lmflow.models.vision_encoder.clip_encoder# Classes# CLIPVisionTower Functions# build_vision_tower(vision_tower_cfg, **kwargs) Module Contents# lmflow.models.vision_encoder.clip_encoder.build_vision_tower(vision_tower_cfg, **kwargs)[source]# class lmflow.models.vision_encoder.clip_encoder.CLIPVisionTower(vision_tower, args, delay_load=False)[source]# Bases: torch.nn.Module is_loaded = False[source]# vision_tower_name[source]# select_layer[source]# select_feature[source]# load_model()[source]# encode_images(images, language_projection)[source]# feature_select(image_forward_outs)[source]# forward(images)[source]# property dummy_feature[source]# property dtype[source]# property device[source]# property config[source]# property hidden_size[source]# property num_patches[source]# prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images, language_projection=None, language_model=None, **kwargs)[source]# Copy from the LLAVA code base. Should be polished.