| import transformers |
| from transformers import AutoConfig, PretrainedConfig |
|
|
|
|
| class HCXVisionConfig(PretrainedConfig): |
| model_type = "vlm" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| text_config=None, |
| vision_config=None, |
| text_model_name_or_path=None, |
| vision_model_name_or_path=None, |
| q_former_model_name_or_path=None, |
| mm_projector_type="mlp", |
| use_nth_layer=-2, |
| img_start_id=100271, |
| video_start_id=100270, |
| freeze_encoder=False, |
| freeze_decoder=False, |
| freeze_mm_projector=False, |
| anyres=False, |
| unpad=False, |
| max_num_grids=-1, |
| num_queries_vis_abstractor=-1, |
| video_num_queries_fast=None, |
| video_num_queries_slow=None, |
| video_first_last_frames_slows=None, |
| video_max_num_frames=None, |
| ignore_index=-100, |
| proj_pos_emb=True, |
| proj_prenorm=False, |
| use_1x1_grid=False, |
| possible_resolutions=[], |
| **kwargs, |
| ): |
| from transformers import CONFIG_MAPPING |
|
|
| if kwargs.get("language_config", None) is not None: |
| text_config = CONFIG_MAPPING[kwargs["language_config"]["model_type"]](**kwargs["language_config"]) |
| elif text_config is None and text_model_name_or_path is not None: |
| text_config = AutoConfig.from_pretrained(text_model_name_or_path, trust_remote_code=True) |
| if vision_config is None and vision_model_name_or_path is not None: |
| vision_config = AutoConfig.from_pretrained(vision_model_name_or_path, trust_remote_code=True) |
|
|
| if isinstance(text_config, dict): |
| text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) |
|
|
| if isinstance(vision_config, dict): |
| if vision_config["model_type"] == "qwen2_5_vl": |
| vision_config["model_type"] = "qwen2_5_vl_visual" |
| assert transformers.__version__ >= "4.52.4", "please upgrade transformers to 4.52.4 or higher" |
| vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) |
|
|
| self.text_config = text_config |
| self.vision_config = vision_config |
|
|
| if text_config is not None: |
| |
| self.hidden_size = text_config.hidden_size if hasattr(text_config, "hidden_size") else text_config.n_embd |
| |
| self.text_model_name_or_path = text_model_name_or_path |
| self.vision_model_name_or_path = vision_model_name_or_path |
| self.q_former_model_name_or_path = q_former_model_name_or_path |
| self.mm_projector_type = mm_projector_type |
| self.use_nth_layer = use_nth_layer |
| self.freeze_encoder = freeze_encoder |
| self.freeze_decoder = freeze_decoder |
| self.freeze_mm_projector = freeze_mm_projector |
| self.anyres = anyres |
| self.unpad = unpad |
| self.max_num_grids = max_num_grids |
| self.num_queries_vis_abstractor = num_queries_vis_abstractor |
| self.video_num_queries_fast = video_num_queries_fast |
| self.video_num_queries_slow = video_num_queries_slow |
| self.video_first_last_frames_slows = video_first_last_frames_slows |
| self.video_max_num_frames = video_max_num_frames |
| self.img_start_id = img_start_id |
| self.image_token_id = img_start_id |
| self.video_start_id = video_start_id |
| self.video_token_id = video_start_id |
| self.ignore_index = ignore_index |
| self.proj_pos_emb = proj_pos_emb |
| self.proj_prenorm = proj_prenorm |
| self.use_1x1_grid = use_1x1_grid |
| self.possible_resolutions = possible_resolutions |
| super().__init__(**kwargs) |
| if self.text_config is not None: |
| self.pad_token_id = self.text_config.pad_token_id |
|
|
|
|
| AutoConfig.register("vlm", HCXVisionConfig) |
| try: |
| from .configuration_hyperclovax import HyperCLOVAXConfig |
|
|
| AutoConfig.register("hyperclovax", HyperCLOVAXConfig) |
| except: |
| pass |
| try: |
| from transformers import CONFIG_MAPPING, MODEL_MAPPING |
| from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( |
| Qwen2_5_VisionTransformerPretrainedModel, |
| Qwen2_5_VLPatchMerger, |
| Qwen2_5_VLVisionConfig, |
| ) |
|
|
| MODEL_MAPPING.register(Qwen2_5_VLVisionConfig, Qwen2_5_VisionTransformerPretrainedModel) |
| CONFIG_MAPPING.register("qwen2_5_vl_visual", Qwen2_5_VLVisionConfig) |
| except: |
| pass |
|
|