|
|
import transformers |
|
|
from transformers import AutoConfig, PretrainedConfig |
|
|
|
|
|
|
|
|
class HCXVisionConfig(PretrainedConfig): |
|
|
model_type = "vlm" |
|
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
text_config=None, |
|
|
vision_config=None, |
|
|
text_model_name_or_path=None, |
|
|
vision_model_name_or_path=None, |
|
|
q_former_model_name_or_path=None, |
|
|
mm_projector_type="mlp", |
|
|
use_nth_layer=-2, |
|
|
img_start_id=100271, |
|
|
video_start_id=100270, |
|
|
freeze_encoder=False, |
|
|
freeze_decoder=False, |
|
|
freeze_mm_projector=False, |
|
|
anyres=False, |
|
|
unpad=False, |
|
|
max_num_grids=-1, |
|
|
num_queries_vis_abstractor=-1, |
|
|
video_num_queries_fast=None, |
|
|
video_num_queries_slow=None, |
|
|
video_first_last_frames_slows=None, |
|
|
video_max_num_frames=None, |
|
|
ignore_index=-100, |
|
|
proj_pos_emb=True, |
|
|
proj_prenorm=False, |
|
|
use_1x1_grid=False, |
|
|
possible_resolutions=[], |
|
|
**kwargs, |
|
|
): |
|
|
from transformers import CONFIG_MAPPING |
|
|
|
|
|
if kwargs.get("language_config", None) is not None: |
|
|
text_config = CONFIG_MAPPING[kwargs["language_config"]["model_type"]](**kwargs["language_config"]) |
|
|
elif text_config is None and text_model_name_or_path is not None: |
|
|
text_config = AutoConfig.from_pretrained(text_model_name_or_path, trust_remote_code=True) |
|
|
if vision_config is None and vision_model_name_or_path is not None: |
|
|
vision_config = AutoConfig.from_pretrained(vision_model_name_or_path, trust_remote_code=True) |
|
|
|
|
|
if isinstance(text_config, dict): |
|
|
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) |
|
|
|
|
|
if isinstance(vision_config, dict): |
|
|
if vision_config["model_type"] == "qwen2_5_vl": |
|
|
vision_config["model_type"] = "qwen2_5_vl_visual" |
|
|
assert transformers.__version__ >= "4.52.4", "please upgrade transformers to 4.52.4 or higher" |
|
|
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) |
|
|
|
|
|
self.text_config = text_config |
|
|
self.vision_config = vision_config |
|
|
|
|
|
if text_config is not None: |
|
|
|
|
|
self.hidden_size = text_config.hidden_size if hasattr(text_config, "hidden_size") else text_config.n_embd |
|
|
|
|
|
self.text_model_name_or_path = text_model_name_or_path |
|
|
self.vision_model_name_or_path = vision_model_name_or_path |
|
|
self.q_former_model_name_or_path = q_former_model_name_or_path |
|
|
self.mm_projector_type = mm_projector_type |
|
|
self.use_nth_layer = use_nth_layer |
|
|
self.freeze_encoder = freeze_encoder |
|
|
self.freeze_decoder = freeze_decoder |
|
|
self.freeze_mm_projector = freeze_mm_projector |
|
|
self.anyres = anyres |
|
|
self.unpad = unpad |
|
|
self.max_num_grids = max_num_grids |
|
|
self.num_queries_vis_abstractor = num_queries_vis_abstractor |
|
|
self.video_num_queries_fast = video_num_queries_fast |
|
|
self.video_num_queries_slow = video_num_queries_slow |
|
|
self.video_first_last_frames_slows = video_first_last_frames_slows |
|
|
self.video_max_num_frames = video_max_num_frames |
|
|
self.img_start_id = img_start_id |
|
|
self.image_token_id = img_start_id |
|
|
self.video_start_id = video_start_id |
|
|
self.video_token_id = video_start_id |
|
|
self.ignore_index = ignore_index |
|
|
self.proj_pos_emb = proj_pos_emb |
|
|
self.proj_prenorm = proj_prenorm |
|
|
self.use_1x1_grid = use_1x1_grid |
|
|
self.possible_resolutions = possible_resolutions |
|
|
super().__init__(**kwargs) |
|
|
if self.text_config is not None: |
|
|
self.pad_token_id = self.text_config.pad_token_id |
|
|
|
|
|
|
|
|
AutoConfig.register("vlm", HCXVisionConfig) |
|
|
try: |
|
|
from .configuration_hyperclovax import HyperCLOVAXConfig |
|
|
|
|
|
AutoConfig.register("hyperclovax", HyperCLOVAXConfig) |
|
|
except: |
|
|
pass |
|
|
try: |
|
|
from transformers import CONFIG_MAPPING, MODEL_MAPPING |
|
|
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( |
|
|
Qwen2_5_VisionTransformerPretrainedModel, |
|
|
Qwen2_5_VLPatchMerger, |
|
|
Qwen2_5_VLVisionConfig, |
|
|
) |
|
|
|
|
|
MODEL_MAPPING.register(Qwen2_5_VLVisionConfig, Qwen2_5_VisionTransformerPretrainedModel) |
|
|
CONFIG_MAPPING.register("qwen2_5_vl_visual", Qwen2_5_VLVisionConfig) |
|
|
except: |
|
|
pass |
|
|
|