NVILA-Lite-15B-Video-hf-0626 / configuration_vila.py
AndyZijianZhang's picture
Upload files with `vila-upload`.
3d29bb3 verified
from typing import Any, Dict, Optional
from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
class VILAConfig(PretrainedConfig):
# Class attributes.
model_type: str = "vila"
sub_configs: Dict[str, PretrainedConfig] = {
"text_config": Qwen2Config(),
"vision_config": SiglipVisionConfig(),
}
_auto_class: Optional[str] = "AutoConfig"
# Configuration for sub-modules.
text_config: Qwen2Config = Qwen2Config()
vision_config: SiglipVisionConfig = SiglipVisionConfig()
# Model configuration.
hidden_size: int
image_token_id: int
mm_hidden_size: int
mm_projector_type: str
mm_vision_select_feature: str
mm_vision_select_layer: int
video_token_id: int
def __init__(
self,
text_config: Optional[Dict[str, Any]] = None,
vision_config: Optional[Dict[str, Any]] = None,
*,
hidden_size: int = 1536,
image_token_id: int = 151649,
mm_hidden_size: int = 1152,
mm_projector_type: str = "mlp_downsample_3x3_fix",
mm_vision_select_feature: str = "cls_patch",
mm_vision_select_layer: int = -2,
video_token_id: int = 151650,
**kwargs,
):
super().__init__(**kwargs)
self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config()
self.vision_config = SiglipVisionConfig(**vision_config) if vision_config else SiglipVisionConfig()
self.hidden_size = hidden_size
self.image_token_id = image_token_id
self.mm_hidden_size = mm_hidden_size
self.mm_projector_type = mm_projector_type
self.mm_vision_select_feature = mm_vision_select_feature
self.mm_vision_select_layer = mm_vision_select_layer
self.video_token_id = video_token_id