| from typing import Any, Dict, Optional |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.models.qwen2.configuration_qwen2 import Qwen2Config |
| from transformers.models.siglip.configuration_siglip import SiglipVisionConfig |
|
|
|
|
| class VILAConfig(PretrainedConfig): |
| |
| model_type: str = "vila" |
| sub_configs: Dict[str, PretrainedConfig] = { |
| "text_config": Qwen2Config(), |
| "vision_config": SiglipVisionConfig(), |
| } |
| _auto_class: Optional[str] = "AutoConfig" |
|
|
| |
| text_config: Qwen2Config = Qwen2Config() |
| vision_config: SiglipVisionConfig = SiglipVisionConfig() |
|
|
| |
| hidden_size: int |
| image_token_id: int |
| mm_hidden_size: int |
| mm_projector_type: str |
| mm_vision_select_feature: str |
| mm_vision_select_layer: int |
| video_token_id: int |
|
|
| def __init__( |
| self, |
| text_config: Optional[Dict[str, Any]] = None, |
| vision_config: Optional[Dict[str, Any]] = None, |
| *, |
| hidden_size: int = 1536, |
| image_token_id: int = 151649, |
| mm_hidden_size: int = 1152, |
| mm_projector_type: str = "mlp_downsample_3x3_fix", |
| mm_vision_select_feature: str = "cls_patch", |
| mm_vision_select_layer: int = -2, |
| video_token_id: int = 151650, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
| self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config() |
| self.vision_config = SiglipVisionConfig(**vision_config) if vision_config else SiglipVisionConfig() |
|
|
| self.hidden_size = hidden_size |
| self.image_token_id = image_token_id |
| self.mm_hidden_size = mm_hidden_size |
| self.mm_projector_type = mm_projector_type |
| self.mm_vision_select_feature = mm_vision_select_feature |
| self.mm_vision_select_layer = mm_vision_select_layer |
| self.video_token_id = video_token_id |
|
|