| from transformers.configuration_utils import PretrainedConfig |
| from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig |
| from transformers.models.qwen2.configuration_qwen2 import Qwen2Config |
|
|
|
|
| class OpenCUAConfig(PretrainedConfig): |
| """OpenCUA-2.5-32B model configuration. |
| |
| Args: |
| vision_config: Configuration for the vision model.Qwen2_5_VLVisionConfig |
| text_config: Configuration for the text model. Qwen2Config |
| pad_token_id: The token ID to use for padding. |
| """ |
|
|
| model_type = "opencua" |
|
|
| def __init__( |
| self, |
| vision_config: dict | Qwen2_5_VLVisionConfig | None = None, |
| text_config: dict | Qwen2Config | None = None, |
| ignore_index: int = -100, |
| media_placeholder_token_id: int = 151664, |
| pad_token_id: int = 0, |
| **kwargs |
| ): |
| if isinstance(vision_config, dict): |
| vision_config = Qwen2_5_VLVisionConfig(**vision_config) |
| self.vision_config = vision_config |
|
|
| if isinstance(text_config, dict): |
| text_config = Qwen2Config(**text_config) |
| self.text_config = text_config |
|
|
| self.ignore_index = ignore_index |
| self.media_placeholder_token_id = media_placeholder_token_id |
|
|
| super().__init__(pad_token_id=pad_token_id, **kwargs) |
|
|