| from transformers import PretrainedConfig | |
| class CapriConfig(PretrainedConfig): | |
| model_type = "capri" | |
| def __init__( | |
| self, | |
| text_model_name_or_path="Qwen/Qwen2.5-0.5B", | |
| vision_model_name_or_path="google/siglip2-base-patch16-224", | |
| adapter_subdir="text_adapter", | |
| projector_type="mlp", | |
| projector_in_dim=768, | |
| projector_hidden_dim=3072, | |
| projector_out_dim=896, | |
| image_token="<image>", | |
| image_token_id=151665, | |
| prompt_prefix="<image> Caption:", | |
| max_length=64, | |
| load_vision_tower_by_default=False, | |
| processor_class="CapriProcessor", | |
| **kwargs, | |
| ): | |
| self.text_model_name_or_path = text_model_name_or_path | |
| self.vision_model_name_or_path = vision_model_name_or_path | |
| self.adapter_subdir = adapter_subdir | |
| self.projector_type = projector_type | |
| self.projector_in_dim = projector_in_dim | |
| self.projector_hidden_dim = projector_hidden_dim | |
| self.projector_out_dim = projector_out_dim | |
| self.image_token = image_token | |
| self.image_token_id = image_token_id | |
| self.prompt_prefix = prompt_prefix | |
| self.max_length = max_length | |
| self.load_vision_tower_by_default = load_vision_tower_by_default | |
| self.processor_class = processor_class | |
| super().__init__(**kwargs) | |