from transformers import PretrainedConfig class CapriConfig(PretrainedConfig): model_type = "capri" def __init__( self, text_model_name_or_path="Qwen/Qwen2.5-0.5B", vision_model_name_or_path="google/siglip2-base-patch16-224", adapter_subdir="text_adapter", projector_type="mlp", projector_in_dim=768, projector_hidden_dim=3072, projector_out_dim=896, image_token="", image_token_id=151665, prompt_prefix=" Caption:", max_length=64, load_vision_tower_by_default=False, processor_class="CapriProcessor", **kwargs, ): self.text_model_name_or_path = text_model_name_or_path self.vision_model_name_or_path = vision_model_name_or_path self.adapter_subdir = adapter_subdir self.projector_type = projector_type self.projector_in_dim = projector_in_dim self.projector_hidden_dim = projector_hidden_dim self.projector_out_dim = projector_out_dim self.image_token = image_token self.image_token_id = image_token_id self.prompt_prefix = prompt_prefix self.max_length = max_length self.load_vision_tower_by_default = load_vision_tower_by_default self.processor_class = processor_class super().__init__(**kwargs)