| from transformers import AutoConfig, AutoModel | |
| from transformers import CLIPVisionModel, CLIPImageProcessor | |
| from .base_visual_tokenizer import BaseVisualTokenizerConfig, BaseVisualTokenizer | |
| MODEL_TYPE = "clip_visual_tokenizer" | |
| class ClipVisualTokenizerConfig(BaseVisualTokenizerConfig): | |
| model_type = MODEL_TYPE | |
| def __init__(self, **kwargs): | |
| super().__init__(**kwargs) | |
| if self.depths: | |
| assert len(self.depths) == 1 | |
| self.backbone_kwargs['num_hidden_layers'] = self.depths[0] | |
| class ClipVisualTokenizer(BaseVisualTokenizer): | |
| config_class = ClipVisualTokenizerConfig | |
| supports_gradient_checkpointing = True | |
| _no_split_modules = ["CLIPEncoderLayer"] | |
| _image_processor_class = CLIPImageProcessor | |
| _image_processor_kwargs = dict(do_center_crop=False) | |
| _backbone_class = CLIPVisionModel | |
| _backbone_name_or_path = "openai/clip-vit-large-patch14-336" | |
| def get_monitor_tensors(self): | |
| return dict( | |
| backbone_bottom=self.backbone.vision_model.encoder.layers[0].self_attn.k_proj.weight, | |
| backbone_top=self.backbone.vision_model.encoder.layers[-1].self_attn.out_proj.weight, | |
| head=self.head[0].weight | |
| ) | |
| def get_image_size(self): | |
| height = self.image_processor.crop_size["height"] | |
| width = self.image_processor.crop_size["width"] | |
| return height, width | |
| AutoConfig.register(MODEL_TYPE, ClipVisualTokenizerConfig) | |
| AutoModel.register(ClipVisualTokenizerConfig, ClipVisualTokenizer) | |