sachin
/

tiny_clip

Zero-Shot Image Classification

English

Model card Files Files and versions

xet

Community

sachin commited on Apr 14, 2024

Commit

9d7268a

1 Parent(s): c7a14ad

Converted models to transformers standard

Browse files

Files changed (1) hide show

src/models.py +67 -87

src/models.py CHANGED Viewed

@@ -1,11 +1,14 @@
-import dataclasses
-import json
 import timm
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import transformers
 class Projection(nn.Module):
@@ -37,105 +40,82 @@ def mean_pooling(
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(text_representation.size()).float()
     return torch.sum(text_representation * input_mask_expanded, 1) / torch.clamp(
         input_mask_expanded.sum(1), min=1e-9
-    )
-class TextEncoder(nn.Module):
-    def __init__(
-        self,
-        base: nn.Module,
-        d_in: int,
-        d_out: int,
-        n_projection_layers: int,
-        cls_token: bool = False,
-    ):
-        super().__init__()
-        self.base = base
-        self.cls_token = cls_token
-        self.projection = projection_layers(d_in, d_out, n_projection_layers)
-        self.base.eval()
-        for p in self.base.parameters():
-            p.requires_grad = False
-    def forward(self, x):
         out = self.base(**x).last_hidden_state
-        if self.cls_token:
             out = out[:, 0]  # get CLS token output
         else:
-            out = mean_pooling(out, x["attention_mask"])
         projected_vec = self.projection(out)
         return F.normalize(projected_vec, dim=-1)
-class VisionEncoder(nn.Module):
-    def __init__(self, base: nn.Module, d_in: int, d_out: int, n_projection_layers: int):
-        super().__init__()
-        self.base = base
-        self.projection = projection_layers(d_in, d_out, n_projection_layers)
-        self.base.eval()
-        for p in self.base.parameters():
-            p.requires_grad = False
-    def forward(self, x):
         projected_vec = self.projection(self.base(x))
         return F.normalize(projected_vec, dim=-1)
-class Tokenizer:
-    def __init__(self, tokenizer, max_len: int) -> None:
-        self.tokenizer = tokenizer
-        self.max_len = max_len
-    def __call__(self, x: str) -> transformers.AutoTokenizer:
-        return self.tokenizer(
-            x, max_length=self.max_len, truncation=True, padding=True, return_tensors="pt"
-        )
-    def decode(self, x: dict[str, torch.LongTensor]) -> list[str]:
-        return [
-            self.tokenizer.decode(sentence[:sentence_len])
-            for sentence, sentence_len in zip(x["input_ids"], x["attention_mask"].sum(axis=-1))
-        ]
-@dataclasses.dataclass(frozen=True)
-class CLIPConfig:
-    cls_token: bool = True
-    n_projection_layers: int = 3
-    embed_dims: int = 512
-    vision_model: str = "edgenext_small"
-    text_model: str = "microsoft/xtremedistil-l6-h256-uncased"
-    max_len: int = 128
-def get_model():
-    with open("./clip_config.json", "r") as f:
-        config = CLIPConfig(**json.load(f))
-    # load text model and tokenizer
-    text_config = transformers.AutoConfig.from_pretrained("./text_model_config/")
-    text_base = transformers.AutoModel.from_config(text_config)
-    tokenizer = Tokenizer(
-        transformers.AutoTokenizer.from_pretrained("./tokenizer/"), config.max_len
-    )
-    text_encoder = TextEncoder(
-        text_base,
-        text_base.config.hidden_size,
-        config.embed_dims,
-        config.n_projection_layers,
-        config.cls_token,
-    )
-    text_encoder.load_state_dict(torch.load("./text.ckpt", map_location=torch.device("cpu")))
-    # load vision model and image transform
-    image_base = timm.create_model(config.vision_model, num_classes=0)
-    timm_config = timm.data.resolve_data_config({}, model=image_base)
-    transform = timm.data.transforms_factory.create_transform(**timm_config)
-    vision_encoder = VisionEncoder(
-        image_base, image_base.num_features, config.embed_dims, config.n_projection_layers
-    )
-    vision_encoder.load_state_dict(torch.load("./vision.ckpt", map_location=torch.device("cpu")))
-    return text_encoder, tokenizer, vision_encoder, transform

+from PIL import Image
 import timm
+from timm import data
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import transformers
+from transformers import PreTrainedModel
+from src.config import TinyCLIPConfig, TinyCLIPTextConfig, TinyCLIPVisionConfig
+from src import loss
 class Projection(nn.Module):
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(text_representation.size()).float()
     return torch.sum(text_representation * input_mask_expanded, 1) / torch.clamp(
         input_mask_expanded.sum(1), min=1e-9
+    )  # type: ignore
+class TinyCLIPTextEncoder(PreTrainedModel):
+    config_class = TinyCLIPTextConfig
+    def __init__(self, config: TinyCLIPTextConfig):
+        super().__init__(config)
+        self.base = transformers.AutoModel.from_pretrained(config.text_model)
+        self.cls_type = config.cls_type
+        self.projection = projection_layers(
+            self.base.config.hidden_size, config.embed_dims, config.projection_layers
+        )
+    def forward(self, x: dict[str, torch.Tensor]):
         out = self.base(**x).last_hidden_state
+        if self.cls_type:
             out = out[:, 0]  # get CLS token output
         else:
+            out = mean_pooling(out, x["attention_mask"])  # type: ignore
         projected_vec = self.projection(out)
         return F.normalize(projected_vec, dim=-1)
+class TinyCLIPVisionEncoder(PreTrainedModel):
+    config_class = TinyCLIPVisionConfig
+    def __init__(self, config: TinyCLIPVisionConfig):
+        super().__init__(config)
+        self.base = timm.create_model(config.vision_model, num_classes=0)
+        timm_config = data.resolve_data_config({}, model=self.base)
+        self.transform = data.transforms_factory.create_transform(**timm_config)
+        self.projection = projection_layers(
+            self.base.num_features, config.embed_dims, config.projection_layers
+        )
+    def forward(self, images: list[Image.Image]):
+        x: torch.Tensor = torch.stack([self.transform(image) for image in images])  # type: ignore
         projected_vec = self.projection(self.base(x))
         return F.normalize(projected_vec, dim=-1)
+class TinyCLIP(PreTrainedModel):
+    config_class = TinyCLIPConfig
+    def __init__(self, config: TinyCLIPConfig):
+        super().__init__(config)
+        self.text_encoder = TinyCLIPTextEncoder(config.text_config)
+        self.vision_encoder = TinyCLIPVisionEncoder(config.vision_config)
+        if config.freeze_text_base:
+            self.text_encoder.base.eval()
+            for param in self.text_encoder.parameters():
+                param.requires_grad = False
+        if config.freeze_vision_base:
+            self.vision_encoder.base.eval()
+            for param in self.vision_encoder.parameters():
+                param.requires_grad = False
+        self.loss_fn = loss.get_loss(config.loss_type)
+    def forward(
+        self,
+        text_input: dict[str, torch.Tensor],
+        vision_input: list[Image.Image],
+        return_loss: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        text_output = self.text_encoder(text_input)
+        vision_output = self.vision_encoder(vision_input)
+        out = {"text_output": text_output, "vision_output": vision_output}
+        if return_loss:
+            out["loss"] = self.loss_fn(vision_output, text_output)
+        return out