# model.py import torch import torch.nn as nn from torchvision.models import resnet50 from transformers import DistilBertModel import config class VisionEncoder(nn.Module): def __init__(self): super().__init__() pretrained_resnet50 = resnet50(weights='IMAGENET1K_V1') self.model = nn.Sequential(*list(pretrained_resnet50.children())[:-1]) for param in self.model.parameters(): param.requires_grad = False def forward(self, x): x = self.model(x) return x.view(x.size(0), -1) class TextEncoder(nn.Module): def __init__(self): super().__init__() self.model = DistilBertModel.from_pretrained('distilbert-base-uncased') for param in self.model.parameters(): param.requires_grad = False def forward(self, input_ids, attention_mask=None): outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) return outputs.last_hidden_state[:, 0, :] class ProjectionHead(nn.Module): def __init__(self, embedding_dim, projection_dim=config.PROJECTION_DIM): super().__init__() self.projection = nn.Linear(embedding_dim, projection_dim) self.gelu = nn.GELU() self.fc = nn.Linear(projection_dim, projection_dim) self.dropout = nn.Dropout(0.1) self.layer_norm = nn.LayerNorm(projection_dim) def forward(self, x): projected = self.projection(x) x = self.gelu(projected) x = self.fc(x) x = self.dropout(x) x = x + projected x = self.layer_norm(x) return x class CLIPModel(nn.Module): def __init__(self): super().__init__() self.vision_encoder = VisionEncoder() self.text_encoder = TextEncoder() self.image_projection = ProjectionHead(embedding_dim=config.IMAGE_EMBEDDING_DIM) self.text_projection = ProjectionHead(embedding_dim=config.TEXT_EMBEDDING_DIM) def forward(self, batch): # This forward pass is for training; in the app, we use parts of the model separately. pass