Spaces:
Runtime error
Runtime error
File size: 2,071 Bytes
c038a26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | # model.py
import torch
import torch.nn as nn
from torchvision.models import resnet50
from transformers import DistilBertModel
import config
class VisionEncoder(nn.Module):
def __init__(self):
super().__init__()
pretrained_resnet50 = resnet50(weights='IMAGENET1K_V1')
self.model = nn.Sequential(*list(pretrained_resnet50.children())[:-1])
for param in self.model.parameters():
param.requires_grad = False
def forward(self, x):
x = self.model(x)
return x.view(x.size(0), -1)
class TextEncoder(nn.Module):
def __init__(self):
super().__init__()
self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
for param in self.model.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask=None):
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
return outputs.last_hidden_state[:, 0, :]
class ProjectionHead(nn.Module):
def __init__(self, embedding_dim, projection_dim=config.PROJECTION_DIM):
super().__init__()
self.projection = nn.Linear(embedding_dim, projection_dim)
self.gelu = nn.GELU()
self.fc = nn.Linear(projection_dim, projection_dim)
self.dropout = nn.Dropout(0.1)
self.layer_norm = nn.LayerNorm(projection_dim)
def forward(self, x):
projected = self.projection(x)
x = self.gelu(projected)
x = self.fc(x)
x = self.dropout(x)
x = x + projected
x = self.layer_norm(x)
return x
class CLIPModel(nn.Module):
def __init__(self):
super().__init__()
self.vision_encoder = VisionEncoder()
self.text_encoder = TextEncoder()
self.image_projection = ProjectionHead(embedding_dim=config.IMAGE_EMBEDDING_DIM)
self.text_projection = ProjectionHead(embedding_dim=config.TEXT_EMBEDDING_DIM)
def forward(self, batch):
# This forward pass is for training; in the app, we use parts of the model separately.
pass |