Advait3009's picture
Create embedder.py
d2a0e9f verified
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel
from sentence_transformers import SentenceTransformer
class MultiModalEmbedder:
def __init__(self):
self.text_encoder = None
self.image_processor = None
self.image_encoder = None
self.device = "cuda" if torch.cuda.is_available() else "cpu"
@torch.no_grad()
def load_models(self):
"""Lazy load models with HF acceleration"""
# Text encoder
self.text_encoder = SentenceTransformer(
'sentence-transformers/all-MiniLM-L6-v2',
device=self.device
)
# Image encoder
self.image_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.image_encoder = AutoModel.from_pretrained("openai/clip-vit-base-patch32",
device_map="auto",
torch_dtype=torch.float16)
def embed_text(self, text: str) -> torch.Tensor:
if not self.text_encoder:
self.load_models()
return self.text_encoder.encode(text, convert_to_tensor=True)
def embed_image(self, image: Image.Image) -> torch.Tensor:
if not self.image_encoder:
self.load_models()
inputs = self.image_processor(images=image, return_tensors="pt").to(
device=self.device,
dtype=torch.float16
)
with torch.autocast(device_type=self.device):
features = self.image_encoder.get_image_features(**inputs)
return features.squeeze(0).cpu().to(torch.float32)
def normalize(self, tensor: torch.Tensor) -> torch.Tensor:
return tensor / tensor.norm(dim=-1, keepdim=True)