import torch from PIL import Image from transformers import AutoProcessor, AutoModel from sentence_transformers import SentenceTransformer class MultiModalEmbedder: def __init__(self): self.text_encoder = None self.image_processor = None self.image_encoder = None self.device = "cuda" if torch.cuda.is_available() else "cpu" @torch.no_grad() def load_models(self): """Lazy load models with HF acceleration""" # Text encoder self.text_encoder = SentenceTransformer( 'sentence-transformers/all-MiniLM-L6-v2', device=self.device ) # Image encoder self.image_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") self.image_encoder = AutoModel.from_pretrained("openai/clip-vit-base-patch32", device_map="auto", torch_dtype=torch.float16) def embed_text(self, text: str) -> torch.Tensor: if not self.text_encoder: self.load_models() return self.text_encoder.encode(text, convert_to_tensor=True) def embed_image(self, image: Image.Image) -> torch.Tensor: if not self.image_encoder: self.load_models() inputs = self.image_processor(images=image, return_tensors="pt").to( device=self.device, dtype=torch.float16 ) with torch.autocast(device_type=self.device): features = self.image_encoder.get_image_features(**inputs) return features.squeeze(0).cpu().to(torch.float32) def normalize(self, tensor: torch.Tensor) -> torch.Tensor: return tensor / tensor.norm(dim=-1, keepdim=True)