Spaces:
Runtime error
Runtime error
| import torch | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModel | |
| from sentence_transformers import SentenceTransformer | |
| class MultiModalEmbedder: | |
| def __init__(self): | |
| self.text_encoder = None | |
| self.image_processor = None | |
| self.image_encoder = None | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_models(self): | |
| """Lazy load models with HF acceleration""" | |
| # Text encoder | |
| self.text_encoder = SentenceTransformer( | |
| 'sentence-transformers/all-MiniLM-L6-v2', | |
| device=self.device | |
| ) | |
| # Image encoder | |
| self.image_processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| self.image_encoder = AutoModel.from_pretrained("openai/clip-vit-base-patch32", | |
| device_map="auto", | |
| torch_dtype=torch.float16) | |
| def embed_text(self, text: str) -> torch.Tensor: | |
| if not self.text_encoder: | |
| self.load_models() | |
| return self.text_encoder.encode(text, convert_to_tensor=True) | |
| def embed_image(self, image: Image.Image) -> torch.Tensor: | |
| if not self.image_encoder: | |
| self.load_models() | |
| inputs = self.image_processor(images=image, return_tensors="pt").to( | |
| device=self.device, | |
| dtype=torch.float16 | |
| ) | |
| with torch.autocast(device_type=self.device): | |
| features = self.image_encoder.get_image_features(**inputs) | |
| return features.squeeze(0).cpu().to(torch.float32) | |
| def normalize(self, tensor: torch.Tensor) -> torch.Tensor: | |
| return tensor / tensor.norm(dim=-1, keepdim=True) |