Spaces:
Running
Running
| from transformers import CLIPProcessor, CLIPModel | |
| from PIL import Image | |
| import torch | |
| import io | |
| class ImageProcessor: | |
| def __init__(self): | |
| # Initialize CLIP | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device) | |
| self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| def get_embedding(self, image: Image.Image): | |
| inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device) | |
| with torch.no_grad(): | |
| outputs = self.clip_model.get_image_features(**inputs) | |
| # Robustly handle different CLIP output formats | |
| if hasattr(outputs, "image_embeds"): | |
| image_features = outputs.image_embeds | |
| elif hasattr(outputs, "pooler_output"): | |
| image_features = outputs.pooler_output | |
| elif isinstance(outputs, (list, tuple)): | |
| image_features = outputs[0] | |
| else: | |
| image_features = outputs | |
| # Final check: must be a tensor | |
| if not isinstance(image_features, torch.Tensor): | |
| try: | |
| image_features = outputs[0] | |
| except: | |
| raise Exception(f"Failed to extract tensor from {type(outputs)}") | |
| # Normalize | |
| image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) | |
| return image_features.cpu().numpy()[0].tolist() | |
| def get_text_embedding(self, text: str): | |
| """ | |
| Generate embedding for text query. | |
| """ | |
| inputs = self.clip_processor(text=text, return_tensors="pt", padding=True, truncation=True).to(self.device) | |
| with torch.no_grad(): | |
| outputs = self.clip_model.get_text_features(**inputs) | |
| # Robustly handle different CLIP output formats | |
| if hasattr(outputs, "text_embeds"): | |
| text_features = outputs.text_embeds | |
| elif hasattr(outputs, "pooler_output"): | |
| text_features = outputs.pooler_output | |
| elif isinstance(outputs, (list, tuple)): | |
| text_features = outputs[0] | |
| else: | |
| text_features = outputs | |
| # Final check: must be a tensor | |
| if not isinstance(text_features, torch.Tensor): | |
| try: | |
| text_features = outputs[0] | |
| except: | |
| raise Exception(f"Failed to extract tensor from {type(outputs)}") | |
| # Normalize | |
| text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) | |
| return text_features.cpu().numpy()[0].tolist() | |