""" EmbeddingAgent - Generates image embeddings using MobileCLIP NEW MODULE - Adds semantic visual understanding """ import torch import numpy as np from PIL import Image from transformers import CLIPProcessor, CLIPModel class EmbeddingAgent: def __init__(self, model_name="openai/clip-vit-base-patch32"): """ Initialize MobileCLIP for image embeddings Falls back to standard CLIP if MobileCLIP unavailable """ print("[EmbeddingAgent] Loading CLIP model...") try: self.processor = CLIPProcessor.from_pretrained(model_name) self.model = CLIPModel.from_pretrained(model_name) self.model.eval() print(f"[EmbeddingAgent] Loaded: {model_name}") except Exception as e: print(f"[EmbeddingAgent] Error loading model: {e}") raise def encode_image(self, frame_bgr): """ Generate embedding vector from BGR image frame Args: frame_bgr: OpenCV BGR image (numpy array) Returns: numpy array: 512-dim embedding vector """ # Convert BGR to RGB frame_rgb = frame_bgr[:, :, ::-1] image = Image.fromarray(frame_rgb) # Process and encode inputs = self.processor(images=image, return_tensors="pt") with torch.no_grad(): image_features = self.model.get_image_features(**inputs) # Normalize embedding using torch.nn.functional embedding = torch.nn.functional.normalize(image_features, p=2, dim=-1) return embedding.cpu().numpy().flatten() def encode_text(self, text): """ Generate embedding vector from text query Useful for text-to-image search Args: text: Query string Returns: numpy array: 512-dim embedding vector """ inputs = self.processor(text=[text], return_tensors="pt", padding=True) with torch.no_grad(): text_features = self.model.get_text_features(**inputs) embedding = torch.nn.functional.normalize(text_features, p=2, dim=-1) return embedding.cpu().numpy().flatten()