| """ |
| EmbeddingAgent - Generates image embeddings using MobileCLIP |
| NEW MODULE - Adds semantic visual understanding |
| """ |
|
|
| import torch |
| import numpy as np |
| from PIL import Image |
| from transformers import CLIPProcessor, CLIPModel |
|
|
|
|
| class EmbeddingAgent: |
| def __init__(self, model_name="openai/clip-vit-base-patch32"): |
| """ |
| Initialize MobileCLIP for image embeddings |
| Falls back to standard CLIP if MobileCLIP unavailable |
| """ |
| print("[EmbeddingAgent] Loading CLIP model...") |
| |
| try: |
| self.processor = CLIPProcessor.from_pretrained(model_name) |
| self.model = CLIPModel.from_pretrained(model_name) |
| self.model.eval() |
| print(f"[EmbeddingAgent] Loaded: {model_name}") |
| except Exception as e: |
| print(f"[EmbeddingAgent] Error loading model: {e}") |
| raise |
|
|
| def encode_image(self, frame_bgr): |
| """ |
| Generate embedding vector from BGR image frame |
| |
| Args: |
| frame_bgr: OpenCV BGR image (numpy array) |
| |
| Returns: |
| numpy array: 512-dim embedding vector |
| """ |
| |
| frame_rgb = frame_bgr[:, :, ::-1] |
| image = Image.fromarray(frame_rgb) |
| |
| |
| inputs = self.processor(images=image, return_tensors="pt") |
| |
| with torch.no_grad(): |
| image_features = self.model.get_image_features(**inputs) |
| |
| embedding = torch.nn.functional.normalize(image_features, p=2, dim=-1) |
| |
| return embedding.cpu().numpy().flatten() |
|
|
| def encode_text(self, text): |
| """ |
| Generate embedding vector from text query |
| Useful for text-to-image search |
| |
| Args: |
| text: Query string |
| |
| Returns: |
| numpy array: 512-dim embedding vector |
| """ |
| inputs = self.processor(text=[text], return_tensors="pt", padding=True) |
| |
| with torch.no_grad(): |
| text_features = self.model.get_text_features(**inputs) |
| embedding = torch.nn.functional.normalize(text_features, p=2, dim=-1) |
| |
| return embedding.cpu().numpy().flatten() |
|
|