visionq / agents /embedding_agent.py
NanG01's picture
fixes
57327cd
"""
EmbeddingAgent - Generates image embeddings using MobileCLIP
NEW MODULE - Adds semantic visual understanding
"""
import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
class EmbeddingAgent:
def __init__(self, model_name="openai/clip-vit-base-patch32"):
"""
Initialize MobileCLIP for image embeddings
Falls back to standard CLIP if MobileCLIP unavailable
"""
print("[EmbeddingAgent] Loading CLIP model...")
try:
self.processor = CLIPProcessor.from_pretrained(model_name)
self.model = CLIPModel.from_pretrained(model_name)
self.model.eval()
print(f"[EmbeddingAgent] Loaded: {model_name}")
except Exception as e:
print(f"[EmbeddingAgent] Error loading model: {e}")
raise
def encode_image(self, frame_bgr):
"""
Generate embedding vector from BGR image frame
Args:
frame_bgr: OpenCV BGR image (numpy array)
Returns:
numpy array: 512-dim embedding vector
"""
# Convert BGR to RGB
frame_rgb = frame_bgr[:, :, ::-1]
image = Image.fromarray(frame_rgb)
# Process and encode
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
image_features = self.model.get_image_features(**inputs)
# Normalize embedding using torch.nn.functional
embedding = torch.nn.functional.normalize(image_features, p=2, dim=-1)
return embedding.cpu().numpy().flatten()
def encode_text(self, text):
"""
Generate embedding vector from text query
Useful for text-to-image search
Args:
text: Query string
Returns:
numpy array: 512-dim embedding vector
"""
inputs = self.processor(text=[text], return_tensors="pt", padding=True)
with torch.no_grad():
text_features = self.model.get_text_features(**inputs)
embedding = torch.nn.functional.normalize(text_features, p=2, dim=-1)
return embedding.cpu().numpy().flatten()