Spaces:

NanG01
/

visionq

Running

App Files Files Community

visionq / agents /embedding_agent.py

NanG01

fixes

57327cd 2 months ago

raw

history blame contribute delete

2.28 kB

	"""
	EmbeddingAgent - Generates image embeddings using MobileCLIP
	NEW MODULE - Adds semantic visual understanding
	"""

	import torch
	import numpy as np
	from PIL import Image
	from transformers import CLIPProcessor, CLIPModel


	class EmbeddingAgent:
	def __init__(self, model_name="openai/clip-vit-base-patch32"):
	"""
	Initialize MobileCLIP for image embeddings
	Falls back to standard CLIP if MobileCLIP unavailable
	"""
	print("[EmbeddingAgent] Loading CLIP model...")

	try:
	self.processor = CLIPProcessor.from_pretrained(model_name)
	self.model = CLIPModel.from_pretrained(model_name)
	self.model.eval()
	print(f"[EmbeddingAgent] Loaded: {model_name}")
	except Exception as e:
	print(f"[EmbeddingAgent] Error loading model: {e}")
	raise

	def encode_image(self, frame_bgr):
	"""
	Generate embedding vector from BGR image frame

	Args:
	frame_bgr: OpenCV BGR image (numpy array)

	Returns:
	numpy array: 512-dim embedding vector
	"""
	# Convert BGR to RGB
	frame_rgb = frame_bgr[:, :, ::-1]
	image = Image.fromarray(frame_rgb)

	# Process and encode
	inputs = self.processor(images=image, return_tensors="pt")

	with torch.no_grad():
	image_features = self.model.get_image_features(**inputs)
	# Normalize embedding using torch.nn.functional
	embedding = torch.nn.functional.normalize(image_features, p=2, dim=-1)

	return embedding.cpu().numpy().flatten()

	def encode_text(self, text):
	"""
	Generate embedding vector from text query
	Useful for text-to-image search

	Args:
	text: Query string

	Returns:
	numpy array: 512-dim embedding vector
	"""
	inputs = self.processor(text=[text], return_tensors="pt", padding=True)

	with torch.no_grad():
	text_features = self.model.get_text_features(**inputs)
	embedding = torch.nn.functional.normalize(text_features, p=2, dim=-1)

	return embedding.cpu().numpy().flatten()