"""
EmbeddingAgent - Generates image embeddings using MobileCLIP
NEW MODULE - Adds semantic visual understanding
"""

import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel


class EmbeddingAgent:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        """
        Initialize MobileCLIP for image embeddings
        Falls back to standard CLIP if MobileCLIP unavailable
        """
        print("[EmbeddingAgent] Loading CLIP model...")
        
        try:
            self.processor = CLIPProcessor.from_pretrained(model_name)
            self.model = CLIPModel.from_pretrained(model_name)
            self.model.eval()
            print(f"[EmbeddingAgent] Loaded: {model_name}")
        except Exception as e:
            print(f"[EmbeddingAgent] Error loading model: {e}")
            raise

    def encode_image(self, frame_bgr):
        """
        Generate embedding vector from BGR image frame
        
        Args:
            frame_bgr: OpenCV BGR image (numpy array)
            
        Returns:
            numpy array: 512-dim embedding vector
        """
        # Convert BGR to RGB
        frame_rgb = frame_bgr[:, :, ::-1]
        image = Image.fromarray(frame_rgb)
        
        # Process and encode
        inputs = self.processor(images=image, return_tensors="pt")
        
        with torch.no_grad():
            image_features = self.model.get_image_features(**inputs)
            # Normalize embedding using torch.nn.functional
            embedding = torch.nn.functional.normalize(image_features, p=2, dim=-1)
        
        return embedding.cpu().numpy().flatten()

    def encode_text(self, text):
        """
        Generate embedding vector from text query
        Useful for text-to-image search
        
        Args:
            text: Query string
            
        Returns:
            numpy array: 512-dim embedding vector
        """
        inputs = self.processor(text=[text], return_tensors="pt", padding=True)
        
        with torch.no_grad():
            text_features = self.model.get_text_features(**inputs)
            embedding = torch.nn.functional.normalize(text_features, p=2, dim=-1)
        
        return embedding.cpu().numpy().flatten()