File size: 3,439 Bytes

ed4f8d3

"""
Custom handler for ANCE dense retrieval on HuggingFace Inference Endpoints.

Returns dense embeddings for queries and passages.
"""

from typing import Any, Dict, List, Union

import torch
from transformers import AutoModel, AutoTokenizer


class EndpointHandler:
    """Handler for ANCE embedding generation."""
    
    def __init__(self, path: str = ""):
        """Initialize the model and tokenizer."""
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModel.from_pretrained(path)
        self.model.eval()
        
        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        
        print(f"ANCE loaded on {self.device}")
    
    def __call__(self, data: Dict[str, Any]) -> Union[List[List[float]], Dict[str, Any]]:
        """
        Process inference requests.
        
        Accepts:
        - {"inputs": "text"} - single text
        - {"inputs": ["text1", "text2", ...]} - batch of texts
        - {"query": "...", "passages": ["...", ...]} - query + passages (returns similarity scores)
        
        Returns:
        - List of embeddings (each embedding is a list of floats)
        - Or similarity scores if query + passages provided
        """
        inputs = data.get("inputs", None)
        query = data.get("query", None)
        passages = data.get("passages", None)
        
        # Mode 1: Query + Passages -> return similarity scores
        if query is not None and passages is not None:
            query_emb = self._encode([query])[0]
            passage_embs = self._encode(passages)
            
            # Compute cosine similarities
            scores = []
            for p_emb in passage_embs:
                score = self._cosine_similarity(query_emb, p_emb)
                scores.append(score)
            
            return {"scores": scores}
        
        # Mode 2: Just inputs -> return embeddings
        if inputs is None:
            return {"error": "No inputs provided. Use 'inputs' or 'query'+'passages'."}
        
        if isinstance(inputs, str):
            inputs = [inputs]
        
        embeddings = self._encode(inputs)
        return {"embeddings": embeddings}
    
    def _encode(self, texts: List[str], max_length: int = 512) -> List[List[float]]:
        """Encode texts into embeddings."""
        # Tokenize
        encoded = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(self.device)
        
        # Get embeddings
        with torch.no_grad():
            outputs = self.model(**encoded)
            # Use CLS token embedding
            embeddings = outputs.last_hidden_state[:, 0, :]
            # Normalize embeddings
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        
        return embeddings.cpu().tolist()
    
    def _cosine_similarity(self, emb1: List[float], emb2: List[float]) -> float:
        """Compute cosine similarity between two embeddings."""
        import math
        dot = sum(a * b for a, b in zip(emb1, emb2))
        norm1 = math.sqrt(sum(a * a for a in emb1))
        norm2 = math.sqrt(sum(b * b for b in emb2))
        if norm1 == 0 or norm2 == 0:
            return 0.0
        return dot / (norm1 * norm2)