""" Custom handler for ANCE dense retrieval on HuggingFace Inference Endpoints. Returns dense embeddings for queries and passages. """ from typing import Any, Dict, List, Union import torch from transformers import AutoModel, AutoTokenizer class EndpointHandler: """Handler for ANCE embedding generation.""" def __init__(self, path: str = ""): """Initialize the model and tokenizer.""" self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = AutoModel.from_pretrained(path) self.model.eval() # Move to GPU if available self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = self.model.to(self.device) print(f"ANCE loaded on {self.device}") def __call__(self, data: Dict[str, Any]) -> Union[List[List[float]], Dict[str, Any]]: """ Process inference requests. Accepts: - {"inputs": "text"} - single text - {"inputs": ["text1", "text2", ...]} - batch of texts - {"query": "...", "passages": ["...", ...]} - query + passages (returns similarity scores) Returns: - List of embeddings (each embedding is a list of floats) - Or similarity scores if query + passages provided """ inputs = data.get("inputs", None) query = data.get("query", None) passages = data.get("passages", None) # Mode 1: Query + Passages -> return similarity scores if query is not None and passages is not None: query_emb = self._encode([query])[0] passage_embs = self._encode(passages) # Compute cosine similarities scores = [] for p_emb in passage_embs: score = self._cosine_similarity(query_emb, p_emb) scores.append(score) return {"scores": scores} # Mode 2: Just inputs -> return embeddings if inputs is None: return {"error": "No inputs provided. Use 'inputs' or 'query'+'passages'."} if isinstance(inputs, str): inputs = [inputs] embeddings = self._encode(inputs) return {"embeddings": embeddings} def _encode(self, texts: List[str], max_length: int = 512) -> List[List[float]]: """Encode texts into embeddings.""" # Tokenize encoded = self.tokenizer( texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt" ).to(self.device) # Get embeddings with torch.no_grad(): outputs = self.model(**encoded) # Use CLS token embedding embeddings = outputs.last_hidden_state[:, 0, :] # Normalize embeddings embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) return embeddings.cpu().tolist() def _cosine_similarity(self, emb1: List[float], emb2: List[float]) -> float: """Compute cosine similarity between two embeddings.""" import math dot = sum(a * b for a, b in zip(emb1, emb2)) norm1 = math.sqrt(sum(a * a for a in emb1)) norm2 = math.sqrt(sum(b * b for b in emb2)) if norm1 == 0 or norm2 == 0: return 0.0 return dot / (norm1 * norm2)