""" Minimal vector search implementation for HuggingFace deployment """ import json import numpy as np from pathlib import Path from typing import List, Dict, Tuple class SimpleVectorSearch: """Simple in-memory vector search""" def __init__(self, data_dir: str = "py/backend/data/embeddings"): self.data_dir = Path(data_dir) self.documents = [] self.embeddings = None self._load_embeddings() def _load_embeddings(self): """Load all embedding files""" all_docs = [] all_embeddings = [] # Load all JSON files for json_file in self.data_dir.glob("*.json"): try: with open(json_file, 'r') as f: data = json.load(f) for item in data: all_docs.append({ 'content': item['content'], 'metadata': item.get('metadata', {}) }) all_embeddings.append(item['embedding']) except Exception as e: print(f"Error loading {json_file}: {e}") self.documents = all_docs self.embeddings = np.array(all_embeddings) if all_embeddings else None def search(self, query_embedding: List[float], k: int = 3) -> List[Dict]: """Search for similar documents""" if self.embeddings is None or len(self.embeddings) == 0: return [] # Convert query to numpy array query_vec = np.array(query_embedding) # Compute cosine similarity query_norm = query_vec / (np.linalg.norm(query_vec) + 1e-10) embeddings_norm = self.embeddings / (np.linalg.norm(self.embeddings, axis=1, keepdims=True) + 1e-10) similarities = np.dot(embeddings_norm, query_norm) # Get top k indices top_indices = np.argsort(similarities)[-k:][::-1] # Return documents with scores results = [] for idx in top_indices: results.append({ 'content': self.documents[idx]['content'], 'metadata': self.documents[idx]['metadata'], 'score': float(similarities[idx]) }) return results