Spaces:
Sleeping
Sleeping
| """ | |
| Minimal vector search implementation for HuggingFace deployment | |
| """ | |
| import json | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import List, Dict, Tuple | |
| class SimpleVectorSearch: | |
| """Simple in-memory vector search""" | |
| def __init__(self, data_dir: str = "py/backend/data/embeddings"): | |
| self.data_dir = Path(data_dir) | |
| self.documents = [] | |
| self.embeddings = None | |
| self._load_embeddings() | |
| def _load_embeddings(self): | |
| """Load all embedding files""" | |
| all_docs = [] | |
| all_embeddings = [] | |
| # Load all JSON files | |
| for json_file in self.data_dir.glob("*.json"): | |
| try: | |
| with open(json_file, 'r') as f: | |
| data = json.load(f) | |
| for item in data: | |
| all_docs.append({ | |
| 'content': item['content'], | |
| 'metadata': item.get('metadata', {}) | |
| }) | |
| all_embeddings.append(item['embedding']) | |
| except Exception as e: | |
| print(f"Error loading {json_file}: {e}") | |
| self.documents = all_docs | |
| self.embeddings = np.array(all_embeddings) if all_embeddings else None | |
| def search(self, query_embedding: List[float], k: int = 3) -> List[Dict]: | |
| """Search for similar documents""" | |
| if self.embeddings is None or len(self.embeddings) == 0: | |
| return [] | |
| # Convert query to numpy array | |
| query_vec = np.array(query_embedding) | |
| # Compute cosine similarity | |
| query_norm = query_vec / (np.linalg.norm(query_vec) + 1e-10) | |
| embeddings_norm = self.embeddings / (np.linalg.norm(self.embeddings, axis=1, keepdims=True) + 1e-10) | |
| similarities = np.dot(embeddings_norm, query_norm) | |
| # Get top k indices | |
| top_indices = np.argsort(similarities)[-k:][::-1] | |
| # Return documents with scores | |
| results = [] | |
| for idx in top_indices: | |
| results.append({ | |
| 'content': self.documents[idx]['content'], | |
| 'metadata': self.documents[idx]['metadata'], | |
| 'score': float(similarities[idx]) | |
| }) | |
| return results |