Spaces:
Sleeping
Sleeping
| """Semantic vector search engine backed by FAISS. | |
| Expected dataset format (JSON array): | |
| [ | |
| { | |
| "question": "...", | |
| "answer": "...", | |
| "embeddings": [0.1, 0.2, ...] | |
| }, | |
| ... | |
| ] | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| import faiss | |
| import numpy as np | |
| DEFAULT_DATASET_PATH = Path("data/stackoverflow_sample_3000.json") | |
| class SemanticSearchEngine: | |
| """FAISS-based semantic search using cosine similarity via inner product.""" | |
| def __init__(self, dataset_path: str | Path = DEFAULT_DATASET_PATH) -> None: | |
| self.dataset_path = Path(dataset_path) | |
| self.metadata: List[Dict[str, str]] = [] | |
| self.embeddings: np.ndarray | |
| self.index: faiss.IndexFlatIP | |
| self._load_and_build() | |
| def _load_and_build(self) -> None: | |
| with self.dataset_path.open("r", encoding="utf-8") as f: | |
| rows: List[Dict[str, Any]] = json.load(f) | |
| if not isinstance(rows, list): | |
| raise ValueError("Dataset must be a JSON array of objects.") | |
| if not rows: | |
| raise ValueError("Dataset is empty; expected at least one row.") | |
| self.metadata = [ | |
| { | |
| "question": row["question"], | |
| "answer": row["answer"], | |
| } | |
| for row in rows | |
| ] | |
| embeddings = np.asarray([row["embedding"] for row in rows], dtype=np.float32) | |
| if embeddings.ndim != 2: | |
| raise ValueError("Embeddings must be a 2D matrix [num_rows, dim].") | |
| self.embeddings = self._normalize(embeddings) | |
| dim = self.embeddings.shape[1] | |
| self.index = faiss.IndexFlatIP(dim) | |
| self.index.add(self.embeddings) | |
| def _normalize(vectors: np.ndarray) -> np.ndarray: | |
| """L2-normalize vectors for cosine similarity search via inner product.""" | |
| vectors = np.asarray(vectors, dtype=np.float32) | |
| norms = np.linalg.norm(vectors, axis=1, keepdims=True) | |
| norms = np.where(norms == 0.0, 1.0, norms) | |
| return vectors / norms | |
| def search(self, query_embedding: List[float] | np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]: | |
| """Search nearest neighbors and return question/answer plus similarity score.""" | |
| if top_k <= 0: | |
| raise ValueError("top_k must be greater than 0.") | |
| query = np.asarray(query_embedding, dtype=np.float32).reshape(1, -1) | |
| if query.shape[1] != self.embeddings.shape[1]: | |
| raise ValueError( | |
| f"Query dimension {query.shape[1]} does not match index dimension {self.embeddings.shape[1]}." | |
| ) | |
| query = self._normalize(query) | |
| scores, indices = self.index.search(query, min(top_k, len(self.metadata))) | |
| results: List[Dict[str, Any]] = [] | |
| for score, idx in zip(scores[0], indices[0]): | |
| item = self.metadata[int(idx)] | |
| results.append( | |
| { | |
| "question": item["question"], | |
| "answer": item["answer"], | |
| "score": float(score), | |
| } | |
| ) | |
| return results | |
| def search(query_embedding: List[float] | np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]: | |
| """Module-level convenience function using the default dataset path.""" | |
| engine = SemanticSearchEngine(DEFAULT_DATASET_PATH) | |
| return engine.search(query_embedding=query_embedding, top_k=top_k) | |