SemanticSearchApp / src /search_engine.py
iqra15's picture
Upload 5 files
82af8ab verified
Raw
History Blame Contribute Delete
3.56 kB
"""Semantic vector search engine backed by FAISS.
Expected dataset format (JSON array):
[
{
"question": "...",
"answer": "...",
"embeddings": [0.1, 0.2, ...]
},
...
]
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import List, Dict, Any
import faiss
import numpy as np
DEFAULT_DATASET_PATH = Path("data/stackoverflow_sample_3000.json")
class SemanticSearchEngine:
"""FAISS-based semantic search using cosine similarity via inner product."""
def __init__(self, dataset_path: str | Path = DEFAULT_DATASET_PATH) -> None:
self.dataset_path = Path(dataset_path)
self.metadata: List[Dict[str, str]] = []
self.embeddings: np.ndarray
self.index: faiss.IndexFlatIP
self._load_and_build()
def _load_and_build(self) -> None:
with self.dataset_path.open("r", encoding="utf-8") as f:
rows: List[Dict[str, Any]] = json.load(f)
if not isinstance(rows, list):
raise ValueError("Dataset must be a JSON array of objects.")
if not rows:
raise ValueError("Dataset is empty; expected at least one row.")
self.metadata = [
{
"question": row["question"],
"answer": row["answer"],
}
for row in rows
]
embeddings = np.asarray([row["embedding"] for row in rows], dtype=np.float32)
if embeddings.ndim != 2:
raise ValueError("Embeddings must be a 2D matrix [num_rows, dim].")
self.embeddings = self._normalize(embeddings)
dim = self.embeddings.shape[1]
self.index = faiss.IndexFlatIP(dim)
self.index.add(self.embeddings)
@staticmethod
def _normalize(vectors: np.ndarray) -> np.ndarray:
"""L2-normalize vectors for cosine similarity search via inner product."""
vectors = np.asarray(vectors, dtype=np.float32)
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
norms = np.where(norms == 0.0, 1.0, norms)
return vectors / norms
def search(self, query_embedding: List[float] | np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]:
"""Search nearest neighbors and return question/answer plus similarity score."""
if top_k <= 0:
raise ValueError("top_k must be greater than 0.")
query = np.asarray(query_embedding, dtype=np.float32).reshape(1, -1)
if query.shape[1] != self.embeddings.shape[1]:
raise ValueError(
f"Query dimension {query.shape[1]} does not match index dimension {self.embeddings.shape[1]}."
)
query = self._normalize(query)
scores, indices = self.index.search(query, min(top_k, len(self.metadata)))
results: List[Dict[str, Any]] = []
for score, idx in zip(scores[0], indices[0]):
item = self.metadata[int(idx)]
results.append(
{
"question": item["question"],
"answer": item["answer"],
"score": float(score),
}
)
return results
def search(query_embedding: List[float] | np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]:
"""Module-level convenience function using the default dataset path."""
engine = SemanticSearchEngine(DEFAULT_DATASET_PATH)
return engine.search(query_embedding=query_embedding, top_k=top_k)