Spaces:

iqra15
/

SemanticSearchApp

Sleeping

App Files Files Community

SemanticSearchApp / src /search_engine.py

iqra15

Upload 5 files

82af8ab verified 3 months ago

Raw

History Blame Contribute Delete

3.56 kB

	"""Semantic vector search engine backed by FAISS.

	Expected dataset format (JSON array):
	[
	{
	"question": "...",
	"answer": "...",
	"embeddings": [0.1, 0.2, ...]
	},
	...
	]
	"""

	from __future__ import annotations

	import json
	from pathlib import Path
	from typing import List, Dict, Any

	import faiss
	import numpy as np


	DEFAULT_DATASET_PATH = Path("data/stackoverflow_sample_3000.json")


	class SemanticSearchEngine:
	"""FAISS-based semantic search using cosine similarity via inner product."""

	def __init__(self, dataset_path: str \| Path = DEFAULT_DATASET_PATH) -> None:
	self.dataset_path = Path(dataset_path)
	self.metadata: List[Dict[str, str]] = []
	self.embeddings: np.ndarray
	self.index: faiss.IndexFlatIP
	self._load_and_build()

	def _load_and_build(self) -> None:
	with self.dataset_path.open("r", encoding="utf-8") as f:
	rows: List[Dict[str, Any]] = json.load(f)

	if not isinstance(rows, list):
	raise ValueError("Dataset must be a JSON array of objects.")
	if not rows:
	raise ValueError("Dataset is empty; expected at least one row.")

	self.metadata = [
	{
	"question": row["question"],
	"answer": row["answer"],
	}
	for row in rows
	]

	embeddings = np.asarray([row["embedding"] for row in rows], dtype=np.float32)
	if embeddings.ndim != 2:
	raise ValueError("Embeddings must be a 2D matrix [num_rows, dim].")

	self.embeddings = self._normalize(embeddings)

	dim = self.embeddings.shape[1]
	self.index = faiss.IndexFlatIP(dim)
	self.index.add(self.embeddings)

	@staticmethod
	def _normalize(vectors: np.ndarray) -> np.ndarray:
	"""L2-normalize vectors for cosine similarity search via inner product."""
	vectors = np.asarray(vectors, dtype=np.float32)
	norms = np.linalg.norm(vectors, axis=1, keepdims=True)
	norms = np.where(norms == 0.0, 1.0, norms)
	return vectors / norms

	def search(self, query_embedding: List[float] \| np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]:
	"""Search nearest neighbors and return question/answer plus similarity score."""
	if top_k <= 0:
	raise ValueError("top_k must be greater than 0.")

	query = np.asarray(query_embedding, dtype=np.float32).reshape(1, -1)
	if query.shape[1] != self.embeddings.shape[1]:
	raise ValueError(
	f"Query dimension {query.shape[1]} does not match index dimension {self.embeddings.shape[1]}."
	)

	query = self._normalize(query)
	scores, indices = self.index.search(query, min(top_k, len(self.metadata)))

	results: List[Dict[str, Any]] = []
	for score, idx in zip(scores[0], indices[0]):
	item = self.metadata[int(idx)]
	results.append(
	{
	"question": item["question"],
	"answer": item["answer"],
	"score": float(score),
	}
	)
	return results


	def search(query_embedding: List[float] \| np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]:
	"""Module-level convenience function using the default dataset path."""
	engine = SemanticSearchEngine(DEFAULT_DATASET_PATH)
	return engine.search(query_embedding=query_embedding, top_k=top_k)