Spaces:

Utkarsh430
/

shlaiagent

Build error

App Files Files Community

shlaiagent / retrieval.py

Utkarsh430

The app

4fe04aa verified about 2 months ago

Raw

History Blame Contribute Delete

6.37 kB

	"""
	retrieval.py — Builds and queries a lightweight vector index over the SHL catalog.

	Architecture decision: TF-IDF + cosine similarity via scikit-learn.
	Why not sentence-transformers?
	- sentence-transformers requires torch (~2 GB download), which is hostile to a cold
	HF Space boot with a 512 MB RAM limit on free tier.
	- TF-IDF over rich text descriptions is fast to build (<1s), deterministic, and
	transparent — every interviewer can follow the math.
	- For a 35-item catalog the retrieval quality difference vs. neural embeddings is
	marginal. A neural upgrade path is straightforward (swap _build_index).

	Why FAISS-style top-k?
	- We cap recommendations at 10. TF-IDF + cosine score with top-k is sufficient.
	- We use sklearn's linear_kernel for exact dot-product similarity (no approximation
	needed at this scale).

	Interview Q: "What would you change for a 10,000-item catalog?"
	A: Switch to sentence-transformers with a pre-built FAISS index serialized to disk,
	loaded once at startup. The interface (retrieve) stays the same.

	Interview Q: "Why build the index at startup instead of per-request?"
	A: Index construction (even TF-IDF) is O(n*d) in terms of vocabulary. Doing it per
	request wastes CPU and adds latency. We build once, query many times.
	"""

	import os
	import pickle
	from typing import List, Dict, Any, Tuple

	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import linear_kernel


	# Path for persisted index artifacts (built by scripts/build_index.py or on first boot).
	_INDEX_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
	_VECTORIZER_PATH = os.path.join(_INDEX_DIR, "tfidf_vectorizer.pkl")
	_MATRIX_PATH = os.path.join(_INDEX_DIR, "tfidf_matrix.pkl")


	def _build_documents(catalog: List[Dict[str, Any]]) -> List[str]:
	"""
	Construct a rich text document per catalog item.

	Design: concatenate all textual fields into a single string. This gives TF-IDF
	the full vocabulary of test names, descriptions, domains, and keys. Repeating
	the name twice boosts its weight (a simple, defensible heuristic).

	Trade-off: we lose field-level weighting. A more sophisticated approach would
	use separate TF-IDF columns per field and combine scores. Avoided here to keep
	the retrieval logic transparent and reviewable.
	"""
	docs = []
	for item in catalog:
	parts = [
	item["name"], # repeated for weight boost
	item["name"],
	item.get("description", ""),
	item.get("test_type", ""),
	" ".join(item.get("keys", [])),
	" ".join(item.get("domains", [])),
	" ".join(item.get("seniority", [])),
	" ".join(item.get("languages", [])),
	]
	docs.append(" ".join(p for p in parts if p))
	return docs


	def build_index(catalog: List[Dict[str, Any]]) -> Tuple[TfidfVectorizer, Any]:
	"""
	Fit TF-IDF vectorizer on catalog documents and return (vectorizer, tfidf_matrix).

	Called once at startup (or by build_index.py). Persists artifacts to disk so
	subsequent startups can load instead of rebuild.
	"""
	documents = _build_documents(catalog)
	vectorizer = TfidfVectorizer(
	ngram_range=(1, 2), # unigrams + bigrams to catch "contact centre", "senior IC"
	min_df=1, # every term counts at this catalog size
	max_df=0.95, # ignore terms in >95% of docs (stop-word effect)
	strip_accents="unicode",
	lowercase=True,
	)
	tfidf_matrix = vectorizer.fit_transform(documents)

	# Persist for fast reloads
	os.makedirs(_INDEX_DIR, exist_ok=True)
	with open(_VECTORIZER_PATH, "wb") as f:
	pickle.dump(vectorizer, f)
	with open(_MATRIX_PATH, "wb") as f:
	pickle.dump(tfidf_matrix, f)

	return vectorizer, tfidf_matrix


	def load_index() -> Tuple[TfidfVectorizer, Any]:
	"""
	Load persisted TF-IDF artifacts from disk.
	Raises FileNotFoundError if artifacts haven't been built yet.
	"""
	if not os.path.exists(_VECTORIZER_PATH) or not os.path.exists(_MATRIX_PATH):
	raise FileNotFoundError(
	"Index artifacts not found. Run scripts/build_index.py first, "
	"or let the server build the index on first startup."
	)
	with open(_VECTORIZER_PATH, "rb") as f:
	vectorizer = pickle.load(f)
	with open(_MATRIX_PATH, "rb") as f:
	tfidf_matrix = pickle.load(f)
	return vectorizer, tfidf_matrix


	def get_or_build_index(
	catalog: List[Dict[str, Any]]
	) -> Tuple[TfidfVectorizer, Any]:
	"""
	Load persisted index if available; otherwise build and persist it.

	This is the function called at server startup. It implements the
	'startup should load precomputed artifacts if possible' requirement.
	"""
	try:
	return load_index()
	except FileNotFoundError:
	return build_index(catalog)


	def retrieve(
	query: str,
	vectorizer: TfidfVectorizer,
	tfidf_matrix: Any,
	catalog: List[Dict[str, Any]],
	top_k: int = 10,
	score_threshold: float = 0.05,
	) -> List[Dict[str, Any]]:
	"""
	Return top_k catalog items most similar to the query, above score_threshold.

	Args:
	query: Free-text query derived from conversation context.
	vectorizer: Fitted TF-IDF vectorizer.
	tfidf_matrix: Pre-computed TF-IDF matrix (catalog x vocabulary).
	catalog: Original catalog list (for returning full item dicts).
	top_k: Maximum number of results to return (capped at 10 by schema).
	score_threshold: Minimum cosine similarity to include a result.

	Returns:
	List of catalog dicts sorted by relevance, up to top_k items.

	Design: exact cosine similarity over a small matrix is O(n*d) — negligible
	for 35 items. No approximate nearest-neighbour needed.
	"""
	if not query.strip():
	return []

	query_vec = vectorizer.transform([query.lower()])
	scores = linear_kernel(query_vec, tfidf_matrix).flatten()

	# Pair each item with its score, filter by threshold, sort descending.
	scored = [
	(score, catalog[i])
	for i, score in enumerate(scores)
	if score >= score_threshold
	]
	scored.sort(key=lambda x: x[0], reverse=True)

	return [item for _, item in scored[:top_k]]