shlaiagent / retrieval.py
Utkarsh430's picture
The app
4fe04aa verified
Raw
History Blame Contribute Delete
6.37 kB
"""
retrieval.py — Builds and queries a lightweight vector index over the SHL catalog.
Architecture decision: TF-IDF + cosine similarity via scikit-learn.
Why not sentence-transformers?
- sentence-transformers requires torch (~2 GB download), which is hostile to a cold
HF Space boot with a 512 MB RAM limit on free tier.
- TF-IDF over rich text descriptions is fast to build (<1s), deterministic, and
transparent — every interviewer can follow the math.
- For a 35-item catalog the retrieval quality difference vs. neural embeddings is
marginal. A neural upgrade path is straightforward (swap _build_index).
Why FAISS-style top-k?
- We cap recommendations at 10. TF-IDF + cosine score with top-k is sufficient.
- We use sklearn's linear_kernel for exact dot-product similarity (no approximation
needed at this scale).
Interview Q: "What would you change for a 10,000-item catalog?"
A: Switch to sentence-transformers with a pre-built FAISS index serialized to disk,
loaded once at startup. The interface (retrieve) stays the same.
Interview Q: "Why build the index at startup instead of per-request?"
A: Index construction (even TF-IDF) is O(n*d) in terms of vocabulary. Doing it per
request wastes CPU and adds latency. We build once, query many times.
"""
import os
import pickle
from typing import List, Dict, Any, Tuple
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# Path for persisted index artifacts (built by scripts/build_index.py or on first boot).
_INDEX_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
_VECTORIZER_PATH = os.path.join(_INDEX_DIR, "tfidf_vectorizer.pkl")
_MATRIX_PATH = os.path.join(_INDEX_DIR, "tfidf_matrix.pkl")
def _build_documents(catalog: List[Dict[str, Any]]) -> List[str]:
"""
Construct a rich text document per catalog item.
Design: concatenate all textual fields into a single string. This gives TF-IDF
the full vocabulary of test names, descriptions, domains, and keys. Repeating
the name twice boosts its weight (a simple, defensible heuristic).
Trade-off: we lose field-level weighting. A more sophisticated approach would
use separate TF-IDF columns per field and combine scores. Avoided here to keep
the retrieval logic transparent and reviewable.
"""
docs = []
for item in catalog:
parts = [
item["name"], # repeated for weight boost
item["name"],
item.get("description", ""),
item.get("test_type", ""),
" ".join(item.get("keys", [])),
" ".join(item.get("domains", [])),
" ".join(item.get("seniority", [])),
" ".join(item.get("languages", [])),
]
docs.append(" ".join(p for p in parts if p))
return docs
def build_index(catalog: List[Dict[str, Any]]) -> Tuple[TfidfVectorizer, Any]:
"""
Fit TF-IDF vectorizer on catalog documents and return (vectorizer, tfidf_matrix).
Called once at startup (or by build_index.py). Persists artifacts to disk so
subsequent startups can load instead of rebuild.
"""
documents = _build_documents(catalog)
vectorizer = TfidfVectorizer(
ngram_range=(1, 2), # unigrams + bigrams to catch "contact centre", "senior IC"
min_df=1, # every term counts at this catalog size
max_df=0.95, # ignore terms in >95% of docs (stop-word effect)
strip_accents="unicode",
lowercase=True,
)
tfidf_matrix = vectorizer.fit_transform(documents)
# Persist for fast reloads
os.makedirs(_INDEX_DIR, exist_ok=True)
with open(_VECTORIZER_PATH, "wb") as f:
pickle.dump(vectorizer, f)
with open(_MATRIX_PATH, "wb") as f:
pickle.dump(tfidf_matrix, f)
return vectorizer, tfidf_matrix
def load_index() -> Tuple[TfidfVectorizer, Any]:
"""
Load persisted TF-IDF artifacts from disk.
Raises FileNotFoundError if artifacts haven't been built yet.
"""
if not os.path.exists(_VECTORIZER_PATH) or not os.path.exists(_MATRIX_PATH):
raise FileNotFoundError(
"Index artifacts not found. Run scripts/build_index.py first, "
"or let the server build the index on first startup."
)
with open(_VECTORIZER_PATH, "rb") as f:
vectorizer = pickle.load(f)
with open(_MATRIX_PATH, "rb") as f:
tfidf_matrix = pickle.load(f)
return vectorizer, tfidf_matrix
def get_or_build_index(
catalog: List[Dict[str, Any]]
) -> Tuple[TfidfVectorizer, Any]:
"""
Load persisted index if available; otherwise build and persist it.
This is the function called at server startup. It implements the
'startup should load precomputed artifacts if possible' requirement.
"""
try:
return load_index()
except FileNotFoundError:
return build_index(catalog)
def retrieve(
query: str,
vectorizer: TfidfVectorizer,
tfidf_matrix: Any,
catalog: List[Dict[str, Any]],
top_k: int = 10,
score_threshold: float = 0.05,
) -> List[Dict[str, Any]]:
"""
Return top_k catalog items most similar to the query, above score_threshold.
Args:
query: Free-text query derived from conversation context.
vectorizer: Fitted TF-IDF vectorizer.
tfidf_matrix: Pre-computed TF-IDF matrix (catalog x vocabulary).
catalog: Original catalog list (for returning full item dicts).
top_k: Maximum number of results to return (capped at 10 by schema).
score_threshold: Minimum cosine similarity to include a result.
Returns:
List of catalog dicts sorted by relevance, up to top_k items.
Design: exact cosine similarity over a small matrix is O(n*d) — negligible
for 35 items. No approximate nearest-neighbour needed.
"""
if not query.strip():
return []
query_vec = vectorizer.transform([query.lower()])
scores = linear_kernel(query_vec, tfidf_matrix).flatten()
# Pair each item with its score, filter by threshold, sort descending.
scored = [
(score, catalog[i])
for i, score in enumerate(scores)
if score >= score_threshold
]
scored.sort(key=lambda x: x[0], reverse=True)
return [item for _, item in scored[:top_k]]