Spaces:
Build error
Build error
| """ | |
| retrieval.py — Builds and queries a lightweight vector index over the SHL catalog. | |
| Architecture decision: TF-IDF + cosine similarity via scikit-learn. | |
| Why not sentence-transformers? | |
| - sentence-transformers requires torch (~2 GB download), which is hostile to a cold | |
| HF Space boot with a 512 MB RAM limit on free tier. | |
| - TF-IDF over rich text descriptions is fast to build (<1s), deterministic, and | |
| transparent — every interviewer can follow the math. | |
| - For a 35-item catalog the retrieval quality difference vs. neural embeddings is | |
| marginal. A neural upgrade path is straightforward (swap _build_index). | |
| Why FAISS-style top-k? | |
| - We cap recommendations at 10. TF-IDF + cosine score with top-k is sufficient. | |
| - We use sklearn's linear_kernel for exact dot-product similarity (no approximation | |
| needed at this scale). | |
| Interview Q: "What would you change for a 10,000-item catalog?" | |
| A: Switch to sentence-transformers with a pre-built FAISS index serialized to disk, | |
| loaded once at startup. The interface (retrieve) stays the same. | |
| Interview Q: "Why build the index at startup instead of per-request?" | |
| A: Index construction (even TF-IDF) is O(n*d) in terms of vocabulary. Doing it per | |
| request wastes CPU and adds latency. We build once, query many times. | |
| """ | |
| import os | |
| import pickle | |
| from typing import List, Dict, Any, Tuple | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import linear_kernel | |
| # Path for persisted index artifacts (built by scripts/build_index.py or on first boot). | |
| _INDEX_DIR = os.path.join(os.path.dirname(__file__), "..", "data") | |
| _VECTORIZER_PATH = os.path.join(_INDEX_DIR, "tfidf_vectorizer.pkl") | |
| _MATRIX_PATH = os.path.join(_INDEX_DIR, "tfidf_matrix.pkl") | |
| def _build_documents(catalog: List[Dict[str, Any]]) -> List[str]: | |
| """ | |
| Construct a rich text document per catalog item. | |
| Design: concatenate all textual fields into a single string. This gives TF-IDF | |
| the full vocabulary of test names, descriptions, domains, and keys. Repeating | |
| the name twice boosts its weight (a simple, defensible heuristic). | |
| Trade-off: we lose field-level weighting. A more sophisticated approach would | |
| use separate TF-IDF columns per field and combine scores. Avoided here to keep | |
| the retrieval logic transparent and reviewable. | |
| """ | |
| docs = [] | |
| for item in catalog: | |
| parts = [ | |
| item["name"], # repeated for weight boost | |
| item["name"], | |
| item.get("description", ""), | |
| item.get("test_type", ""), | |
| " ".join(item.get("keys", [])), | |
| " ".join(item.get("domains", [])), | |
| " ".join(item.get("seniority", [])), | |
| " ".join(item.get("languages", [])), | |
| ] | |
| docs.append(" ".join(p for p in parts if p)) | |
| return docs | |
| def build_index(catalog: List[Dict[str, Any]]) -> Tuple[TfidfVectorizer, Any]: | |
| """ | |
| Fit TF-IDF vectorizer on catalog documents and return (vectorizer, tfidf_matrix). | |
| Called once at startup (or by build_index.py). Persists artifacts to disk so | |
| subsequent startups can load instead of rebuild. | |
| """ | |
| documents = _build_documents(catalog) | |
| vectorizer = TfidfVectorizer( | |
| ngram_range=(1, 2), # unigrams + bigrams to catch "contact centre", "senior IC" | |
| min_df=1, # every term counts at this catalog size | |
| max_df=0.95, # ignore terms in >95% of docs (stop-word effect) | |
| strip_accents="unicode", | |
| lowercase=True, | |
| ) | |
| tfidf_matrix = vectorizer.fit_transform(documents) | |
| # Persist for fast reloads | |
| os.makedirs(_INDEX_DIR, exist_ok=True) | |
| with open(_VECTORIZER_PATH, "wb") as f: | |
| pickle.dump(vectorizer, f) | |
| with open(_MATRIX_PATH, "wb") as f: | |
| pickle.dump(tfidf_matrix, f) | |
| return vectorizer, tfidf_matrix | |
| def load_index() -> Tuple[TfidfVectorizer, Any]: | |
| """ | |
| Load persisted TF-IDF artifacts from disk. | |
| Raises FileNotFoundError if artifacts haven't been built yet. | |
| """ | |
| if not os.path.exists(_VECTORIZER_PATH) or not os.path.exists(_MATRIX_PATH): | |
| raise FileNotFoundError( | |
| "Index artifacts not found. Run scripts/build_index.py first, " | |
| "or let the server build the index on first startup." | |
| ) | |
| with open(_VECTORIZER_PATH, "rb") as f: | |
| vectorizer = pickle.load(f) | |
| with open(_MATRIX_PATH, "rb") as f: | |
| tfidf_matrix = pickle.load(f) | |
| return vectorizer, tfidf_matrix | |
| def get_or_build_index( | |
| catalog: List[Dict[str, Any]] | |
| ) -> Tuple[TfidfVectorizer, Any]: | |
| """ | |
| Load persisted index if available; otherwise build and persist it. | |
| This is the function called at server startup. It implements the | |
| 'startup should load precomputed artifacts if possible' requirement. | |
| """ | |
| try: | |
| return load_index() | |
| except FileNotFoundError: | |
| return build_index(catalog) | |
| def retrieve( | |
| query: str, | |
| vectorizer: TfidfVectorizer, | |
| tfidf_matrix: Any, | |
| catalog: List[Dict[str, Any]], | |
| top_k: int = 10, | |
| score_threshold: float = 0.05, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Return top_k catalog items most similar to the query, above score_threshold. | |
| Args: | |
| query: Free-text query derived from conversation context. | |
| vectorizer: Fitted TF-IDF vectorizer. | |
| tfidf_matrix: Pre-computed TF-IDF matrix (catalog x vocabulary). | |
| catalog: Original catalog list (for returning full item dicts). | |
| top_k: Maximum number of results to return (capped at 10 by schema). | |
| score_threshold: Minimum cosine similarity to include a result. | |
| Returns: | |
| List of catalog dicts sorted by relevance, up to top_k items. | |
| Design: exact cosine similarity over a small matrix is O(n*d) — negligible | |
| for 35 items. No approximate nearest-neighbour needed. | |
| """ | |
| if not query.strip(): | |
| return [] | |
| query_vec = vectorizer.transform([query.lower()]) | |
| scores = linear_kernel(query_vec, tfidf_matrix).flatten() | |
| # Pair each item with its score, filter by threshold, sort descending. | |
| scored = [ | |
| (score, catalog[i]) | |
| for i, score in enumerate(scores) | |
| if score >= score_threshold | |
| ] | |
| scored.sort(key=lambda x: x[0], reverse=True) | |
| return [item for _, item in scored[:top_k]] | |