""" vecdb.py — 4-layer hybrid retrieval with scored ranking. Changes from original: 1. FAISS k raised from 50 → 150 (catalog ~377; old k missed items that drift semantically) 2. Lexical search now hits BOTH name AND description columns 3. New SQL layers for job_level and test_category that hard-filter the DB 4. All candidates scored before the context string is built so the synthesizer sees a ranked top-N, not a flat dump of 50+ items 5. Context string now includes scores and job_levels to help the synthesizer choose """ import json import sqlite3 from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings EMBEDDING_MODEL = "all-MiniLM-L6-v2" # ========================================== # PART 1: Build Databases (Run Once) # ========================================== def build_databases(json_filepath="dataset.json"): with open(json_filepath, 'r') as f: data = json.load(f) conn = sqlite3.connect("shl_catalog.db") cursor = conn.cursor() cursor.execute(""" CREATE TABLE IF NOT EXISTS assessments ( entity_id TEXT PRIMARY KEY, name TEXT, link TEXT, description TEXT, job_levels TEXT, keys TEXT ) """) texts = [] metadatas = [] str_ids = [] for item in data: e_id = str(item["entity_id"]) cursor.execute(""" INSERT OR REPLACE INTO assessments (entity_id, name, link, description, job_levels, keys) VALUES (?, ?, ?, ?, ?, ?) """, ( e_id, item['name'], item['link'], item['description'], ",".join(item.get('job_levels', [])), ",".join(item.get('keys', [])) )) # Embed name + description + job_levels + keys so FAISS sees all metadata embed_text = ( f"Name: {item['name']}\n" f"Description: {item['description']}\n" f"Job Levels: {','.join(item.get('job_levels', []))}\n" f"Category: {','.join(item.get('keys', []))}" ) texts.append(embed_text) metadatas.append({ "entity_id": e_id, "name": item["name"], "link": item["link"], "test_type": item.get("keys", [""])[0] if item.get("keys") else "", "job_levels": ",".join(item.get('job_levels', [])), }) str_ids.append(e_id) conn.commit() conn.close() embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) vector_store = FAISS.from_texts( texts=texts, embedding=embeddings, metadatas=metadatas, ids=str_ids ) vector_store.save_local("faiss_index") print("Databases built successfully!") # ========================================== # PART 2: Scored Candidate Accumulator # ========================================== def _accumulate(candidates: dict, name: str, row: tuple, score_delta: float): """ candidates is a dict keyed by assessment name. Each value: {"name", "link", "test_type", "job_levels", "score"} Score is additive — the same item can be found by multiple layers. """ if name not in candidates: candidates[name] = { "name": row[0], "link": row[2], "test_type": row[3], "job_levels": row[4] if len(row) > 4 else "", "score": 0.0, } candidates[name]["score"] += score_delta # ========================================== # PART 3: The Search Execution Engine # ========================================== def execute_rag_search(extracted_filters): """ 4-layer retrieval: Layer 1 — Keyword exact match on name (score +5 per hit) Layer 2 — Keyword match on description (score +3 per hit) Layer 3 — SQL filter on job_level (score +2) Layer 4 — SQL filter on test_category (score +2) Layer 5 — FAISS semantic search k=150 (score = 1 / (rank+1), max +1) Mandatory flagships injected at the end if not already present. """ conn = sqlite3.connect("shl_catalog.db") cursor = conn.cursor() candidates: dict = {} # name → candidate dict # ------------------------------------------------------------------ # Layer 1 & 2: Keyword lexical search (name + description) # ------------------------------------------------------------------ if extracted_filters.exact_keywords: for kw in extracted_filters.exact_keywords: # Name match — high signal cursor.execute(""" SELECT name, description, link, keys, job_levels FROM assessments WHERE name LIKE ? """, (f"%{kw}%",)) for row in cursor.fetchall(): _accumulate(candidates, row[0], row, score_delta=5.0) # Description match — medium signal (catches items where # keyword only appears in the body, e.g. "Linux Programming (General)") cursor.execute(""" SELECT name, description, link, keys, job_levels FROM assessments WHERE name NOT LIKE ? AND description LIKE ? """, (f"%{kw}%", f"%{kw}%")) for row in cursor.fetchall(): _accumulate(candidates, row[0], row, score_delta=3.0) # ------------------------------------------------------------------ # Layer 3: Job-level SQL filter # Previously this was only appended to the FAISS query string, so it # never enforced hard SQL filtering. Now it pulls real DB matches. # ------------------------------------------------------------------ if extracted_filters.job_level: cursor.execute(""" SELECT name, description, link, keys, job_levels FROM assessments WHERE job_levels LIKE ? """, (f"%{extracted_filters.job_level}%",)) for row in cursor.fetchall(): _accumulate(candidates, row[0], row, score_delta=2.0) # ------------------------------------------------------------------ # Layer 4: Test-category SQL filter # "keys" column stores the category (e.g. "Knowledge & Skills") # ------------------------------------------------------------------ if extracted_filters.test_category: cursor.execute(""" SELECT name, description, link, keys, job_levels FROM assessments WHERE keys LIKE ? """, (f"%{extracted_filters.test_category}%",)) for row in cursor.fetchall(): _accumulate(candidates, row[0], row, score_delta=2.0) conn.close() # ------------------------------------------------------------------ # Layer 5: FAISS semantic search # k=150 (~40% of catalog) vs old k=50 (~13%). # Items that drift semantically (e.g. "Global Skills Assessment" for # a broad skills query) now have room to surface. # ------------------------------------------------------------------ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) vector_store = FAISS.load_local( "faiss_index", embeddings, allow_dangerous_deserialization=True ) # Build a rich composite query query_parts = [extracted_filters.semantic_query] if extracted_filters.job_level: query_parts.append(f"Job Level: {extracted_filters.job_level}") if extracted_filters.test_category: query_parts.append(f"Category: {extracted_filters.test_category}") if extracted_filters.exact_keywords: query_parts.append("Tools: " + ", ".join(extracted_filters.exact_keywords)) rich_query = " | ".join(query_parts) faiss_results = vector_store.similarity_search(query=rich_query, k=150) for rank, res in enumerate(faiss_results): name = res.metadata["name"] # Reciprocal rank score: rank 0 → +1.0, rank 149 → +0.0067 rr_score = 1.0 / (rank + 1) # Build a stub row compatible with _accumulate stub = ( name, "", # description not needed after retrieval res.metadata["link"], res.metadata.get("test_type", ""), res.metadata.get("job_levels", ""), ) _accumulate(candidates, name, stub, score_delta=rr_score) # ------------------------------------------------------------------ # Flagship injection (OPQ32r + Verify G+) # Always included; given a modest baseline score so they don't # outrank genuinely specific matches. # ------------------------------------------------------------------ conn = sqlite3.connect("shl_catalog.db") cursor = conn.cursor() cursor.execute(""" SELECT name, description, link, keys, job_levels FROM assessments WHERE name = 'Occupational Personality Questionnaire OPQ32r' OR name = 'SHL Verify Interactive G+' """) for row in cursor.fetchall(): # Score 1.5 — present but won't outrank a perfect keyword match _accumulate(candidates, row[0], row, score_delta=1.5) conn.close() if not candidates: return "No assessments found matching those constraints." # ------------------------------------------------------------------ # Sort by score and build context string (top 25 to keep prompt lean) # ------------------------------------------------------------------ ranked = sorted(candidates.values(), key=lambda x: x["score"], reverse=True) top_candidates = ranked[:25] context_lines = ["RANKED CANDIDATE ASSESSMENTS (higher score = better match):"] context_lines.append( f"{'Rank':<5} {'Score':<7} {'Name':<55} {'Type':<30} {'Job Levels'}" ) context_lines.append("-" * 130) for i, c in enumerate(top_candidates, 1): context_lines.append( f"{i:<5} {c['score']:<7.2f} {c['name']:<55} {c['test_type']:<30} {c['job_levels']}" ) # Append the URL on a sub-line so synthesizer can pick it up easily context_lines.append(f" URL: {c['link']}") return "\n".join(context_lines) # ========================================== # PART 4: Comparison Lookup (unchanged logic, minor hardening) # ========================================== def execute_comparison_lookup(test_names: list) -> str: conn = sqlite3.connect("shl_catalog.db") cursor = conn.cursor() retrieved_text = "COMPARISON DATA RETRIEVED:\n" found_any = False for name in test_names: cursor.execute(""" SELECT name, description, link, keys FROM assessments WHERE name LIKE ? LIMIT 1 """, (f"%{name}%",)) row = cursor.fetchone() if row: found_any = True retrieved_text += ( f"- Name: {row[0]}\n" f" Type: {row[3]}\n" f" URL: {row[2]}\n" f" Description: {row[1]}\n\n" ) conn.close() if not found_any: return "NO DATA RETRIEVED. Could not find those specific tests in the catalog." return retrieved_text if __name__ == "__main__": build_databases()