Spaces:

levanell
/

shlproj

Running

App Files Files Community

shlproj / vecdb.py

levanell

Upload 5 files

b3d9840 verified 13 days ago

raw

history blame contribute delete

11.2 kB

	"""
	vecdb.py — 4-layer hybrid retrieval with scored ranking.

	Changes from original:
	1. FAISS k raised from 50 → 150 (catalog ~377; old k missed items that drift semantically)
	2. Lexical search now hits BOTH name AND description columns
	3. New SQL layers for job_level and test_category that hard-filter the DB
	4. All candidates scored before the context string is built so the synthesizer
	sees a ranked top-N, not a flat dump of 50+ items
	5. Context string now includes scores and job_levels to help the synthesizer choose
	"""

	import json
	import sqlite3
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings

	EMBEDDING_MODEL = "all-MiniLM-L6-v2"

	# ==========================================
	# PART 1: Build Databases (Run Once)
	# ==========================================
	def build_databases(json_filepath="dataset.json"):
	with open(json_filepath, 'r') as f:
	data = json.load(f)

	conn = sqlite3.connect("shl_catalog.db")
	cursor = conn.cursor()
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS assessments (
	entity_id TEXT PRIMARY KEY,
	name TEXT,
	link TEXT,
	description TEXT,
	job_levels TEXT,
	keys TEXT
	)
	""")

	texts = []
	metadatas = []
	str_ids = []

	for item in data:
	e_id = str(item["entity_id"])
	cursor.execute("""
	INSERT OR REPLACE INTO assessments
	(entity_id, name, link, description, job_levels, keys)
	VALUES (?, ?, ?, ?, ?, ?)
	""", (
	e_id,
	item['name'],
	item['link'],
	item['description'],
	",".join(item.get('job_levels', [])),
	",".join(item.get('keys', []))
	))
	# Embed name + description + job_levels + keys so FAISS sees all metadata
	embed_text = (
	f"Name: {item['name']}\n"
	f"Description: {item['description']}\n"
	f"Job Levels: {','.join(item.get('job_levels', []))}\n"
	f"Category: {','.join(item.get('keys', []))}"
	)
	texts.append(embed_text)
	metadatas.append({
	"entity_id": e_id,
	"name": item["name"],
	"link": item["link"],
	"test_type": item.get("keys", [""])[0] if item.get("keys") else "",
	"job_levels": ",".join(item.get('job_levels', [])),
	})
	str_ids.append(e_id)

	conn.commit()
	conn.close()

	embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
	vector_store = FAISS.from_texts(
	texts=texts,
	embedding=embeddings,
	metadatas=metadatas,
	ids=str_ids
	)
	vector_store.save_local("faiss_index")
	print("Databases built successfully!")


	# ==========================================
	# PART 2: Scored Candidate Accumulator
	# ==========================================
	def _accumulate(candidates: dict, name: str, row: tuple, score_delta: float):
	"""
	candidates is a dict keyed by assessment name.
	Each value: {"name", "link", "test_type", "job_levels", "score"}
	Score is additive — the same item can be found by multiple layers.
	"""
	if name not in candidates:
	candidates[name] = {
	"name": row[0],
	"link": row[2],
	"test_type": row[3],
	"job_levels": row[4] if len(row) > 4 else "",
	"score": 0.0,
	}
	candidates[name]["score"] += score_delta


	# ==========================================
	# PART 3: The Search Execution Engine
	# ==========================================
	def execute_rag_search(extracted_filters):
	"""
	4-layer retrieval:
	Layer 1 — Keyword exact match on name (score +5 per hit)
	Layer 2 — Keyword match on description (score +3 per hit)
	Layer 3 — SQL filter on job_level (score +2)
	Layer 4 — SQL filter on test_category (score +2)
	Layer 5 — FAISS semantic search k=150 (score = 1 / (rank+1), max +1)
	Mandatory flagships injected at the end if not already present.
	"""
	conn = sqlite3.connect("shl_catalog.db")
	cursor = conn.cursor()

	candidates: dict = {} # name → candidate dict

	# ------------------------------------------------------------------
	# Layer 1 & 2: Keyword lexical search (name + description)
	# ------------------------------------------------------------------
	if extracted_filters.exact_keywords:
	for kw in extracted_filters.exact_keywords:
	# Name match — high signal
	cursor.execute("""
	SELECT name, description, link, keys, job_levels
	FROM assessments
	WHERE name LIKE ?
	""", (f"%{kw}%",))
	for row in cursor.fetchall():
	_accumulate(candidates, row[0], row, score_delta=5.0)

	# Description match — medium signal (catches items where
	# keyword only appears in the body, e.g. "Linux Programming (General)")
	cursor.execute("""
	SELECT name, description, link, keys, job_levels
	FROM assessments
	WHERE name NOT LIKE ?
	AND description LIKE ?
	""", (f"%{kw}%", f"%{kw}%"))
	for row in cursor.fetchall():
	_accumulate(candidates, row[0], row, score_delta=3.0)

	# ------------------------------------------------------------------
	# Layer 3: Job-level SQL filter
	# Previously this was only appended to the FAISS query string, so it
	# never enforced hard SQL filtering. Now it pulls real DB matches.
	# ------------------------------------------------------------------
	if extracted_filters.job_level:
	cursor.execute("""
	SELECT name, description, link, keys, job_levels
	FROM assessments
	WHERE job_levels LIKE ?
	""", (f"%{extracted_filters.job_level}%",))
	for row in cursor.fetchall():
	_accumulate(candidates, row[0], row, score_delta=2.0)

	# ------------------------------------------------------------------
	# Layer 4: Test-category SQL filter
	# "keys" column stores the category (e.g. "Knowledge & Skills")
	# ------------------------------------------------------------------
	if extracted_filters.test_category:
	cursor.execute("""
	SELECT name, description, link, keys, job_levels
	FROM assessments
	WHERE keys LIKE ?
	""", (f"%{extracted_filters.test_category}%",))
	for row in cursor.fetchall():
	_accumulate(candidates, row[0], row, score_delta=2.0)

	conn.close()

	# ------------------------------------------------------------------
	# Layer 5: FAISS semantic search
	# k=150 (~40% of catalog) vs old k=50 (~13%).
	# Items that drift semantically (e.g. "Global Skills Assessment" for
	# a broad skills query) now have room to surface.
	# ------------------------------------------------------------------
	embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
	vector_store = FAISS.load_local(
	"faiss_index", embeddings, allow_dangerous_deserialization=True
	)

	# Build a rich composite query
	query_parts = [extracted_filters.semantic_query]
	if extracted_filters.job_level:
	query_parts.append(f"Job Level: {extracted_filters.job_level}")
	if extracted_filters.test_category:
	query_parts.append(f"Category: {extracted_filters.test_category}")
	if extracted_filters.exact_keywords:
	query_parts.append("Tools: " + ", ".join(extracted_filters.exact_keywords))

	rich_query = " \| ".join(query_parts)
	faiss_results = vector_store.similarity_search(query=rich_query, k=150)

	for rank, res in enumerate(faiss_results):
	name = res.metadata["name"]
	# Reciprocal rank score: rank 0 → +1.0, rank 149 → +0.0067
	rr_score = 1.0 / (rank + 1)
	# Build a stub row compatible with _accumulate
	stub = (
	name,
	"", # description not needed after retrieval
	res.metadata["link"],
	res.metadata.get("test_type", ""),
	res.metadata.get("job_levels", ""),
	)
	_accumulate(candidates, name, stub, score_delta=rr_score)

	# ------------------------------------------------------------------
	# Flagship injection (OPQ32r + Verify G+)
	# Always included; given a modest baseline score so they don't
	# outrank genuinely specific matches.
	# ------------------------------------------------------------------
	conn = sqlite3.connect("shl_catalog.db")
	cursor = conn.cursor()
	cursor.execute("""
	SELECT name, description, link, keys, job_levels
	FROM assessments
	WHERE name = 'Occupational Personality Questionnaire OPQ32r'
	OR name = 'SHL Verify Interactive G+'
	""")
	for row in cursor.fetchall():
	# Score 1.5 — present but won't outrank a perfect keyword match
	_accumulate(candidates, row[0], row, score_delta=1.5)
	conn.close()

	if not candidates:
	return "No assessments found matching those constraints."

	# ------------------------------------------------------------------
	# Sort by score and build context string (top 25 to keep prompt lean)
	# ------------------------------------------------------------------
	ranked = sorted(candidates.values(), key=lambda x: x["score"], reverse=True)
	top_candidates = ranked[:25]

	context_lines = ["RANKED CANDIDATE ASSESSMENTS (higher score = better match):"]
	context_lines.append(
	f"{'Rank':<5} {'Score':<7} {'Name':<55} {'Type':<30} {'Job Levels'}"
	)
	context_lines.append("-" * 130)

	for i, c in enumerate(top_candidates, 1):
	context_lines.append(
	f"{i:<5} {c['score']:<7.2f} {c['name']:<55} {c['test_type']:<30} {c['job_levels']}"
	)
	# Append the URL on a sub-line so synthesizer can pick it up easily
	context_lines.append(f" URL: {c['link']}")

	return "\n".join(context_lines)


	# ==========================================
	# PART 4: Comparison Lookup (unchanged logic, minor hardening)
	# ==========================================
	def execute_comparison_lookup(test_names: list) -> str:
	conn = sqlite3.connect("shl_catalog.db")
	cursor = conn.cursor()
	retrieved_text = "COMPARISON DATA RETRIEVED:\n"
	found_any = False
	for name in test_names:
	cursor.execute("""
	SELECT name, description, link, keys
	FROM assessments
	WHERE name LIKE ? LIMIT 1
	""", (f"%{name}%",))
	row = cursor.fetchone()
	if row:
	found_any = True
	retrieved_text += (
	f"- Name: {row[0]}\n"
	f" Type: {row[3]}\n"
	f" URL: {row[2]}\n"
	f" Description: {row[1]}\n\n"
	)
	conn.close()
	if not found_any:
	return "NO DATA RETRIEVED. Could not find those specific tests in the catalog."
	return retrieved_text


	if __name__ == "__main__":
	build_databases()