iris_backend / backend /src /embeddings /match_benchmark_granular.py
Saandraahh's picture
Implemented clustering
4b3a33f
import sys
import os
import time
import json
import random
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
# Set encoding for Windows terminals
# Removing potentially problematic wrapper for background logging
# if sys.platform == "win32":
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
# ---------------------------------------------------------------------
# UTILS & NOISE SIMULATION
# ---------------------------------------------------------------------
def cosine_similarity(v1, v2):
if v1 is None or v2 is None: return 0.0
norm1 = np.linalg.norm(v1)
norm2 = np.linalg.norm(v2)
if norm1 == 0 or norm2 == 0: return 0.0
return np.dot(v1, v2) / (norm1 * norm2)
def jaccard_similarity(list1, list2):
s1 = set([str(x).lower().strip() for x in list1])
s2 = set([str(x).lower().strip() for x in list2])
if not s1 or not s2: return 0.0
return len(s1.intersection(s2)) / len(s1.union(s2))
def inject_real_world_noise(text, is_skill=False):
"""Simulates typos, abbreviations, and informal language."""
if random.random() < 0.2: return text # 20% keep clean
abbrev = {
"Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS",
"React": "ReactJS", "Machine Learning": "ML", "Kubernetes": "K8s",
"TypeScript": "TS", "Amazon Web Services": "AWS", "Google Cloud": "GCP"
}
# Apply abbreviation
if is_skill and text in abbrev and random.random() > 0.4:
return abbrev[text]
# Inject "Messy" Resume fillers
fillers = ["Highly skilled in", "Practical knowledge of", "Working with", "Extensive experience in"]
if random.random() > 0.7 and not is_skill:
text = f"{random.choice(fillers)} {text}"
# Random case noise
if random.random() > 0.8:
text = text.lower()
return text
# ---------------------------------------------------------------------
# DATASET GENERATION
# ---------------------------------------------------------------------
def generate_bench_dataset(num_candidates=100):
print(f"๐Ÿ› ๏ธ Generating N={num_candidates} Real-World Synthetic Dataset...")
domains = [
("Cloud_Architect", ["AWS", "Terraform", "Kubernetes", "Docker"], ["Solutions Associate", "AWS Architect"]),
("Backend_Dev", ["Python", "FastAPI", "PostgreSQL", "Redis"], ["Python Cert", "FastAPI Expert"]),
("Frontend_Dev", ["React", "TypeScript", "Tailwind", "Next.js"], ["Meta React Cert", "JS Expert"]),
("Data_Science", ["Python", "PyTorch", "SQL", "Pandas"], ["TensorFlow Cert", "Data Pro"]),
]
candidates = []
queries = [] # JDs
# We generate balanced pairs
for i in range(num_candidates):
domain_name, skills, certs = domains[i % len(domains)]
level = random.choice(["Junior", "Senior", "Lead"])
# 1. The Candidate Data
cand_id = f"cand_{i}_{domain_name}"
noisy_skills = [inject_real_world_noise(s, True) for s in skills]
candidates.append({
"id": cand_id,
"skills": noisy_skills,
"tech_skills": noisy_skills, # Project uses both
"experience": [f"Developed {domain_name} solutions at Tech {i}."],
"certifications": [certs[0]] if random.random() > 0.5 else [],
"full_text": f"{level} {domain_name}. Skills: {', '.join(noisy_skills)}"
})
# 2. The Matching Query (JD) - Formal Clean Version
jd_text = f"We are looking for a {level} {domain_name.replace('_', ' ')}. Must have expertise in {skills[0]}, {skills[1]}, and {skills[2]}."
queries.append({
"query": jd_text,
"relevant_id": cand_id,
"jd_structured": {
"skills": skills,
"tech_skills": skills,
"experience": [f"{level} {domain_name} experience."],
"certifications": certs
}
})
return candidates, queries
# ---------------------------------------------------------------------
# BENCHMARK RUNNER
# ---------------------------------------------------------------------
def run_benchmark():
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐Ÿš€ Loading Models on {device}...", flush=True)
# Load Models
bert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
bge_model = SentenceTransformer('BAAI/bge-m3', device=device)
candidates, queries = generate_bench_dataset(250)
# Save the synthetic dataset to a JSON file for inspection
with open("synthetic_dataset_adversarial.json", "w", encoding="utf-8") as f:
json.dump({"candidates": candidates, "queries": queries}, f, indent=4)
print(f"๐Ÿ’พ Saved generated synthetic dataset to 'synthetic_dataset_adversarial.json'", flush=True)
# Pre-calculate Candidate Embeddings
print("๐Ÿง  Indexing Candidates...")
start_idx = time.time()
for i, c in enumerate(candidates):
# BERT Flattened
c["bert_vec"] = bert_model.encode(c["full_text"])
# BGE Flattened
c["bge_flat_vec"] = bge_model.encode(c["full_text"])
# BGE Granular (Project Method)
c["bge_granular"] = {
"skills": bge_model.encode(" ".join(c["skills"])),
"tech_skills": bge_model.encode(" ".join(c["tech_skills"])),
"experience": bge_model.encode(" ".join(c["experience"])),
"certs": bge_model.encode(" ".join(c["certifications"])) if c["certifications"] else np.zeros(1024)
}
if (i+1) % 50 == 0:
print(f" -> Indexed {i+1}/{len(candidates)} candidates...", flush=True)
print(f"โœ… Indexed in {time.time() - start_idx:.2f}s")
# Evaluation Loops
methods = ["Jaccard_Baseline", "BERT_Flattened", "BGE_Flattened", "BGE_Granular_Weighted"]
results = {m: {"mrr": 0, "r1": 0, "r3": 0} for m in methods}
weights = {"skills": 0.35, "tech_skills": 0.35, "experience": 0.20, "certs": 0.10}
print("\nEvaluating Queries...")
for i, q in enumerate(queries):
target_id = q["relevant_id"]
jd_text = q["query"]
jd_s = q["jd_structured"]
# Embed Query
q_bert = bert_model.encode(jd_text)
q_bge_flat = bge_model.encode(jd_text)
q_bge_g = {
"skills": bge_model.encode(" ".join(jd_s["skills"])),
"tech_skills": bge_model.encode(" ".join(jd_s["tech_skills"])),
"experience": bge_model.encode(" ".join(jd_s["experience"])),
"certs": bge_model.encode(" ".join(jd_s["certifications"]))
}
if (i+1) % 25 == 0:
print(f" -> Evaluated {i+1}/{len(queries)} queries...", flush=True)
# Calculate scores for all candidates
cand_scores = []
for c in candidates:
# 1. Jaccard
jac = jaccard_similarity(jd_s["skills"], c["skills"])
# 2. BERT
ber = cosine_similarity(q_bert, c["bert_vec"])
# 3. BGE Flat
bgf = cosine_similarity(q_bge_flat, c["bge_flat_vec"])
# 4. BGE Granular Weighted
bgg = (
cosine_similarity(q_bge_g["skills"], c["bge_granular"]["skills"]) * weights["skills"] +
cosine_similarity(q_bge_g["tech_skills"], c["bge_granular"]["tech_skills"]) * weights["tech_skills"] +
cosine_similarity(q_bge_g["experience"], c["bge_granular"]["experience"]) * weights["experience"] +
cosine_similarity(q_bge_g["certs"], c["bge_granular"]["certs"]) * weights["certs"]
)
cand_scores.append({
"id": c["id"],
"Jaccard_Baseline": jac,
"BERT_Flattened": ber,
"BGE_Flattened": bgf,
"BGE_Granular_Weighted": bgg
})
# Rank and Calc Metrics
for m in methods:
sorted_cands = sorted(cand_scores, key=lambda x: x[m], reverse=True)
rank = next(i for i, x in enumerate(sorted_cands) if x["id"] == target_id) + 1
results[m]["mrr"] += (1.0 / rank)
if rank == 1: results[m]["r1"] += 1
if rank <= 3: results[m]["r3"] += 1
# Print Results Table
num_q = len(queries)
print("\n" + "="*65)
print(f"{'Method':<25} | {'MRR':<8} | {'Recall@1':<10} | {'Recall@3':<10}")
print("-" * 65)
for m in methods:
mrr = results[m]["mrr"] / num_q
r1 = (results[m]["r1"] / num_q) * 100
r3 = (results[m]["r3"] / num_q) * 100
print(f"{m:<25} | {mrr:.4f} | {r1:>8.1f}% | {r3:>8.1f}%", flush=True)
print("="*65, flush=True)
# Save to file
summary = {m: {"mrr": results[m]["mrr"]/num_q, "r1": results[m]["r1"]/num_q, "r3": results[m]["r3"]/num_q} for m in methods}
with open("match_benchmark_results.json", "w") as f:
json.dump(summary, f, indent=4)
print(f"\n๐Ÿ“„ Results saved to 'match_benchmark_results.json'", flush=True)
if __name__ == "__main__":
run_benchmark()