import sys import os import time import json import random import numpy as np import torch from sentence_transformers import SentenceTransformer # Set encoding for Windows terminals # Removing potentially problematic wrapper for background logging # if sys.platform == "win32": # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # Add backend to path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) # --------------------------------------------------------------------- # UTILS & NOISE SIMULATION # --------------------------------------------------------------------- def cosine_similarity(v1, v2): if v1 is None or v2 is None: return 0.0 norm1 = np.linalg.norm(v1) norm2 = np.linalg.norm(v2) if norm1 == 0 or norm2 == 0: return 0.0 return np.dot(v1, v2) / (norm1 * norm2) def jaccard_similarity(list1, list2): s1 = set([str(x).lower().strip() for x in list1]) s2 = set([str(x).lower().strip() for x in list2]) if not s1 or not s2: return 0.0 return len(s1.intersection(s2)) / len(s1.union(s2)) def inject_real_world_noise(text, is_skill=False): """Simulates typos, abbreviations, and informal language.""" if random.random() < 0.2: return text # 20% keep clean abbrev = { "Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS", "React": "ReactJS", "Machine Learning": "ML", "Kubernetes": "K8s", "TypeScript": "TS", "Amazon Web Services": "AWS", "Google Cloud": "GCP" } # Apply abbreviation if is_skill and text in abbrev and random.random() > 0.4: return abbrev[text] # Inject "Messy" Resume fillers fillers = ["Highly skilled in", "Practical knowledge of", "Working with", "Extensive experience in"] if random.random() > 0.7 and not is_skill: text = f"{random.choice(fillers)} {text}" # Random case noise if random.random() > 0.8: text = text.lower() return text # --------------------------------------------------------------------- # DATASET GENERATION # --------------------------------------------------------------------- def generate_bench_dataset(num_candidates=100): print(f"šŸ› ļø Generating N={num_candidates} Real-World Synthetic Dataset...") domains = [ ("Cloud_Architect", ["AWS", "Terraform", "Kubernetes", "Docker"], ["Solutions Associate", "AWS Architect"]), ("Backend_Dev", ["Python", "FastAPI", "PostgreSQL", "Redis"], ["Python Cert", "FastAPI Expert"]), ("Frontend_Dev", ["React", "TypeScript", "Tailwind", "Next.js"], ["Meta React Cert", "JS Expert"]), ("Data_Science", ["Python", "PyTorch", "SQL", "Pandas"], ["TensorFlow Cert", "Data Pro"]), ] candidates = [] queries = [] # JDs # We generate balanced pairs for i in range(num_candidates): domain_name, skills, certs = domains[i % len(domains)] level = random.choice(["Junior", "Senior", "Lead"]) # 1. The Candidate Data cand_id = f"cand_{i}_{domain_name}" noisy_skills = [inject_real_world_noise(s, True) for s in skills] candidates.append({ "id": cand_id, "skills": noisy_skills, "tech_skills": noisy_skills, # Project uses both "experience": [f"Developed {domain_name} solutions at Tech {i}."], "certifications": [certs[0]] if random.random() > 0.5 else [], "full_text": f"{level} {domain_name}. Skills: {', '.join(noisy_skills)}" }) # 2. The Matching Query (JD) - Formal Clean Version jd_text = f"We are looking for a {level} {domain_name.replace('_', ' ')}. Must have expertise in {skills[0]}, {skills[1]}, and {skills[2]}." queries.append({ "query": jd_text, "relevant_id": cand_id, "jd_structured": { "skills": skills, "tech_skills": skills, "experience": [f"{level} {domain_name} experience."], "certifications": certs } }) return candidates, queries # --------------------------------------------------------------------- # BENCHMARK RUNNER # --------------------------------------------------------------------- def run_benchmark(): device = "cuda" if torch.cuda.is_available() else "cpu" print(f"šŸš€ Loading Models on {device}...", flush=True) # Load Models bert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device) bge_model = SentenceTransformer('BAAI/bge-m3', device=device) candidates, queries = generate_bench_dataset(250) # Save the synthetic dataset to a JSON file for inspection with open("synthetic_dataset_adversarial.json", "w", encoding="utf-8") as f: json.dump({"candidates": candidates, "queries": queries}, f, indent=4) print(f"šŸ’¾ Saved generated synthetic dataset to 'synthetic_dataset_adversarial.json'", flush=True) # Pre-calculate Candidate Embeddings print("🧠 Indexing Candidates...") start_idx = time.time() for i, c in enumerate(candidates): # BERT Flattened c["bert_vec"] = bert_model.encode(c["full_text"]) # BGE Flattened c["bge_flat_vec"] = bge_model.encode(c["full_text"]) # BGE Granular (Project Method) c["bge_granular"] = { "skills": bge_model.encode(" ".join(c["skills"])), "tech_skills": bge_model.encode(" ".join(c["tech_skills"])), "experience": bge_model.encode(" ".join(c["experience"])), "certs": bge_model.encode(" ".join(c["certifications"])) if c["certifications"] else np.zeros(1024) } if (i+1) % 50 == 0: print(f" -> Indexed {i+1}/{len(candidates)} candidates...", flush=True) print(f"āœ… Indexed in {time.time() - start_idx:.2f}s") # Evaluation Loops methods = ["Jaccard_Baseline", "BERT_Flattened", "BGE_Flattened", "BGE_Granular_Weighted"] results = {m: {"mrr": 0, "r1": 0, "r3": 0} for m in methods} weights = {"skills": 0.35, "tech_skills": 0.35, "experience": 0.20, "certs": 0.10} print("\nEvaluating Queries...") for i, q in enumerate(queries): target_id = q["relevant_id"] jd_text = q["query"] jd_s = q["jd_structured"] # Embed Query q_bert = bert_model.encode(jd_text) q_bge_flat = bge_model.encode(jd_text) q_bge_g = { "skills": bge_model.encode(" ".join(jd_s["skills"])), "tech_skills": bge_model.encode(" ".join(jd_s["tech_skills"])), "experience": bge_model.encode(" ".join(jd_s["experience"])), "certs": bge_model.encode(" ".join(jd_s["certifications"])) } if (i+1) % 25 == 0: print(f" -> Evaluated {i+1}/{len(queries)} queries...", flush=True) # Calculate scores for all candidates cand_scores = [] for c in candidates: # 1. Jaccard jac = jaccard_similarity(jd_s["skills"], c["skills"]) # 2. BERT ber = cosine_similarity(q_bert, c["bert_vec"]) # 3. BGE Flat bgf = cosine_similarity(q_bge_flat, c["bge_flat_vec"]) # 4. BGE Granular Weighted bgg = ( cosine_similarity(q_bge_g["skills"], c["bge_granular"]["skills"]) * weights["skills"] + cosine_similarity(q_bge_g["tech_skills"], c["bge_granular"]["tech_skills"]) * weights["tech_skills"] + cosine_similarity(q_bge_g["experience"], c["bge_granular"]["experience"]) * weights["experience"] + cosine_similarity(q_bge_g["certs"], c["bge_granular"]["certs"]) * weights["certs"] ) cand_scores.append({ "id": c["id"], "Jaccard_Baseline": jac, "BERT_Flattened": ber, "BGE_Flattened": bgf, "BGE_Granular_Weighted": bgg }) # Rank and Calc Metrics for m in methods: sorted_cands = sorted(cand_scores, key=lambda x: x[m], reverse=True) rank = next(i for i, x in enumerate(sorted_cands) if x["id"] == target_id) + 1 results[m]["mrr"] += (1.0 / rank) if rank == 1: results[m]["r1"] += 1 if rank <= 3: results[m]["r3"] += 1 # Print Results Table num_q = len(queries) print("\n" + "="*65) print(f"{'Method':<25} | {'MRR':<8} | {'Recall@1':<10} | {'Recall@3':<10}") print("-" * 65) for m in methods: mrr = results[m]["mrr"] / num_q r1 = (results[m]["r1"] / num_q) * 100 r3 = (results[m]["r3"] / num_q) * 100 print(f"{m:<25} | {mrr:.4f} | {r1:>8.1f}% | {r3:>8.1f}%", flush=True) print("="*65, flush=True) # Save to file summary = {m: {"mrr": results[m]["mrr"]/num_q, "r1": results[m]["r1"]/num_q, "r3": results[m]["r3"]/num_q} for m in methods} with open("match_benchmark_results.json", "w") as f: json.dump(summary, f, indent=4) print(f"\nšŸ“„ Results saved to 'match_benchmark_results.json'", flush=True) if __name__ == "__main__": run_benchmark()