Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| import time | |
| import json | |
| import random | |
| import numpy as np | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| # Set encoding for Windows terminals | |
| # Removing potentially problematic wrapper for background logging | |
| # if sys.platform == "win32": | |
| # import io | |
| # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') | |
| # Add backend to path | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) | |
| # --------------------------------------------------------------------- | |
| # UTILS & NOISE SIMULATION | |
| # --------------------------------------------------------------------- | |
| def cosine_similarity(v1, v2): | |
| if v1 is None or v2 is None: return 0.0 | |
| norm1 = np.linalg.norm(v1) | |
| norm2 = np.linalg.norm(v2) | |
| if norm1 == 0 or norm2 == 0: return 0.0 | |
| return np.dot(v1, v2) / (norm1 * norm2) | |
| def jaccard_similarity(list1, list2): | |
| s1 = set([str(x).lower().strip() for x in list1]) | |
| s2 = set([str(x).lower().strip() for x in list2]) | |
| if not s1 or not s2: return 0.0 | |
| return len(s1.intersection(s2)) / len(s1.union(s2)) | |
| def inject_real_world_noise(text, is_skill=False): | |
| """Simulates typos, abbreviations, and informal language.""" | |
| if random.random() < 0.2: return text # 20% keep clean | |
| abbrev = { | |
| "Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS", | |
| "React": "ReactJS", "Machine Learning": "ML", "Kubernetes": "K8s", | |
| "TypeScript": "TS", "Amazon Web Services": "AWS", "Google Cloud": "GCP" | |
| } | |
| # Apply abbreviation | |
| if is_skill and text in abbrev and random.random() > 0.4: | |
| return abbrev[text] | |
| # Inject "Messy" Resume fillers | |
| fillers = ["Highly skilled in", "Practical knowledge of", "Working with", "Extensive experience in"] | |
| if random.random() > 0.7 and not is_skill: | |
| text = f"{random.choice(fillers)} {text}" | |
| # Random case noise | |
| if random.random() > 0.8: | |
| text = text.lower() | |
| return text | |
| # --------------------------------------------------------------------- | |
| # DATASET GENERATION | |
| # --------------------------------------------------------------------- | |
| def generate_bench_dataset(num_candidates=100): | |
| print(f"๐ ๏ธ Generating N={num_candidates} Real-World Synthetic Dataset...") | |
| domains = [ | |
| ("Cloud_Architect", ["AWS", "Terraform", "Kubernetes", "Docker"], ["Solutions Associate", "AWS Architect"]), | |
| ("Backend_Dev", ["Python", "FastAPI", "PostgreSQL", "Redis"], ["Python Cert", "FastAPI Expert"]), | |
| ("Frontend_Dev", ["React", "TypeScript", "Tailwind", "Next.js"], ["Meta React Cert", "JS Expert"]), | |
| ("Data_Science", ["Python", "PyTorch", "SQL", "Pandas"], ["TensorFlow Cert", "Data Pro"]), | |
| ] | |
| candidates = [] | |
| queries = [] # JDs | |
| # We generate balanced pairs | |
| for i in range(num_candidates): | |
| domain_name, skills, certs = domains[i % len(domains)] | |
| level = random.choice(["Junior", "Senior", "Lead"]) | |
| # 1. The Candidate Data | |
| cand_id = f"cand_{i}_{domain_name}" | |
| noisy_skills = [inject_real_world_noise(s, True) for s in skills] | |
| candidates.append({ | |
| "id": cand_id, | |
| "skills": noisy_skills, | |
| "tech_skills": noisy_skills, # Project uses both | |
| "experience": [f"Developed {domain_name} solutions at Tech {i}."], | |
| "certifications": [certs[0]] if random.random() > 0.5 else [], | |
| "full_text": f"{level} {domain_name}. Skills: {', '.join(noisy_skills)}" | |
| }) | |
| # 2. The Matching Query (JD) - Formal Clean Version | |
| jd_text = f"We are looking for a {level} {domain_name.replace('_', ' ')}. Must have expertise in {skills[0]}, {skills[1]}, and {skills[2]}." | |
| queries.append({ | |
| "query": jd_text, | |
| "relevant_id": cand_id, | |
| "jd_structured": { | |
| "skills": skills, | |
| "tech_skills": skills, | |
| "experience": [f"{level} {domain_name} experience."], | |
| "certifications": certs | |
| } | |
| }) | |
| return candidates, queries | |
| # --------------------------------------------------------------------- | |
| # BENCHMARK RUNNER | |
| # --------------------------------------------------------------------- | |
| def run_benchmark(): | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"๐ Loading Models on {device}...", flush=True) | |
| # Load Models | |
| bert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device) | |
| bge_model = SentenceTransformer('BAAI/bge-m3', device=device) | |
| candidates, queries = generate_bench_dataset(250) | |
| # Save the synthetic dataset to a JSON file for inspection | |
| with open("synthetic_dataset_adversarial.json", "w", encoding="utf-8") as f: | |
| json.dump({"candidates": candidates, "queries": queries}, f, indent=4) | |
| print(f"๐พ Saved generated synthetic dataset to 'synthetic_dataset_adversarial.json'", flush=True) | |
| # Pre-calculate Candidate Embeddings | |
| print("๐ง Indexing Candidates...") | |
| start_idx = time.time() | |
| for i, c in enumerate(candidates): | |
| # BERT Flattened | |
| c["bert_vec"] = bert_model.encode(c["full_text"]) | |
| # BGE Flattened | |
| c["bge_flat_vec"] = bge_model.encode(c["full_text"]) | |
| # BGE Granular (Project Method) | |
| c["bge_granular"] = { | |
| "skills": bge_model.encode(" ".join(c["skills"])), | |
| "tech_skills": bge_model.encode(" ".join(c["tech_skills"])), | |
| "experience": bge_model.encode(" ".join(c["experience"])), | |
| "certs": bge_model.encode(" ".join(c["certifications"])) if c["certifications"] else np.zeros(1024) | |
| } | |
| if (i+1) % 50 == 0: | |
| print(f" -> Indexed {i+1}/{len(candidates)} candidates...", flush=True) | |
| print(f"โ Indexed in {time.time() - start_idx:.2f}s") | |
| # Evaluation Loops | |
| methods = ["Jaccard_Baseline", "BERT_Flattened", "BGE_Flattened", "BGE_Granular_Weighted"] | |
| results = {m: {"mrr": 0, "r1": 0, "r3": 0} for m in methods} | |
| weights = {"skills": 0.35, "tech_skills": 0.35, "experience": 0.20, "certs": 0.10} | |
| print("\nEvaluating Queries...") | |
| for i, q in enumerate(queries): | |
| target_id = q["relevant_id"] | |
| jd_text = q["query"] | |
| jd_s = q["jd_structured"] | |
| # Embed Query | |
| q_bert = bert_model.encode(jd_text) | |
| q_bge_flat = bge_model.encode(jd_text) | |
| q_bge_g = { | |
| "skills": bge_model.encode(" ".join(jd_s["skills"])), | |
| "tech_skills": bge_model.encode(" ".join(jd_s["tech_skills"])), | |
| "experience": bge_model.encode(" ".join(jd_s["experience"])), | |
| "certs": bge_model.encode(" ".join(jd_s["certifications"])) | |
| } | |
| if (i+1) % 25 == 0: | |
| print(f" -> Evaluated {i+1}/{len(queries)} queries...", flush=True) | |
| # Calculate scores for all candidates | |
| cand_scores = [] | |
| for c in candidates: | |
| # 1. Jaccard | |
| jac = jaccard_similarity(jd_s["skills"], c["skills"]) | |
| # 2. BERT | |
| ber = cosine_similarity(q_bert, c["bert_vec"]) | |
| # 3. BGE Flat | |
| bgf = cosine_similarity(q_bge_flat, c["bge_flat_vec"]) | |
| # 4. BGE Granular Weighted | |
| bgg = ( | |
| cosine_similarity(q_bge_g["skills"], c["bge_granular"]["skills"]) * weights["skills"] + | |
| cosine_similarity(q_bge_g["tech_skills"], c["bge_granular"]["tech_skills"]) * weights["tech_skills"] + | |
| cosine_similarity(q_bge_g["experience"], c["bge_granular"]["experience"]) * weights["experience"] + | |
| cosine_similarity(q_bge_g["certs"], c["bge_granular"]["certs"]) * weights["certs"] | |
| ) | |
| cand_scores.append({ | |
| "id": c["id"], | |
| "Jaccard_Baseline": jac, | |
| "BERT_Flattened": ber, | |
| "BGE_Flattened": bgf, | |
| "BGE_Granular_Weighted": bgg | |
| }) | |
| # Rank and Calc Metrics | |
| for m in methods: | |
| sorted_cands = sorted(cand_scores, key=lambda x: x[m], reverse=True) | |
| rank = next(i for i, x in enumerate(sorted_cands) if x["id"] == target_id) + 1 | |
| results[m]["mrr"] += (1.0 / rank) | |
| if rank == 1: results[m]["r1"] += 1 | |
| if rank <= 3: results[m]["r3"] += 1 | |
| # Print Results Table | |
| num_q = len(queries) | |
| print("\n" + "="*65) | |
| print(f"{'Method':<25} | {'MRR':<8} | {'Recall@1':<10} | {'Recall@3':<10}") | |
| print("-" * 65) | |
| for m in methods: | |
| mrr = results[m]["mrr"] / num_q | |
| r1 = (results[m]["r1"] / num_q) * 100 | |
| r3 = (results[m]["r3"] / num_q) * 100 | |
| print(f"{m:<25} | {mrr:.4f} | {r1:>8.1f}% | {r3:>8.1f}%", flush=True) | |
| print("="*65, flush=True) | |
| # Save to file | |
| summary = {m: {"mrr": results[m]["mrr"]/num_q, "r1": results[m]["r1"]/num_q, "r3": results[m]["r3"]/num_q} for m in methods} | |
| with open("match_benchmark_results.json", "w") as f: | |
| json.dump(summary, f, indent=4) | |
| print(f"\n๐ Results saved to 'match_benchmark_results.json'", flush=True) | |
| if __name__ == "__main__": | |
| run_benchmark() | |