Spaces:
Sleeping
Sleeping
File size: 9,282 Bytes
4b3a33f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | import sys
import os
import time
import json
import random
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
# Set encoding for Windows terminals
# Removing potentially problematic wrapper for background logging
# if sys.platform == "win32":
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
# ---------------------------------------------------------------------
# UTILS & NOISE SIMULATION
# ---------------------------------------------------------------------
def cosine_similarity(v1, v2):
if v1 is None or v2 is None: return 0.0
norm1 = np.linalg.norm(v1)
norm2 = np.linalg.norm(v2)
if norm1 == 0 or norm2 == 0: return 0.0
return np.dot(v1, v2) / (norm1 * norm2)
def jaccard_similarity(list1, list2):
s1 = set([str(x).lower().strip() for x in list1])
s2 = set([str(x).lower().strip() for x in list2])
if not s1 or not s2: return 0.0
return len(s1.intersection(s2)) / len(s1.union(s2))
def inject_real_world_noise(text, is_skill=False):
"""Simulates typos, abbreviations, and informal language."""
if random.random() < 0.2: return text # 20% keep clean
abbrev = {
"Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS",
"React": "ReactJS", "Machine Learning": "ML", "Kubernetes": "K8s",
"TypeScript": "TS", "Amazon Web Services": "AWS", "Google Cloud": "GCP"
}
# Apply abbreviation
if is_skill and text in abbrev and random.random() > 0.4:
return abbrev[text]
# Inject "Messy" Resume fillers
fillers = ["Highly skilled in", "Practical knowledge of", "Working with", "Extensive experience in"]
if random.random() > 0.7 and not is_skill:
text = f"{random.choice(fillers)} {text}"
# Random case noise
if random.random() > 0.8:
text = text.lower()
return text
# ---------------------------------------------------------------------
# DATASET GENERATION
# ---------------------------------------------------------------------
def generate_bench_dataset(num_candidates=100):
print(f"๐ ๏ธ Generating N={num_candidates} Real-World Synthetic Dataset...")
domains = [
("Cloud_Architect", ["AWS", "Terraform", "Kubernetes", "Docker"], ["Solutions Associate", "AWS Architect"]),
("Backend_Dev", ["Python", "FastAPI", "PostgreSQL", "Redis"], ["Python Cert", "FastAPI Expert"]),
("Frontend_Dev", ["React", "TypeScript", "Tailwind", "Next.js"], ["Meta React Cert", "JS Expert"]),
("Data_Science", ["Python", "PyTorch", "SQL", "Pandas"], ["TensorFlow Cert", "Data Pro"]),
]
candidates = []
queries = [] # JDs
# We generate balanced pairs
for i in range(num_candidates):
domain_name, skills, certs = domains[i % len(domains)]
level = random.choice(["Junior", "Senior", "Lead"])
# 1. The Candidate Data
cand_id = f"cand_{i}_{domain_name}"
noisy_skills = [inject_real_world_noise(s, True) for s in skills]
candidates.append({
"id": cand_id,
"skills": noisy_skills,
"tech_skills": noisy_skills, # Project uses both
"experience": [f"Developed {domain_name} solutions at Tech {i}."],
"certifications": [certs[0]] if random.random() > 0.5 else [],
"full_text": f"{level} {domain_name}. Skills: {', '.join(noisy_skills)}"
})
# 2. The Matching Query (JD) - Formal Clean Version
jd_text = f"We are looking for a {level} {domain_name.replace('_', ' ')}. Must have expertise in {skills[0]}, {skills[1]}, and {skills[2]}."
queries.append({
"query": jd_text,
"relevant_id": cand_id,
"jd_structured": {
"skills": skills,
"tech_skills": skills,
"experience": [f"{level} {domain_name} experience."],
"certifications": certs
}
})
return candidates, queries
# ---------------------------------------------------------------------
# BENCHMARK RUNNER
# ---------------------------------------------------------------------
def run_benchmark():
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐ Loading Models on {device}...", flush=True)
# Load Models
bert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
bge_model = SentenceTransformer('BAAI/bge-m3', device=device)
candidates, queries = generate_bench_dataset(250)
# Save the synthetic dataset to a JSON file for inspection
with open("synthetic_dataset_adversarial.json", "w", encoding="utf-8") as f:
json.dump({"candidates": candidates, "queries": queries}, f, indent=4)
print(f"๐พ Saved generated synthetic dataset to 'synthetic_dataset_adversarial.json'", flush=True)
# Pre-calculate Candidate Embeddings
print("๐ง Indexing Candidates...")
start_idx = time.time()
for i, c in enumerate(candidates):
# BERT Flattened
c["bert_vec"] = bert_model.encode(c["full_text"])
# BGE Flattened
c["bge_flat_vec"] = bge_model.encode(c["full_text"])
# BGE Granular (Project Method)
c["bge_granular"] = {
"skills": bge_model.encode(" ".join(c["skills"])),
"tech_skills": bge_model.encode(" ".join(c["tech_skills"])),
"experience": bge_model.encode(" ".join(c["experience"])),
"certs": bge_model.encode(" ".join(c["certifications"])) if c["certifications"] else np.zeros(1024)
}
if (i+1) % 50 == 0:
print(f" -> Indexed {i+1}/{len(candidates)} candidates...", flush=True)
print(f"โ
Indexed in {time.time() - start_idx:.2f}s")
# Evaluation Loops
methods = ["Jaccard_Baseline", "BERT_Flattened", "BGE_Flattened", "BGE_Granular_Weighted"]
results = {m: {"mrr": 0, "r1": 0, "r3": 0} for m in methods}
weights = {"skills": 0.35, "tech_skills": 0.35, "experience": 0.20, "certs": 0.10}
print("\nEvaluating Queries...")
for i, q in enumerate(queries):
target_id = q["relevant_id"]
jd_text = q["query"]
jd_s = q["jd_structured"]
# Embed Query
q_bert = bert_model.encode(jd_text)
q_bge_flat = bge_model.encode(jd_text)
q_bge_g = {
"skills": bge_model.encode(" ".join(jd_s["skills"])),
"tech_skills": bge_model.encode(" ".join(jd_s["tech_skills"])),
"experience": bge_model.encode(" ".join(jd_s["experience"])),
"certs": bge_model.encode(" ".join(jd_s["certifications"]))
}
if (i+1) % 25 == 0:
print(f" -> Evaluated {i+1}/{len(queries)} queries...", flush=True)
# Calculate scores for all candidates
cand_scores = []
for c in candidates:
# 1. Jaccard
jac = jaccard_similarity(jd_s["skills"], c["skills"])
# 2. BERT
ber = cosine_similarity(q_bert, c["bert_vec"])
# 3. BGE Flat
bgf = cosine_similarity(q_bge_flat, c["bge_flat_vec"])
# 4. BGE Granular Weighted
bgg = (
cosine_similarity(q_bge_g["skills"], c["bge_granular"]["skills"]) * weights["skills"] +
cosine_similarity(q_bge_g["tech_skills"], c["bge_granular"]["tech_skills"]) * weights["tech_skills"] +
cosine_similarity(q_bge_g["experience"], c["bge_granular"]["experience"]) * weights["experience"] +
cosine_similarity(q_bge_g["certs"], c["bge_granular"]["certs"]) * weights["certs"]
)
cand_scores.append({
"id": c["id"],
"Jaccard_Baseline": jac,
"BERT_Flattened": ber,
"BGE_Flattened": bgf,
"BGE_Granular_Weighted": bgg
})
# Rank and Calc Metrics
for m in methods:
sorted_cands = sorted(cand_scores, key=lambda x: x[m], reverse=True)
rank = next(i for i, x in enumerate(sorted_cands) if x["id"] == target_id) + 1
results[m]["mrr"] += (1.0 / rank)
if rank == 1: results[m]["r1"] += 1
if rank <= 3: results[m]["r3"] += 1
# Print Results Table
num_q = len(queries)
print("\n" + "="*65)
print(f"{'Method':<25} | {'MRR':<8} | {'Recall@1':<10} | {'Recall@3':<10}")
print("-" * 65)
for m in methods:
mrr = results[m]["mrr"] / num_q
r1 = (results[m]["r1"] / num_q) * 100
r3 = (results[m]["r3"] / num_q) * 100
print(f"{m:<25} | {mrr:.4f} | {r1:>8.1f}% | {r3:>8.1f}%", flush=True)
print("="*65, flush=True)
# Save to file
summary = {m: {"mrr": results[m]["mrr"]/num_q, "r1": results[m]["r1"]/num_q, "r3": results[m]["r3"]/num_q} for m in methods}
with open("match_benchmark_results.json", "w") as f:
json.dump(summary, f, indent=4)
print(f"\n๐ Results saved to 'match_benchmark_results.json'", flush=True)
if __name__ == "__main__":
run_benchmark()
|