Spaces:
Sleeping
Sleeping
File size: 8,318 Bytes
4b3a33f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | import sys
import os
import time
import json
import random
import numpy as np
# Set encoding for Windows terminals
if sys.platform == "win32":
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
from backend.src.embeddings.local_embedder import generate_embedding
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
def inject_noise(text, is_skill=False):
"""Simulates real-world messy resumes with abbreviations, typos, and lowercasing."""
if random.random() < 0.3: # 30% chance to leave perfectly clean
return text
abbreviations = {
"Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS",
"React": "ReactJS", "Machine Learning": "ML", "Amazon Web Services": "AWS",
"Kubernetes": "K8s", "TypeScript": "TS", "User Experience": "UX"
}
if is_skill and text in abbreviations and random.random() > 0.5:
return abbreviations[text]
# Randomly lowercase everything (common in lazy resumes)
if random.random() > 0.7:
text = text.lower()
return text
def generate_adversarial_dataset():
"""Generates 200 candidates with intentional distractors and noise."""
print("Building N=200 Adversarial Candidate Pool...")
domains = [
("Frontend_React", ["React", "JavaScript", "Tailwind", "CSS", "TypeScript"]),
("Frontend_Angular", ["Angular", "JavaScript", "SCSS", "HTML", "TypeScript"]),
("Backend_Python", ["Python", "FastAPI", "PostgreSQL", "Docker", "Linux"]),
("Backend_Java", ["Java", "Spring Boot", "MySQL", "Kafka", "Kubernetes"]),
("Data_Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning"]),
("Data_Engineer", ["Spark", "Airflow", "Python", "SQL", "AWS"]),
("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "AWS"]),
("Mobile_iOS", ["Swift", "Objective-C", "iOS", "XCode", "CoreData"]),
("Mobile_Android", ["Kotlin", "Java", "Android Studio", "Jetpack", "Firebase"]),
("Cybersecurity", ["Network Security", "Penetration Testing", "Firewalls", "Linux", "Python"])
]
levels = ["Junior", "Mid-Level", "Senior", "Lead"]
candidates = []
golden_dataset = []
cand_counter = 1
# Generate 40 Queries (10 domains x 4 levels)
for domain_name, base_skills in domains:
for level in levels:
# 1. The Target Candidate (Golden)
target_id = f"cand_{cand_counter}_TARGET_{level}_{domain_name}"
target_skills = [inject_noise(s, True) for s in base_skills]
candidates.append({
"id": target_id,
"headline": f"{level} {domain_name.replace('_', ' ')} Engineer",
"summary": inject_noise(f"Experienced {level} professional in {domain_name}. Passionate about building scalable architectures."),
"skills": target_skills,
"experience": [inject_noise(f"Built systems using {target_skills[0]} and {target_skills[1]}.")]
})
cand_counter += 1
# The Query (Clean, formal HR language)
query = f"Hiring a {level} professional in {domain_name.replace('_', ' ')}. Must have strong experience with {base_skills[0]}, {base_skills[1]}, and {base_skills[2]}."
golden_dataset.append({"query": query, "relevant_id": target_id})
# 2. Seniority Distractor (Wrong level, perfect skills)
distractor_level = "Senior" if level == "Junior" else "Junior"
candidates.append({
"id": f"cand_{cand_counter}_DISTRACTOR_LEVEL_{domain_name}",
"headline": f"{distractor_level} {domain_name.replace('_', ' ')} Engineer",
"summary": f"A {distractor_level} developer specializing in {domain_name}.",
"skills": base_skills, # Same exact skills to confuse the model
"experience": [f"Worked extensively with {base_skills[0]}."]
})
cand_counter += 1
# 3. Skill Distractor (Right level, missing core skill, has similar skill)
altered_skills = base_skills.copy()
altered_skills[0] = "C++" # Replace core skill with something irrelevant
candidates.append({
"id": f"cand_{cand_counter}_DISTRACTOR_SKILL_{domain_name}",
"headline": f"{level} Software Engineer",
"summary": f"Focuses on {altered_skills[0]} and backend architecture.",
"skills": altered_skills,
"experience": [f"Maintained legacy {altered_skills[0]} codebases."]
})
cand_counter += 1
# 4 & 5. Random Noise Candidates (Fill out the 200)
for _ in range(2):
rand_domain = random.choice(domains)
candidates.append({
"id": f"cand_{cand_counter}_RANDOM",
"headline": f"{random.choice(levels)} {rand_domain[0]} Dev",
"summary": "Looking for new opportunities. Hobbies: hiking, dog walking, photography.",
"skills": [inject_noise(s, True) for s in rand_domain[1]],
"experience": ["General software development tasks."]
})
cand_counter += 1
return candidates, golden_dataset
def evaluate_adversarial():
print("π Starting Adversarial Robustness Evaluation...")
candidates, golden_dataset = generate_adversarial_dataset()
print(f"π Dataset: {len(golden_dataset)} Queries | {len(candidates)} Candidates")
print("β οΈ Warning: Embedding 200 profiles on CPU will take time. Please wait...\n")
# 1. Embed Candidates (Flattening)
candidate_embeddings = []
start_time = time.time()
for i, c in enumerate(candidates):
rich_text = f"Headline: {c['headline']}. Summary: {c['summary']} Skills: {', '.join(c['skills'])}. Experience: {' '.join(c['experience'])}"
candidate_embeddings.append({
"id": c["id"],
"vec": generate_embedding(rich_text)
})
if (i+1) % 20 == 0:
print(f" -> Embedded {i+1}/200 candidates...")
print(f"β
Embedding complete in {time.time() - start_time:.2f} seconds.\n")
# 2. Evaluate Queries
mrr_total = 0
hits_at_1 = 0
hits_at_3 = 0
hits_at_5 = 0
for item in golden_dataset:
query_vec = generate_embedding(item["query"])
target_id = item["relevant_id"]
scores = [(c_emb["id"], cosine_similarity(query_vec, c_emb["vec"])) for c_emb in candidate_embeddings]
scores.sort(key=lambda x: x[1], reverse=True)
rank = -1
for idx, (cid, sim) in enumerate(scores):
if cid == target_id:
rank = idx + 1
break
if rank != -1:
mrr_total += (1.0 / rank)
if rank == 1: hits_at_1 += 1
if rank <= 3: hits_at_3 += 1
if rank <= 5: hits_at_5 += 1
# 3. Final Aggregation
num_queries = len(golden_dataset)
final_mrr = mrr_total / num_queries
recall_1 = hits_at_1 / num_queries
recall_3 = hits_at_3 / num_queries
recall_5 = hits_at_5 / num_queries
print("="*45)
print("π‘οΈ ADVERSARIAL RETRIEVAL METRICS (N=200)")
print("="*45)
print(f"MRR (Mean Reciprocal Rank): {final_mrr:.4f}")
print("-" * 45)
print(f"Recall@1 (R@1): {recall_1*100:.1f}%")
print(f"Recall@3 (R@3): {recall_3*100:.1f}%")
print(f"Recall@5 (R@5): {recall_5*100:.1f}%")
print("="*45)
# Save to JSON for the guide/paper
with open("quality_metrics_adversarial.json", "w") as f:
json.dump({
"dataset": "N=200 Adversarial (Noise + Distractors)",
"mrr": final_mrr,
"recall_1": recall_1,
"recall_3": recall_3
}, f, indent=4)
print("π Results securely saved to 'quality_metrics_adversarial.json'")
if __name__ == "__main__":
evaluate_adversarial() |