import sys import os import time import json import random import numpy as np # Set encoding for Windows terminals if sys.platform == "win32": import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # Add backend to path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) from backend.src.embeddings.local_embedder import generate_embedding def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) def inject_noise(text, is_skill=False): """Simulates real-world messy resumes with abbreviations, typos, and lowercasing.""" if random.random() < 0.3: # 30% chance to leave perfectly clean return text abbreviations = { "Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS", "React": "ReactJS", "Machine Learning": "ML", "Amazon Web Services": "AWS", "Kubernetes": "K8s", "TypeScript": "TS", "User Experience": "UX" } if is_skill and text in abbreviations and random.random() > 0.5: return abbreviations[text] # Randomly lowercase everything (common in lazy resumes) if random.random() > 0.7: text = text.lower() return text def generate_adversarial_dataset(): """Generates 200 candidates with intentional distractors and noise.""" print("Building N=200 Adversarial Candidate Pool...") domains = [ ("Frontend_React", ["React", "JavaScript", "Tailwind", "CSS", "TypeScript"]), ("Frontend_Angular", ["Angular", "JavaScript", "SCSS", "HTML", "TypeScript"]), ("Backend_Python", ["Python", "FastAPI", "PostgreSQL", "Docker", "Linux"]), ("Backend_Java", ["Java", "Spring Boot", "MySQL", "Kafka", "Kubernetes"]), ("Data_Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning"]), ("Data_Engineer", ["Spark", "Airflow", "Python", "SQL", "AWS"]), ("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "AWS"]), ("Mobile_iOS", ["Swift", "Objective-C", "iOS", "XCode", "CoreData"]), ("Mobile_Android", ["Kotlin", "Java", "Android Studio", "Jetpack", "Firebase"]), ("Cybersecurity", ["Network Security", "Penetration Testing", "Firewalls", "Linux", "Python"]) ] levels = ["Junior", "Mid-Level", "Senior", "Lead"] candidates = [] golden_dataset = [] cand_counter = 1 # Generate 40 Queries (10 domains x 4 levels) for domain_name, base_skills in domains: for level in levels: # 1. The Target Candidate (Golden) target_id = f"cand_{cand_counter}_TARGET_{level}_{domain_name}" target_skills = [inject_noise(s, True) for s in base_skills] candidates.append({ "id": target_id, "headline": f"{level} {domain_name.replace('_', ' ')} Engineer", "summary": inject_noise(f"Experienced {level} professional in {domain_name}. Passionate about building scalable architectures."), "skills": target_skills, "experience": [inject_noise(f"Built systems using {target_skills[0]} and {target_skills[1]}.")] }) cand_counter += 1 # The Query (Clean, formal HR language) query = f"Hiring a {level} professional in {domain_name.replace('_', ' ')}. Must have strong experience with {base_skills[0]}, {base_skills[1]}, and {base_skills[2]}." golden_dataset.append({"query": query, "relevant_id": target_id}) # 2. Seniority Distractor (Wrong level, perfect skills) distractor_level = "Senior" if level == "Junior" else "Junior" candidates.append({ "id": f"cand_{cand_counter}_DISTRACTOR_LEVEL_{domain_name}", "headline": f"{distractor_level} {domain_name.replace('_', ' ')} Engineer", "summary": f"A {distractor_level} developer specializing in {domain_name}.", "skills": base_skills, # Same exact skills to confuse the model "experience": [f"Worked extensively with {base_skills[0]}."] }) cand_counter += 1 # 3. Skill Distractor (Right level, missing core skill, has similar skill) altered_skills = base_skills.copy() altered_skills[0] = "C++" # Replace core skill with something irrelevant candidates.append({ "id": f"cand_{cand_counter}_DISTRACTOR_SKILL_{domain_name}", "headline": f"{level} Software Engineer", "summary": f"Focuses on {altered_skills[0]} and backend architecture.", "skills": altered_skills, "experience": [f"Maintained legacy {altered_skills[0]} codebases."] }) cand_counter += 1 # 4 & 5. Random Noise Candidates (Fill out the 200) for _ in range(2): rand_domain = random.choice(domains) candidates.append({ "id": f"cand_{cand_counter}_RANDOM", "headline": f"{random.choice(levels)} {rand_domain[0]} Dev", "summary": "Looking for new opportunities. Hobbies: hiking, dog walking, photography.", "skills": [inject_noise(s, True) for s in rand_domain[1]], "experience": ["General software development tasks."] }) cand_counter += 1 return candidates, golden_dataset def evaluate_adversarial(): print("🚀 Starting Adversarial Robustness Evaluation...") candidates, golden_dataset = generate_adversarial_dataset() print(f"📊 Dataset: {len(golden_dataset)} Queries | {len(candidates)} Candidates") print("⚠️ Warning: Embedding 200 profiles on CPU will take time. Please wait...\n") # 1. Embed Candidates (Flattening) candidate_embeddings = [] start_time = time.time() for i, c in enumerate(candidates): rich_text = f"Headline: {c['headline']}. Summary: {c['summary']} Skills: {', '.join(c['skills'])}. Experience: {' '.join(c['experience'])}" candidate_embeddings.append({ "id": c["id"], "vec": generate_embedding(rich_text) }) if (i+1) % 20 == 0: print(f" -> Embedded {i+1}/200 candidates...") print(f"✅ Embedding complete in {time.time() - start_time:.2f} seconds.\n") # 2. Evaluate Queries mrr_total = 0 hits_at_1 = 0 hits_at_3 = 0 hits_at_5 = 0 for item in golden_dataset: query_vec = generate_embedding(item["query"]) target_id = item["relevant_id"] scores = [(c_emb["id"], cosine_similarity(query_vec, c_emb["vec"])) for c_emb in candidate_embeddings] scores.sort(key=lambda x: x[1], reverse=True) rank = -1 for idx, (cid, sim) in enumerate(scores): if cid == target_id: rank = idx + 1 break if rank != -1: mrr_total += (1.0 / rank) if rank == 1: hits_at_1 += 1 if rank <= 3: hits_at_3 += 1 if rank <= 5: hits_at_5 += 1 # 3. Final Aggregation num_queries = len(golden_dataset) final_mrr = mrr_total / num_queries recall_1 = hits_at_1 / num_queries recall_3 = hits_at_3 / num_queries recall_5 = hits_at_5 / num_queries print("="*45) print("🛡️ ADVERSARIAL RETRIEVAL METRICS (N=200)") print("="*45) print(f"MRR (Mean Reciprocal Rank): {final_mrr:.4f}") print("-" * 45) print(f"Recall@1 (R@1): {recall_1*100:.1f}%") print(f"Recall@3 (R@3): {recall_3*100:.1f}%") print(f"Recall@5 (R@5): {recall_5*100:.1f}%") print("="*45) # Save to JSON for the guide/paper with open("quality_metrics_adversarial.json", "w") as f: json.dump({ "dataset": "N=200 Adversarial (Noise + Distractors)", "mrr": final_mrr, "recall_1": recall_1, "recall_3": recall_3 }, f, indent=4) print("📄 Results securely saved to 'quality_metrics_adversarial.json'") if __name__ == "__main__": evaluate_adversarial()