File size: 8,318 Bytes
4b3a33f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import sys
import os
import time
import json
import random
import numpy as np

# Set encoding for Windows terminals
if sys.platform == "win32":
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))

from backend.src.embeddings.local_embedder import generate_embedding

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def inject_noise(text, is_skill=False):
    """Simulates real-world messy resumes with abbreviations, typos, and lowercasing."""
    if random.random() < 0.3: # 30% chance to leave perfectly clean
        return text
        
    abbreviations = {
        "Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS", 
        "React": "ReactJS", "Machine Learning": "ML", "Amazon Web Services": "AWS",
        "Kubernetes": "K8s", "TypeScript": "TS", "User Experience": "UX"
    }
    
    if is_skill and text in abbreviations and random.random() > 0.5:
        return abbreviations[text]
        
    # Randomly lowercase everything (common in lazy resumes)
    if random.random() > 0.7:
        text = text.lower()
        
    return text

def generate_adversarial_dataset():
    """Generates 200 candidates with intentional distractors and noise."""
    print("Building N=200 Adversarial Candidate Pool...")
    
    domains = [
        ("Frontend_React", ["React", "JavaScript", "Tailwind", "CSS", "TypeScript"]),
        ("Frontend_Angular", ["Angular", "JavaScript", "SCSS", "HTML", "TypeScript"]),
        ("Backend_Python", ["Python", "FastAPI", "PostgreSQL", "Docker", "Linux"]),
        ("Backend_Java", ["Java", "Spring Boot", "MySQL", "Kafka", "Kubernetes"]),
        ("Data_Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning"]),
        ("Data_Engineer", ["Spark", "Airflow", "Python", "SQL", "AWS"]),
        ("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "AWS"]),
        ("Mobile_iOS", ["Swift", "Objective-C", "iOS", "XCode", "CoreData"]),
        ("Mobile_Android", ["Kotlin", "Java", "Android Studio", "Jetpack", "Firebase"]),
        ("Cybersecurity", ["Network Security", "Penetration Testing", "Firewalls", "Linux", "Python"])
    ]
    levels = ["Junior", "Mid-Level", "Senior", "Lead"]
    
    candidates = []
    golden_dataset = []
    
    cand_counter = 1
    
    # Generate 40 Queries (10 domains x 4 levels)
    for domain_name, base_skills in domains:
        for level in levels:
            # 1. The Target Candidate (Golden)
            target_id = f"cand_{cand_counter}_TARGET_{level}_{domain_name}"
            target_skills = [inject_noise(s, True) for s in base_skills]
            candidates.append({
                "id": target_id,
                "headline": f"{level} {domain_name.replace('_', ' ')} Engineer",
                "summary": inject_noise(f"Experienced {level} professional in {domain_name}. Passionate about building scalable architectures."),
                "skills": target_skills,
                "experience": [inject_noise(f"Built systems using {target_skills[0]} and {target_skills[1]}.")]
            })
            cand_counter += 1
            
            # The Query (Clean, formal HR language)
            query = f"Hiring a {level} professional in {domain_name.replace('_', ' ')}. Must have strong experience with {base_skills[0]}, {base_skills[1]}, and {base_skills[2]}."
            golden_dataset.append({"query": query, "relevant_id": target_id})

            # 2. Seniority Distractor (Wrong level, perfect skills)
            distractor_level = "Senior" if level == "Junior" else "Junior"
            candidates.append({
                "id": f"cand_{cand_counter}_DISTRACTOR_LEVEL_{domain_name}",
                "headline": f"{distractor_level} {domain_name.replace('_', ' ')} Engineer",
                "summary": f"A {distractor_level} developer specializing in {domain_name}.",
                "skills": base_skills, # Same exact skills to confuse the model
                "experience": [f"Worked extensively with {base_skills[0]}."]
            })
            cand_counter += 1

            # 3. Skill Distractor (Right level, missing core skill, has similar skill)
            altered_skills = base_skills.copy()
            altered_skills[0] = "C++" # Replace core skill with something irrelevant
            candidates.append({
                "id": f"cand_{cand_counter}_DISTRACTOR_SKILL_{domain_name}",
                "headline": f"{level} Software Engineer",
                "summary": f"Focuses on {altered_skills[0]} and backend architecture.",
                "skills": altered_skills,
                "experience": [f"Maintained legacy {altered_skills[0]} codebases."]
            })
            cand_counter += 1
            
            # 4 & 5. Random Noise Candidates (Fill out the 200)
            for _ in range(2):
                rand_domain = random.choice(domains)
                candidates.append({
                    "id": f"cand_{cand_counter}_RANDOM",
                    "headline": f"{random.choice(levels)} {rand_domain[0]} Dev",
                    "summary": "Looking for new opportunities. Hobbies: hiking, dog walking, photography.",
                    "skills": [inject_noise(s, True) for s in rand_domain[1]],
                    "experience": ["General software development tasks."]
                })
                cand_counter += 1

    return candidates, golden_dataset

def evaluate_adversarial():
    print("πŸš€ Starting Adversarial Robustness Evaluation...")
    
    candidates, golden_dataset = generate_adversarial_dataset()
    
    print(f"πŸ“Š Dataset: {len(golden_dataset)} Queries | {len(candidates)} Candidates")
    print("⚠️  Warning: Embedding 200 profiles on CPU will take time. Please wait...\n")
    
    # 1. Embed Candidates (Flattening)
    candidate_embeddings = []
    start_time = time.time()
    
    for i, c in enumerate(candidates):
        rich_text = f"Headline: {c['headline']}. Summary: {c['summary']} Skills: {', '.join(c['skills'])}. Experience: {' '.join(c['experience'])}"
        candidate_embeddings.append({
            "id": c["id"],
            "vec": generate_embedding(rich_text)
        })
        if (i+1) % 20 == 0:
            print(f"  -> Embedded {i+1}/200 candidates...")
            
    print(f"βœ… Embedding complete in {time.time() - start_time:.2f} seconds.\n")

    # 2. Evaluate Queries
    mrr_total = 0
    hits_at_1 = 0
    hits_at_3 = 0
    hits_at_5 = 0
    
    for item in golden_dataset:
        query_vec = generate_embedding(item["query"])
        target_id = item["relevant_id"]
        
        scores = [(c_emb["id"], cosine_similarity(query_vec, c_emb["vec"])) for c_emb in candidate_embeddings]
        scores.sort(key=lambda x: x[1], reverse=True)
        
        rank = -1
        for idx, (cid, sim) in enumerate(scores):
            if cid == target_id:
                rank = idx + 1
                break
                
        if rank != -1:
            mrr_total += (1.0 / rank)
            if rank == 1: hits_at_1 += 1
            if rank <= 3: hits_at_3 += 1
            if rank <= 5: hits_at_5 += 1

    # 3. Final Aggregation
    num_queries = len(golden_dataset)
    final_mrr = mrr_total / num_queries
    recall_1 = hits_at_1 / num_queries
    recall_3 = hits_at_3 / num_queries
    recall_5 = hits_at_5 / num_queries

    print("="*45)
    print("πŸ›‘οΈ ADVERSARIAL RETRIEVAL METRICS (N=200)")
    print("="*45)
    print(f"MRR (Mean Reciprocal Rank):  {final_mrr:.4f}")
    print("-" * 45)
    print(f"Recall@1  (R@1):             {recall_1*100:.1f}%")
    print(f"Recall@3  (R@3):             {recall_3*100:.1f}%")
    print(f"Recall@5  (R@5):             {recall_5*100:.1f}%")
    print("="*45)
    
    # Save to JSON for the guide/paper
    with open("quality_metrics_adversarial.json", "w") as f:
        json.dump({
            "dataset": "N=200 Adversarial (Noise + Distractors)",
            "mrr": final_mrr,
            "recall_1": recall_1,
            "recall_3": recall_3
        }, f, indent=4)
        
    print("πŸ“„ Results securely saved to 'quality_metrics_adversarial.json'")

if __name__ == "__main__":
    evaluate_adversarial()