iris_backend / backend /src /embeddings /evaluate_quality.py
Saandraahh's picture
Implemented clustering
4b3a33f
import sys
import os
import time
import json
import random
import numpy as np
# Set encoding for Windows terminals
if sys.platform == "win32":
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
from backend.src.embeddings.local_embedder import generate_embedding
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
def inject_noise(text, is_skill=False):
"""Simulates real-world messy resumes with abbreviations, typos, and lowercasing."""
if random.random() < 0.3: # 30% chance to leave perfectly clean
return text
abbreviations = {
"Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS",
"React": "ReactJS", "Machine Learning": "ML", "Amazon Web Services": "AWS",
"Kubernetes": "K8s", "TypeScript": "TS", "User Experience": "UX"
}
if is_skill and text in abbreviations and random.random() > 0.5:
return abbreviations[text]
# Randomly lowercase everything (common in lazy resumes)
if random.random() > 0.7:
text = text.lower()
return text
def generate_adversarial_dataset():
"""Generates 200 candidates with intentional distractors and noise."""
print("Building N=200 Adversarial Candidate Pool...")
domains = [
("Frontend_React", ["React", "JavaScript", "Tailwind", "CSS", "TypeScript"]),
("Frontend_Angular", ["Angular", "JavaScript", "SCSS", "HTML", "TypeScript"]),
("Backend_Python", ["Python", "FastAPI", "PostgreSQL", "Docker", "Linux"]),
("Backend_Java", ["Java", "Spring Boot", "MySQL", "Kafka", "Kubernetes"]),
("Data_Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning"]),
("Data_Engineer", ["Spark", "Airflow", "Python", "SQL", "AWS"]),
("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "AWS"]),
("Mobile_iOS", ["Swift", "Objective-C", "iOS", "XCode", "CoreData"]),
("Mobile_Android", ["Kotlin", "Java", "Android Studio", "Jetpack", "Firebase"]),
("Cybersecurity", ["Network Security", "Penetration Testing", "Firewalls", "Linux", "Python"])
]
levels = ["Junior", "Mid-Level", "Senior", "Lead"]
candidates = []
golden_dataset = []
cand_counter = 1
# Generate 40 Queries (10 domains x 4 levels)
for domain_name, base_skills in domains:
for level in levels:
# 1. The Target Candidate (Golden)
target_id = f"cand_{cand_counter}_TARGET_{level}_{domain_name}"
target_skills = [inject_noise(s, True) for s in base_skills]
candidates.append({
"id": target_id,
"headline": f"{level} {domain_name.replace('_', ' ')} Engineer",
"summary": inject_noise(f"Experienced {level} professional in {domain_name}. Passionate about building scalable architectures."),
"skills": target_skills,
"experience": [inject_noise(f"Built systems using {target_skills[0]} and {target_skills[1]}.")]
})
cand_counter += 1
# The Query (Clean, formal HR language)
query = f"Hiring a {level} professional in {domain_name.replace('_', ' ')}. Must have strong experience with {base_skills[0]}, {base_skills[1]}, and {base_skills[2]}."
golden_dataset.append({"query": query, "relevant_id": target_id})
# 2. Seniority Distractor (Wrong level, perfect skills)
distractor_level = "Senior" if level == "Junior" else "Junior"
candidates.append({
"id": f"cand_{cand_counter}_DISTRACTOR_LEVEL_{domain_name}",
"headline": f"{distractor_level} {domain_name.replace('_', ' ')} Engineer",
"summary": f"A {distractor_level} developer specializing in {domain_name}.",
"skills": base_skills, # Same exact skills to confuse the model
"experience": [f"Worked extensively with {base_skills[0]}."]
})
cand_counter += 1
# 3. Skill Distractor (Right level, missing core skill, has similar skill)
altered_skills = base_skills.copy()
altered_skills[0] = "C++" # Replace core skill with something irrelevant
candidates.append({
"id": f"cand_{cand_counter}_DISTRACTOR_SKILL_{domain_name}",
"headline": f"{level} Software Engineer",
"summary": f"Focuses on {altered_skills[0]} and backend architecture.",
"skills": altered_skills,
"experience": [f"Maintained legacy {altered_skills[0]} codebases."]
})
cand_counter += 1
# 4 & 5. Random Noise Candidates (Fill out the 200)
for _ in range(2):
rand_domain = random.choice(domains)
candidates.append({
"id": f"cand_{cand_counter}_RANDOM",
"headline": f"{random.choice(levels)} {rand_domain[0]} Dev",
"summary": "Looking for new opportunities. Hobbies: hiking, dog walking, photography.",
"skills": [inject_noise(s, True) for s in rand_domain[1]],
"experience": ["General software development tasks."]
})
cand_counter += 1
return candidates, golden_dataset
def evaluate_adversarial():
print("πŸš€ Starting Adversarial Robustness Evaluation...")
candidates, golden_dataset = generate_adversarial_dataset()
print(f"πŸ“Š Dataset: {len(golden_dataset)} Queries | {len(candidates)} Candidates")
print("⚠️ Warning: Embedding 200 profiles on CPU will take time. Please wait...\n")
# 1. Embed Candidates (Flattening)
candidate_embeddings = []
start_time = time.time()
for i, c in enumerate(candidates):
rich_text = f"Headline: {c['headline']}. Summary: {c['summary']} Skills: {', '.join(c['skills'])}. Experience: {' '.join(c['experience'])}"
candidate_embeddings.append({
"id": c["id"],
"vec": generate_embedding(rich_text)
})
if (i+1) % 20 == 0:
print(f" -> Embedded {i+1}/200 candidates...")
print(f"βœ… Embedding complete in {time.time() - start_time:.2f} seconds.\n")
# 2. Evaluate Queries
mrr_total = 0
hits_at_1 = 0
hits_at_3 = 0
hits_at_5 = 0
for item in golden_dataset:
query_vec = generate_embedding(item["query"])
target_id = item["relevant_id"]
scores = [(c_emb["id"], cosine_similarity(query_vec, c_emb["vec"])) for c_emb in candidate_embeddings]
scores.sort(key=lambda x: x[1], reverse=True)
rank = -1
for idx, (cid, sim) in enumerate(scores):
if cid == target_id:
rank = idx + 1
break
if rank != -1:
mrr_total += (1.0 / rank)
if rank == 1: hits_at_1 += 1
if rank <= 3: hits_at_3 += 1
if rank <= 5: hits_at_5 += 1
# 3. Final Aggregation
num_queries = len(golden_dataset)
final_mrr = mrr_total / num_queries
recall_1 = hits_at_1 / num_queries
recall_3 = hits_at_3 / num_queries
recall_5 = hits_at_5 / num_queries
print("="*45)
print("πŸ›‘οΈ ADVERSARIAL RETRIEVAL METRICS (N=200)")
print("="*45)
print(f"MRR (Mean Reciprocal Rank): {final_mrr:.4f}")
print("-" * 45)
print(f"Recall@1 (R@1): {recall_1*100:.1f}%")
print(f"Recall@3 (R@3): {recall_3*100:.1f}%")
print(f"Recall@5 (R@5): {recall_5*100:.1f}%")
print("="*45)
# Save to JSON for the guide/paper
with open("quality_metrics_adversarial.json", "w") as f:
json.dump({
"dataset": "N=200 Adversarial (Noise + Distractors)",
"mrr": final_mrr,
"recall_1": recall_1,
"recall_3": recall_3
}, f, indent=4)
print("πŸ“„ Results securely saved to 'quality_metrics_adversarial.json'")
if __name__ == "__main__":
evaluate_adversarial()