Spaces:

sameer2026
/

iris_backend

Sleeping

App Files Files Community

iris_backend / backend /src /embeddings /evaluate_quality.py

Saandraahh

Implemented clustering

4b3a33f 2 months ago

raw

history blame contribute delete

8.32 kB

	import sys
	import os
	import time
	import json
	import random
	import numpy as np

	# Set encoding for Windows terminals
	if sys.platform == "win32":
	import io
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

	# Add backend to path
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))

	from backend.src.embeddings.local_embedder import generate_embedding

	def cosine_similarity(v1, v2):
	return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

	def inject_noise(text, is_skill=False):
	"""Simulates real-world messy resumes with abbreviations, typos, and lowercasing."""
	if random.random() < 0.3: # 30% chance to leave perfectly clean
	return text

	abbreviations = {
	"Python": "Py", "PostgreSQL": "Postgres", "JavaScript": "JS",
	"React": "ReactJS", "Machine Learning": "ML", "Amazon Web Services": "AWS",
	"Kubernetes": "K8s", "TypeScript": "TS", "User Experience": "UX"
	}

	if is_skill and text in abbreviations and random.random() > 0.5:
	return abbreviations[text]

	# Randomly lowercase everything (common in lazy resumes)
	if random.random() > 0.7:
	text = text.lower()

	return text

	def generate_adversarial_dataset():
	"""Generates 200 candidates with intentional distractors and noise."""
	print("Building N=200 Adversarial Candidate Pool...")

	domains = [
	("Frontend_React", ["React", "JavaScript", "Tailwind", "CSS", "TypeScript"]),
	("Frontend_Angular", ["Angular", "JavaScript", "SCSS", "HTML", "TypeScript"]),
	("Backend_Python", ["Python", "FastAPI", "PostgreSQL", "Docker", "Linux"]),
	("Backend_Java", ["Java", "Spring Boot", "MySQL", "Kafka", "Kubernetes"]),
	("Data_Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning"]),
	("Data_Engineer", ["Spark", "Airflow", "Python", "SQL", "AWS"]),
	("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "AWS"]),
	("Mobile_iOS", ["Swift", "Objective-C", "iOS", "XCode", "CoreData"]),
	("Mobile_Android", ["Kotlin", "Java", "Android Studio", "Jetpack", "Firebase"]),
	("Cybersecurity", ["Network Security", "Penetration Testing", "Firewalls", "Linux", "Python"])
	]
	levels = ["Junior", "Mid-Level", "Senior", "Lead"]

	candidates = []
	golden_dataset = []

	cand_counter = 1

	# Generate 40 Queries (10 domains x 4 levels)
	for domain_name, base_skills in domains:
	for level in levels:
	# 1. The Target Candidate (Golden)
	target_id = f"cand_{cand_counter}_TARGET_{level}_{domain_name}"
	target_skills = [inject_noise(s, True) for s in base_skills]
	candidates.append({
	"id": target_id,
	"headline": f"{level} {domain_name.replace('_', ' ')} Engineer",
	"summary": inject_noise(f"Experienced {level} professional in {domain_name}. Passionate about building scalable architectures."),
	"skills": target_skills,
	"experience": [inject_noise(f"Built systems using {target_skills[0]} and {target_skills[1]}.")]
	})
	cand_counter += 1

	# The Query (Clean, formal HR language)
	query = f"Hiring a {level} professional in {domain_name.replace('_', ' ')}. Must have strong experience with {base_skills[0]}, {base_skills[1]}, and {base_skills[2]}."
	golden_dataset.append({"query": query, "relevant_id": target_id})

	# 2. Seniority Distractor (Wrong level, perfect skills)
	distractor_level = "Senior" if level == "Junior" else "Junior"
	candidates.append({
	"id": f"cand_{cand_counter}_DISTRACTOR_LEVEL_{domain_name}",
	"headline": f"{distractor_level} {domain_name.replace('_', ' ')} Engineer",
	"summary": f"A {distractor_level} developer specializing in {domain_name}.",
	"skills": base_skills, # Same exact skills to confuse the model
	"experience": [f"Worked extensively with {base_skills[0]}."]
	})
	cand_counter += 1

	# 3. Skill Distractor (Right level, missing core skill, has similar skill)
	altered_skills = base_skills.copy()
	altered_skills[0] = "C++" # Replace core skill with something irrelevant
	candidates.append({
	"id": f"cand_{cand_counter}_DISTRACTOR_SKILL_{domain_name}",
	"headline": f"{level} Software Engineer",
	"summary": f"Focuses on {altered_skills[0]} and backend architecture.",
	"skills": altered_skills,
	"experience": [f"Maintained legacy {altered_skills[0]} codebases."]
	})
	cand_counter += 1

	# 4 & 5. Random Noise Candidates (Fill out the 200)
	for _ in range(2):
	rand_domain = random.choice(domains)
	candidates.append({
	"id": f"cand_{cand_counter}_RANDOM",
	"headline": f"{random.choice(levels)} {rand_domain[0]} Dev",
	"summary": "Looking for new opportunities. Hobbies: hiking, dog walking, photography.",
	"skills": [inject_noise(s, True) for s in rand_domain[1]],
	"experience": ["General software development tasks."]
	})
	cand_counter += 1

	return candidates, golden_dataset

	def evaluate_adversarial():
	print("🚀 Starting Adversarial Robustness Evaluation...")

	candidates, golden_dataset = generate_adversarial_dataset()

	print(f"📊 Dataset: {len(golden_dataset)} Queries \| {len(candidates)} Candidates")
	print("⚠️ Warning: Embedding 200 profiles on CPU will take time. Please wait...\n")

	# 1. Embed Candidates (Flattening)
	candidate_embeddings = []
	start_time = time.time()

	for i, c in enumerate(candidates):
	rich_text = f"Headline: {c['headline']}. Summary: {c['summary']} Skills: {', '.join(c['skills'])}. Experience: {' '.join(c['experience'])}"
	candidate_embeddings.append({
	"id": c["id"],
	"vec": generate_embedding(rich_text)
	})
	if (i+1) % 20 == 0:
	print(f" -> Embedded {i+1}/200 candidates...")

	print(f"✅ Embedding complete in {time.time() - start_time:.2f} seconds.\n")

	# 2. Evaluate Queries
	mrr_total = 0
	hits_at_1 = 0
	hits_at_3 = 0
	hits_at_5 = 0

	for item in golden_dataset:
	query_vec = generate_embedding(item["query"])
	target_id = item["relevant_id"]

	scores = [(c_emb["id"], cosine_similarity(query_vec, c_emb["vec"])) for c_emb in candidate_embeddings]
	scores.sort(key=lambda x: x[1], reverse=True)

	rank = -1
	for idx, (cid, sim) in enumerate(scores):
	if cid == target_id:
	rank = idx + 1
	break

	if rank != -1:
	mrr_total += (1.0 / rank)
	if rank == 1: hits_at_1 += 1
	if rank <= 3: hits_at_3 += 1
	if rank <= 5: hits_at_5 += 1

	# 3. Final Aggregation
	num_queries = len(golden_dataset)
	final_mrr = mrr_total / num_queries
	recall_1 = hits_at_1 / num_queries
	recall_3 = hits_at_3 / num_queries
	recall_5 = hits_at_5 / num_queries

	print("="*45)
	print("🛡️ ADVERSARIAL RETRIEVAL METRICS (N=200)")
	print("="*45)
	print(f"MRR (Mean Reciprocal Rank): {final_mrr:.4f}")
	print("-" * 45)
	print(f"Recall@1 (R@1): {recall_1*100:.1f}%")
	print(f"Recall@3 (R@3): {recall_3*100:.1f}%")
	print(f"Recall@5 (R@5): {recall_5*100:.1f}%")
	print("="*45)

	# Save to JSON for the guide/paper
	with open("quality_metrics_adversarial.json", "w") as f:
	json.dump({
	"dataset": "N=200 Adversarial (Noise + Distractors)",
	"mrr": final_mrr,
	"recall_1": recall_1,
	"recall_3": recall_3
	}, f, indent=4)

	print("📄 Results securely saved to 'quality_metrics_adversarial.json'")

	if __name__ == "__main__":
	evaluate_adversarial()