Spaces:

sameer2026
/

iris_backend

Sleeping

App Files Files Community

iris_backend / backend /src /embeddings /profile_entities_bench.py

Saandraahh

Implemented clustering

4b3a33f 2 months ago

raw

history blame contribute delete

4.79 kB

	import sys
	import os
	import time
	import numpy as np
	import json

	# Add backend to path
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))

	from backend.src.embeddings.local_embedder import generate_embedding, generate_list_embedding

	def generate_structured_profiles(num_samples=50):
	"""Generates synthetic resumes split into specific entity fields."""
	print(f"Generating {num_samples} structured synthetic profiles...")

	domains = [
	("Frontend", ["React", "JavaScript", "Tailwind", "CSS", "HTML", "Redux", "TypeScript", "Jest"]),
	("Backend", ["Python", "FastAPI", "PostgreSQL", "Docker", "AWS", "Linux", "Redis", "Kafka"]),
	("Data Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning", "NLP", "TensorFlow", "R"]),
	("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "Jenkins", "AWS", "Bash", "Ansible"]),
	("Mobile", ["Swift", "Kotlin", "React Native", "Flutter", "iOS", "Android", "Firebase", "SQLite"])
	]
	levels = ["Junior", "Mid-Level", "Senior", "Lead", "Principal"]

	profiles = []
	for i in range(num_samples):
	domain_name, domain_skills = domains[i % len(domains)]
	level = levels[i % len(levels)]

	# Randomize skills count slightly per profile (5 to 8 skills)
	np.random.seed(i)
	skills_subset = list(np.random.choice(domain_skills, size=np.random.randint(5, 9), replace=False))

	profile = {
	"profile_id": f"cand_{i+1}_{domain_name.lower()}",
	"headline": f"{level} {domain_name} Engineer",
	"summary": f"Dedicated {level} {domain_name} professional with a proven track record of building scalable systems and working in agile environments. Passionate about clean code and modern architectures.",
	"skills": skills_subset,
	"experience": [
	f"{level} Engineer at TechCorp: Spearheaded the migration to cloud infrastructure and improved system performance by 40%.",
	f"Software Developer at Startup Inc: Developed RESTful APIs and collaborated with the frontend team to deliver features.",
	f"Intern at Legacy Systems: Assisted in maintaining codebases and writing unit tests."
	]
	}
	profiles.append(profile)
	return profiles

	def profile_entities_scaled():
	num_samples = 50
	profiles = generate_structured_profiles(num_samples)

	print(f"\n🚀 Starting Entity-to-Embedding Efficiency Benchmark (N={num_samples})...")

	# Tracking arrays
	summary_times = []
	headline_times = []
	skills_times = []
	exp_times = []
	total_times = []

	for i, p in enumerate(profiles):
	start_total = time.time()

	# 1. Profile Headline
	start = time.time()
	generate_embedding(p["headline"])
	headline_times.append((time.time() - start) * 1000)

	# 2. Profile Summary
	start = time.time()
	generate_embedding(p["summary"])
	summary_times.append((time.time() - start) * 1000)

	# 3. Profile Skills (Batch)
	start = time.time()
	generate_list_embedding(p["skills"])
	skills_times.append((time.time() - start) * 1000)

	# 4. Profile Experience (Batch)
	start = time.time()
	generate_list_embedding(p["experience"])
	exp_times.append((time.time() - start) * 1000)

	# Total
	total_times.append((time.time() - start_total) * 1000)

	if (i + 1) % 10 == 0:
	print(f" -> Processed {i + 1}/{num_samples} profiles...")

	# Calculate statistics
	results = [
	"IRIS Entity-to-Embedding Efficiency Results (Scaled)",
	f"Total Profiles Evaluated: {num_samples}",
	"-" * 60,
	f"{'Entity Type':<15} \| {'Mean Latency (ms)':<20} \| {'Std Dev (ms)':<15}",
	"-" * 60,
	f"{'Headline':<15} \| {np.mean(headline_times):<20.2f} \| {np.std(headline_times):<15.2f}",
	f"{'Summary':<15} \| {np.mean(summary_times):<20.2f} \| {np.std(summary_times):<15.2f}",
	f"{'Skills (List)':<15} \| {np.mean(skills_times):<20.2f} \| {np.std(skills_times):<15.2f}",
	f"{'Experience (List)':<15}\| {np.mean(exp_times):<20.2f} \| {np.std(exp_times):<15.2f}",
	"-" * 60,
	f"MEAN TOTAL PER PROFILE: {np.mean(total_times):.2f} ms",
	f"Average Throughput: {1000 / np.mean(total_times):.3f} profiles/sec"
	]

	output_text = "\n".join(results)
	print("\n" + output_text)

	with open("entity_benchmark_scaled_results.txt", "w") as f:
	f.write(output_text)

	print("\n📄 Results saved to 'entity_benchmark_scaled_results.txt'.")

	if __name__ == "__main__":
	profile_entities_scaled()