iris_backend / backend /src /embeddings /profile_entities_bench.py
Saandraahh's picture
Implemented clustering
4b3a33f
import sys
import os
import time
import numpy as np
import json
# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
from backend.src.embeddings.local_embedder import generate_embedding, generate_list_embedding
def generate_structured_profiles(num_samples=50):
"""Generates synthetic resumes split into specific entity fields."""
print(f"Generating {num_samples} structured synthetic profiles...")
domains = [
("Frontend", ["React", "JavaScript", "Tailwind", "CSS", "HTML", "Redux", "TypeScript", "Jest"]),
("Backend", ["Python", "FastAPI", "PostgreSQL", "Docker", "AWS", "Linux", "Redis", "Kafka"]),
("Data Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning", "NLP", "TensorFlow", "R"]),
("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "Jenkins", "AWS", "Bash", "Ansible"]),
("Mobile", ["Swift", "Kotlin", "React Native", "Flutter", "iOS", "Android", "Firebase", "SQLite"])
]
levels = ["Junior", "Mid-Level", "Senior", "Lead", "Principal"]
profiles = []
for i in range(num_samples):
domain_name, domain_skills = domains[i % len(domains)]
level = levels[i % len(levels)]
# Randomize skills count slightly per profile (5 to 8 skills)
np.random.seed(i)
skills_subset = list(np.random.choice(domain_skills, size=np.random.randint(5, 9), replace=False))
profile = {
"profile_id": f"cand_{i+1}_{domain_name.lower()}",
"headline": f"{level} {domain_name} Engineer",
"summary": f"Dedicated {level} {domain_name} professional with a proven track record of building scalable systems and working in agile environments. Passionate about clean code and modern architectures.",
"skills": skills_subset,
"experience": [
f"{level} Engineer at TechCorp: Spearheaded the migration to cloud infrastructure and improved system performance by 40%.",
f"Software Developer at Startup Inc: Developed RESTful APIs and collaborated with the frontend team to deliver features.",
f"Intern at Legacy Systems: Assisted in maintaining codebases and writing unit tests."
]
}
profiles.append(profile)
return profiles
def profile_entities_scaled():
num_samples = 50
profiles = generate_structured_profiles(num_samples)
print(f"\n๐Ÿš€ Starting Entity-to-Embedding Efficiency Benchmark (N={num_samples})...")
# Tracking arrays
summary_times = []
headline_times = []
skills_times = []
exp_times = []
total_times = []
for i, p in enumerate(profiles):
start_total = time.time()
# 1. Profile Headline
start = time.time()
generate_embedding(p["headline"])
headline_times.append((time.time() - start) * 1000)
# 2. Profile Summary
start = time.time()
generate_embedding(p["summary"])
summary_times.append((time.time() - start) * 1000)
# 3. Profile Skills (Batch)
start = time.time()
generate_list_embedding(p["skills"])
skills_times.append((time.time() - start) * 1000)
# 4. Profile Experience (Batch)
start = time.time()
generate_list_embedding(p["experience"])
exp_times.append((time.time() - start) * 1000)
# Total
total_times.append((time.time() - start_total) * 1000)
if (i + 1) % 10 == 0:
print(f" -> Processed {i + 1}/{num_samples} profiles...")
# Calculate statistics
results = [
"IRIS Entity-to-Embedding Efficiency Results (Scaled)",
f"Total Profiles Evaluated: {num_samples}",
"-" * 60,
f"{'Entity Type':<15} | {'Mean Latency (ms)':<20} | {'Std Dev (ms)':<15}",
"-" * 60,
f"{'Headline':<15} | {np.mean(headline_times):<20.2f} | {np.std(headline_times):<15.2f}",
f"{'Summary':<15} | {np.mean(summary_times):<20.2f} | {np.std(summary_times):<15.2f}",
f"{'Skills (List)':<15} | {np.mean(skills_times):<20.2f} | {np.std(skills_times):<15.2f}",
f"{'Experience (List)':<15}| {np.mean(exp_times):<20.2f} | {np.std(exp_times):<15.2f}",
"-" * 60,
f"MEAN TOTAL PER PROFILE: {np.mean(total_times):.2f} ms",
f"Average Throughput: {1000 / np.mean(total_times):.3f} profiles/sec"
]
output_text = "\n".join(results)
print("\n" + output_text)
with open("entity_benchmark_scaled_results.txt", "w") as f:
f.write(output_text)
print("\n๐Ÿ“„ Results saved to 'entity_benchmark_scaled_results.txt'.")
if __name__ == "__main__":
profile_entities_scaled()