import sys import os import time import numpy as np import json # Add backend to path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) from backend.src.embeddings.local_embedder import generate_embedding, generate_list_embedding def generate_structured_profiles(num_samples=50): """Generates synthetic resumes split into specific entity fields.""" print(f"Generating {num_samples} structured synthetic profiles...") domains = [ ("Frontend", ["React", "JavaScript", "Tailwind", "CSS", "HTML", "Redux", "TypeScript", "Jest"]), ("Backend", ["Python", "FastAPI", "PostgreSQL", "Docker", "AWS", "Linux", "Redis", "Kafka"]), ("Data Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning", "NLP", "TensorFlow", "R"]), ("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "Jenkins", "AWS", "Bash", "Ansible"]), ("Mobile", ["Swift", "Kotlin", "React Native", "Flutter", "iOS", "Android", "Firebase", "SQLite"]) ] levels = ["Junior", "Mid-Level", "Senior", "Lead", "Principal"] profiles = [] for i in range(num_samples): domain_name, domain_skills = domains[i % len(domains)] level = levels[i % len(levels)] # Randomize skills count slightly per profile (5 to 8 skills) np.random.seed(i) skills_subset = list(np.random.choice(domain_skills, size=np.random.randint(5, 9), replace=False)) profile = { "profile_id": f"cand_{i+1}_{domain_name.lower()}", "headline": f"{level} {domain_name} Engineer", "summary": f"Dedicated {level} {domain_name} professional with a proven track record of building scalable systems and working in agile environments. Passionate about clean code and modern architectures.", "skills": skills_subset, "experience": [ f"{level} Engineer at TechCorp: Spearheaded the migration to cloud infrastructure and improved system performance by 40%.", f"Software Developer at Startup Inc: Developed RESTful APIs and collaborated with the frontend team to deliver features.", f"Intern at Legacy Systems: Assisted in maintaining codebases and writing unit tests." ] } profiles.append(profile) return profiles def profile_entities_scaled(): num_samples = 50 profiles = generate_structured_profiles(num_samples) print(f"\nšŸš€ Starting Entity-to-Embedding Efficiency Benchmark (N={num_samples})...") # Tracking arrays summary_times = [] headline_times = [] skills_times = [] exp_times = [] total_times = [] for i, p in enumerate(profiles): start_total = time.time() # 1. Profile Headline start = time.time() generate_embedding(p["headline"]) headline_times.append((time.time() - start) * 1000) # 2. Profile Summary start = time.time() generate_embedding(p["summary"]) summary_times.append((time.time() - start) * 1000) # 3. Profile Skills (Batch) start = time.time() generate_list_embedding(p["skills"]) skills_times.append((time.time() - start) * 1000) # 4. Profile Experience (Batch) start = time.time() generate_list_embedding(p["experience"]) exp_times.append((time.time() - start) * 1000) # Total total_times.append((time.time() - start_total) * 1000) if (i + 1) % 10 == 0: print(f" -> Processed {i + 1}/{num_samples} profiles...") # Calculate statistics results = [ "IRIS Entity-to-Embedding Efficiency Results (Scaled)", f"Total Profiles Evaluated: {num_samples}", "-" * 60, f"{'Entity Type':<15} | {'Mean Latency (ms)':<20} | {'Std Dev (ms)':<15}", "-" * 60, f"{'Headline':<15} | {np.mean(headline_times):<20.2f} | {np.std(headline_times):<15.2f}", f"{'Summary':<15} | {np.mean(summary_times):<20.2f} | {np.std(summary_times):<15.2f}", f"{'Skills (List)':<15} | {np.mean(skills_times):<20.2f} | {np.std(skills_times):<15.2f}", f"{'Experience (List)':<15}| {np.mean(exp_times):<20.2f} | {np.std(exp_times):<15.2f}", "-" * 60, f"MEAN TOTAL PER PROFILE: {np.mean(total_times):.2f} ms", f"Average Throughput: {1000 / np.mean(total_times):.3f} profiles/sec" ] output_text = "\n".join(results) print("\n" + output_text) with open("entity_benchmark_scaled_results.txt", "w") as f: f.write(output_text) print("\nšŸ“„ Results saved to 'entity_benchmark_scaled_results.txt'.") if __name__ == "__main__": profile_entities_scaled()