Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| import time | |
| import numpy as np | |
| import json | |
| # Add backend to path | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) | |
| from backend.src.embeddings.local_embedder import generate_embedding, generate_list_embedding | |
| def generate_structured_profiles(num_samples=50): | |
| """Generates synthetic resumes split into specific entity fields.""" | |
| print(f"Generating {num_samples} structured synthetic profiles...") | |
| domains = [ | |
| ("Frontend", ["React", "JavaScript", "Tailwind", "CSS", "HTML", "Redux", "TypeScript", "Jest"]), | |
| ("Backend", ["Python", "FastAPI", "PostgreSQL", "Docker", "AWS", "Linux", "Redis", "Kafka"]), | |
| ("Data Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning", "NLP", "TensorFlow", "R"]), | |
| ("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "Jenkins", "AWS", "Bash", "Ansible"]), | |
| ("Mobile", ["Swift", "Kotlin", "React Native", "Flutter", "iOS", "Android", "Firebase", "SQLite"]) | |
| ] | |
| levels = ["Junior", "Mid-Level", "Senior", "Lead", "Principal"] | |
| profiles = [] | |
| for i in range(num_samples): | |
| domain_name, domain_skills = domains[i % len(domains)] | |
| level = levels[i % len(levels)] | |
| # Randomize skills count slightly per profile (5 to 8 skills) | |
| np.random.seed(i) | |
| skills_subset = list(np.random.choice(domain_skills, size=np.random.randint(5, 9), replace=False)) | |
| profile = { | |
| "profile_id": f"cand_{i+1}_{domain_name.lower()}", | |
| "headline": f"{level} {domain_name} Engineer", | |
| "summary": f"Dedicated {level} {domain_name} professional with a proven track record of building scalable systems and working in agile environments. Passionate about clean code and modern architectures.", | |
| "skills": skills_subset, | |
| "experience": [ | |
| f"{level} Engineer at TechCorp: Spearheaded the migration to cloud infrastructure and improved system performance by 40%.", | |
| f"Software Developer at Startup Inc: Developed RESTful APIs and collaborated with the frontend team to deliver features.", | |
| f"Intern at Legacy Systems: Assisted in maintaining codebases and writing unit tests." | |
| ] | |
| } | |
| profiles.append(profile) | |
| return profiles | |
| def profile_entities_scaled(): | |
| num_samples = 50 | |
| profiles = generate_structured_profiles(num_samples) | |
| print(f"\n๐ Starting Entity-to-Embedding Efficiency Benchmark (N={num_samples})...") | |
| # Tracking arrays | |
| summary_times = [] | |
| headline_times = [] | |
| skills_times = [] | |
| exp_times = [] | |
| total_times = [] | |
| for i, p in enumerate(profiles): | |
| start_total = time.time() | |
| # 1. Profile Headline | |
| start = time.time() | |
| generate_embedding(p["headline"]) | |
| headline_times.append((time.time() - start) * 1000) | |
| # 2. Profile Summary | |
| start = time.time() | |
| generate_embedding(p["summary"]) | |
| summary_times.append((time.time() - start) * 1000) | |
| # 3. Profile Skills (Batch) | |
| start = time.time() | |
| generate_list_embedding(p["skills"]) | |
| skills_times.append((time.time() - start) * 1000) | |
| # 4. Profile Experience (Batch) | |
| start = time.time() | |
| generate_list_embedding(p["experience"]) | |
| exp_times.append((time.time() - start) * 1000) | |
| # Total | |
| total_times.append((time.time() - start_total) * 1000) | |
| if (i + 1) % 10 == 0: | |
| print(f" -> Processed {i + 1}/{num_samples} profiles...") | |
| # Calculate statistics | |
| results = [ | |
| "IRIS Entity-to-Embedding Efficiency Results (Scaled)", | |
| f"Total Profiles Evaluated: {num_samples}", | |
| "-" * 60, | |
| f"{'Entity Type':<15} | {'Mean Latency (ms)':<20} | {'Std Dev (ms)':<15}", | |
| "-" * 60, | |
| f"{'Headline':<15} | {np.mean(headline_times):<20.2f} | {np.std(headline_times):<15.2f}", | |
| f"{'Summary':<15} | {np.mean(summary_times):<20.2f} | {np.std(summary_times):<15.2f}", | |
| f"{'Skills (List)':<15} | {np.mean(skills_times):<20.2f} | {np.std(skills_times):<15.2f}", | |
| f"{'Experience (List)':<15}| {np.mean(exp_times):<20.2f} | {np.std(exp_times):<15.2f}", | |
| "-" * 60, | |
| f"MEAN TOTAL PER PROFILE: {np.mean(total_times):.2f} ms", | |
| f"Average Throughput: {1000 / np.mean(total_times):.3f} profiles/sec" | |
| ] | |
| output_text = "\n".join(results) | |
| print("\n" + output_text) | |
| with open("entity_benchmark_scaled_results.txt", "w") as f: | |
| f.write(output_text) | |
| print("\n๐ Results saved to 'entity_benchmark_scaled_results.txt'.") | |
| if __name__ == "__main__": | |
| profile_entities_scaled() |