File size: 4,785 Bytes
4b3a33f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import os
import time
import numpy as np
import json

# Add backend to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))

from backend.src.embeddings.local_embedder import generate_embedding, generate_list_embedding

def generate_structured_profiles(num_samples=50):
    """Generates synthetic resumes split into specific entity fields."""
    print(f"Generating {num_samples} structured synthetic profiles...")
    
    domains = [
        ("Frontend", ["React", "JavaScript", "Tailwind", "CSS", "HTML", "Redux", "TypeScript", "Jest"]),
        ("Backend", ["Python", "FastAPI", "PostgreSQL", "Docker", "AWS", "Linux", "Redis", "Kafka"]),
        ("Data Science", ["Python", "Pandas", "PyTorch", "SQL", "Machine Learning", "NLP", "TensorFlow", "R"]),
        ("DevOps", ["Kubernetes", "Docker", "Terraform", "CI/CD", "Jenkins", "AWS", "Bash", "Ansible"]),
        ("Mobile", ["Swift", "Kotlin", "React Native", "Flutter", "iOS", "Android", "Firebase", "SQLite"])
    ]
    levels = ["Junior", "Mid-Level", "Senior", "Lead", "Principal"]
    
    profiles = []
    for i in range(num_samples):
        domain_name, domain_skills = domains[i % len(domains)]
        level = levels[i % len(levels)]
        
        # Randomize skills count slightly per profile (5 to 8 skills)
        np.random.seed(i)
        skills_subset = list(np.random.choice(domain_skills, size=np.random.randint(5, 9), replace=False))
        
        profile = {
            "profile_id": f"cand_{i+1}_{domain_name.lower()}",
            "headline": f"{level} {domain_name} Engineer",
            "summary": f"Dedicated {level} {domain_name} professional with a proven track record of building scalable systems and working in agile environments. Passionate about clean code and modern architectures.",
            "skills": skills_subset,
            "experience": [
                f"{level} Engineer at TechCorp: Spearheaded the migration to cloud infrastructure and improved system performance by 40%.",
                f"Software Developer at Startup Inc: Developed RESTful APIs and collaborated with the frontend team to deliver features.",
                f"Intern at Legacy Systems: Assisted in maintaining codebases and writing unit tests."
            ]
        }
        profiles.append(profile)
    return profiles

def profile_entities_scaled():
    num_samples = 50
    profiles = generate_structured_profiles(num_samples)
    
    print(f"\n🚀 Starting Entity-to-Embedding Efficiency Benchmark (N={num_samples})...")
    
    # Tracking arrays
    summary_times = []
    headline_times = []
    skills_times = []
    exp_times = []
    total_times = []

    for i, p in enumerate(profiles):
        start_total = time.time()
        
        # 1. Profile Headline
        start = time.time()
        generate_embedding(p["headline"])
        headline_times.append((time.time() - start) * 1000)

        # 2. Profile Summary
        start = time.time()
        generate_embedding(p["summary"])
        summary_times.append((time.time() - start) * 1000)
        
        # 3. Profile Skills (Batch)
        start = time.time()
        generate_list_embedding(p["skills"])
        skills_times.append((time.time() - start) * 1000)
        
        # 4. Profile Experience (Batch)
        start = time.time()
        generate_list_embedding(p["experience"])
        exp_times.append((time.time() - start) * 1000)
        
        # Total
        total_times.append((time.time() - start_total) * 1000)

        if (i + 1) % 10 == 0:
            print(f"  -> Processed {i + 1}/{num_samples} profiles...")

    # Calculate statistics
    results = [
        "IRIS Entity-to-Embedding Efficiency Results (Scaled)",
        f"Total Profiles Evaluated: {num_samples}",
        "-" * 60,
        f"{'Entity Type':<15} | {'Mean Latency (ms)':<20} | {'Std Dev (ms)':<15}",
        "-" * 60,
        f"{'Headline':<15} | {np.mean(headline_times):<20.2f} | {np.std(headline_times):<15.2f}",
        f"{'Summary':<15} | {np.mean(summary_times):<20.2f} | {np.std(summary_times):<15.2f}",
        f"{'Skills (List)':<15} | {np.mean(skills_times):<20.2f} | {np.std(skills_times):<15.2f}",
        f"{'Experience (List)':<15}| {np.mean(exp_times):<20.2f} | {np.std(exp_times):<15.2f}",
        "-" * 60,
        f"MEAN TOTAL PER PROFILE: {np.mean(total_times):.2f} ms",
        f"Average Throughput:     {1000 / np.mean(total_times):.3f} profiles/sec"
    ]

    output_text = "\n".join(results)
    print("\n" + output_text)

    with open("entity_benchmark_scaled_results.txt", "w") as f:
        f.write(output_text)
        
    print("\n📄 Results saved to 'entity_benchmark_scaled_results.txt'.")

if __name__ == "__main__":
    profile_entities_scaled()