File size: 5,436 Bytes
db4f25a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae024d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db4f25a
 
ae024d5
 
 
db4f25a
 
ae024d5
 
 
 
 
 
 
 
db4f25a
 
ae024d5
 
 
 
 
db4f25a
 
 
 
 
 
 
 
 
 
 
 
ae024d5
db4f25a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
import random
import os
import argparse
from typing import List, Dict, Any

def load_json(file_path: str) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]:
    synthetic_mentees = []
    
    # Collect all unique skills, domains, careers for random sampling if needed
    all_skills = {} # id -> name
    all_domains = {} # id -> name
    for m in mentors:
        for s in m.get('skills', []):
            if isinstance(s, dict): all_skills[s['id']] = s['name']
        for d in m.get('domains', []):
            if isinstance(d, dict): all_domains[d['id']] = d['name']

    print(f"Generating {num_mentees} synthetic mentees...")

    for i in range(num_mentees):
        # Pick a random "persona" based on a random mentor (to ensure matches exist)
        target_mentor = random.choice(mentors)
        
        # Create a mentee that looks somewhat like this mentor's target audience
        # e.g., wants to learn the skills this mentor has
        
        mentor_skills = target_mentor.get('skill_ids', [])
        mentor_domains = target_mentor.get('domain_ids', [])
        career_id = target_mentor.get('career_id')
        
        # Randomly pick a subset of skills/domains to form the query
        if not mentor_skills: continue
        
        num_skills = random.randint(1, min(len(mentor_skills), 3))
        selected_skills = random.sample(mentor_skills, num_skills)
        
        num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0
        selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else []
        
        # Build goal string
        skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills]
        domain_names = [all_domains.get(did, str(did)) for did in selected_domains]
        
        # VIETNAMESE LOCALIZATION
        templates = [
            "Tôi muốn học về {skills}.",
            "Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.",
            "Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.",
            "Tôi muốn nâng cao kỹ năng {skills}.",
            "Em là sinh viên muốn tìm hiểu về {skills} và {domains}.",
            "Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.",
            "Cần tìm mentor giỏi về {skills}.",
            "Mục tiêu của tôi là trở thành chuyên gia về {skills}.",
            "Em muốn chuyển ngành sang {domains} nên cần học về {skills}."
        ]
        
        template = random.choice(templates)
        skills_str = ", ".join(skill_names)
        domains_str = ", ".join(domain_names) if domain_names else "công nghệ"
        
        goals = template.format(skills=skills_str, domains=domains_str)
        
        # STRICT MATCHING LOGIC for GROUND TRUTH
        # A mentor is considered "relevant" if:
        # 1. They share at least one domain (if mentee has domains)
        # 2. They cover a significant portion (>50%) of requested skills
        relevant_mentors = []
        for m in mentors:
            # Check domain match first (if domains are specified)
            if selected_domains:
                m_domains = set(m.get('domain_ids', []))
                domain_overlap = m_domains.intersection(set(selected_domains))
                if not domain_overlap:
                    continue # Skip if no domain match
            
            # Check skill match
            m_skills = set(m.get('skill_ids', []))
            overlap = m_skills.intersection(set(selected_skills))
            
            if not selected_skills:
                match_score = 0
            else:
                match_score = len(overlap) / len(selected_skills)
            
            if match_score >= 0.5: # At least 50% skill match
                relevant_mentors.append(m['mentor_id'])
                
        # Limit ground truth to top 20 to avoid huge lists
        ground_truth = relevant_mentors[:20]
        
        mentee = {
            "mentee_id": i + 1,
            "goals": goals,
            "career_id": career_id,
            "domain_ids": selected_domains,
            # mentor_domain_ids removed as per user request
            "skill_ids": selected_skills,
            "ground_truth_mentors": ground_truth
        }
        synthetic_mentees.append(mentee)
        
    return synthetic_mentees

def main():
    parser = argparse.ArgumentParser(description="Generate synthetic evaluation data")
    parser.add_argument("mentors_file", help="Path to input mentors JSON file")
    parser.add_argument("output_file", help="Path to output evaluation JSON file")
    parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate")
    
    args = parser.parse_args()
    
    print(f"Loading mentors from {args.mentors_file}...")
    mentors = load_json(args.mentors_file)
    
    data = generate_synthetic_data(mentors, args.count)
    
    print(f"Writing {len(data)} mentees to {args.output_file}...")
    with open(args.output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print("Done.")

if __name__ == "__main__":
    main()