Spaces:
Sleeping
Sleeping
File size: 5,436 Bytes
db4f25a ae024d5 db4f25a ae024d5 db4f25a ae024d5 db4f25a ae024d5 db4f25a ae024d5 db4f25a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import json
import random
import os
import argparse
from typing import List, Dict, Any
def load_json(file_path: str) -> List[Dict[str, Any]]:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]:
synthetic_mentees = []
# Collect all unique skills, domains, careers for random sampling if needed
all_skills = {} # id -> name
all_domains = {} # id -> name
for m in mentors:
for s in m.get('skills', []):
if isinstance(s, dict): all_skills[s['id']] = s['name']
for d in m.get('domains', []):
if isinstance(d, dict): all_domains[d['id']] = d['name']
print(f"Generating {num_mentees} synthetic mentees...")
for i in range(num_mentees):
# Pick a random "persona" based on a random mentor (to ensure matches exist)
target_mentor = random.choice(mentors)
# Create a mentee that looks somewhat like this mentor's target audience
# e.g., wants to learn the skills this mentor has
mentor_skills = target_mentor.get('skill_ids', [])
mentor_domains = target_mentor.get('domain_ids', [])
career_id = target_mentor.get('career_id')
# Randomly pick a subset of skills/domains to form the query
if not mentor_skills: continue
num_skills = random.randint(1, min(len(mentor_skills), 3))
selected_skills = random.sample(mentor_skills, num_skills)
num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0
selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else []
# Build goal string
skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills]
domain_names = [all_domains.get(did, str(did)) for did in selected_domains]
# VIETNAMESE LOCALIZATION
templates = [
"Tôi muốn học về {skills}.",
"Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.",
"Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.",
"Tôi muốn nâng cao kỹ năng {skills}.",
"Em là sinh viên muốn tìm hiểu về {skills} và {domains}.",
"Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.",
"Cần tìm mentor giỏi về {skills}.",
"Mục tiêu của tôi là trở thành chuyên gia về {skills}.",
"Em muốn chuyển ngành sang {domains} nên cần học về {skills}."
]
template = random.choice(templates)
skills_str = ", ".join(skill_names)
domains_str = ", ".join(domain_names) if domain_names else "công nghệ"
goals = template.format(skills=skills_str, domains=domains_str)
# STRICT MATCHING LOGIC for GROUND TRUTH
# A mentor is considered "relevant" if:
# 1. They share at least one domain (if mentee has domains)
# 2. They cover a significant portion (>50%) of requested skills
relevant_mentors = []
for m in mentors:
# Check domain match first (if domains are specified)
if selected_domains:
m_domains = set(m.get('domain_ids', []))
domain_overlap = m_domains.intersection(set(selected_domains))
if not domain_overlap:
continue # Skip if no domain match
# Check skill match
m_skills = set(m.get('skill_ids', []))
overlap = m_skills.intersection(set(selected_skills))
if not selected_skills:
match_score = 0
else:
match_score = len(overlap) / len(selected_skills)
if match_score >= 0.5: # At least 50% skill match
relevant_mentors.append(m['mentor_id'])
# Limit ground truth to top 20 to avoid huge lists
ground_truth = relevant_mentors[:20]
mentee = {
"mentee_id": i + 1,
"goals": goals,
"career_id": career_id,
"domain_ids": selected_domains,
# mentor_domain_ids removed as per user request
"skill_ids": selected_skills,
"ground_truth_mentors": ground_truth
}
synthetic_mentees.append(mentee)
return synthetic_mentees
def main():
parser = argparse.ArgumentParser(description="Generate synthetic evaluation data")
parser.add_argument("mentors_file", help="Path to input mentors JSON file")
parser.add_argument("output_file", help="Path to output evaluation JSON file")
parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate")
args = parser.parse_args()
print(f"Loading mentors from {args.mentors_file}...")
mentors = load_json(args.mentors_file)
data = generate_synthetic_data(mentors, args.count)
print(f"Writing {len(data)} mentees to {args.output_file}...")
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Done.")
if __name__ == "__main__":
main()
|