Spaces:
Sleeping
Sleeping
| import json | |
| import random | |
| import os | |
| import argparse | |
| from typing import List, Dict, Any | |
| def load_json(file_path: str) -> List[Dict[str, Any]]: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]: | |
| synthetic_mentees = [] | |
| # Collect all unique skills, domains, careers for random sampling if needed | |
| all_skills = {} # id -> name | |
| all_domains = {} # id -> name | |
| for m in mentors: | |
| for s in m.get('skills', []): | |
| if isinstance(s, dict): all_skills[s['id']] = s['name'] | |
| for d in m.get('domains', []): | |
| if isinstance(d, dict): all_domains[d['id']] = d['name'] | |
| print(f"Generating {num_mentees} synthetic mentees...") | |
| for i in range(num_mentees): | |
| # Pick a random "persona" based on a random mentor (to ensure matches exist) | |
| target_mentor = random.choice(mentors) | |
| # Create a mentee that looks somewhat like this mentor's target audience | |
| # e.g., wants to learn the skills this mentor has | |
| mentor_skills = target_mentor.get('skill_ids', []) | |
| mentor_domains = target_mentor.get('domain_ids', []) | |
| career_id = target_mentor.get('career_id') | |
| # Randomly pick a subset of skills/domains to form the query | |
| if not mentor_skills: continue | |
| num_skills = random.randint(1, min(len(mentor_skills), 3)) | |
| selected_skills = random.sample(mentor_skills, num_skills) | |
| num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0 | |
| selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else [] | |
| # Build goal string | |
| skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills] | |
| domain_names = [all_domains.get(did, str(did)) for did in selected_domains] | |
| # VIETNAMESE LOCALIZATION | |
| templates = [ | |
| "Tôi muốn học về {skills}.", | |
| "Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.", | |
| "Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.", | |
| "Tôi muốn nâng cao kỹ năng {skills}.", | |
| "Em là sinh viên muốn tìm hiểu về {skills} và {domains}.", | |
| "Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.", | |
| "Cần tìm mentor giỏi về {skills}.", | |
| "Mục tiêu của tôi là trở thành chuyên gia về {skills}.", | |
| "Em muốn chuyển ngành sang {domains} nên cần học về {skills}." | |
| ] | |
| template = random.choice(templates) | |
| skills_str = ", ".join(skill_names) | |
| domains_str = ", ".join(domain_names) if domain_names else "công nghệ" | |
| goals = template.format(skills=skills_str, domains=domains_str) | |
| # STRICT MATCHING LOGIC for GROUND TRUTH | |
| # A mentor is considered "relevant" if: | |
| # 1. They share at least one domain (if mentee has domains) | |
| # 2. They cover a significant portion (>50%) of requested skills | |
| relevant_mentors = [] | |
| for m in mentors: | |
| # Check domain match first (if domains are specified) | |
| if selected_domains: | |
| m_domains = set(m.get('domain_ids', [])) | |
| domain_overlap = m_domains.intersection(set(selected_domains)) | |
| if not domain_overlap: | |
| continue # Skip if no domain match | |
| # Check skill match | |
| m_skills = set(m.get('skill_ids', [])) | |
| overlap = m_skills.intersection(set(selected_skills)) | |
| if not selected_skills: | |
| match_score = 0 | |
| else: | |
| match_score = len(overlap) / len(selected_skills) | |
| if match_score >= 0.5: # At least 50% skill match | |
| relevant_mentors.append(m['mentor_id']) | |
| # Limit ground truth to top 20 to avoid huge lists | |
| ground_truth = relevant_mentors[:20] | |
| mentee = { | |
| "mentee_id": i + 1, | |
| "goals": goals, | |
| "career_id": career_id, | |
| "domain_ids": selected_domains, | |
| # mentor_domain_ids removed as per user request | |
| "skill_ids": selected_skills, | |
| "ground_truth_mentors": ground_truth | |
| } | |
| synthetic_mentees.append(mentee) | |
| return synthetic_mentees | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Generate synthetic evaluation data") | |
| parser.add_argument("mentors_file", help="Path to input mentors JSON file") | |
| parser.add_argument("output_file", help="Path to output evaluation JSON file") | |
| parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate") | |
| args = parser.parse_args() | |
| print(f"Loading mentors from {args.mentors_file}...") | |
| mentors = load_json(args.mentors_file) | |
| data = generate_synthetic_data(mentors, args.count) | |
| print(f"Writing {len(data)} mentees to {args.output_file}...") | |
| with open(args.output_file, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| print("Done.") | |
| if __name__ == "__main__": | |
| main() | |