import json import random import os import argparse from typing import List, Dict, Any def load_json(file_path: str) -> List[Dict[str, Any]]: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]: synthetic_mentees = [] # Collect all unique skills, domains, careers for random sampling if needed all_skills = {} # id -> name all_domains = {} # id -> name for m in mentors: for s in m.get('skills', []): if isinstance(s, dict): all_skills[s['id']] = s['name'] for d in m.get('domains', []): if isinstance(d, dict): all_domains[d['id']] = d['name'] print(f"Generating {num_mentees} synthetic mentees...") for i in range(num_mentees): # Pick a random "persona" based on a random mentor (to ensure matches exist) target_mentor = random.choice(mentors) # Create a mentee that looks somewhat like this mentor's target audience # e.g., wants to learn the skills this mentor has mentor_skills = target_mentor.get('skill_ids', []) mentor_domains = target_mentor.get('domain_ids', []) career_id = target_mentor.get('career_id') # Randomly pick a subset of skills/domains to form the query if not mentor_skills: continue num_skills = random.randint(1, min(len(mentor_skills), 3)) selected_skills = random.sample(mentor_skills, num_skills) num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0 selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else [] # Build goal string skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills] domain_names = [all_domains.get(did, str(did)) for did in selected_domains] # VIETNAMESE LOCALIZATION templates = [ "Tôi muốn học về {skills}.", "Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.", "Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.", "Tôi muốn nâng cao kỹ năng {skills}.", "Em là sinh viên muốn tìm hiểu về {skills} và {domains}.", "Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.", "Cần tìm mentor giỏi về {skills}.", "Mục tiêu của tôi là trở thành chuyên gia về {skills}.", "Em muốn chuyển ngành sang {domains} nên cần học về {skills}." ] template = random.choice(templates) skills_str = ", ".join(skill_names) domains_str = ", ".join(domain_names) if domain_names else "công nghệ" goals = template.format(skills=skills_str, domains=domains_str) # STRICT MATCHING LOGIC for GROUND TRUTH # A mentor is considered "relevant" if: # 1. They share at least one domain (if mentee has domains) # 2. They cover a significant portion (>50%) of requested skills relevant_mentors = [] for m in mentors: # Check domain match first (if domains are specified) if selected_domains: m_domains = set(m.get('domain_ids', [])) domain_overlap = m_domains.intersection(set(selected_domains)) if not domain_overlap: continue # Skip if no domain match # Check skill match m_skills = set(m.get('skill_ids', [])) overlap = m_skills.intersection(set(selected_skills)) if not selected_skills: match_score = 0 else: match_score = len(overlap) / len(selected_skills) if match_score >= 0.5: # At least 50% skill match relevant_mentors.append(m['mentor_id']) # Limit ground truth to top 20 to avoid huge lists ground_truth = relevant_mentors[:20] mentee = { "mentee_id": i + 1, "goals": goals, "career_id": career_id, "domain_ids": selected_domains, # mentor_domain_ids removed as per user request "skill_ids": selected_skills, "ground_truth_mentors": ground_truth } synthetic_mentees.append(mentee) return synthetic_mentees def main(): parser = argparse.ArgumentParser(description="Generate synthetic evaluation data") parser.add_argument("mentors_file", help="Path to input mentors JSON file") parser.add_argument("output_file", help="Path to output evaluation JSON file") parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate") args = parser.parse_args() print(f"Loading mentors from {args.mentors_file}...") mentors = load_json(args.mentors_file) data = generate_synthetic_data(mentors, args.count) print(f"Writing {len(data)} mentees to {args.output_file}...") with open(args.output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print("Done.") if __name__ == "__main__": main()