Spaces:

jakenb
/

mentorme858

Sleeping

File size: 5,436 Bytes

import json
import random
import os
import argparse
from typing import List, Dict, Any

def load_json(file_path: str) -> List[Dict[str, Any]]:
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]:
    synthetic_mentees = []
    
    # Collect all unique skills, domains, careers for random sampling if needed
    all_skills = {} # id -> name
    all_domains = {} # id -> name
    for m in mentors:
        for s in m.get('skills', []):
            if isinstance(s, dict): all_skills[s['id']] = s['name']
        for d in m.get('domains', []):
            if isinstance(d, dict): all_domains[d['id']] = d['name']

    print(f"Generating {num_mentees} synthetic mentees...")

    for i in range(num_mentees):
        # Pick a random "persona" based on a random mentor (to ensure matches exist)
        target_mentor = random.choice(mentors)
        
        # Create a mentee that looks somewhat like this mentor's target audience
        # e.g., wants to learn the skills this mentor has
        
        mentor_skills = target_mentor.get('skill_ids', [])
        mentor_domains = target_mentor.get('domain_ids', [])
        career_id = target_mentor.get('career_id')
        
        # Randomly pick a subset of skills/domains to form the query
        if not mentor_skills: continue
        
        num_skills = random.randint(1, min(len(mentor_skills), 3))
        selected_skills = random.sample(mentor_skills, num_skills)
        
        num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0
        selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else []
        
        # Build goal string
        skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills]
        domain_names = [all_domains.get(did, str(did)) for did in selected_domains]
        
        # VIETNAMESE LOCALIZATION
        templates = [
            "Tôi muốn học về {skills}.",
            "Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.",
            "Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.",
            "Tôi muốn nâng cao kỹ năng {skills}.",
            "Em là sinh viên muốn tìm hiểu về {skills} và {domains}.",
            "Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.",
            "Cần tìm mentor giỏi về {skills}.",
            "Mục tiêu của tôi là trở thành chuyên gia về {skills}.",
            "Em muốn chuyển ngành sang {domains} nên cần học về {skills}."
        ]
        
        template = random.choice(templates)
        skills_str = ", ".join(skill_names)
        domains_str = ", ".join(domain_names) if domain_names else "công nghệ"
        
        goals = template.format(skills=skills_str, domains=domains_str)
        
        # STRICT MATCHING LOGIC for GROUND TRUTH
        # A mentor is considered "relevant" if:
        # 1. They share at least one domain (if mentee has domains)
        # 2. They cover a significant portion (>50%) of requested skills
        relevant_mentors = []
        for m in mentors:
            # Check domain match first (if domains are specified)
            if selected_domains:
                m_domains = set(m.get('domain_ids', []))
                domain_overlap = m_domains.intersection(set(selected_domains))
                if not domain_overlap:
                    continue # Skip if no domain match
            
            # Check skill match
            m_skills = set(m.get('skill_ids', []))
            overlap = m_skills.intersection(set(selected_skills))
            
            if not selected_skills:
                match_score = 0
            else:
                match_score = len(overlap) / len(selected_skills)
            
            if match_score >= 0.5: # At least 50% skill match
                relevant_mentors.append(m['mentor_id'])
                
        # Limit ground truth to top 20 to avoid huge lists
        ground_truth = relevant_mentors[:20]
        
        mentee = {
            "mentee_id": i + 1,
            "goals": goals,
            "career_id": career_id,
            "domain_ids": selected_domains,
            # mentor_domain_ids removed as per user request
            "skill_ids": selected_skills,
            "ground_truth_mentors": ground_truth
        }
        synthetic_mentees.append(mentee)
        
    return synthetic_mentees

def main():
    parser = argparse.ArgumentParser(description="Generate synthetic evaluation data")
    parser.add_argument("mentors_file", help="Path to input mentors JSON file")
    parser.add_argument("output_file", help="Path to output evaluation JSON file")
    parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate")
    
    args = parser.parse_args()
    
    print(f"Loading mentors from {args.mentors_file}...")
    mentors = load_json(args.mentors_file)
    
    data = generate_synthetic_data(mentors, args.count)
    
    print(f"Writing {len(data)} mentees to {args.output_file}...")
    with open(args.output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print("Done.")

if __name__ == "__main__":
    main()