mentorme858 / scripts /generate_evaluation_data.py
Nguyễn Thanh Tùng
Improve prompt with semantic names
ae024d5
import json
import random
import os
import argparse
from typing import List, Dict, Any
def load_json(file_path: str) -> List[Dict[str, Any]]:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]:
synthetic_mentees = []
# Collect all unique skills, domains, careers for random sampling if needed
all_skills = {} # id -> name
all_domains = {} # id -> name
for m in mentors:
for s in m.get('skills', []):
if isinstance(s, dict): all_skills[s['id']] = s['name']
for d in m.get('domains', []):
if isinstance(d, dict): all_domains[d['id']] = d['name']
print(f"Generating {num_mentees} synthetic mentees...")
for i in range(num_mentees):
# Pick a random "persona" based on a random mentor (to ensure matches exist)
target_mentor = random.choice(mentors)
# Create a mentee that looks somewhat like this mentor's target audience
# e.g., wants to learn the skills this mentor has
mentor_skills = target_mentor.get('skill_ids', [])
mentor_domains = target_mentor.get('domain_ids', [])
career_id = target_mentor.get('career_id')
# Randomly pick a subset of skills/domains to form the query
if not mentor_skills: continue
num_skills = random.randint(1, min(len(mentor_skills), 3))
selected_skills = random.sample(mentor_skills, num_skills)
num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0
selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else []
# Build goal string
skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills]
domain_names = [all_domains.get(did, str(did)) for did in selected_domains]
# VIETNAMESE LOCALIZATION
templates = [
"Tôi muốn học về {skills}.",
"Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.",
"Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.",
"Tôi muốn nâng cao kỹ năng {skills}.",
"Em là sinh viên muốn tìm hiểu về {skills} và {domains}.",
"Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.",
"Cần tìm mentor giỏi về {skills}.",
"Mục tiêu của tôi là trở thành chuyên gia về {skills}.",
"Em muốn chuyển ngành sang {domains} nên cần học về {skills}."
]
template = random.choice(templates)
skills_str = ", ".join(skill_names)
domains_str = ", ".join(domain_names) if domain_names else "công nghệ"
goals = template.format(skills=skills_str, domains=domains_str)
# STRICT MATCHING LOGIC for GROUND TRUTH
# A mentor is considered "relevant" if:
# 1. They share at least one domain (if mentee has domains)
# 2. They cover a significant portion (>50%) of requested skills
relevant_mentors = []
for m in mentors:
# Check domain match first (if domains are specified)
if selected_domains:
m_domains = set(m.get('domain_ids', []))
domain_overlap = m_domains.intersection(set(selected_domains))
if not domain_overlap:
continue # Skip if no domain match
# Check skill match
m_skills = set(m.get('skill_ids', []))
overlap = m_skills.intersection(set(selected_skills))
if not selected_skills:
match_score = 0
else:
match_score = len(overlap) / len(selected_skills)
if match_score >= 0.5: # At least 50% skill match
relevant_mentors.append(m['mentor_id'])
# Limit ground truth to top 20 to avoid huge lists
ground_truth = relevant_mentors[:20]
mentee = {
"mentee_id": i + 1,
"goals": goals,
"career_id": career_id,
"domain_ids": selected_domains,
# mentor_domain_ids removed as per user request
"skill_ids": selected_skills,
"ground_truth_mentors": ground_truth
}
synthetic_mentees.append(mentee)
return synthetic_mentees
def main():
parser = argparse.ArgumentParser(description="Generate synthetic evaluation data")
parser.add_argument("mentors_file", help="Path to input mentors JSON file")
parser.add_argument("output_file", help="Path to output evaluation JSON file")
parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate")
args = parser.parse_args()
print(f"Loading mentors from {args.mentors_file}...")
mentors = load_json(args.mentors_file)
data = generate_synthetic_data(mentors, args.count)
print(f"Writing {len(data)} mentees to {args.output_file}...")
with open(args.output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("Done.")
if __name__ == "__main__":
main()