Spaces:

jakenb
/

mentorme858

Sleeping

mentorme858 / scripts /generate_evaluation_data.py

Nguyễn Thanh Tùng

Improve prompt with semantic names

ae024d5 21 days ago

5.44 kB

	import json
	import random
	import os
	import argparse
	from typing import List, Dict, Any

	def load_json(file_path: str) -> List[Dict[str, Any]]:
	with open(file_path, 'r', encoding='utf-8') as f:
	return json.load(f)

	def generate_synthetic_data(mentors: List[Dict[str, Any]], num_mentees: int = 50) -> List[Dict[str, Any]]:
	synthetic_mentees = []

	# Collect all unique skills, domains, careers for random sampling if needed
	all_skills = {} # id -> name
	all_domains = {} # id -> name
	for m in mentors:
	for s in m.get('skills', []):
	if isinstance(s, dict): all_skills[s['id']] = s['name']
	for d in m.get('domains', []):
	if isinstance(d, dict): all_domains[d['id']] = d['name']

	print(f"Generating {num_mentees} synthetic mentees...")

	for i in range(num_mentees):
	# Pick a random "persona" based on a random mentor (to ensure matches exist)
	target_mentor = random.choice(mentors)

	# Create a mentee that looks somewhat like this mentor's target audience
	# e.g., wants to learn the skills this mentor has

	mentor_skills = target_mentor.get('skill_ids', [])
	mentor_domains = target_mentor.get('domain_ids', [])
	career_id = target_mentor.get('career_id')

	# Randomly pick a subset of skills/domains to form the query
	if not mentor_skills: continue

	num_skills = random.randint(1, min(len(mentor_skills), 3))
	selected_skills = random.sample(mentor_skills, num_skills)

	num_domains = random.randint(1, min(len(mentor_domains), 2)) if mentor_domains else 0
	selected_domains = random.sample(mentor_domains, num_domains) if num_domains > 0 else []

	# Build goal string
	skill_names = [all_skills.get(sid, str(sid)) for sid in selected_skills]
	domain_names = [all_domains.get(did, str(did)) for did in selected_domains]

	# VIETNAMESE LOCALIZATION
	templates = [
	"Tôi muốn học về {skills}.",
	"Em đang tìm mentor có kinh nghiệm về {skills} trong lĩnh vực {domains}.",
	"Mình cần người hướng dẫn về {skills} để phát triển sự nghiệp.",
	"Tôi muốn nâng cao kỹ năng {skills}.",
	"Em là sinh viên muốn tìm hiểu về {skills} và {domains}.",
	"Mình đang làm việc trong ngành {domains} và muốn học thêm về {skills}.",
	"Cần tìm mentor giỏi về {skills}.",
	"Mục tiêu của tôi là trở thành chuyên gia về {skills}.",
	"Em muốn chuyển ngành sang {domains} nên cần học về {skills}."
	]

	template = random.choice(templates)
	skills_str = ", ".join(skill_names)
	domains_str = ", ".join(domain_names) if domain_names else "công nghệ"

	goals = template.format(skills=skills_str, domains=domains_str)

	# STRICT MATCHING LOGIC for GROUND TRUTH
	# A mentor is considered "relevant" if:
	# 1. They share at least one domain (if mentee has domains)
	# 2. They cover a significant portion (>50%) of requested skills
	relevant_mentors = []
	for m in mentors:
	# Check domain match first (if domains are specified)
	if selected_domains:
	m_domains = set(m.get('domain_ids', []))
	domain_overlap = m_domains.intersection(set(selected_domains))
	if not domain_overlap:
	continue # Skip if no domain match

	# Check skill match
	m_skills = set(m.get('skill_ids', []))
	overlap = m_skills.intersection(set(selected_skills))

	if not selected_skills:
	match_score = 0
	else:
	match_score = len(overlap) / len(selected_skills)

	if match_score >= 0.5: # At least 50% skill match
	relevant_mentors.append(m['mentor_id'])

	# Limit ground truth to top 20 to avoid huge lists
	ground_truth = relevant_mentors[:20]

	mentee = {
	"mentee_id": i + 1,
	"goals": goals,
	"career_id": career_id,
	"domain_ids": selected_domains,
	# mentor_domain_ids removed as per user request
	"skill_ids": selected_skills,
	"ground_truth_mentors": ground_truth
	}
	synthetic_mentees.append(mentee)

	return synthetic_mentees

	def main():
	parser = argparse.ArgumentParser(description="Generate synthetic evaluation data")
	parser.add_argument("mentors_file", help="Path to input mentors JSON file")
	parser.add_argument("output_file", help="Path to output evaluation JSON file")
	parser.add_argument("--count", type=int, default=50, help="Number of mentees to generate")

	args = parser.parse_args()

	print(f"Loading mentors from {args.mentors_file}...")
	mentors = load_json(args.mentors_file)

	data = generate_synthetic_data(mentors, args.count)

	print(f"Writing {len(data)} mentees to {args.output_file}...")
	with open(args.output_file, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)
	print("Done.")

	if __name__ == "__main__":
	main()