|
|
""" |
|
|
Synthetic Resume Section Data Generator |
|
|
|
|
|
Generates realistic resume section text across 8 categories for training |
|
|
a text classifier. Uses template-based generation with randomized entities, |
|
|
synonym replacement, and structural variation to produce diverse examples. |
|
|
|
|
|
Author: Lorenzo Scaturchio (gr8monk3ys) |
|
|
""" |
|
|
|
|
|
import csv |
|
|
import random |
|
|
import itertools |
|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FIRST_NAMES = [ |
|
|
"James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", |
|
|
"Linda", "David", "Elizabeth", "William", "Barbara", "Richard", "Susan", |
|
|
"Joseph", "Jessica", "Thomas", "Sarah", "Charles", "Karen", "Daniel", |
|
|
"Lisa", "Matthew", "Nancy", "Anthony", "Betty", "Mark", "Sandra", |
|
|
"Aisha", "Wei", "Carlos", "Priya", "Olga", "Hiroshi", "Fatima", "Liam", |
|
|
"Sofia", "Andrei", "Mei", "Alejandro", "Yuki", "Omar", "Elena", "Raj", |
|
|
] |
|
|
|
|
|
LAST_NAMES = [ |
|
|
"Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", |
|
|
"Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", |
|
|
"Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin", |
|
|
"Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", |
|
|
"Patel", "Chen", "Kim", "Nakamura", "Ivanov", "Silva", "Okafor", |
|
|
] |
|
|
|
|
|
COMPANIES = [ |
|
|
"Google", "Microsoft", "Amazon", "Apple", "Meta", "Netflix", "Stripe", |
|
|
"Airbnb", "Uber", "Salesforce", "Adobe", "IBM", "Oracle", "Intel", |
|
|
"Tesla", "SpaceX", "Palantir", "Snowflake", "Databricks", "Confluent", |
|
|
"JPMorgan Chase", "Goldman Sachs", "Morgan Stanley", "Deloitte", |
|
|
"McKinsey & Company", "Boston Consulting Group", "Accenture", |
|
|
"Lockheed Martin", "Boeing", "Raytheon", "General Electric", |
|
|
"Procter & Gamble", "Johnson & Johnson", "Pfizer", "Moderna", |
|
|
"Shopify", "Square", "Twilio", "Cloudflare", "HashiCorp", |
|
|
"DataRobot", "Hugging Face", "OpenAI", "Anthropic", "Cohere", |
|
|
"Startup XYZ", "TechCorp Inc.", "InnovateTech", "DataDriven LLC", |
|
|
] |
|
|
|
|
|
UNIVERSITIES = [ |
|
|
"Massachusetts Institute of Technology", "Stanford University", |
|
|
"Harvard University", "University of California, Berkeley", |
|
|
"Carnegie Mellon University", "Georgia Institute of Technology", |
|
|
"University of Michigan", "University of Illinois Urbana-Champaign", |
|
|
"California Institute of Technology", "Princeton University", |
|
|
"Columbia University", "University of Washington", |
|
|
"University of Texas at Austin", "Cornell University", |
|
|
"University of Pennsylvania", "University of Southern California", |
|
|
"New York University", "University of Wisconsin-Madison", |
|
|
"Duke University", "Northwestern University", |
|
|
"University of California, Los Angeles", "Rice University", |
|
|
"University of Maryland", "Purdue University", |
|
|
"Ohio State University", "Arizona State University", |
|
|
"University of Virginia", "University of Florida", |
|
|
"Boston University", "Northeastern University", |
|
|
] |
|
|
|
|
|
DEGREES = [ |
|
|
("Bachelor of Science", "B.S."), |
|
|
("Bachelor of Arts", "B.A."), |
|
|
("Master of Science", "M.S."), |
|
|
("Master of Arts", "M.A."), |
|
|
("Master of Business Administration", "MBA"), |
|
|
("Doctor of Philosophy", "Ph.D."), |
|
|
("Associate of Science", "A.S."), |
|
|
("Bachelor of Engineering", "B.Eng."), |
|
|
("Master of Engineering", "M.Eng."), |
|
|
] |
|
|
|
|
|
MAJORS = [ |
|
|
"Computer Science", "Software Engineering", "Data Science", |
|
|
"Electrical Engineering", "Mechanical Engineering", |
|
|
"Information Technology", "Mathematics", "Statistics", |
|
|
"Business Administration", "Economics", "Finance", |
|
|
"Biomedical Engineering", "Chemical Engineering", |
|
|
"Civil Engineering", "Physics", "Biology", |
|
|
"Artificial Intelligence", "Machine Learning", |
|
|
"Human-Computer Interaction", "Cybersecurity", |
|
|
"Information Systems", "Operations Research", |
|
|
] |
|
|
|
|
|
MINORS = [ |
|
|
"Mathematics", "Statistics", "Psychology", "Business", |
|
|
"Economics", "Philosophy", "Linguistics", "Physics", |
|
|
"Data Science", "Communication", "Sociology", "History", |
|
|
] |
|
|
|
|
|
GPA_VALUES = [ |
|
|
"3.5", "3.6", "3.7", "3.8", "3.9", "4.0", |
|
|
"3.52", "3.65", "3.78", "3.85", "3.92", "3.45", |
|
|
] |
|
|
|
|
|
GRAD_YEARS = list(range(2015, 2027)) |
|
|
|
|
|
JOB_TITLES = [ |
|
|
"Software Engineer", "Senior Software Engineer", "Staff Engineer", |
|
|
"Principal Engineer", "Engineering Manager", "Tech Lead", |
|
|
"Data Scientist", "Senior Data Scientist", "Machine Learning Engineer", |
|
|
"ML Research Scientist", "Data Engineer", "Data Analyst", |
|
|
"Product Manager", "Senior Product Manager", "Program Manager", |
|
|
"DevOps Engineer", "Site Reliability Engineer", "Cloud Architect", |
|
|
"Full Stack Developer", "Frontend Engineer", "Backend Engineer", |
|
|
"Mobile Developer", "iOS Engineer", "Android Developer", |
|
|
"QA Engineer", "Security Engineer", "Solutions Architect", |
|
|
"Research Scientist", "AI Engineer", "NLP Engineer", |
|
|
"Quantitative Analyst", "Financial Analyst", "Business Analyst", |
|
|
"UX Designer", "UI Engineer", "Technical Writer", |
|
|
"Intern", "Software Engineering Intern", "Data Science Intern", |
|
|
] |
|
|
|
|
|
PROGRAMMING_LANGUAGES = [ |
|
|
"Python", "Java", "JavaScript", "TypeScript", "C++", "C", "C#", |
|
|
"Go", "Rust", "Kotlin", "Swift", "Ruby", "PHP", "Scala", |
|
|
"R", "MATLAB", "Julia", "Haskell", "Elixir", "Dart", |
|
|
] |
|
|
|
|
|
FRAMEWORKS = [ |
|
|
"React", "Angular", "Vue.js", "Next.js", "Django", "Flask", |
|
|
"FastAPI", "Spring Boot", "Express.js", "Node.js", "Rails", |
|
|
"TensorFlow", "PyTorch", "Keras", "scikit-learn", "Pandas", |
|
|
"NumPy", "Spark", "Hadoop", "Kubernetes", "Docker", |
|
|
"AWS", "GCP", "Azure", "Terraform", "Ansible", |
|
|
".NET", "Laravel", "Svelte", "Remix", "Astro", |
|
|
] |
|
|
|
|
|
TOOLS = [ |
|
|
"Git", "GitHub", "GitLab", "Jira", "Confluence", "Slack", |
|
|
"VS Code", "IntelliJ", "PyCharm", "Vim", "Emacs", |
|
|
"PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", |
|
|
"Kafka", "RabbitMQ", "Airflow", "dbt", "Snowflake", |
|
|
"Tableau", "Power BI", "Grafana", "Prometheus", "Datadog", |
|
|
"Jenkins", "CircleCI", "GitHub Actions", "ArgoCD", |
|
|
"Figma", "Sketch", "Adobe XD", "Postman", "Swagger", |
|
|
] |
|
|
|
|
|
SOFT_SKILLS = [ |
|
|
"Leadership", "Communication", "Team Collaboration", |
|
|
"Problem Solving", "Critical Thinking", "Time Management", |
|
|
"Project Management", "Agile Methodologies", "Scrum", |
|
|
"Cross-functional Collaboration", "Mentoring", |
|
|
"Strategic Planning", "Stakeholder Management", |
|
|
"Technical Writing", "Public Speaking", "Negotiation", |
|
|
] |
|
|
|
|
|
CERTIFICATIONS_LIST = [ |
|
|
"AWS Certified Solutions Architect - Associate", |
|
|
"AWS Certified Developer - Associate", |
|
|
"AWS Certified Machine Learning - Specialty", |
|
|
"Google Cloud Professional Data Engineer", |
|
|
"Google Cloud Professional ML Engineer", |
|
|
"Microsoft Azure Fundamentals (AZ-900)", |
|
|
"Microsoft Azure Data Scientist Associate (DP-100)", |
|
|
"Certified Kubernetes Administrator (CKA)", |
|
|
"Certified Kubernetes Application Developer (CKAD)", |
|
|
"Certified Information Systems Security Professional (CISSP)", |
|
|
"CompTIA Security+", |
|
|
"Project Management Professional (PMP)", |
|
|
"Certified ScrumMaster (CSM)", |
|
|
"TensorFlow Developer Certificate", |
|
|
"Databricks Certified Data Engineer Associate", |
|
|
"Snowflake SnowPro Core Certification", |
|
|
"HashiCorp Terraform Associate", |
|
|
"Cisco Certified Network Associate (CCNA)", |
|
|
"Oracle Certified Professional, Java SE", |
|
|
"Red Hat Certified System Administrator (RHCSA)", |
|
|
"Deep Learning Specialization (Coursera)", |
|
|
"Machine Learning by Stanford (Coursera)", |
|
|
"Professional Scrum Master I (PSM I)", |
|
|
] |
|
|
|
|
|
AWARDS_LIST = [ |
|
|
"Dean's List", "Summa Cum Laude", "Magna Cum Laude", "Cum Laude", |
|
|
"Phi Beta Kappa", "Tau Beta Pi", "National Merit Scholar", |
|
|
"Employee of the Quarter", "Spot Bonus Award", "President's Club", |
|
|
"Best Paper Award", "Innovation Award", "Hackathon Winner", |
|
|
"Outstanding Graduate Student Award", "Research Fellowship", |
|
|
"Teaching Assistant Excellence Award", "Community Service Award", |
|
|
"IEEE Best Student Paper", "ACM ICPC Regional Finalist", |
|
|
"Google Code Jam Qualifier", "Facebook Hacker Cup Participant", |
|
|
"Patent Holder", "Top Performer Award", "Rising Star Award", |
|
|
] |
|
|
|
|
|
CITIES = [ |
|
|
"San Francisco, CA", "New York, NY", "Seattle, WA", "Austin, TX", |
|
|
"Boston, MA", "Chicago, IL", "Los Angeles, CA", "Denver, CO", |
|
|
"Portland, OR", "Atlanta, GA", "Washington, DC", "San Jose, CA", |
|
|
"Raleigh, NC", "Pittsburgh, PA", "Minneapolis, MN", "Dallas, TX", |
|
|
"Miami, FL", "Phoenix, AZ", "San Diego, CA", "Philadelphia, PA", |
|
|
] |
|
|
|
|
|
MONTHS = [ |
|
|
"January", "February", "March", "April", "May", "June", |
|
|
"July", "August", "September", "October", "November", "December", |
|
|
] |
|
|
|
|
|
MONTHS_SHORT = [ |
|
|
"Jan", "Feb", "Mar", "Apr", "May", "Jun", |
|
|
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec", |
|
|
] |
|
|
|
|
|
PROJECT_ADJECTIVES = [ |
|
|
"Real-time", "Scalable", "Distributed", "Cloud-native", |
|
|
"AI-powered", "Automated", "Interactive", "Cross-platform", |
|
|
"Open-source", "End-to-end", "High-performance", "Serverless", |
|
|
"Event-driven", "Microservice-based", "Full-stack", |
|
|
] |
|
|
|
|
|
PROJECT_NOUNS = [ |
|
|
"Dashboard", "Platform", "Pipeline", "Application", "System", |
|
|
"API", "Framework", "Tool", "Service", "Engine", |
|
|
"Chatbot", "Recommendation System", "Search Engine", |
|
|
"Analytics Platform", "Monitoring System", "Marketplace", |
|
|
] |
|
|
|
|
|
IMPACT_METRICS = [ |
|
|
"reduced latency by {pct}%", |
|
|
"improved throughput by {pct}%", |
|
|
"increased user engagement by {pct}%", |
|
|
"decreased error rate by {pct}%", |
|
|
"saved ${amount}K annually", |
|
|
"reduced costs by {pct}%", |
|
|
"improved accuracy by {pct}%", |
|
|
"increased conversion rate by {pct}%", |
|
|
"served {users} daily active users", |
|
|
"processed {events} events per second", |
|
|
"reduced deployment time from hours to minutes", |
|
|
"cut onboarding time by {pct}%", |
|
|
"automated {pct}% of manual processes", |
|
|
"improved model F1 score from 0.{f1_old} to 0.{f1_new}", |
|
|
] |
|
|
|
|
|
PHONE_AREA_CODES = [ |
|
|
"415", "650", "408", "510", "212", "646", "718", "206", |
|
|
"512", "617", "312", "213", "303", "503", "404", "202", |
|
|
] |
|
|
|
|
|
LINKEDIN_PREFIXES = [ |
|
|
"linkedin.com/in/", "www.linkedin.com/in/", |
|
|
] |
|
|
|
|
|
GITHUB_PREFIXES = [ |
|
|
"github.com/", "www.github.com/", |
|
|
] |
|
|
|
|
|
DOMAINS = [ |
|
|
"gmail.com", "outlook.com", "yahoo.com", "protonmail.com", |
|
|
"icloud.com", "hotmail.com", "mail.com", |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SYNONYMS = { |
|
|
"developed": ["built", "created", "engineered", "designed", "implemented", "constructed", "authored"], |
|
|
"managed": ["led", "oversaw", "directed", "supervised", "coordinated", "administered"], |
|
|
"improved": ["enhanced", "optimized", "upgraded", "refined", "boosted", "strengthened"], |
|
|
"implemented": ["deployed", "executed", "delivered", "rolled out", "launched", "shipped"], |
|
|
"analyzed": ["examined", "evaluated", "assessed", "investigated", "studied", "reviewed"], |
|
|
"collaborated": ["partnered", "worked closely with", "teamed up with", "cooperated with"], |
|
|
"responsible for": ["in charge of", "accountable for", "tasked with", "owned"], |
|
|
"utilized": ["leveraged", "employed", "used", "applied", "harnessed"], |
|
|
"achieved": ["accomplished", "attained", "reached", "secured", "delivered"], |
|
|
"experience": ["expertise", "background", "proficiency", "track record"], |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _pick(pool, k=1): |
|
|
"""Return k unique random items from a pool.""" |
|
|
k = min(k, len(pool)) |
|
|
return random.sample(pool, k) |
|
|
|
|
|
|
|
|
def _pick_one(pool): |
|
|
return random.choice(pool) |
|
|
|
|
|
|
|
|
def _date_range(allow_present: bool = True): |
|
|
"""Return a random date range string.""" |
|
|
start_year = random.randint(2014, 2024) |
|
|
start_month = _pick_one(MONTHS_SHORT) |
|
|
fmt = random.choice(["short", "long", "year_only"]) |
|
|
|
|
|
if allow_present and random.random() < 0.3: |
|
|
end_str = random.choice(["Present", "Current", "Now"]) |
|
|
else: |
|
|
end_year = random.randint(start_year, min(start_year + 6, 2026)) |
|
|
end_month = _pick_one(MONTHS_SHORT) |
|
|
if fmt == "short": |
|
|
end_str = f"{end_month} {end_year}" |
|
|
elif fmt == "long": |
|
|
end_str = f"{_pick_one(MONTHS)} {end_year}" |
|
|
else: |
|
|
end_str = str(end_year) |
|
|
|
|
|
if fmt == "short": |
|
|
start_str = f"{start_month} {start_year}" |
|
|
elif fmt == "long": |
|
|
start_str = f"{_pick_one(MONTHS)} {start_year}" |
|
|
else: |
|
|
start_str = str(start_year) |
|
|
|
|
|
sep = random.choice([" - ", " – ", " to ", "–", "-"]) |
|
|
return f"{start_str}{sep}{end_str}" |
|
|
|
|
|
|
|
|
def _impact(): |
|
|
"""Generate a random impact metric string.""" |
|
|
template = _pick_one(IMPACT_METRICS) |
|
|
return template.format( |
|
|
pct=random.randint(10, 85), |
|
|
amount=random.randint(50, 500), |
|
|
users=random.choice(["10K", "50K", "100K", "500K", "1M", "5M"]), |
|
|
events=random.choice(["1K", "10K", "50K", "100K", "1M"]), |
|
|
f1_old=random.randint(65, 80), |
|
|
f1_new=random.randint(82, 97), |
|
|
) |
|
|
|
|
|
|
|
|
def _synonym_replace(text: str) -> str: |
|
|
"""Randomly replace words with synonyms for augmentation.""" |
|
|
words = text.split() |
|
|
result = [] |
|
|
for w in words: |
|
|
lower = w.lower().rstrip(".,;:") |
|
|
if lower in SYNONYMS and random.random() < 0.3: |
|
|
replacement = _pick_one(SYNONYMS[lower]) |
|
|
|
|
|
if w[0].isupper(): |
|
|
replacement = replacement.capitalize() |
|
|
|
|
|
trailing = w[len(lower):] |
|
|
result.append(replacement + trailing) |
|
|
else: |
|
|
result.append(w) |
|
|
return " ".join(result) |
|
|
|
|
|
|
|
|
def _bullet(): |
|
|
"""Return a random bullet character.""" |
|
|
return random.choice(["•", "-", "●", "*", "▪", ""]) |
|
|
|
|
|
|
|
|
def _reorder_bullets(bullets: list) -> list: |
|
|
"""Shuffle bullet points for variation.""" |
|
|
shuffled = bullets.copy() |
|
|
random.shuffle(shuffled) |
|
|
return shuffled |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_education() -> str: |
|
|
"""Generate a realistic education section.""" |
|
|
templates = [] |
|
|
|
|
|
|
|
|
def _t1(): |
|
|
uni = _pick_one(UNIVERSITIES) |
|
|
deg_full, deg_short = _pick_one(DEGREES) |
|
|
major = _pick_one(MAJORS) |
|
|
year = _pick_one(GRAD_YEARS) |
|
|
lines = [] |
|
|
|
|
|
header_style = random.choice(["full", "short", "inline"]) |
|
|
if header_style == "full": |
|
|
lines.append(f"{deg_full} in {major}") |
|
|
lines.append(f"{uni}") |
|
|
lines.append(f"Graduated: {_pick_one(MONTHS)} {year}") |
|
|
elif header_style == "short": |
|
|
lines.append(f"{deg_short} {major}, {uni} ({year})") |
|
|
else: |
|
|
lines.append(f"{uni} — {deg_full} in {major}, {year}") |
|
|
|
|
|
|
|
|
if random.random() < 0.6: |
|
|
gpa = _pick_one(GPA_VALUES) |
|
|
lines.append(f"GPA: {gpa}/4.0") |
|
|
|
|
|
|
|
|
if random.random() < 0.3: |
|
|
minor = _pick_one(MINORS) |
|
|
lines.append(f"Minor in {minor}") |
|
|
|
|
|
|
|
|
if random.random() < 0.5: |
|
|
courses = _pick(MAJORS + ["Algorithms", "Data Structures", |
|
|
"Operating Systems", "Database Systems", |
|
|
"Computer Networks", "Linear Algebra", |
|
|
"Probability and Statistics", |
|
|
"Deep Learning", "Natural Language Processing", |
|
|
"Computer Vision", "Distributed Systems"], k=random.randint(3, 6)) |
|
|
prefix = random.choice(["Relevant Coursework:", "Key Courses:", "Coursework:"]) |
|
|
lines.append(f"{prefix} {', '.join(courses)}") |
|
|
|
|
|
|
|
|
if random.random() < 0.3: |
|
|
honor = random.choice(["Summa Cum Laude", "Magna Cum Laude", |
|
|
"Cum Laude", "Dean's List (all semesters)", |
|
|
"Honors Program", "University Scholar"]) |
|
|
lines.append(honor) |
|
|
|
|
|
|
|
|
if "Ph.D." in deg_short or ("M.S." in deg_short and random.random() < 0.4): |
|
|
topic = random.choice([ |
|
|
"Transformer-based approaches to document classification", |
|
|
"Scalable distributed systems for real-time data processing", |
|
|
"Graph neural networks for molecular property prediction", |
|
|
"Federated learning in healthcare applications", |
|
|
"Efficient attention mechanisms for long-sequence modeling", |
|
|
"Reinforcement learning for autonomous navigation", |
|
|
]) |
|
|
label = "Dissertation" if "Ph.D." in deg_short else "Thesis" |
|
|
lines.append(f"{label}: \"{topic}\"") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def _t2(): |
|
|
entries = [] |
|
|
for _ in range(random.randint(2, 3)): |
|
|
uni = _pick_one(UNIVERSITIES) |
|
|
deg_full, deg_short = _pick_one(DEGREES) |
|
|
major = _pick_one(MAJORS) |
|
|
year = _pick_one(GRAD_YEARS) |
|
|
gpa_line = f" | GPA: {_pick_one(GPA_VALUES)}" if random.random() < 0.5 else "" |
|
|
entries.append(f"{deg_short} in {major}, {uni}, {year}{gpa_line}") |
|
|
return "\n".join(entries) |
|
|
|
|
|
|
|
|
def _t3(): |
|
|
uni = _pick_one(UNIVERSITIES) |
|
|
deg_full, deg_short = _pick_one(DEGREES) |
|
|
major = _pick_one(MAJORS) |
|
|
year = _pick_one(GRAD_YEARS) |
|
|
lines = [f"{uni}", f"{deg_full} in {major} | {_pick_one(MONTHS)} {year}"] |
|
|
|
|
|
activities = random.sample([ |
|
|
"Teaching Assistant for Introduction to Computer Science", |
|
|
"President, Computer Science Student Association", |
|
|
"Member, ACM Student Chapter", |
|
|
"Undergraduate Research Assistant, ML Lab", |
|
|
"Peer Tutor, Mathematics Department", |
|
|
"Captain, University Programming Competition Team", |
|
|
"Volunteer, Engineering Outreach Program", |
|
|
"Member, Honors College", |
|
|
"Study Abroad Program, Technical University of Munich", |
|
|
"Resident Advisor, Engineering Living-Learning Community", |
|
|
], k=random.randint(1, 3)) |
|
|
|
|
|
b = _bullet() |
|
|
for a in activities: |
|
|
lines.append(f"{b} {a}" if b else a) |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
templates = [_t1, _t2, _t3] |
|
|
return random.choice(templates)() |
|
|
|
|
|
|
|
|
def generate_experience() -> str: |
|
|
"""Generate a realistic work experience section.""" |
|
|
|
|
|
def _single_role(): |
|
|
title = _pick_one(JOB_TITLES) |
|
|
company = _pick_one(COMPANIES) |
|
|
city = _pick_one(CITIES) |
|
|
date_range = _date_range() |
|
|
|
|
|
header_styles = [ |
|
|
f"{title} | {company} | {city} | {date_range}", |
|
|
f"{title}, {company}\n{city} | {date_range}", |
|
|
f"{company} — {title}\n{date_range} | {city}", |
|
|
f"{title}\n{company}, {city}\n{date_range}", |
|
|
] |
|
|
lines = [random.choice(header_styles)] |
|
|
|
|
|
|
|
|
bullet_templates = [ |
|
|
f"Developed and maintained {random.choice(['microservices', 'APIs', 'web applications', 'data pipelines', 'ML models', 'backend systems', 'frontend components'])} using {', '.join(_pick(PROGRAMMING_LANGUAGES, k=random.randint(1,3)))} and {', '.join(_pick(FRAMEWORKS, k=random.randint(1,2)))}", |
|
|
f"Collaborated with cross-functional teams of {random.randint(3,15)} engineers to deliver {random.choice(['product features', 'platform improvements', 'system migrations', 'infrastructure upgrades'])} on schedule", |
|
|
f"Designed and implemented {random.choice(['CI/CD pipelines', 'testing frameworks', 'monitoring solutions', 'data models', 'caching strategies', 'authentication systems'])} that {_impact()}", |
|
|
f"Led migration of {random.choice(['legacy monolith', 'on-premise infrastructure', 'batch processing system', 'manual workflows'])} to {random.choice(['cloud-native architecture', 'microservices', 'real-time streaming', 'automated pipelines'])}", |
|
|
f"Mentored {random.randint(2,8)} junior engineers through code reviews, pair programming, and technical design sessions", |
|
|
f"Optimized {random.choice(['database queries', 'API response times', 'model inference', 'data processing pipelines', 'search indexing'])} resulting in {_impact()}", |
|
|
f"Wrote comprehensive technical documentation and {random.choice(['RFCs', 'design docs', 'runbooks', 'architecture decision records'])} for {random.choice(['system design', 'API contracts', 'deployment procedures', 'incident response'])}", |
|
|
f"Built {random.choice(['real-time', 'batch', 'streaming', 'event-driven'])} {random.choice(['data pipeline', 'ETL process', 'analytics system', 'feature store'])} processing {random.choice(['1M+', '10M+', '100M+', '1B+'])} records {random.choice(['daily', 'per hour', 'in real-time'])}", |
|
|
f"Spearheaded adoption of {_pick_one(FRAMEWORKS)} and {_pick_one(TOOLS)}, {_impact()}", |
|
|
f"Conducted A/B testing and experimentation for {random.choice(['recommendation engine', 'search ranking', 'pricing model', 'onboarding flow', 'notification system'])}, {_impact()}", |
|
|
f"Architected {random.choice(['distributed', 'fault-tolerant', 'highly available', 'horizontally scalable'])} system handling {random.choice(['10K', '50K', '100K', '1M'])} requests per second with {random.choice(['99.9%', '99.95%', '99.99%'])} uptime", |
|
|
] |
|
|
|
|
|
n_bullets = random.randint(2, 5) |
|
|
selected = random.sample(bullet_templates, min(n_bullets, len(bullet_templates))) |
|
|
selected = _reorder_bullets(selected) |
|
|
b = _bullet() |
|
|
for bullet in selected: |
|
|
lines.append(f"{b} {bullet}" if b else bullet) |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
n_roles = random.choices([1, 2], weights=[0.7, 0.3])[0] |
|
|
roles = [_single_role() for _ in range(n_roles)] |
|
|
return "\n\n".join(roles) |
|
|
|
|
|
|
|
|
def generate_skills() -> str: |
|
|
"""Generate a realistic skills section.""" |
|
|
templates = [] |
|
|
|
|
|
def _t_categorized(): |
|
|
lines = [] |
|
|
categories = [] |
|
|
|
|
|
if random.random() < 0.9: |
|
|
langs = _pick(PROGRAMMING_LANGUAGES, k=random.randint(3, 7)) |
|
|
label = random.choice(["Languages", "Programming Languages", "Programming"]) |
|
|
categories.append((label, langs)) |
|
|
|
|
|
if random.random() < 0.9: |
|
|
fws = _pick(FRAMEWORKS, k=random.randint(3, 7)) |
|
|
label = random.choice(["Frameworks", "Frameworks & Libraries", "Technologies"]) |
|
|
categories.append((label, fws)) |
|
|
|
|
|
if random.random() < 0.8: |
|
|
tls = _pick(TOOLS, k=random.randint(3, 7)) |
|
|
label = random.choice(["Tools", "Developer Tools", "Tools & Platforms"]) |
|
|
categories.append((label, tls)) |
|
|
|
|
|
if random.random() < 0.4: |
|
|
ss = _pick(SOFT_SKILLS, k=random.randint(2, 5)) |
|
|
label = random.choice(["Soft Skills", "Other Skills", "Additional Skills"]) |
|
|
categories.append((label, ss)) |
|
|
|
|
|
sep = random.choice([": ", " - ", " — "]) |
|
|
for label, items in categories: |
|
|
joiner = random.choice([", ", " | ", " · ", " / "]) |
|
|
lines.append(f"{label}{sep}{joiner.join(items)}") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def _t_flat(): |
|
|
all_skills = (_pick(PROGRAMMING_LANGUAGES, k=random.randint(3, 6)) + |
|
|
_pick(FRAMEWORKS, k=random.randint(3, 6)) + |
|
|
_pick(TOOLS, k=random.randint(2, 4))) |
|
|
random.shuffle(all_skills) |
|
|
joiner = random.choice([", ", " | ", " · ", " • "]) |
|
|
return joiner.join(all_skills) |
|
|
|
|
|
def _t_proficiency(): |
|
|
lines = [] |
|
|
levels = ["Expert", "Advanced", "Proficient", "Intermediate", "Familiar"] |
|
|
used = set() |
|
|
for level in random.sample(levels, k=random.randint(2, 4)): |
|
|
pool = [s for s in PROGRAMMING_LANGUAGES + FRAMEWORKS + TOOLS if s not in used] |
|
|
items = _pick(pool, k=random.randint(2, 5)) |
|
|
used.update(items) |
|
|
lines.append(f"{level}: {', '.join(items)}") |
|
|
return "\n".join(lines) |
|
|
|
|
|
templates = [_t_categorized, _t_flat, _t_proficiency] |
|
|
return random.choice(templates)() |
|
|
|
|
|
|
|
|
def generate_projects() -> str: |
|
|
"""Generate a realistic projects section.""" |
|
|
|
|
|
def _single_project(): |
|
|
adj = _pick_one(PROJECT_ADJECTIVES) |
|
|
noun = _pick_one(PROJECT_NOUNS) |
|
|
name = f"{adj} {noun}" |
|
|
techs = _pick(PROGRAMMING_LANGUAGES + FRAMEWORKS, k=random.randint(2, 5)) |
|
|
|
|
|
header_styles = [ |
|
|
f"{name} | {', '.join(techs)}", |
|
|
f"{name}\nTechnologies: {', '.join(techs)}", |
|
|
f"{name} ({', '.join(techs)})", |
|
|
] |
|
|
lines = [random.choice(header_styles)] |
|
|
|
|
|
|
|
|
if random.random() < 0.3: |
|
|
username = _pick_one(FIRST_NAMES).lower() + _pick_one(LAST_NAMES).lower() |
|
|
lines.append(f"github.com/{username}/{name.lower().replace(' ', '-')}") |
|
|
|
|
|
descriptions = [ |
|
|
f"Built a {noun.lower()} that {random.choice(['processes', 'analyzes', 'visualizes', 'aggregates', 'transforms'])} {random.choice(['user data', 'financial data', 'text documents', 'sensor data', 'social media feeds', 'medical records'])} in real-time", |
|
|
f"Implemented {random.choice(['REST API', 'GraphQL API', 'gRPC service', 'WebSocket server', 'event-driven architecture'])} with {random.choice(['authentication', 'rate limiting', 'caching', 'pagination', 'logging'])} support", |
|
|
f"Trained {random.choice(['classification', 'regression', 'NLP', 'computer vision', 'recommendation'])} model achieving {random.choice(['92%', '95%', '97%', '89%', '94%'])} {random.choice(['accuracy', 'F1 score', 'AUC-ROC'])} on test set", |
|
|
f"Deployed to {random.choice(['AWS', 'GCP', 'Azure', 'Heroku', 'Vercel', 'Railway'])} with {random.choice(['Docker', 'Kubernetes', 'serverless', 'auto-scaling'])} configuration", |
|
|
f"Attracted {random.choice(['100+', '500+', '1K+', '5K+'])} GitHub stars and {random.choice(['20+', '50+', '100+'])} contributors from the open-source community", |
|
|
f"Features {random.choice(['real-time notifications', 'responsive UI', 'role-based access control', 'data export', 'interactive visualizations', 'natural language search'])}", |
|
|
] |
|
|
|
|
|
b = _bullet() |
|
|
for desc in random.sample(descriptions, k=random.randint(2, 4)): |
|
|
lines.append(f"{b} {desc}" if b else desc) |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
n_projects = random.randint(1, 3) |
|
|
return "\n\n".join([_single_project() for _ in range(n_projects)]) |
|
|
|
|
|
|
|
|
def generate_summary() -> str: |
|
|
"""Generate a realistic professional summary / objective section.""" |
|
|
years = random.randint(2, 15) |
|
|
specialties = _pick(MAJORS + [ |
|
|
"full-stack development", "distributed systems", "machine learning", |
|
|
"data engineering", "cloud architecture", "mobile development", |
|
|
"DevOps", "backend development", "frontend development", |
|
|
"natural language processing", "computer vision", |
|
|
], k=random.randint(1, 3)) |
|
|
|
|
|
templates = [ |
|
|
|
|
|
lambda: f"Results-driven {_pick_one(JOB_TITLES).lower()} with {years}+ years of experience in {' and '.join(specialties)}. Proven track record of {random.choice(['delivering high-impact solutions', 'building scalable systems', 'driving technical excellence', 'leading cross-functional teams'])} at companies like {_pick_one(COMPANIES)} and {_pick_one(COMPANIES)}. Passionate about {random.choice(['clean code', 'system design', 'open source', 'mentorship', 'continuous learning', 'innovation'])} and {random.choice(['building products that scale', 'solving complex problems', 'leveraging data-driven insights', 'improving developer experience'])}.", |
|
|
|
|
|
|
|
|
lambda: f"Experienced {_pick_one(JOB_TITLES).lower()} specializing in {', '.join(specialties)}. Skilled in {', '.join(_pick(PROGRAMMING_LANGUAGES, k=3))} with deep expertise in {', '.join(_pick(FRAMEWORKS, k=2))}. {random.choice(['Strong background in', 'Demonstrated ability in', 'Track record of'])} {random.choice(['building distributed systems at scale', 'developing ML models for production', 'architecting cloud-native applications', 'leading agile engineering teams'])}. Seeking to {random.choice(['contribute to cutting-edge products', 'drive technical innovation', 'solve challenging problems', 'build impactful technology'])} at a {random.choice(['fast-growing startup', 'leading technology company', 'mission-driven organization'])}.", |
|
|
|
|
|
|
|
|
lambda: f"{_pick_one(JOB_TITLES)} with {years} years of experience building {random.choice(['enterprise-scale', 'consumer-facing', 'B2B', 'data-intensive'])} applications. Key achievements include {_impact()}, {_impact()}, and {_impact()}. Proficient in {', '.join(_pick(PROGRAMMING_LANGUAGES, k=3))} and {', '.join(_pick(FRAMEWORKS, k=2))}.", |
|
|
|
|
|
|
|
|
lambda: f"Motivated {random.choice(['professional', 'engineer', 'developer', 'technologist'])} seeking a {_pick_one(JOB_TITLES).lower()} role where I can apply my expertise in {' and '.join(specialties)} to {random.choice(['build innovative products', 'solve real-world problems', 'drive business impact', 'push the boundaries of technology'])}.", |
|
|
|
|
|
|
|
|
lambda: f"I am a {_pick_one(JOB_TITLES).lower()} who thrives at the intersection of {_pick_one(specialties)} and {_pick_one(specialties)}. Over the past {years} years, I have {random.choice(['shipped products used by millions', 'built ML systems processing petabytes of data', 'led engineering teams through rapid growth', 'contributed to open-source projects with thousands of stars'])}. I bring a {random.choice(['data-driven', 'user-centric', 'systems-thinking', 'first-principles'])} approach to every problem I tackle.", |
|
|
] |
|
|
|
|
|
return random.choice(templates)() |
|
|
|
|
|
|
|
|
def generate_certifications() -> str: |
|
|
"""Generate a realistic certifications section.""" |
|
|
n = random.randint(2, 6) |
|
|
certs = _pick(CERTIFICATIONS_LIST, k=n) |
|
|
|
|
|
lines = [] |
|
|
for cert in certs: |
|
|
year = random.randint(2019, 2025) |
|
|
styles = [ |
|
|
f"{cert} ({year})", |
|
|
f"{cert} — Issued {_pick_one(MONTHS)} {year}", |
|
|
f"{cert}, {year}", |
|
|
f"{cert}\n Issued: {_pick_one(MONTHS_SHORT)} {year}" + ( |
|
|
f" | Expires: {_pick_one(MONTHS_SHORT)} {year + random.randint(2, 3)}" |
|
|
if random.random() < 0.3 else "" |
|
|
), |
|
|
] |
|
|
lines.append(random.choice(styles)) |
|
|
|
|
|
b = _bullet() |
|
|
if b and random.random() < 0.5: |
|
|
return "\n".join(f"{b} {line}" for line in lines) |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def generate_contact() -> str: |
|
|
"""Generate a realistic contact information section.""" |
|
|
first = _pick_one(FIRST_NAMES) |
|
|
last = _pick_one(LAST_NAMES) |
|
|
city = _pick_one(CITIES) |
|
|
area_code = _pick_one(PHONE_AREA_CODES) |
|
|
email_user = random.choice([ |
|
|
f"{first.lower()}.{last.lower()}", |
|
|
f"{first.lower()}{last.lower()}", |
|
|
f"{first[0].lower()}{last.lower()}", |
|
|
f"{first.lower()}_{last.lower()}", |
|
|
f"{first.lower()}{random.randint(1, 99)}", |
|
|
]) |
|
|
email = f"{email_user}@{_pick_one(DOMAINS)}" |
|
|
phone = f"({area_code}) {random.randint(100,999)}-{random.randint(1000,9999)}" |
|
|
linkedin_user = f"{first.lower()}-{last.lower()}-{random.randint(100, 999)}" |
|
|
github_user = f"{first.lower()}{last.lower()}" |
|
|
|
|
|
parts = [f"{first} {last}"] |
|
|
|
|
|
if random.random() < 0.8: |
|
|
parts.append(email) |
|
|
if random.random() < 0.7: |
|
|
parts.append(phone) |
|
|
if random.random() < 0.6: |
|
|
parts.append(city) |
|
|
if random.random() < 0.5: |
|
|
parts.append(f"{_pick_one(LINKEDIN_PREFIXES)}{linkedin_user}") |
|
|
if random.random() < 0.4: |
|
|
parts.append(f"{_pick_one(GITHUB_PREFIXES)}{github_user}") |
|
|
if random.random() < 0.2: |
|
|
parts.append(f"{github_user}.dev" if random.random() < 0.5 else f"{first.lower()}{last.lower()}.com") |
|
|
|
|
|
sep = random.choice(["\n", " | ", " · ", "\n"]) |
|
|
return sep.join(parts) |
|
|
|
|
|
|
|
|
def generate_awards() -> str: |
|
|
"""Generate a realistic awards & honors section.""" |
|
|
n = random.randint(2, 6) |
|
|
awards = _pick(AWARDS_LIST, k=n) |
|
|
lines = [] |
|
|
|
|
|
for award in awards: |
|
|
year = random.randint(2015, 2025) |
|
|
org = random.choice([ |
|
|
_pick_one(UNIVERSITIES), |
|
|
_pick_one(COMPANIES), |
|
|
random.choice(["ACM", "IEEE", "Google", "Facebook", "Microsoft", |
|
|
"National Science Foundation", "Department of Education"]), |
|
|
]) |
|
|
styles = [ |
|
|
f"{award}, {org} ({year})", |
|
|
f"{award} — {org}, {year}", |
|
|
f"{award} ({year})\n Awarded by {org}", |
|
|
f"{award}, {year}", |
|
|
] |
|
|
lines.append(random.choice(styles)) |
|
|
|
|
|
b = _bullet() |
|
|
if b and random.random() < 0.6: |
|
|
return "\n".join(f"{b} {line}" for line in lines) |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SECTION_HEADERS = { |
|
|
"education": ["EDUCATION", "Education", "Academic Background", "ACADEMIC BACKGROUND", "Education & Training"], |
|
|
"experience": ["EXPERIENCE", "Experience", "WORK EXPERIENCE", "Work Experience", "PROFESSIONAL EXPERIENCE", "Professional Experience", "Employment History"], |
|
|
"skills": ["SKILLS", "Skills", "TECHNICAL SKILLS", "Technical Skills", "Core Competencies", "CORE COMPETENCIES", "Technologies"], |
|
|
"projects": ["PROJECTS", "Projects", "PERSONAL PROJECTS", "Personal Projects", "SIDE PROJECTS", "Selected Projects", "Portfolio"], |
|
|
"summary": ["SUMMARY", "Summary", "PROFESSIONAL SUMMARY", "Professional Summary", "OBJECTIVE", "Objective", "PROFILE", "Profile", "About Me", "ABOUT"], |
|
|
"certifications": ["CERTIFICATIONS", "Certifications", "CERTIFICATES", "Certificates", "Licenses & Certifications", "PROFESSIONAL CERTIFICATIONS"], |
|
|
"contact": ["CONTACT", "Contact", "CONTACT INFORMATION", "Contact Information", "Personal Information"], |
|
|
"awards": ["AWARDS", "Awards", "HONORS & AWARDS", "Honors & Awards", "ACHIEVEMENTS", "Achievements", "Awards & Honors", "RECOGNITION"], |
|
|
} |
|
|
|
|
|
GENERATORS = { |
|
|
"education": generate_education, |
|
|
"experience": generate_experience, |
|
|
"skills": generate_skills, |
|
|
"projects": generate_projects, |
|
|
"summary": generate_summary, |
|
|
"certifications": generate_certifications, |
|
|
"contact": generate_contact, |
|
|
"awards": generate_awards, |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_example(label: str, include_header: bool = False, augment: bool = False) -> str: |
|
|
""" |
|
|
Generate a single synthetic example for the given label. |
|
|
|
|
|
Args: |
|
|
label: One of the 8 section categories. |
|
|
include_header: Whether to prepend a section header. |
|
|
augment: Whether to apply text augmentation. |
|
|
|
|
|
Returns: |
|
|
Generated text string. |
|
|
""" |
|
|
text = GENERATORS[label]() |
|
|
|
|
|
|
|
|
if include_header and random.random() < 0.5: |
|
|
header = _pick_one(SECTION_HEADERS[label]) |
|
|
sep = random.choice(["\n", "\n\n", "\n---\n"]) |
|
|
text = f"{header}{sep}{text}" |
|
|
|
|
|
|
|
|
if augment: |
|
|
if random.random() < 0.4: |
|
|
text = _synonym_replace(text) |
|
|
|
|
|
if random.random() < 0.2: |
|
|
text = text.strip() + "\n" |
|
|
if random.random() < 0.1: |
|
|
text = " " + text |
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
def generate_dataset( |
|
|
examples_per_category: int = 80, |
|
|
augmented_copies: int = 2, |
|
|
include_header_prob: float = 0.4, |
|
|
seed: int = 42, |
|
|
) -> list[dict]: |
|
|
""" |
|
|
Generate a complete synthetic dataset. |
|
|
|
|
|
Args: |
|
|
examples_per_category: Base examples per category. |
|
|
augmented_copies: Number of augmented copies per base example. |
|
|
include_header_prob: Probability of including section header. |
|
|
seed: Random seed for reproducibility. |
|
|
|
|
|
Returns: |
|
|
List of dicts with 'text' and 'label' keys. |
|
|
""" |
|
|
random.seed(seed) |
|
|
labels = list(GENERATORS.keys()) |
|
|
dataset = [] |
|
|
|
|
|
for label in labels: |
|
|
for i in range(examples_per_category): |
|
|
include_header = random.random() < include_header_prob |
|
|
text = generate_example(label, include_header=include_header, augment=False) |
|
|
dataset.append({"text": text, "label": label}) |
|
|
|
|
|
|
|
|
for _ in range(augmented_copies): |
|
|
aug_text = generate_example(label, include_header=include_header, augment=True) |
|
|
dataset.append({"text": aug_text, "label": label}) |
|
|
|
|
|
random.shuffle(dataset) |
|
|
return dataset |
|
|
|
|
|
|
|
|
def save_to_csv(dataset: list[dict], path: str) -> None: |
|
|
"""Save dataset to CSV.""" |
|
|
filepath = Path(path) |
|
|
filepath.parent.mkdir(parents=True, exist_ok=True) |
|
|
with open(filepath, "w", newline="", encoding="utf-8") as f: |
|
|
writer = csv.DictWriter(f, fieldnames=["text", "label"]) |
|
|
writer.writeheader() |
|
|
writer.writerows(dataset) |
|
|
print(f"Saved {len(dataset)} examples to {filepath}") |
|
|
|
|
|
|
|
|
def load_as_hf_dataset(dataset: list[dict]): |
|
|
"""Convert to HuggingFace Dataset with train/val/test splits.""" |
|
|
from datasets import Dataset, DatasetDict |
|
|
|
|
|
ds = Dataset.from_list(dataset) |
|
|
|
|
|
|
|
|
train_test = ds.train_test_split(test_size=0.2, seed=42, stratify_by_column="label") |
|
|
val_test = train_test["test"].train_test_split(test_size=0.5, seed=42, stratify_by_column="label") |
|
|
|
|
|
return DatasetDict({ |
|
|
"train": train_test["train"], |
|
|
"validation": val_test["train"], |
|
|
"test": val_test["test"], |
|
|
}) |
|
|
|
|
|
|
|
|
def get_label_mapping(dataset: list[dict]) -> tuple[dict, dict]: |
|
|
"""Create label <-> id mappings.""" |
|
|
labels = sorted(set(d["label"] for d in dataset)) |
|
|
label2id = {label: idx for idx, label in enumerate(labels)} |
|
|
id2label = {idx: label for label, idx in label2id.items()} |
|
|
return label2id, id2label |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Generate synthetic resume section data") |
|
|
parser.add_argument("--examples-per-category", type=int, default=80, |
|
|
help="Number of base examples per category (default: 80)") |
|
|
parser.add_argument("--augmented-copies", type=int, default=2, |
|
|
help="Number of augmented copies per example (default: 2)") |
|
|
parser.add_argument("--output", type=str, default="data/resume_sections.csv", |
|
|
help="Output CSV path (default: data/resume_sections.csv)") |
|
|
parser.add_argument("--seed", type=int, default=42, |
|
|
help="Random seed (default: 42)") |
|
|
parser.add_argument("--print-stats", action="store_true", |
|
|
help="Print dataset statistics") |
|
|
parser.add_argument("--print-samples", type=int, default=0, |
|
|
help="Print N sample examples") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print(f"Generating dataset with {args.examples_per_category} base examples per category...") |
|
|
print(f"Augmented copies per example: {args.augmented_copies}") |
|
|
print(f"Total expected examples: {args.examples_per_category * (1 + args.augmented_copies) * 8}") |
|
|
|
|
|
dataset = generate_dataset( |
|
|
examples_per_category=args.examples_per_category, |
|
|
augmented_copies=args.augmented_copies, |
|
|
seed=args.seed, |
|
|
) |
|
|
|
|
|
save_to_csv(dataset, args.output) |
|
|
|
|
|
if args.print_stats: |
|
|
from collections import Counter |
|
|
counts = Counter(d["label"] for d in dataset) |
|
|
print("\nDataset Statistics:") |
|
|
print(f" Total examples: {len(dataset)}") |
|
|
print(f" Categories: {len(counts)}") |
|
|
for label, count in sorted(counts.items()): |
|
|
print(f" {label}: {count}") |
|
|
avg_len = sum(len(d["text"]) for d in dataset) / len(dataset) |
|
|
print(f" Average text length: {avg_len:.0f} chars") |
|
|
|
|
|
if args.print_samples > 0: |
|
|
print(f"\n{'='*60}") |
|
|
print(f"Sample Examples (first {args.print_samples}):") |
|
|
print(f"{'='*60}") |
|
|
for i, example in enumerate(dataset[:args.print_samples]): |
|
|
print(f"\n--- Example {i+1} [{example['label']}] ---") |
|
|
print(example["text"][:300]) |
|
|
if len(example["text"]) > 300: |
|
|
print("...") |
|
|
|