Spaces:
Sleeping
Sleeping
| import json | |
| import pandas as pd | |
| import numpy as np | |
| from typing import List, Dict, Tuple | |
| import os | |
| class DataGenerator: | |
| """Generate synthetic training data for both tasks""" | |
| def generate_extraction_samples(num_samples: int = 1000) -> List[Dict]: | |
| """Generate resume extraction training samples""" | |
| companies = ["TechCorp", "DataFlow", "CloudSys", "AI Labs", "WebDev Inc", | |
| "FinTech Solutions", "Health Systems", "E-commerce Plus"] | |
| roles = ["Developer", "Senior Developer", "Data Scientist", "ML Engineer", | |
| "Product Manager", "DevOps Engineer", "Frontend Engineer", "Backend Engineer"] | |
| skills_pool = ["Python", "Django", "Flask", "FastAPI", "PostgreSQL", "MongoDB", | |
| "React", "Vue.js", "AWS", "GCP", "Docker", "Kubernetes", | |
| "Machine Learning", "NLP", "TensorFlow", "PyTorch", "Git", | |
| "SQL", "REST API", "GraphQL", "Redis", "Elasticsearch"] | |
| universities = ["MIT", "Stanford", "Carnegie Mellon", "Berkeley", "Harvard", | |
| "University of Washington", "State University", "Tech Institute"] | |
| degrees = ["BS Computer Science", "BS Data Science", "MS Computer Science", | |
| "MS Artificial Intelligence", "BS Engineering"] | |
| samples = [] | |
| for i in range(num_samples): | |
| name = f"Candidate_{i+1}" | |
| email = f"candidate{i+1}@email.com" | |
| phone = f"555-{np.random.randint(1000, 9999)}" | |
| # Experience | |
| num_exp = np.random.randint(1, 4) | |
| experience = [] | |
| for _ in range(num_exp): | |
| experience.append({ | |
| "company": np.random.choice(companies), | |
| "role": np.random.choice(roles), | |
| "duration": f"{np.random.randint(1, 7)} years", | |
| "description": "Led projects and mentored team members" | |
| }) | |
| # Skills | |
| num_skills = np.random.randint(3, 10) | |
| skills = list(np.random.choice(skills_pool, num_skills, replace=False)) | |
| # Education | |
| education = [{ | |
| "degree": np.random.choice(degrees), | |
| "university": np.random.choice(universities), | |
| "graduation_year": str(np.random.randint(2015, 2023)) | |
| }] | |
| # Certifications | |
| certifications = [f"Cert_{j}" for j in range(np.random.randint(0, 3))] | |
| resume_text = f""" | |
| Resume of {name} | |
| Email: {email} | Phone: {phone} | |
| EXPERIENCE: | |
| {chr(10).join([f"- {exp['company']}: {exp['role']} ({exp['duration']})" for exp in experience])} | |
| SKILLS: | |
| {', '.join(skills)} | |
| EDUCATION: | |
| {chr(10).join([f"- {edu['degree']} from {edu['university']} ({edu['graduation_year']})" for edu in education])} | |
| CERTIFICATIONS: | |
| {chr(10).join(certifications) if certifications else "None"} | |
| """ | |
| extracted_data = { | |
| "name": name, | |
| "email": email, | |
| "phone": phone, | |
| "skills": skills, | |
| "experience": experience, | |
| "education": education, | |
| "certifications": certifications | |
| } | |
| samples.append({ | |
| "input": resume_text.strip(), | |
| "output": json.dumps(extracted_data, indent=2), | |
| "task": "extraction" | |
| }) | |
| return samples | |
| def generate_matching_samples(num_samples: int = 500) -> List[Dict]: | |
| """Generate resume-job matching training samples""" | |
| job_titles = ["Senior Python Developer", "Data Scientist", "ML Engineer", | |
| "Full-Stack Developer", "DevOps Engineer", "Product Manager"] | |
| skills_pool = ["Python", "Django", "PostgreSQL", "AWS", "Docker", "Kubernetes", | |
| "Machine Learning", "React", "Node.js", "SQL"] | |
| samples = [] | |
| for i in range(num_samples): | |
| # Create job description | |
| job_title = np.random.choice(job_titles) | |
| required_skills = list(np.random.choice(skills_pool, np.random.randint(3, 7), replace=False)) | |
| job_desc = f""" | |
| Job Title: {job_title} | |
| Required Skills: | |
| {', '.join(required_skills)} | |
| Experience: 3+ years in relevant role | |
| Education: BS in Computer Science or related field | |
| """ | |
| # Create matching resume | |
| resume_skills = list(np.random.choice(skills_pool, np.random.randint(3, 8), replace=False)) | |
| resume = f"Skills: {', '.join(resume_skills)}\nExperience: {np.random.randint(1, 8)} years" | |
| # Calculate match score based on skill overlap | |
| matching_skills = list(set(resume_skills) & set(required_skills)) | |
| match_score = min(100, int((len(matching_skills) / len(required_skills)) * 100)) | |
| matching_data = { | |
| "match_score": match_score, | |
| "matching_skills": matching_skills, | |
| "missing_skills": [s for s in required_skills if s not in resume_skills], | |
| "recommendation": "Recommend interview" if match_score >= 70 else "Consider further review" | |
| } | |
| samples.append({ | |
| "input": f"Resume:\n{resume}\n\nJob Description:\n{job_desc}", | |
| "output": json.dumps(matching_data, indent=2), | |
| "task": "matching" | |
| }) | |
| return samples | |
| def create_instruction_dataset(extraction_samples: List[Dict], | |
| matching_samples: List[Dict]) -> List[Dict]: | |
| """Convert samples to instruction-following format""" | |
| dataset = [] | |
| # Extraction task instructions | |
| for sample in extraction_samples: | |
| dataset.append({ | |
| "instruction": "Extract structured information from the resume. Return valid JSON.", | |
| "input": sample["input"], | |
| "output": sample["output"], | |
| "task": "extraction" | |
| }) | |
| # Matching task instructions | |
| for sample in matching_samples: | |
| dataset.append({ | |
| "instruction": "Compare the resume against the job description and provide a match score (0-100) with reasoning. Return valid JSON.", | |
| "input": sample["input"], | |
| "output": sample["output"], | |
| "task": "matching" | |
| }) | |
| return dataset | |
| def prepare_data(output_dir: str = "data/processed"): | |
| """Main function to prepare all data""" | |
| os.makedirs(output_dir, exist_ok=True) | |
| print("Generating extraction samples...") | |
| extraction_samples = DataGenerator.generate_extraction_samples(1000) | |
| print("Generating matching samples...") | |
| matching_samples = DataGenerator.generate_matching_samples(500) | |
| print("Creating instruction dataset...") | |
| full_dataset = DataGenerator.create_instruction_dataset(extraction_samples, matching_samples) | |
| # Split into train/val/test | |
| np.random.shuffle(full_dataset) | |
| total = len(full_dataset) | |
| train_idx = int(0.8 * total) | |
| val_idx = int(0.9 * total) | |
| train_data = full_dataset[:train_idx] | |
| val_data = full_dataset[train_idx:val_idx] | |
| test_data = full_dataset[val_idx:] | |
| # Save datasets | |
| with open(f"{output_dir}/train.json", "w") as f: | |
| json.dump(train_data, f, indent=2) | |
| with open(f"{output_dir}/validation.json", "w") as f: | |
| json.dump(val_data, f, indent=2) | |
| with open(f"{output_dir}/test.json", "w") as f: | |
| json.dump(test_data, f, indent=2) | |
| print(f"✅ Data prepared successfully!") | |
| print(f" - Train samples: {len(train_data)}") | |
| print(f" - Validation samples: {len(val_data)}") | |
| print(f" - Test samples: {len(test_data)}") | |
| print(f" - Total: {total}") | |
| return train_data, val_data, test_data | |
| if __name__ == "__main__": | |
| prepare_data() | |