resume-llm-api / src /data_preparation.py
mhr-212's picture
Upload folder using huggingface_hub
7e0c689 verified
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import os
class DataGenerator:
"""Generate synthetic training data for both tasks"""
@staticmethod
def generate_extraction_samples(num_samples: int = 1000) -> List[Dict]:
"""Generate resume extraction training samples"""
companies = ["TechCorp", "DataFlow", "CloudSys", "AI Labs", "WebDev Inc",
"FinTech Solutions", "Health Systems", "E-commerce Plus"]
roles = ["Developer", "Senior Developer", "Data Scientist", "ML Engineer",
"Product Manager", "DevOps Engineer", "Frontend Engineer", "Backend Engineer"]
skills_pool = ["Python", "Django", "Flask", "FastAPI", "PostgreSQL", "MongoDB",
"React", "Vue.js", "AWS", "GCP", "Docker", "Kubernetes",
"Machine Learning", "NLP", "TensorFlow", "PyTorch", "Git",
"SQL", "REST API", "GraphQL", "Redis", "Elasticsearch"]
universities = ["MIT", "Stanford", "Carnegie Mellon", "Berkeley", "Harvard",
"University of Washington", "State University", "Tech Institute"]
degrees = ["BS Computer Science", "BS Data Science", "MS Computer Science",
"MS Artificial Intelligence", "BS Engineering"]
samples = []
for i in range(num_samples):
name = f"Candidate_{i+1}"
email = f"candidate{i+1}@email.com"
phone = f"555-{np.random.randint(1000, 9999)}"
# Experience
num_exp = np.random.randint(1, 4)
experience = []
for _ in range(num_exp):
experience.append({
"company": np.random.choice(companies),
"role": np.random.choice(roles),
"duration": f"{np.random.randint(1, 7)} years",
"description": "Led projects and mentored team members"
})
# Skills
num_skills = np.random.randint(3, 10)
skills = list(np.random.choice(skills_pool, num_skills, replace=False))
# Education
education = [{
"degree": np.random.choice(degrees),
"university": np.random.choice(universities),
"graduation_year": str(np.random.randint(2015, 2023))
}]
# Certifications
certifications = [f"Cert_{j}" for j in range(np.random.randint(0, 3))]
resume_text = f"""
Resume of {name}
Email: {email} | Phone: {phone}
EXPERIENCE:
{chr(10).join([f"- {exp['company']}: {exp['role']} ({exp['duration']})" for exp in experience])}
SKILLS:
{', '.join(skills)}
EDUCATION:
{chr(10).join([f"- {edu['degree']} from {edu['university']} ({edu['graduation_year']})" for edu in education])}
CERTIFICATIONS:
{chr(10).join(certifications) if certifications else "None"}
"""
extracted_data = {
"name": name,
"email": email,
"phone": phone,
"skills": skills,
"experience": experience,
"education": education,
"certifications": certifications
}
samples.append({
"input": resume_text.strip(),
"output": json.dumps(extracted_data, indent=2),
"task": "extraction"
})
return samples
@staticmethod
def generate_matching_samples(num_samples: int = 500) -> List[Dict]:
"""Generate resume-job matching training samples"""
job_titles = ["Senior Python Developer", "Data Scientist", "ML Engineer",
"Full-Stack Developer", "DevOps Engineer", "Product Manager"]
skills_pool = ["Python", "Django", "PostgreSQL", "AWS", "Docker", "Kubernetes",
"Machine Learning", "React", "Node.js", "SQL"]
samples = []
for i in range(num_samples):
# Create job description
job_title = np.random.choice(job_titles)
required_skills = list(np.random.choice(skills_pool, np.random.randint(3, 7), replace=False))
job_desc = f"""
Job Title: {job_title}
Required Skills:
{', '.join(required_skills)}
Experience: 3+ years in relevant role
Education: BS in Computer Science or related field
"""
# Create matching resume
resume_skills = list(np.random.choice(skills_pool, np.random.randint(3, 8), replace=False))
resume = f"Skills: {', '.join(resume_skills)}\nExperience: {np.random.randint(1, 8)} years"
# Calculate match score based on skill overlap
matching_skills = list(set(resume_skills) & set(required_skills))
match_score = min(100, int((len(matching_skills) / len(required_skills)) * 100))
matching_data = {
"match_score": match_score,
"matching_skills": matching_skills,
"missing_skills": [s for s in required_skills if s not in resume_skills],
"recommendation": "Recommend interview" if match_score >= 70 else "Consider further review"
}
samples.append({
"input": f"Resume:\n{resume}\n\nJob Description:\n{job_desc}",
"output": json.dumps(matching_data, indent=2),
"task": "matching"
})
return samples
@staticmethod
def create_instruction_dataset(extraction_samples: List[Dict],
matching_samples: List[Dict]) -> List[Dict]:
"""Convert samples to instruction-following format"""
dataset = []
# Extraction task instructions
for sample in extraction_samples:
dataset.append({
"instruction": "Extract structured information from the resume. Return valid JSON.",
"input": sample["input"],
"output": sample["output"],
"task": "extraction"
})
# Matching task instructions
for sample in matching_samples:
dataset.append({
"instruction": "Compare the resume against the job description and provide a match score (0-100) with reasoning. Return valid JSON.",
"input": sample["input"],
"output": sample["output"],
"task": "matching"
})
return dataset
def prepare_data(output_dir: str = "data/processed"):
"""Main function to prepare all data"""
os.makedirs(output_dir, exist_ok=True)
print("Generating extraction samples...")
extraction_samples = DataGenerator.generate_extraction_samples(1000)
print("Generating matching samples...")
matching_samples = DataGenerator.generate_matching_samples(500)
print("Creating instruction dataset...")
full_dataset = DataGenerator.create_instruction_dataset(extraction_samples, matching_samples)
# Split into train/val/test
np.random.shuffle(full_dataset)
total = len(full_dataset)
train_idx = int(0.8 * total)
val_idx = int(0.9 * total)
train_data = full_dataset[:train_idx]
val_data = full_dataset[train_idx:val_idx]
test_data = full_dataset[val_idx:]
# Save datasets
with open(f"{output_dir}/train.json", "w") as f:
json.dump(train_data, f, indent=2)
with open(f"{output_dir}/validation.json", "w") as f:
json.dump(val_data, f, indent=2)
with open(f"{output_dir}/test.json", "w") as f:
json.dump(test_data, f, indent=2)
print(f"✅ Data prepared successfully!")
print(f" - Train samples: {len(train_data)}")
print(f" - Validation samples: {len(val_data)}")
print(f" - Test samples: {len(test_data)}")
print(f" - Total: {total}")
return train_data, val_data, test_data
if __name__ == "__main__":
prepare_data()