jobopportunity / generate_dataset.py
sivan26's picture
Rename jobs.csv to generate_dataset.py
82cf82c verified
---
## **4. Dataset — data/jobs.csv**
You’ll need to create a folder called `data` in your Space and inside it a `jobs.csv` file.
Because the dataset is large (1200 rows), I can give you a **Python script** to generate it quickly inside Hugging Face instead of pasting all rows here:
Create a file named **generate_dataset.py** in the Space:
```python
import os, random, pandas as pd
industries = {
"Software": {
"roles": ["Backend Engineer", "Frontend Engineer", "Full-Stack Developer", "ML Engineer", "Data Engineer", "QA Engineer", "DevOps Engineer", "Product Manager"],
"skills": ["Python", "JavaScript", "TypeScript", "React", "Node.js", "Django", "Flask", "REST APIs", "GraphQL", "SQL", "NoSQL", "Docker", "Kubernetes", "CI/CD", "Git", "AWS", "GCP", "Unit Testing", "Agile", "Scrum"]
},
"Data": {
"roles": ["Data Scientist", "Data Analyst", "Analytics Engineer", "BI Developer", "NLP Engineer", "Computer Vision Engineer"],
"skills": ["Python", "Pandas", "NumPy", "Scikit-learn", "TensorFlow", "PyTorch", "XGBoost", "Statistics", "A/B Testing", "SQL", "Data Visualization", "Tableau", "Power BI", "Feature Engineering", "Model Deployment", "Airflow", "MLflow"]
},
"Cybersecurity": {
"roles": ["Security Analyst", "SOC Analyst", "Security Engineer", "GRC Specialist", "Penetration Tester"],
"skills": ["Network Security", "SIEM", "IDS/IPS", "Vulnerability Management", "Python", "Linux", "Windows", "Cloud Security", "Risk Assessment", "Incident Response", "OWASP", "Threat Modeling"]
},
"Finance": {
"roles": ["Financial Analyst", "Risk Analyst", "Quantitative Analyst", "Portfolio Analyst", "FinOps Specialist"],
"skills": ["Excel", "Financial Modeling", "Valuation", "SQL", "Power BI", "Statistics", "Python", "R", "Forecasting", "Accounting", "Risk Management", "Regulatory Compliance"]
},
"Marketing": {
"roles": ["Digital Marketing Specialist", "SEO Specialist", "Content Strategist", "Growth Marketer", "Marketing Analyst"],
"skills": ["SEO", "SEM", "Google Analytics", "Content Marketing", "Copywriting", "Email Marketing", "Social Media", "A/B Testing", "SQL", "Excel", "NoCode", "Canva"]
},
"Sales": {
"roles": ["Sales Development Rep", "Account Executive", "Sales Operations Analyst", "Customer Success Manager"],
"skills": ["CRM", "Salesforce", "HubSpot", "Prospecting", "Negotiation", "Communication", "Pipeline Management", "Excel", "Presentation", "Customer Empathy"]
},
"Design": {
"roles": ["Product Designer", "UX Researcher", "UI Designer", "Design Technologist"],
"skills": ["Figma", "Prototyping", "User Research", "Wireframing", "Design Systems", "Accessibility", "HTML", "CSS", "JavaScript", "Usability Testing"]
},
"Operations": {
"roles": ["Operations Analyst", "Project Manager", "Program Coordinator", "Supply Chain Analyst"],
"skills": ["Project Management", "Scrum", "Kanban", "Excel", "SQL", "Process Improvement", "Stakeholder Management", "Risk Management", "Presentation"]
}
}
levels = ["Intern", "Junior", "Mid", "Senior", "Lead"]
locations = ["Remote", "Hybrid - US", "Hybrid - EU", "Onsite - US", "Onsite - EU", "Onsite - Asia"]
def synthesize_description(role, industry, skills):
highlights = random.sample(skills, k=min(5, len(skills)))
txt = f"{role} in {industry}. Responsibilities include building and improving systems, collaborating cross-functionally, and delivering measurable outcomes. "
txt += "Preferred skills: " + ", ".join(highlights) + "."
return txt
rows = []
job_id = 1000
random.seed(7)
for industry, detail in industries.items():
for role in detail["roles"]:
for _ in range(30):
job_id += 1
level = random.choices(levels, weights=[1, 2, 4, 2, 1])[0]
loc = random.choice(locations)
salary_low = random.randint(40, 120) * 1000
salary_high = salary_low + random.randint(10, 80) * 1000
req_skills = ", ".join(sorted(set(random.sample(detail["skills"], k=min(7, len(detail["skills"]))))))
title = f"{level} {role}"
desc = synthesize_description(role, industry, detail["skills"])
rows.append({
"job_id": job_id,
"title": title,
"industry": industry,
"level": level,
"location": loc,
"required_skills": req_skills,
"description": desc,
"salary_range_usd": f"{salary_low}-{salary_high}"
})
df = pd.DataFrame(rows).sample(frac=1.0, random_state=42).reset_index(drop=True)
os.makedirs("data", exist_ok=True)
df.to_csv("data/jobs.csv", index=False)
print(f"Generated {len(df)} jobs in data/jobs.csv")