Spaces:
Build error
Build error
| --- | |
| ## **4. Dataset — data/jobs.csv** | |
| You’ll need to create a folder called `data` in your Space and inside it a `jobs.csv` file. | |
| Because the dataset is large (1200 rows), I can give you a **Python script** to generate it quickly inside Hugging Face instead of pasting all rows here: | |
| Create a file named **generate_dataset.py** in the Space: | |
| ```python | |
| import os, random, pandas as pd | |
| industries = { | |
| "Software": { | |
| "roles": ["Backend Engineer", "Frontend Engineer", "Full-Stack Developer", "ML Engineer", "Data Engineer", "QA Engineer", "DevOps Engineer", "Product Manager"], | |
| "skills": ["Python", "JavaScript", "TypeScript", "React", "Node.js", "Django", "Flask", "REST APIs", "GraphQL", "SQL", "NoSQL", "Docker", "Kubernetes", "CI/CD", "Git", "AWS", "GCP", "Unit Testing", "Agile", "Scrum"] | |
| }, | |
| "Data": { | |
| "roles": ["Data Scientist", "Data Analyst", "Analytics Engineer", "BI Developer", "NLP Engineer", "Computer Vision Engineer"], | |
| "skills": ["Python", "Pandas", "NumPy", "Scikit-learn", "TensorFlow", "PyTorch", "XGBoost", "Statistics", "A/B Testing", "SQL", "Data Visualization", "Tableau", "Power BI", "Feature Engineering", "Model Deployment", "Airflow", "MLflow"] | |
| }, | |
| "Cybersecurity": { | |
| "roles": ["Security Analyst", "SOC Analyst", "Security Engineer", "GRC Specialist", "Penetration Tester"], | |
| "skills": ["Network Security", "SIEM", "IDS/IPS", "Vulnerability Management", "Python", "Linux", "Windows", "Cloud Security", "Risk Assessment", "Incident Response", "OWASP", "Threat Modeling"] | |
| }, | |
| "Finance": { | |
| "roles": ["Financial Analyst", "Risk Analyst", "Quantitative Analyst", "Portfolio Analyst", "FinOps Specialist"], | |
| "skills": ["Excel", "Financial Modeling", "Valuation", "SQL", "Power BI", "Statistics", "Python", "R", "Forecasting", "Accounting", "Risk Management", "Regulatory Compliance"] | |
| }, | |
| "Marketing": { | |
| "roles": ["Digital Marketing Specialist", "SEO Specialist", "Content Strategist", "Growth Marketer", "Marketing Analyst"], | |
| "skills": ["SEO", "SEM", "Google Analytics", "Content Marketing", "Copywriting", "Email Marketing", "Social Media", "A/B Testing", "SQL", "Excel", "NoCode", "Canva"] | |
| }, | |
| "Sales": { | |
| "roles": ["Sales Development Rep", "Account Executive", "Sales Operations Analyst", "Customer Success Manager"], | |
| "skills": ["CRM", "Salesforce", "HubSpot", "Prospecting", "Negotiation", "Communication", "Pipeline Management", "Excel", "Presentation", "Customer Empathy"] | |
| }, | |
| "Design": { | |
| "roles": ["Product Designer", "UX Researcher", "UI Designer", "Design Technologist"], | |
| "skills": ["Figma", "Prototyping", "User Research", "Wireframing", "Design Systems", "Accessibility", "HTML", "CSS", "JavaScript", "Usability Testing"] | |
| }, | |
| "Operations": { | |
| "roles": ["Operations Analyst", "Project Manager", "Program Coordinator", "Supply Chain Analyst"], | |
| "skills": ["Project Management", "Scrum", "Kanban", "Excel", "SQL", "Process Improvement", "Stakeholder Management", "Risk Management", "Presentation"] | |
| } | |
| } | |
| levels = ["Intern", "Junior", "Mid", "Senior", "Lead"] | |
| locations = ["Remote", "Hybrid - US", "Hybrid - EU", "Onsite - US", "Onsite - EU", "Onsite - Asia"] | |
| def synthesize_description(role, industry, skills): | |
| highlights = random.sample(skills, k=min(5, len(skills))) | |
| txt = f"{role} in {industry}. Responsibilities include building and improving systems, collaborating cross-functionally, and delivering measurable outcomes. " | |
| txt += "Preferred skills: " + ", ".join(highlights) + "." | |
| return txt | |
| rows = [] | |
| job_id = 1000 | |
| random.seed(7) | |
| for industry, detail in industries.items(): | |
| for role in detail["roles"]: | |
| for _ in range(30): | |
| job_id += 1 | |
| level = random.choices(levels, weights=[1, 2, 4, 2, 1])[0] | |
| loc = random.choice(locations) | |
| salary_low = random.randint(40, 120) * 1000 | |
| salary_high = salary_low + random.randint(10, 80) * 1000 | |
| req_skills = ", ".join(sorted(set(random.sample(detail["skills"], k=min(7, len(detail["skills"])))))) | |
| title = f"{level} {role}" | |
| desc = synthesize_description(role, industry, detail["skills"]) | |
| rows.append({ | |
| "job_id": job_id, | |
| "title": title, | |
| "industry": industry, | |
| "level": level, | |
| "location": loc, | |
| "required_skills": req_skills, | |
| "description": desc, | |
| "salary_range_usd": f"{salary_low}-{salary_high}" | |
| }) | |
| df = pd.DataFrame(rows).sample(frac=1.0, random_state=42).reset_index(drop=True) | |
| os.makedirs("data", exist_ok=True) | |
| df.to_csv("data/jobs.csv", index=False) | |
| print(f"Generated {len(df)} jobs in data/jobs.csv") | |