gapguide-api / seed_data /role_curation.yaml
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
21.5 kB
# Role curation — Round 4 (2026-04-19)
#
# Driven by REAL Pakistani job-board evidence (Glassdoor PK / Indeed PK / LinkedIn PK postings).
# Previous rounds R1-R3 included roadmap/curriculum-level fundamentals (DSA, OOP, Agile, etc.)
# that — while real interview prep concerns — are NOT what real job postings list as requirements.
# R4 removes them to match what employers actually ask for.
#
# Target: 10-18 skills per role. Mandatory tier capped at ~10.
# Total: ~146 entries across 10 roles.
#
# RULE: A skill in this list IS kept. Anything in onet_roles_raw.yaml NOT listed here is dropped
# (with reason logged to onet_roles_curation_log.md). Each entry can override is_mandatory /
# required_level / weight. Omit to keep raw values.
#
# Sources cited in research/06-dataset-sourcing.md §10 (R4 sources subsection).
curation:
"Data Scientist":
keep:
- { name: "Python" }
- { name: "SQL" }
- { name: "Pandas" }
- { name: "NumPy" }
- { name: "Scikit-learn" }
- { name: "PyTorch" }
- { name: "Statistical Hypothesis Testing" }
- { name: "Git" }
- { name: "TensorFlow", is_mandatory: false, weight: 0.9 }
- { name: "AWS", is_mandatory: false, weight: 0.8 }
- { name: "Tableau", is_mandatory: false, weight: 0.8 }
- { name: "Apache Spark", is_mandatory: false, weight: 0.7 }
- { name: "Feature Engineering" }
- { name: "Microsoft Power BI", is_mandatory: false, weight: 0.6 }
- { name: "Matplotlib" }
- { name: "Docker", is_mandatory: false, weight: 0.6 } # MLOps overlap
drop_reasons:
"C++": "Low-level numerics only; not in PK DS postings"
"SAS": "Enterprise legacy"
"MATLAB": "Academic only"
"Java": "Rare in DS workflow"
"Microsoft Azure": "Pick one cloud — AWS retained as primary"
"Apache Hadoop": "Spark replaced it"
"Microsoft PowerPoint": "Presentation tool, not measurable"
"R": "R4: PK DS market is Python+SQL; R rarely required (Glassdoor PK Data Scientist evidence)"
"Microsoft Excel": "R4: not core for DS (Excel is for DA/BI roles)"
"Snowflake": "R4: warehouse skill — Data Engineer territory"
"Jupyter Notebook": "R4: assumed knowledge, not learnable on slider"
"Seaborn": "R4: covered by Matplotlib retention"
"Data Structures & Algorithms": "R4: interview prep, not posting requirement (per job-board evidence)"
"Data Analyst":
keep:
- { name: "SQL" }
- { name: "Microsoft Excel", required_level: "ADVANCED" } # R3: collapsed Excel + Excel Advanced
- { name: "Microsoft Power BI" }
- { name: "Python", is_mandatory: true, weight: 0.9 }
- { name: "Tableau", is_mandatory: false, weight: 0.8 }
- { name: "Pandas", is_mandatory: false, weight: 0.7 }
- { name: "DAX", is_mandatory: false, weight: 0.7 } # for Power BI users
- { name: "Power Query / M Language", is_mandatory: false, weight: 0.6 }
- { name: "Data Visualization" }
- { name: "Statistical Hypothesis Testing", is_mandatory: false, weight: 0.5 }
drop_reasons:
"IBM SPSS Statistics": "Enterprise legacy"
"Microsoft Access": "Legacy desktop DB"
"Microsoft Office": "Too generic"
"Microsoft PowerPoint": "Presentation tool"
"SAS": "Enterprise legacy; PK market is Python/SQL"
"AWS": "Junior DA in PK rarely owns cloud infra"
"Microsoft Excel Advanced": "R3: collapsed into Microsoft Excel with required_level=ADVANCED"
"R": "R4: PK DA postings rarely list R; demoted then dropped"
"Machine Learning Engineer":
keep:
- { name: "Python" }
- { name: "PyTorch" }
- { name: "Scikit-learn" }
- { name: "SQL" }
- { name: "Docker" }
- { name: "AWS" }
- { name: "Pandas" }
- { name: "NumPy" }
- { name: "Git" }
- { name: "GitHub" }
- { name: "TensorFlow", is_mandatory: false, weight: 0.9 }
- { name: "MLflow" }
- { name: "Kubernetes", is_mandatory: false, weight: 0.7 }
- { name: "FastAPI" }
- { name: "Apache Spark", is_mandatory: false, weight: 0.7 }
- { name: "Linux", is_mandatory: false, weight: 0.6 }
drop_reasons:
"C++": "Niche optimization only"
"R": "Statistics-heavy — DS territory"
"SAS": "Enterprise legacy"
"MATLAB": "Academic only"
"Microsoft Excel": "Not core to ML eng"
"Microsoft PowerPoint": "Presentation tool"
"Microsoft Power BI": "BI tool"
"Tableau": "BI tool"
"Snowflake": "Data Engineer territory"
"Apache Hadoop": "Spark replaced it"
"Java": "Rare in ML pipelines"
"Microsoft Azure": "Pick one cloud — AWS retained"
"Ansible": "DevOps territory"
"C": "Legacy systems language"
"Go": "Rare in ML pipelines"
"Terraform": "DevOps territory"
"JavaScript": "Frontend, not ML"
"Jenkins": "Replaced by GitHub Actions; not ML-eng-specific"
"NoSQL": "Generic"
"Scala": "Spark+Scala niche, declining"
"Splunk Enterprise": "Observability niche"
"ONNX": "R4: niche optimization, not in PK ML postings"
"Bash": "R4: assumed background skill"
"Object-Oriented Programming": "R4: not in postings (interview prep)"
"Data Structures & Algorithms": "R4: not in postings (interview prep)"
"Agile / Scrum": "R4: rarely in ML eng postings"
"Backend Developer":
keep:
- { name: "Python" }
- { name: "Django" }
- { name: "Node.js" }
- { name: "PostgreSQL" }
- { name: "SQL" }
- { name: "REST API Design" }
- { name: "Docker" }
- { name: "Git" }
- { name: "GitHub" }
- { name: "JWT / OAuth", is_mandatory: true, weight: 0.8 }
- { name: "FastAPI", is_mandatory: false, weight: 0.8 }
- { name: "TypeScript", is_mandatory: false, weight: 0.7 }
- { name: "AWS", is_mandatory: false, weight: 0.7 }
- { name: "NoSQL", is_mandatory: false, weight: 0.6 } # MongoDB-equivalent
- { name: "Laravel" }
- { name: "PHP" }
drop_reasons:
"C": "Not Python/Node-stack backend"
"C#": "Microsoft .NET stack — separate market"
"C++": "Game/systems, not web backend"
"CSS": "Frontend"
"HTML": "Frontend"
"XML": "Legacy"
"JSON": "Assumed knowledge"
".NET": "Microsoft enterprise stack — separate market"
"Java": "Spring stack — separate market"
"Spring Boot": "Java stack"
"Spring Framework": "Java stack"
"Angular": "Frontend"
"React": "Frontend"
"Apache Kafka": "Data Engineer territory"
"Terraform": "DevOps territory"
"Microsoft Azure": "Pick one cloud — AWS retained"
"Jenkins": "DevOps territory"
"Apache Spark": "Data Eng territory"
"Jira": "PM tool"
"Flask": "R4: declining; FastAPI/Django dominate PK postings"
"Redis": "R4: rarely listed by name in PK postings (assumed within Postgres/MongoDB choice)"
"Linux": "R4: assumed background"
"Kubernetes": "R4: senior-only in PK; Docker covers entry-level"
"pytest": "R4: rarely listed by name in postings"
"Celery": "R4: rarely listed by name"
"nginx": "R4: rarely listed by name"
"Pydantic": "R4: rarely listed by name (implicit with FastAPI)"
"Object-Oriented Programming": "R4: assumed for interviews, not posting requirement"
"Data Structures & Algorithms": "R4: interview prep, not posting requirement"
"Agile / Scrum": "R4: not in posting requirement lists"
"CI/CD Pipelines": "R4: not as concept skill in postings"
"GitHub Actions": "R4: rare standalone in PK BE postings"
".NET / C#": "R4: separate Pakistani market, deserves its own role tier"
"Unit Testing": "R4: not in posting requirement lists"
"Frontend Developer":
keep:
- { name: "React" }
- { name: "JavaScript" }
- { name: "TypeScript" }
- { name: "HTML" }
- { name: "CSS" }
- { name: "REST API Design" }
- { name: "Git" }
- { name: "GitHub" }
- { name: "Next.js" }
- { name: "Tailwind CSS", is_mandatory: false, weight: 0.8 }
- { name: "Redux" } # R4: explicitly cited in PK MERN postings
- { name: "Node.js", is_mandatory: false, weight: 0.7 }
- { name: "Vite", is_mandatory: false, weight: 0.5 }
- { name: "TanStack Query", is_mandatory: false, weight: 0.4 }
drop_reasons:
"C#": "Backend"
"C++": "Not FE"
"Java": "Backend"
"PHP": "Backend (separate Full Stack stack)"
"Python": "Backend"
"Go": "Not FE"
"SQL": "Backend/data"
"NoSQL": "Backend/data"
"MongoDB": "Backend/data"
"MySQL": "Backend/data"
"PostgreSQL": "Backend/data"
"Apache Kafka": "Data Eng"
"Kubernetes": "DevOps territory"
"Microsoft Azure": "DevOps territory"
"AWS": "DevOps territory"
"Spring Boot": "Java backend"
"Spring Framework": "Java backend"
"Jenkins": "DevOps"
"jQuery": "Legacy"
"WordPress": "CMS — separate small-agency market"
"JSON": "Assumed"
"Jira": "PM tool"
"Vue.js": "R4: separate role market in PK; React dominates"
"Angular": "R4: separate role market in PK"
"Vitest": "R4: testing rarely in PK FE postings"
"Playwright": "R4: testing rarely in PK FE postings"
"Docker": "R4: rarely listed for FE-only roles"
"Web Accessibility (a11y)": "R4: rarely a posted requirement in PK"
"Responsive Design": "R4: assumed within React+Tailwind work"
"GraphQL": "R4: rare in PK FE postings"
"Data Structures & Algorithms": "R4: interview prep, not posting requirement"
"Agile / Scrum": "R4: not in FE posting requirement lists"
"Full Stack Developer":
keep:
- { name: "JavaScript" }
- { name: "TypeScript" }
- { name: "React" }
- { name: "Node.js" }
- { name: "Python" }
- { name: "PostgreSQL" }
- { name: "REST API Design" }
- { name: "SQL" }
- { name: "Docker" }
- { name: "Git" }
- { name: "GitHub" }
- { name: "HTML", is_mandatory: true, weight: 0.9 }
- { name: "CSS", is_mandatory: true, weight: 0.9 }
- { name: "Next.js" }
- { name: "Django", is_mandatory: false, weight: 0.8 }
- { name: "MongoDB", is_mandatory: false, weight: 0.8 }
- { name: "Tailwind CSS", is_mandatory: false, weight: 0.7 }
- { name: "AWS", is_mandatory: false, weight: 0.7 }
drop_reasons:
"C": "Not web full-stack"
"C#": ".NET niche — separate stack"
"C++": "Not web full-stack"
"XML": "Legacy"
"JSON": "Assumed knowledge"
".NET": ".NET niche — separate stack"
"Java": "Spring niche — separate stack"
"Spring Boot": "Java stack"
"Spring Framework": "Java stack"
"Apache Kafka": "Data Eng"
"Terraform": "DevOps"
"Microsoft Azure": "Pick one cloud — AWS retained"
"MySQL": "PostgreSQL covers SQL needs"
"Angular": "Pick one FE framework — React retained"
"Vue.js": "Pick one FE framework — React retained"
"Prisma": "R3: Python client archived 2025-04, community-only"
"Go": "Niche for full-stack web in PK"
"PHP": "R4: separate Laravel/PHP stack market — Backend covers"
"Laravel": "R4: separate Laravel/PHP stack market — Backend covers"
"Jenkins": "DevOps"
"jQuery": "Legacy"
"WordPress": "CMS — separate market"
"Jira": "PM tool"
"GraphQL": "R4: rare in PK FS postings"
"Linux": "R4: assumed background"
"Vite": "R4: assumed within Next.js/React tooling"
"Vitest": "R4: testing rarely in PK postings"
"JWT / OAuth": "R4: covered by Backend role; FS focuses on stack"
"NoSQL": "Generic — MongoDB covers it"
"Docker Compose": "Reviewer R2: redundant with Docker"
"Object-Oriented Programming": "R4: not in posting requirement lists"
"Data Structures & Algorithms": "R4: interview prep, not posting requirement"
"Agile / Scrum": "R4: not in posting requirement lists"
"CI/CD Pipelines": "R4: not as concept skill in PK FS postings"
"GitHub Actions": "R4: rare standalone for FS"
"Data Engineer":
keep:
- { name: "Python" }
- { name: "SQL" }
- { name: "Apache Spark" }
- { name: "Apache Airflow" }
- { name: "ETL / ELT" }
- { name: "AWS" }
- { name: "Git" }
- { name: "Snowflake", is_mandatory: false, weight: 0.8 }
- { name: "dbt", is_mandatory: false, weight: 0.8 }
- { name: "Apache Kafka" }
- { name: "Google BigQuery" }
- { name: "Docker", is_mandatory: false, weight: 0.7 }
- { name: "Microsoft Azure", is_mandatory: false, weight: 0.6 }
- { name: "Data Modeling", is_mandatory: false, weight: 0.6 }
drop_reasons:
"Microsoft Access": "Legacy desktop DB"
"Microsoft Office": "Too generic"
"Microsoft Outlook": "Email client"
"Microsoft PowerPoint": "Presentation tool"
"Microsoft Power BI": "BI Analyst territory"
"Tableau": "BI Analyst territory"
"R": "BI/stats territory; Python+SQL covers DE"
"Data Modeling (Kimball)": "R3: renamed to plain 'Data Modeling'"
"Apache Iceberg": "R4: rare in PK entry-level DE postings; Delta Lake (also dropped) was the safer pick but neither needed"
"Delta Lake": "R4: rare in PK entry-level DE postings"
"Databricks": "R4: senior-only in PK postings"
"Parquet": "R4: rarely listed by name in postings (implicit with Spark/Airflow)"
"Java": "R4: declining for DE; Python+Scala dominate"
"Microsoft Excel": "R4: rare for DE postings"
"Data Structures & Algorithms": "R4: interview prep, not posting requirement"
"DevOps Engineer":
keep:
- { name: "Docker" }
- { name: "Kubernetes" }
- { name: "Terraform" }
- { name: "AWS" }
- { name: "Linux" }
- { name: "Bash Scripting" }
- { name: "CI/CD Pipelines" }
- { name: "Git" }
- { name: "GitHub" }
- { name: "Python", is_mandatory: true, weight: 0.8 }
- { name: "GitHub Actions", is_mandatory: false, weight: 0.8 }
- { name: "Microsoft Azure", is_mandatory: false, weight: 0.7 }
- { name: "Ansible", is_mandatory: false, weight: 0.7 }
- { name: "Prometheus" }
- { name: "Grafana" }
- { name: "Jenkins", is_mandatory: false, weight: 0.5 }
drop_reasons:
"Bash": "Duplicate of Bash Scripting"
"Amazon Web Services AWS CloudFormation": "Terraform won"
"C": "Not DevOps day-to-day"
"C++": "Not DevOps day-to-day"
"Java": "Rare in modern DevOps"
"JavaScript": "Not DevOps"
"Go": "K8s/Terraform internals only"
"Microsoft Active Directory": "Windows enterprise niche"
"Microsoft PowerShell": "Windows admin only"
"Linux Administration": "R2: duplicate of Linux"
"Jira": "R2: PM tool consistency"
"ArgoCD": "R4: senior-only in PK postings"
"Helm": "R4: senior-only in PK postings"
"Computer Networking Fundamentals": "R4: assumed background, not in posting requirement lists"
"Google Cloud Platform": "R4: rare in PK DevOps postings (AWS/Azure dominate)"
"SQL": "R4: debugging-only, not core skill"
"AI Engineer (GenAI / LLM)":
keep:
- { name: "Python" }
- { name: "HuggingFace Transformers" }
- { name: "LangChain" }
- { name: "OpenAI API" }
- { name: "RAG" } # R4: renamed from "Retrieval-Augmented Generation (RAG)" to canonical job-posting term
- { name: "Prompt Engineering" }
- { name: "PyTorch" }
- { name: "Docker" }
- { name: "Git" }
- { name: "GitHub" }
- { name: "AWS", is_mandatory: false, weight: 0.8 }
- { name: "Microsoft Azure", is_mandatory: false, weight: 0.7 }
- { name: "Anthropic Claude API" }
- { name: "pgvector" }
- { name: "FastAPI" }
- { name: "LangGraph", is_mandatory: false, weight: 0.6 }
- { name: "Sentence Transformers", is_mandatory: false, weight: 0.6 }
drop_reasons:
"C": "Legacy"
"C++": "Niche"
"Bash": "Background"
"Go": "Rare in AI eng"
"Scala": "Big data, not LLM eng"
"Java": "Rare in AI eng"
"R": "Statistics, not LLM eng"
"JavaScript": "Not AI eng"
"Apache Hadoop": "Big data — Data Eng"
"Apache Spark": "Big data — Data Eng"
"Ansible": "DevOps"
"Terraform": "DevOps"
"Jenkins": "DevOps"
"Linux": "Background"
"NoSQL": "Generic — vector DBs are the relevant DB type"
"Splunk Enterprise": "Observability niche"
"TensorFlow": "PyTorch dominates research/LLM"
"Retrieval-Augmented Generation (RAG)": "R4: renamed to canonical 'RAG'"
"Fine-tuning (LoRA/PEFT)": "R3: renamed to 'LLM Fine-tuning'; R4: dropped entirely (rare in postings)"
"LlamaIndex": "R4: overlaps LangChain; rarely both required in PK postings"
"Pinecone": "R4: open-source vector DBs (pgvector, Chroma) winning"
"LLM Evaluations": "R4: concept rarely listed in postings"
"LLM Observability": "R4: concept rarely listed in postings"
"LLM Fine-tuning": "R4: rare in entry-level PK postings"
"Kubernetes": "R4: senior-only in PK AI eng postings"
"SQL": "R4: rare for AI Eng day-to-day"
"Object-Oriented Programming": "R4: interview prep"
"Data Structures & Algorithms": "R4: interview prep"
"Business Intelligence Analyst":
keep:
- { name: "Microsoft Power BI" }
- { name: "DAX" }
- { name: "SQL" }
- { name: "Microsoft Excel", required_level: "ADVANCED" }
- { name: "Power Query / M Language", is_mandatory: true, weight: 0.9 }
- { name: "Tableau", is_mandatory: false, weight: 0.7 }
- { name: "Snowflake", is_mandatory: false, weight: 0.7 }
- { name: "Python", is_mandatory: false, weight: 0.6 }
- { name: "Star Schema Modeling" }
- { name: "Data Storytelling" }
drop_reasons:
"Microsoft Office": "Too generic — Microsoft Excel covers it"
"Microsoft PowerPoint": "Presentation, not BI engineering"
"Oracle Cloud": "Enterprise niche"
"SAP": "ERP — niche"
"SAS": "Enterprise legacy"
"AWS": "BI Analyst consumes from cloud DWH, doesn't own infra"
"Microsoft Azure": "Same as AWS — BI Analyst doesn't own cloud infra"
"Microsoft Excel Advanced": "R3: collapsed into Microsoft Excel with required_level=ADVANCED"
"Looker": "R4: rare in PK BI postings (Power BI/Tableau dominate)"
"dbt": "R4: more for Data Eng; rarely in PK BI postings"
"R": "R4: PK BI market is Power BI+SQL+Excel; R rarely required"