# Role curation — Round 4 (2026-04-19) # # Driven by REAL Pakistani job-board evidence (Glassdoor PK / Indeed PK / LinkedIn PK postings). # Previous rounds R1-R3 included roadmap/curriculum-level fundamentals (DSA, OOP, Agile, etc.) # that — while real interview prep concerns — are NOT what real job postings list as requirements. # R4 removes them to match what employers actually ask for. # # Target: 10-18 skills per role. Mandatory tier capped at ~10. # Total: ~146 entries across 10 roles. # # RULE: A skill in this list IS kept. Anything in onet_roles_raw.yaml NOT listed here is dropped # (with reason logged to onet_roles_curation_log.md). Each entry can override is_mandatory / # required_level / weight. Omit to keep raw values. # # Sources cited in research/06-dataset-sourcing.md §10 (R4 sources subsection). curation: "Data Scientist": keep: - { name: "Python" } - { name: "SQL" } - { name: "Pandas" } - { name: "NumPy" } - { name: "Scikit-learn" } - { name: "PyTorch" } - { name: "Statistical Hypothesis Testing" } - { name: "Git" } - { name: "TensorFlow", is_mandatory: false, weight: 0.9 } - { name: "AWS", is_mandatory: false, weight: 0.8 } - { name: "Tableau", is_mandatory: false, weight: 0.8 } - { name: "Apache Spark", is_mandatory: false, weight: 0.7 } - { name: "Feature Engineering" } - { name: "Microsoft Power BI", is_mandatory: false, weight: 0.6 } - { name: "Matplotlib" } - { name: "Docker", is_mandatory: false, weight: 0.6 } # MLOps overlap drop_reasons: "C++": "Low-level numerics only; not in PK DS postings" "SAS": "Enterprise legacy" "MATLAB": "Academic only" "Java": "Rare in DS workflow" "Microsoft Azure": "Pick one cloud — AWS retained as primary" "Apache Hadoop": "Spark replaced it" "Microsoft PowerPoint": "Presentation tool, not measurable" "R": "R4: PK DS market is Python+SQL; R rarely required (Glassdoor PK Data Scientist evidence)" "Microsoft Excel": "R4: not core for DS (Excel is for DA/BI roles)" "Snowflake": "R4: warehouse skill — Data Engineer territory" "Jupyter Notebook": "R4: assumed knowledge, not learnable on slider" "Seaborn": "R4: covered by Matplotlib retention" "Data Structures & Algorithms": "R4: interview prep, not posting requirement (per job-board evidence)" "Data Analyst": keep: - { name: "SQL" } - { name: "Microsoft Excel", required_level: "ADVANCED" } # R3: collapsed Excel + Excel Advanced - { name: "Microsoft Power BI" } - { name: "Python", is_mandatory: true, weight: 0.9 } - { name: "Tableau", is_mandatory: false, weight: 0.8 } - { name: "Pandas", is_mandatory: false, weight: 0.7 } - { name: "DAX", is_mandatory: false, weight: 0.7 } # for Power BI users - { name: "Power Query / M Language", is_mandatory: false, weight: 0.6 } - { name: "Data Visualization" } - { name: "Statistical Hypothesis Testing", is_mandatory: false, weight: 0.5 } drop_reasons: "IBM SPSS Statistics": "Enterprise legacy" "Microsoft Access": "Legacy desktop DB" "Microsoft Office": "Too generic" "Microsoft PowerPoint": "Presentation tool" "SAS": "Enterprise legacy; PK market is Python/SQL" "AWS": "Junior DA in PK rarely owns cloud infra" "Microsoft Excel Advanced": "R3: collapsed into Microsoft Excel with required_level=ADVANCED" "R": "R4: PK DA postings rarely list R; demoted then dropped" "Machine Learning Engineer": keep: - { name: "Python" } - { name: "PyTorch" } - { name: "Scikit-learn" } - { name: "SQL" } - { name: "Docker" } - { name: "AWS" } - { name: "Pandas" } - { name: "NumPy" } - { name: "Git" } - { name: "GitHub" } - { name: "TensorFlow", is_mandatory: false, weight: 0.9 } - { name: "MLflow" } - { name: "Kubernetes", is_mandatory: false, weight: 0.7 } - { name: "FastAPI" } - { name: "Apache Spark", is_mandatory: false, weight: 0.7 } - { name: "Linux", is_mandatory: false, weight: 0.6 } drop_reasons: "C++": "Niche optimization only" "R": "Statistics-heavy — DS territory" "SAS": "Enterprise legacy" "MATLAB": "Academic only" "Microsoft Excel": "Not core to ML eng" "Microsoft PowerPoint": "Presentation tool" "Microsoft Power BI": "BI tool" "Tableau": "BI tool" "Snowflake": "Data Engineer territory" "Apache Hadoop": "Spark replaced it" "Java": "Rare in ML pipelines" "Microsoft Azure": "Pick one cloud — AWS retained" "Ansible": "DevOps territory" "C": "Legacy systems language" "Go": "Rare in ML pipelines" "Terraform": "DevOps territory" "JavaScript": "Frontend, not ML" "Jenkins": "Replaced by GitHub Actions; not ML-eng-specific" "NoSQL": "Generic" "Scala": "Spark+Scala niche, declining" "Splunk Enterprise": "Observability niche" "ONNX": "R4: niche optimization, not in PK ML postings" "Bash": "R4: assumed background skill" "Object-Oriented Programming": "R4: not in postings (interview prep)" "Data Structures & Algorithms": "R4: not in postings (interview prep)" "Agile / Scrum": "R4: rarely in ML eng postings" "Backend Developer": keep: - { name: "Python" } - { name: "Django" } - { name: "Node.js" } - { name: "PostgreSQL" } - { name: "SQL" } - { name: "REST API Design" } - { name: "Docker" } - { name: "Git" } - { name: "GitHub" } - { name: "JWT / OAuth", is_mandatory: true, weight: 0.8 } - { name: "FastAPI", is_mandatory: false, weight: 0.8 } - { name: "TypeScript", is_mandatory: false, weight: 0.7 } - { name: "AWS", is_mandatory: false, weight: 0.7 } - { name: "NoSQL", is_mandatory: false, weight: 0.6 } # MongoDB-equivalent - { name: "Laravel" } - { name: "PHP" } drop_reasons: "C": "Not Python/Node-stack backend" "C#": "Microsoft .NET stack — separate market" "C++": "Game/systems, not web backend" "CSS": "Frontend" "HTML": "Frontend" "XML": "Legacy" "JSON": "Assumed knowledge" ".NET": "Microsoft enterprise stack — separate market" "Java": "Spring stack — separate market" "Spring Boot": "Java stack" "Spring Framework": "Java stack" "Angular": "Frontend" "React": "Frontend" "Apache Kafka": "Data Engineer territory" "Terraform": "DevOps territory" "Microsoft Azure": "Pick one cloud — AWS retained" "Jenkins": "DevOps territory" "Apache Spark": "Data Eng territory" "Jira": "PM tool" "Flask": "R4: declining; FastAPI/Django dominate PK postings" "Redis": "R4: rarely listed by name in PK postings (assumed within Postgres/MongoDB choice)" "Linux": "R4: assumed background" "Kubernetes": "R4: senior-only in PK; Docker covers entry-level" "pytest": "R4: rarely listed by name in postings" "Celery": "R4: rarely listed by name" "nginx": "R4: rarely listed by name" "Pydantic": "R4: rarely listed by name (implicit with FastAPI)" "Object-Oriented Programming": "R4: assumed for interviews, not posting requirement" "Data Structures & Algorithms": "R4: interview prep, not posting requirement" "Agile / Scrum": "R4: not in posting requirement lists" "CI/CD Pipelines": "R4: not as concept skill in postings" "GitHub Actions": "R4: rare standalone in PK BE postings" ".NET / C#": "R4: separate Pakistani market, deserves its own role tier" "Unit Testing": "R4: not in posting requirement lists" "Frontend Developer": keep: - { name: "React" } - { name: "JavaScript" } - { name: "TypeScript" } - { name: "HTML" } - { name: "CSS" } - { name: "REST API Design" } - { name: "Git" } - { name: "GitHub" } - { name: "Next.js" } - { name: "Tailwind CSS", is_mandatory: false, weight: 0.8 } - { name: "Redux" } # R4: explicitly cited in PK MERN postings - { name: "Node.js", is_mandatory: false, weight: 0.7 } - { name: "Vite", is_mandatory: false, weight: 0.5 } - { name: "TanStack Query", is_mandatory: false, weight: 0.4 } drop_reasons: "C#": "Backend" "C++": "Not FE" "Java": "Backend" "PHP": "Backend (separate Full Stack stack)" "Python": "Backend" "Go": "Not FE" "SQL": "Backend/data" "NoSQL": "Backend/data" "MongoDB": "Backend/data" "MySQL": "Backend/data" "PostgreSQL": "Backend/data" "Apache Kafka": "Data Eng" "Kubernetes": "DevOps territory" "Microsoft Azure": "DevOps territory" "AWS": "DevOps territory" "Spring Boot": "Java backend" "Spring Framework": "Java backend" "Jenkins": "DevOps" "jQuery": "Legacy" "WordPress": "CMS — separate small-agency market" "JSON": "Assumed" "Jira": "PM tool" "Vue.js": "R4: separate role market in PK; React dominates" "Angular": "R4: separate role market in PK" "Vitest": "R4: testing rarely in PK FE postings" "Playwright": "R4: testing rarely in PK FE postings" "Docker": "R4: rarely listed for FE-only roles" "Web Accessibility (a11y)": "R4: rarely a posted requirement in PK" "Responsive Design": "R4: assumed within React+Tailwind work" "GraphQL": "R4: rare in PK FE postings" "Data Structures & Algorithms": "R4: interview prep, not posting requirement" "Agile / Scrum": "R4: not in FE posting requirement lists" "Full Stack Developer": keep: - { name: "JavaScript" } - { name: "TypeScript" } - { name: "React" } - { name: "Node.js" } - { name: "Python" } - { name: "PostgreSQL" } - { name: "REST API Design" } - { name: "SQL" } - { name: "Docker" } - { name: "Git" } - { name: "GitHub" } - { name: "HTML", is_mandatory: true, weight: 0.9 } - { name: "CSS", is_mandatory: true, weight: 0.9 } - { name: "Next.js" } - { name: "Django", is_mandatory: false, weight: 0.8 } - { name: "MongoDB", is_mandatory: false, weight: 0.8 } - { name: "Tailwind CSS", is_mandatory: false, weight: 0.7 } - { name: "AWS", is_mandatory: false, weight: 0.7 } drop_reasons: "C": "Not web full-stack" "C#": ".NET niche — separate stack" "C++": "Not web full-stack" "XML": "Legacy" "JSON": "Assumed knowledge" ".NET": ".NET niche — separate stack" "Java": "Spring niche — separate stack" "Spring Boot": "Java stack" "Spring Framework": "Java stack" "Apache Kafka": "Data Eng" "Terraform": "DevOps" "Microsoft Azure": "Pick one cloud — AWS retained" "MySQL": "PostgreSQL covers SQL needs" "Angular": "Pick one FE framework — React retained" "Vue.js": "Pick one FE framework — React retained" "Prisma": "R3: Python client archived 2025-04, community-only" "Go": "Niche for full-stack web in PK" "PHP": "R4: separate Laravel/PHP stack market — Backend covers" "Laravel": "R4: separate Laravel/PHP stack market — Backend covers" "Jenkins": "DevOps" "jQuery": "Legacy" "WordPress": "CMS — separate market" "Jira": "PM tool" "GraphQL": "R4: rare in PK FS postings" "Linux": "R4: assumed background" "Vite": "R4: assumed within Next.js/React tooling" "Vitest": "R4: testing rarely in PK postings" "JWT / OAuth": "R4: covered by Backend role; FS focuses on stack" "NoSQL": "Generic — MongoDB covers it" "Docker Compose": "Reviewer R2: redundant with Docker" "Object-Oriented Programming": "R4: not in posting requirement lists" "Data Structures & Algorithms": "R4: interview prep, not posting requirement" "Agile / Scrum": "R4: not in posting requirement lists" "CI/CD Pipelines": "R4: not as concept skill in PK FS postings" "GitHub Actions": "R4: rare standalone for FS" "Data Engineer": keep: - { name: "Python" } - { name: "SQL" } - { name: "Apache Spark" } - { name: "Apache Airflow" } - { name: "ETL / ELT" } - { name: "AWS" } - { name: "Git" } - { name: "Snowflake", is_mandatory: false, weight: 0.8 } - { name: "dbt", is_mandatory: false, weight: 0.8 } - { name: "Apache Kafka" } - { name: "Google BigQuery" } - { name: "Docker", is_mandatory: false, weight: 0.7 } - { name: "Microsoft Azure", is_mandatory: false, weight: 0.6 } - { name: "Data Modeling", is_mandatory: false, weight: 0.6 } drop_reasons: "Microsoft Access": "Legacy desktop DB" "Microsoft Office": "Too generic" "Microsoft Outlook": "Email client" "Microsoft PowerPoint": "Presentation tool" "Microsoft Power BI": "BI Analyst territory" "Tableau": "BI Analyst territory" "R": "BI/stats territory; Python+SQL covers DE" "Data Modeling (Kimball)": "R3: renamed to plain 'Data Modeling'" "Apache Iceberg": "R4: rare in PK entry-level DE postings; Delta Lake (also dropped) was the safer pick but neither needed" "Delta Lake": "R4: rare in PK entry-level DE postings" "Databricks": "R4: senior-only in PK postings" "Parquet": "R4: rarely listed by name in postings (implicit with Spark/Airflow)" "Java": "R4: declining for DE; Python+Scala dominate" "Microsoft Excel": "R4: rare for DE postings" "Data Structures & Algorithms": "R4: interview prep, not posting requirement" "DevOps Engineer": keep: - { name: "Docker" } - { name: "Kubernetes" } - { name: "Terraform" } - { name: "AWS" } - { name: "Linux" } - { name: "Bash Scripting" } - { name: "CI/CD Pipelines" } - { name: "Git" } - { name: "GitHub" } - { name: "Python", is_mandatory: true, weight: 0.8 } - { name: "GitHub Actions", is_mandatory: false, weight: 0.8 } - { name: "Microsoft Azure", is_mandatory: false, weight: 0.7 } - { name: "Ansible", is_mandatory: false, weight: 0.7 } - { name: "Prometheus" } - { name: "Grafana" } - { name: "Jenkins", is_mandatory: false, weight: 0.5 } drop_reasons: "Bash": "Duplicate of Bash Scripting" "Amazon Web Services AWS CloudFormation": "Terraform won" "C": "Not DevOps day-to-day" "C++": "Not DevOps day-to-day" "Java": "Rare in modern DevOps" "JavaScript": "Not DevOps" "Go": "K8s/Terraform internals only" "Microsoft Active Directory": "Windows enterprise niche" "Microsoft PowerShell": "Windows admin only" "Linux Administration": "R2: duplicate of Linux" "Jira": "R2: PM tool consistency" "ArgoCD": "R4: senior-only in PK postings" "Helm": "R4: senior-only in PK postings" "Computer Networking Fundamentals": "R4: assumed background, not in posting requirement lists" "Google Cloud Platform": "R4: rare in PK DevOps postings (AWS/Azure dominate)" "SQL": "R4: debugging-only, not core skill" "AI Engineer (GenAI / LLM)": keep: - { name: "Python" } - { name: "HuggingFace Transformers" } - { name: "LangChain" } - { name: "OpenAI API" } - { name: "RAG" } # R4: renamed from "Retrieval-Augmented Generation (RAG)" to canonical job-posting term - { name: "Prompt Engineering" } - { name: "PyTorch" } - { name: "Docker" } - { name: "Git" } - { name: "GitHub" } - { name: "AWS", is_mandatory: false, weight: 0.8 } - { name: "Microsoft Azure", is_mandatory: false, weight: 0.7 } - { name: "Anthropic Claude API" } - { name: "pgvector" } - { name: "FastAPI" } - { name: "LangGraph", is_mandatory: false, weight: 0.6 } - { name: "Sentence Transformers", is_mandatory: false, weight: 0.6 } drop_reasons: "C": "Legacy" "C++": "Niche" "Bash": "Background" "Go": "Rare in AI eng" "Scala": "Big data, not LLM eng" "Java": "Rare in AI eng" "R": "Statistics, not LLM eng" "JavaScript": "Not AI eng" "Apache Hadoop": "Big data — Data Eng" "Apache Spark": "Big data — Data Eng" "Ansible": "DevOps" "Terraform": "DevOps" "Jenkins": "DevOps" "Linux": "Background" "NoSQL": "Generic — vector DBs are the relevant DB type" "Splunk Enterprise": "Observability niche" "TensorFlow": "PyTorch dominates research/LLM" "Retrieval-Augmented Generation (RAG)": "R4: renamed to canonical 'RAG'" "Fine-tuning (LoRA/PEFT)": "R3: renamed to 'LLM Fine-tuning'; R4: dropped entirely (rare in postings)" "LlamaIndex": "R4: overlaps LangChain; rarely both required in PK postings" "Pinecone": "R4: open-source vector DBs (pgvector, Chroma) winning" "LLM Evaluations": "R4: concept rarely listed in postings" "LLM Observability": "R4: concept rarely listed in postings" "LLM Fine-tuning": "R4: rare in entry-level PK postings" "Kubernetes": "R4: senior-only in PK AI eng postings" "SQL": "R4: rare for AI Eng day-to-day" "Object-Oriented Programming": "R4: interview prep" "Data Structures & Algorithms": "R4: interview prep" "Business Intelligence Analyst": keep: - { name: "Microsoft Power BI" } - { name: "DAX" } - { name: "SQL" } - { name: "Microsoft Excel", required_level: "ADVANCED" } - { name: "Power Query / M Language", is_mandatory: true, weight: 0.9 } - { name: "Tableau", is_mandatory: false, weight: 0.7 } - { name: "Snowflake", is_mandatory: false, weight: 0.7 } - { name: "Python", is_mandatory: false, weight: 0.6 } - { name: "Star Schema Modeling" } - { name: "Data Storytelling" } drop_reasons: "Microsoft Office": "Too generic — Microsoft Excel covers it" "Microsoft PowerPoint": "Presentation, not BI engineering" "Oracle Cloud": "Enterprise niche" "SAP": "ERP — niche" "SAS": "Enterprise legacy" "AWS": "BI Analyst consumes from cloud DWH, doesn't own infra" "Microsoft Azure": "Same as AWS — BI Analyst doesn't own cloud infra" "Microsoft Excel Advanced": "R3: collapsed into Microsoft Excel with required_level=ADVANCED" "Looker": "R4: rare in PK BI postings (Power BI/Tableau dominate)" "dbt": "R4: more for Data Eng; rarely in PK BI postings" "R": "R4: PK BI market is Power BI+SQL+Excel; R rarely required"