gapguide-api / scripts /parse_onet_dump.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
12.2 kB
"""Parse the O*NET v30.2 bulk database into a normalized seed YAML.
Reads:
backend/seed_data/onet_dump/db_30_2_text/Technology Skills.txt
backend/seed_data/onet_dump/db_30_2_text/Occupation Data.txt
backend/seed_data/role_soc_mapping.yaml
backend/seed_data/manual_skill_augmentation.yaml
Writes:
backend/seed_data/onet_roles_raw.yaml
Filter rule: keep technology_skills where Hot Technology=Y AND In Demand=Y.
That intersection produces 11-35 skills per role (verified) — the right granularity
for a usable RoleSkill list. Manual augmentation adds modern frameworks O*NET misses.
Run from repo root:
cd backend && python scripts/parse_onet_dump.py
"""
from __future__ import annotations
import csv
import sys
from pathlib import Path
import yaml
REPO_ROOT = Path(__file__).resolve().parent.parent
SEED_DIR = REPO_ROOT / "seed_data"
ONET_DIR = SEED_DIR / "onet_dump" / "db_30_2_text"
ROLE_MAPPING_FILE = SEED_DIR / "role_soc_mapping.yaml"
AUGMENTATION_FILE = SEED_DIR / "manual_skill_augmentation.yaml"
OUTPUT_FILE = SEED_DIR / "onet_roles_raw.yaml"
def load_occupation_data() -> dict[str, dict[str, str]]:
"""SOC code -> {title, description}."""
path = ONET_DIR / "Occupation Data.txt"
out: dict[str, dict[str, str]] = {}
with path.open(encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
for row in reader:
out[row["O*NET-SOC Code"]] = {
"title": row["Title"],
"description": row["Description"],
}
return out
def load_tech_skills_for_socs(socs: set[str]) -> dict[str, list[dict]]:
"""SOC code -> [{example, commodity_title, hot, in_demand}].
Only returns rows where Hot Technology=Y AND In Demand=Y.
"""
path = ONET_DIR / "Technology Skills.txt"
out: dict[str, list[dict]] = {soc: [] for soc in socs}
with path.open(encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
for row in reader:
soc = row["O*NET-SOC Code"]
if soc not in socs:
continue
if row["Hot Technology"] != "Y" or row["In Demand"] != "Y":
continue
out[soc].append({
"example": row["Example"],
"commodity_title": row["Commodity Title"],
"hot": True,
"in_demand": True,
})
return out
def normalize_skill_name(raw: str) -> str:
"""Map O*NET's verbose skill names to cleaner display names.
Examples:
"Structured query language SQL" -> "SQL"
"Amazon Web Services AWS software" -> "AWS"
"Hypertext markup language HTML" -> "HTML"
"Cascading style sheets CSS" -> "CSS"
"Microsoft Azure software" -> "Microsoft Azure"
"Oracle Java" -> "Java"
"JavaScript Object Notation JSON" -> "JSON"
"Extensible markup language XML" -> "XML"
"Microsoft .NET Framework" -> ".NET"
"Spring Framework" / "Spring Boot" -> kept as-is
"The MathWorks MATLAB" -> "MATLAB"
"Google Angular" -> "Angular"
"IBM Terraform" -> "Terraform"
Cleaner names = better matches against manual augmentation entries and
against terms a student would actually type.
"""
s = raw.strip()
# Strip trailing " software"
if s.endswith(" software"):
s = s[: -len(" software")]
# Common acronym extractions: take the trailing all-caps token if the rest
# is a verbose expansion of it.
aliases = {
"Structured query language SQL": "SQL",
"Amazon Web Services AWS": "AWS",
"Hypertext markup language HTML": "HTML",
"Cascading style sheets CSS": "CSS",
"JavaScript Object Notation JSON": "JSON",
"Extensible markup language XML": "XML",
"The MathWorks MATLAB": "MATLAB",
"Google Angular": "Angular",
"IBM Terraform": "Terraform",
"Oracle Java": "Java",
"Microsoft .NET Framework": ".NET",
"Apache Hive": "Apache Hive",
"Apache Hadoop": "Apache Hadoop",
"Apache Spark": "Apache Spark",
"Apache Kafka": "Apache Kafka",
"Apache Airflow": "Apache Airflow",
"Apache Cassandra": "Apache Cassandra",
"Atlassian JIRA": "Jira",
"Atlassian Confluence": "Confluence",
"Microsoft Azure": "Microsoft Azure",
"Microsoft Excel": "Microsoft Excel",
"Microsoft Power BI": "Microsoft Power BI",
"Microsoft PowerPoint": "Microsoft PowerPoint",
"Microsoft Access": "Microsoft Access",
"Jenkins CI": "Jenkins",
"Node.js": "Node.js",
}
return aliases.get(s, s)
def categorize_skill(name: str, commodity_title: str) -> str:
"""Best-effort skill category from O*NET commodity title or exact name match.
Order matters: most specific buckets first (BI / Cloud / ML / Infra / Framework
/ Database / Language) before generic Tools fallback. Match on EXACT skill
names where possible to avoid substring false positives (e.g., the single
letter "r" matching "PowerPoint").
"""
n = name.strip()
n_lower = n.lower()
c_lower = commodity_title.lower() if commodity_title else ""
# Data/Analytics tools — check before BI because commodity_title for many
# data tools (Spark, Hadoop) is "Business intelligence and data analysis software"
data_names = {"Apache Spark", "Apache Hadoop", "Apache Hive", "Apache Kafka",
"Apache Airflow", "Apache Cassandra", "dbt", "Pandas", "NumPy",
"Matplotlib", "Seaborn", "Alteryx software", "Alteryx"}
if n in data_names:
return "Data"
# Most specific first
bi_names = {"Microsoft Power BI", "Tableau", "Looker", "Microsoft Excel"}
if n in bi_names or "business intelligence" in c_lower:
return "BI"
cloud_names = {"AWS", "Microsoft Azure", "Google Cloud Platform", "Snowflake",
"Google BigQuery", "Amazon Redshift", "Amazon Elastic Compute Cloud EC2",
"Amazon Simple Storage Service S3"}
if n in cloud_names or "cloud-based" in c_lower:
return "Cloud"
ml_names = {"PyTorch", "TensorFlow", "Scikit-learn", "HuggingFace Transformers",
"Apache MXNet", "MLflow", "Sentence Transformers", "ONNX"}
if n in ml_names:
return "ML"
infra_names = {"Docker", "Kubernetes", "Terraform", "Ansible", "Linux",
"Linux Administration", "Bash Scripting", "Docker Compose"}
if n in infra_names:
return "Infra"
framework_names = {"React", "Angular", "Vue.js", "Django", "Flask", "FastAPI",
"Spring Boot", "Spring Framework", "Next.js", "Node.js",
"Tailwind CSS", "TanStack Query", "Express", "LangChain",
"LlamaIndex"}
if n in framework_names:
return "Framework"
cicd_names = {"Jenkins", "GitHub Actions", "Atlassian JIRA", "Jira",
"Atlassian Confluence", "Confluence"}
if n in cicd_names:
return "CI/CD"
db_names = {"PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch",
"NoSQL", "pgvector", "Pinecone"}
if n in db_names or ("data base" in c_lower and n not in cloud_names):
return "Database"
# Languages — match on EXACT name only to avoid "r" matching "PowerPoint"
language_names = {"Python", "Java", "C", "C#", "C++", "JavaScript", "TypeScript",
"Go", "Ruby", "Swift", "Kotlin", "Bash", "MATLAB", "R", "SAS",
"PHP", "Rust", "Scala", "HTML", "CSS", "XML", "JSON", ".NET"}
if n in language_names:
return "Language"
if n in ("Git", "GitHub", "Vite", "Jupyter Notebook"):
return "Tools"
if "version" in c_lower or "monitoring" in c_lower:
return "Tools"
return "Tools"
def main() -> int:
if not ONET_DIR.exists():
print(f"ERROR: O*NET dump not found at {ONET_DIR}", file=sys.stderr)
print("Download db_30_2_text.zip from "
"https://www.onetcenter.org/dl_files/database/db_30_2_text.zip "
"and extract under backend/seed_data/onet_dump/", file=sys.stderr)
return 1
role_mapping = yaml.safe_load(ROLE_MAPPING_FILE.read_text(encoding="utf-8"))
augmentation = yaml.safe_load(AUGMENTATION_FILE.read_text(encoding="utf-8"))
# Collect all SOCs we need to query (across composite roles)
all_socs: set[str] = set()
for role in role_mapping["roles"]:
all_socs.update(role["socs"])
occupation_data = load_occupation_data()
tech_skills = load_tech_skills_for_socs(all_socs)
# Verify every SOC was found
missing = [soc for soc in all_socs if soc not in occupation_data]
if missing:
print(f"ERROR: SOC codes not found in Occupation Data.txt: {missing}",
file=sys.stderr)
return 1
output = {"roles": []}
for role in role_mapping["roles"]:
name = role["name"]
socs = role["socs"]
# Pick the title/description from the first SOC, unless overridden
primary_soc = socs[0]
title_source = occupation_data[primary_soc]
description = role.get("description_override") or title_source["description"]
# Union tech skills across composite SOCs, dedupe by normalized name
seen_names: set[str] = set()
skills: list[dict] = []
for soc in socs:
for entry in tech_skills.get(soc, []):
normalized = normalize_skill_name(entry["example"])
if normalized in seen_names:
continue
seen_names.add(normalized)
skills.append({
"skill_name": normalized,
"category": categorize_skill(normalized, entry["commodity_title"]),
"source": "onet",
"source_soc": soc,
"is_mandatory": True, # Hot+InDemand baseline
"weight": 1.0, # Admin can override
"required_level": "INTERMEDIATE", # Admin can override
})
# Apply manual augmentation
for aug in augmentation.get("augmentation", {}).get(name, []):
normalized = aug["skill"].strip()
if normalized in seen_names:
# Augmentation upgrades the existing entry's flags/weight
for s in skills:
if s["skill_name"] == normalized:
s["category"] = aug.get("category", s["category"])
s["is_mandatory"] = aug["is_mandatory"]
s["weight"] = aug["weight"]
s["required_level"] = aug["required_level"]
s["source"] = "onet+manual"
break
else:
seen_names.add(normalized)
skills.append({
"skill_name": normalized,
"category": aug.get("category", "Tools"),
"source": "manual",
"source_soc": None,
"is_mandatory": aug["is_mandatory"],
"weight": aug["weight"],
"required_level": aug["required_level"],
})
output["roles"].append({
"name": name,
"industry": role["industry"],
"description": description,
"primary_soc": primary_soc,
"all_socs": socs,
"skills": skills,
})
print(f" {name}: {len(skills)} skills "
f"({sum(1 for s in skills if s['source'].startswith('onet'))} from O*NET, "
f"{sum(1 for s in skills if s['source'] == 'manual')} manual)")
OUTPUT_FILE.write_text(
yaml.safe_dump(output, sort_keys=False, allow_unicode=True, width=200),
encoding="utf-8",
)
print(f"\nWrote {OUTPUT_FILE}")
print(f" {len(output['roles'])} roles, "
f"{sum(len(r['skills']) for r in output['roles'])} total RoleSkill entries")
return 0
if __name__ == "__main__":
sys.exit(main())