"""Parse the O*NET v30.2 bulk database into a normalized seed YAML. Reads: backend/seed_data/onet_dump/db_30_2_text/Technology Skills.txt backend/seed_data/onet_dump/db_30_2_text/Occupation Data.txt backend/seed_data/role_soc_mapping.yaml backend/seed_data/manual_skill_augmentation.yaml Writes: backend/seed_data/onet_roles_raw.yaml Filter rule: keep technology_skills where Hot Technology=Y AND In Demand=Y. That intersection produces 11-35 skills per role (verified) — the right granularity for a usable RoleSkill list. Manual augmentation adds modern frameworks O*NET misses. Run from repo root: cd backend && python scripts/parse_onet_dump.py """ from __future__ import annotations import csv import sys from pathlib import Path import yaml REPO_ROOT = Path(__file__).resolve().parent.parent SEED_DIR = REPO_ROOT / "seed_data" ONET_DIR = SEED_DIR / "onet_dump" / "db_30_2_text" ROLE_MAPPING_FILE = SEED_DIR / "role_soc_mapping.yaml" AUGMENTATION_FILE = SEED_DIR / "manual_skill_augmentation.yaml" OUTPUT_FILE = SEED_DIR / "onet_roles_raw.yaml" def load_occupation_data() -> dict[str, dict[str, str]]: """SOC code -> {title, description}.""" path = ONET_DIR / "Occupation Data.txt" out: dict[str, dict[str, str]] = {} with path.open(encoding="utf-8") as f: reader = csv.DictReader(f, delimiter="\t") for row in reader: out[row["O*NET-SOC Code"]] = { "title": row["Title"], "description": row["Description"], } return out def load_tech_skills_for_socs(socs: set[str]) -> dict[str, list[dict]]: """SOC code -> [{example, commodity_title, hot, in_demand}]. Only returns rows where Hot Technology=Y AND In Demand=Y. """ path = ONET_DIR / "Technology Skills.txt" out: dict[str, list[dict]] = {soc: [] for soc in socs} with path.open(encoding="utf-8") as f: reader = csv.DictReader(f, delimiter="\t") for row in reader: soc = row["O*NET-SOC Code"] if soc not in socs: continue if row["Hot Technology"] != "Y" or row["In Demand"] != "Y": continue out[soc].append({ "example": row["Example"], "commodity_title": row["Commodity Title"], "hot": True, "in_demand": True, }) return out def normalize_skill_name(raw: str) -> str: """Map O*NET's verbose skill names to cleaner display names. Examples: "Structured query language SQL" -> "SQL" "Amazon Web Services AWS software" -> "AWS" "Hypertext markup language HTML" -> "HTML" "Cascading style sheets CSS" -> "CSS" "Microsoft Azure software" -> "Microsoft Azure" "Oracle Java" -> "Java" "JavaScript Object Notation JSON" -> "JSON" "Extensible markup language XML" -> "XML" "Microsoft .NET Framework" -> ".NET" "Spring Framework" / "Spring Boot" -> kept as-is "The MathWorks MATLAB" -> "MATLAB" "Google Angular" -> "Angular" "IBM Terraform" -> "Terraform" Cleaner names = better matches against manual augmentation entries and against terms a student would actually type. """ s = raw.strip() # Strip trailing " software" if s.endswith(" software"): s = s[: -len(" software")] # Common acronym extractions: take the trailing all-caps token if the rest # is a verbose expansion of it. aliases = { "Structured query language SQL": "SQL", "Amazon Web Services AWS": "AWS", "Hypertext markup language HTML": "HTML", "Cascading style sheets CSS": "CSS", "JavaScript Object Notation JSON": "JSON", "Extensible markup language XML": "XML", "The MathWorks MATLAB": "MATLAB", "Google Angular": "Angular", "IBM Terraform": "Terraform", "Oracle Java": "Java", "Microsoft .NET Framework": ".NET", "Apache Hive": "Apache Hive", "Apache Hadoop": "Apache Hadoop", "Apache Spark": "Apache Spark", "Apache Kafka": "Apache Kafka", "Apache Airflow": "Apache Airflow", "Apache Cassandra": "Apache Cassandra", "Atlassian JIRA": "Jira", "Atlassian Confluence": "Confluence", "Microsoft Azure": "Microsoft Azure", "Microsoft Excel": "Microsoft Excel", "Microsoft Power BI": "Microsoft Power BI", "Microsoft PowerPoint": "Microsoft PowerPoint", "Microsoft Access": "Microsoft Access", "Jenkins CI": "Jenkins", "Node.js": "Node.js", } return aliases.get(s, s) def categorize_skill(name: str, commodity_title: str) -> str: """Best-effort skill category from O*NET commodity title or exact name match. Order matters: most specific buckets first (BI / Cloud / ML / Infra / Framework / Database / Language) before generic Tools fallback. Match on EXACT skill names where possible to avoid substring false positives (e.g., the single letter "r" matching "PowerPoint"). """ n = name.strip() n_lower = n.lower() c_lower = commodity_title.lower() if commodity_title else "" # Data/Analytics tools — check before BI because commodity_title for many # data tools (Spark, Hadoop) is "Business intelligence and data analysis software" data_names = {"Apache Spark", "Apache Hadoop", "Apache Hive", "Apache Kafka", "Apache Airflow", "Apache Cassandra", "dbt", "Pandas", "NumPy", "Matplotlib", "Seaborn", "Alteryx software", "Alteryx"} if n in data_names: return "Data" # Most specific first bi_names = {"Microsoft Power BI", "Tableau", "Looker", "Microsoft Excel"} if n in bi_names or "business intelligence" in c_lower: return "BI" cloud_names = {"AWS", "Microsoft Azure", "Google Cloud Platform", "Snowflake", "Google BigQuery", "Amazon Redshift", "Amazon Elastic Compute Cloud EC2", "Amazon Simple Storage Service S3"} if n in cloud_names or "cloud-based" in c_lower: return "Cloud" ml_names = {"PyTorch", "TensorFlow", "Scikit-learn", "HuggingFace Transformers", "Apache MXNet", "MLflow", "Sentence Transformers", "ONNX"} if n in ml_names: return "ML" infra_names = {"Docker", "Kubernetes", "Terraform", "Ansible", "Linux", "Linux Administration", "Bash Scripting", "Docker Compose"} if n in infra_names: return "Infra" framework_names = {"React", "Angular", "Vue.js", "Django", "Flask", "FastAPI", "Spring Boot", "Spring Framework", "Next.js", "Node.js", "Tailwind CSS", "TanStack Query", "Express", "LangChain", "LlamaIndex"} if n in framework_names: return "Framework" cicd_names = {"Jenkins", "GitHub Actions", "Atlassian JIRA", "Jira", "Atlassian Confluence", "Confluence"} if n in cicd_names: return "CI/CD" db_names = {"PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch", "NoSQL", "pgvector", "Pinecone"} if n in db_names or ("data base" in c_lower and n not in cloud_names): return "Database" # Languages — match on EXACT name only to avoid "r" matching "PowerPoint" language_names = {"Python", "Java", "C", "C#", "C++", "JavaScript", "TypeScript", "Go", "Ruby", "Swift", "Kotlin", "Bash", "MATLAB", "R", "SAS", "PHP", "Rust", "Scala", "HTML", "CSS", "XML", "JSON", ".NET"} if n in language_names: return "Language" if n in ("Git", "GitHub", "Vite", "Jupyter Notebook"): return "Tools" if "version" in c_lower or "monitoring" in c_lower: return "Tools" return "Tools" def main() -> int: if not ONET_DIR.exists(): print(f"ERROR: O*NET dump not found at {ONET_DIR}", file=sys.stderr) print("Download db_30_2_text.zip from " "https://www.onetcenter.org/dl_files/database/db_30_2_text.zip " "and extract under backend/seed_data/onet_dump/", file=sys.stderr) return 1 role_mapping = yaml.safe_load(ROLE_MAPPING_FILE.read_text(encoding="utf-8")) augmentation = yaml.safe_load(AUGMENTATION_FILE.read_text(encoding="utf-8")) # Collect all SOCs we need to query (across composite roles) all_socs: set[str] = set() for role in role_mapping["roles"]: all_socs.update(role["socs"]) occupation_data = load_occupation_data() tech_skills = load_tech_skills_for_socs(all_socs) # Verify every SOC was found missing = [soc for soc in all_socs if soc not in occupation_data] if missing: print(f"ERROR: SOC codes not found in Occupation Data.txt: {missing}", file=sys.stderr) return 1 output = {"roles": []} for role in role_mapping["roles"]: name = role["name"] socs = role["socs"] # Pick the title/description from the first SOC, unless overridden primary_soc = socs[0] title_source = occupation_data[primary_soc] description = role.get("description_override") or title_source["description"] # Union tech skills across composite SOCs, dedupe by normalized name seen_names: set[str] = set() skills: list[dict] = [] for soc in socs: for entry in tech_skills.get(soc, []): normalized = normalize_skill_name(entry["example"]) if normalized in seen_names: continue seen_names.add(normalized) skills.append({ "skill_name": normalized, "category": categorize_skill(normalized, entry["commodity_title"]), "source": "onet", "source_soc": soc, "is_mandatory": True, # Hot+InDemand baseline "weight": 1.0, # Admin can override "required_level": "INTERMEDIATE", # Admin can override }) # Apply manual augmentation for aug in augmentation.get("augmentation", {}).get(name, []): normalized = aug["skill"].strip() if normalized in seen_names: # Augmentation upgrades the existing entry's flags/weight for s in skills: if s["skill_name"] == normalized: s["category"] = aug.get("category", s["category"]) s["is_mandatory"] = aug["is_mandatory"] s["weight"] = aug["weight"] s["required_level"] = aug["required_level"] s["source"] = "onet+manual" break else: seen_names.add(normalized) skills.append({ "skill_name": normalized, "category": aug.get("category", "Tools"), "source": "manual", "source_soc": None, "is_mandatory": aug["is_mandatory"], "weight": aug["weight"], "required_level": aug["required_level"], }) output["roles"].append({ "name": name, "industry": role["industry"], "description": description, "primary_soc": primary_soc, "all_socs": socs, "skills": skills, }) print(f" {name}: {len(skills)} skills " f"({sum(1 for s in skills if s['source'].startswith('onet'))} from O*NET, " f"{sum(1 for s in skills if s['source'] == 'manual')} manual)") OUTPUT_FILE.write_text( yaml.safe_dump(output, sort_keys=False, allow_unicode=True, width=200), encoding="utf-8", ) print(f"\nWrote {OUTPUT_FILE}") print(f" {len(output['roles'])} roles, " f"{sum(len(r['skills']) for r in output['roles'])} total RoleSkill entries") return 0 if __name__ == "__main__": sys.exit(main())