Spaces:

arifRB
/

gapguide-api

Sleeping

App Files Files Community

gapguide-api / scripts /parse_onet_dump.py

arifRB

Deploy GapGuide backend (Docker)

ffd36e0 verified 15 days ago

Raw

History Blame Contribute Delete

12.2 kB

	"""Parse the O*NET v30.2 bulk database into a normalized seed YAML.

	Reads:
	backend/seed_data/onet_dump/db_30_2_text/Technology Skills.txt
	backend/seed_data/onet_dump/db_30_2_text/Occupation Data.txt
	backend/seed_data/role_soc_mapping.yaml
	backend/seed_data/manual_skill_augmentation.yaml

	Writes:
	backend/seed_data/onet_roles_raw.yaml

	Filter rule: keep technology_skills where Hot Technology=Y AND In Demand=Y.
	That intersection produces 11-35 skills per role (verified) — the right granularity
	for a usable RoleSkill list. Manual augmentation adds modern frameworks O*NET misses.

	Run from repo root:
	cd backend && python scripts/parse_onet_dump.py
	"""

	from __future__ import annotations

	import csv
	import sys
	from pathlib import Path

	import yaml

	REPO_ROOT = Path(__file__).resolve().parent.parent
	SEED_DIR = REPO_ROOT / "seed_data"
	ONET_DIR = SEED_DIR / "onet_dump" / "db_30_2_text"

	ROLE_MAPPING_FILE = SEED_DIR / "role_soc_mapping.yaml"
	AUGMENTATION_FILE = SEED_DIR / "manual_skill_augmentation.yaml"
	OUTPUT_FILE = SEED_DIR / "onet_roles_raw.yaml"


	def load_occupation_data() -> dict[str, dict[str, str]]:
	"""SOC code -> {title, description}."""
	path = ONET_DIR / "Occupation Data.txt"
	out: dict[str, dict[str, str]] = {}
	with path.open(encoding="utf-8") as f:
	reader = csv.DictReader(f, delimiter="\t")
	for row in reader:
	out[row["O*NET-SOC Code"]] = {
	"title": row["Title"],
	"description": row["Description"],
	}
	return out


	def load_tech_skills_for_socs(socs: set[str]) -> dict[str, list[dict]]:
	"""SOC code -> [{example, commodity_title, hot, in_demand}].

	Only returns rows where Hot Technology=Y AND In Demand=Y.
	"""
	path = ONET_DIR / "Technology Skills.txt"
	out: dict[str, list[dict]] = {soc: [] for soc in socs}
	with path.open(encoding="utf-8") as f:
	reader = csv.DictReader(f, delimiter="\t")
	for row in reader:
	soc = row["O*NET-SOC Code"]
	if soc not in socs:
	continue
	if row["Hot Technology"] != "Y" or row["In Demand"] != "Y":
	continue
	out[soc].append({
	"example": row["Example"],
	"commodity_title": row["Commodity Title"],
	"hot": True,
	"in_demand": True,
	})
	return out


	def normalize_skill_name(raw: str) -> str:
	"""Map O*NET's verbose skill names to cleaner display names.

	Examples:
	"Structured query language SQL" -> "SQL"
	"Amazon Web Services AWS software" -> "AWS"
	"Hypertext markup language HTML" -> "HTML"
	"Cascading style sheets CSS" -> "CSS"
	"Microsoft Azure software" -> "Microsoft Azure"
	"Oracle Java" -> "Java"
	"JavaScript Object Notation JSON" -> "JSON"
	"Extensible markup language XML" -> "XML"
	"Microsoft .NET Framework" -> ".NET"
	"Spring Framework" / "Spring Boot" -> kept as-is
	"The MathWorks MATLAB" -> "MATLAB"
	"Google Angular" -> "Angular"
	"IBM Terraform" -> "Terraform"
	Cleaner names = better matches against manual augmentation entries and
	against terms a student would actually type.
	"""
	s = raw.strip()
	# Strip trailing " software"
	if s.endswith(" software"):
	s = s[: -len(" software")]
	# Common acronym extractions: take the trailing all-caps token if the rest
	# is a verbose expansion of it.
	aliases = {
	"Structured query language SQL": "SQL",
	"Amazon Web Services AWS": "AWS",
	"Hypertext markup language HTML": "HTML",
	"Cascading style sheets CSS": "CSS",
	"JavaScript Object Notation JSON": "JSON",
	"Extensible markup language XML": "XML",
	"The MathWorks MATLAB": "MATLAB",
	"Google Angular": "Angular",
	"IBM Terraform": "Terraform",
	"Oracle Java": "Java",
	"Microsoft .NET Framework": ".NET",
	"Apache Hive": "Apache Hive",
	"Apache Hadoop": "Apache Hadoop",
	"Apache Spark": "Apache Spark",
	"Apache Kafka": "Apache Kafka",
	"Apache Airflow": "Apache Airflow",
	"Apache Cassandra": "Apache Cassandra",
	"Atlassian JIRA": "Jira",
	"Atlassian Confluence": "Confluence",
	"Microsoft Azure": "Microsoft Azure",
	"Microsoft Excel": "Microsoft Excel",
	"Microsoft Power BI": "Microsoft Power BI",
	"Microsoft PowerPoint": "Microsoft PowerPoint",
	"Microsoft Access": "Microsoft Access",
	"Jenkins CI": "Jenkins",
	"Node.js": "Node.js",
	}
	return aliases.get(s, s)


	def categorize_skill(name: str, commodity_title: str) -> str:
	"""Best-effort skill category from O*NET commodity title or exact name match.

	Order matters: most specific buckets first (BI / Cloud / ML / Infra / Framework
	/ Database / Language) before generic Tools fallback. Match on EXACT skill
	names where possible to avoid substring false positives (e.g., the single
	letter "r" matching "PowerPoint").
	"""
	n = name.strip()
	n_lower = n.lower()
	c_lower = commodity_title.lower() if commodity_title else ""

	# Data/Analytics tools — check before BI because commodity_title for many
	# data tools (Spark, Hadoop) is "Business intelligence and data analysis software"
	data_names = {"Apache Spark", "Apache Hadoop", "Apache Hive", "Apache Kafka",
	"Apache Airflow", "Apache Cassandra", "dbt", "Pandas", "NumPy",
	"Matplotlib", "Seaborn", "Alteryx software", "Alteryx"}
	if n in data_names:
	return "Data"

	# Most specific first
	bi_names = {"Microsoft Power BI", "Tableau", "Looker", "Microsoft Excel"}
	if n in bi_names or "business intelligence" in c_lower:
	return "BI"

	cloud_names = {"AWS", "Microsoft Azure", "Google Cloud Platform", "Snowflake",
	"Google BigQuery", "Amazon Redshift", "Amazon Elastic Compute Cloud EC2",
	"Amazon Simple Storage Service S3"}
	if n in cloud_names or "cloud-based" in c_lower:
	return "Cloud"

	ml_names = {"PyTorch", "TensorFlow", "Scikit-learn", "HuggingFace Transformers",
	"Apache MXNet", "MLflow", "Sentence Transformers", "ONNX"}
	if n in ml_names:
	return "ML"

	infra_names = {"Docker", "Kubernetes", "Terraform", "Ansible", "Linux",
	"Linux Administration", "Bash Scripting", "Docker Compose"}
	if n in infra_names:
	return "Infra"

	framework_names = {"React", "Angular", "Vue.js", "Django", "Flask", "FastAPI",
	"Spring Boot", "Spring Framework", "Next.js", "Node.js",
	"Tailwind CSS", "TanStack Query", "Express", "LangChain",
	"LlamaIndex"}
	if n in framework_names:
	return "Framework"

	cicd_names = {"Jenkins", "GitHub Actions", "Atlassian JIRA", "Jira",
	"Atlassian Confluence", "Confluence"}
	if n in cicd_names:
	return "CI/CD"

	db_names = {"PostgreSQL", "MySQL", "MongoDB", "Redis", "Elasticsearch",
	"NoSQL", "pgvector", "Pinecone"}
	if n in db_names or ("data base" in c_lower and n not in cloud_names):
	return "Database"

	# Languages — match on EXACT name only to avoid "r" matching "PowerPoint"
	language_names = {"Python", "Java", "C", "C#", "C++", "JavaScript", "TypeScript",
	"Go", "Ruby", "Swift", "Kotlin", "Bash", "MATLAB", "R", "SAS",
	"PHP", "Rust", "Scala", "HTML", "CSS", "XML", "JSON", ".NET"}
	if n in language_names:
	return "Language"

	if n in ("Git", "GitHub", "Vite", "Jupyter Notebook"):
	return "Tools"

	if "version" in c_lower or "monitoring" in c_lower:
	return "Tools"

	return "Tools"


	def main() -> int:
	if not ONET_DIR.exists():
	print(f"ERROR: O*NET dump not found at {ONET_DIR}", file=sys.stderr)
	print("Download db_30_2_text.zip from "
	"https://www.onetcenter.org/dl_files/database/db_30_2_text.zip "
	"and extract under backend/seed_data/onet_dump/", file=sys.stderr)
	return 1

	role_mapping = yaml.safe_load(ROLE_MAPPING_FILE.read_text(encoding="utf-8"))
	augmentation = yaml.safe_load(AUGMENTATION_FILE.read_text(encoding="utf-8"))

	# Collect all SOCs we need to query (across composite roles)
	all_socs: set[str] = set()
	for role in role_mapping["roles"]:
	all_socs.update(role["socs"])

	occupation_data = load_occupation_data()
	tech_skills = load_tech_skills_for_socs(all_socs)

	# Verify every SOC was found
	missing = [soc for soc in all_socs if soc not in occupation_data]
	if missing:
	print(f"ERROR: SOC codes not found in Occupation Data.txt: {missing}",
	file=sys.stderr)
	return 1

	output = {"roles": []}

	for role in role_mapping["roles"]:
	name = role["name"]
	socs = role["socs"]

	# Pick the title/description from the first SOC, unless overridden
	primary_soc = socs[0]
	title_source = occupation_data[primary_soc]
	description = role.get("description_override") or title_source["description"]

	# Union tech skills across composite SOCs, dedupe by normalized name
	seen_names: set[str] = set()
	skills: list[dict] = []
	for soc in socs:
	for entry in tech_skills.get(soc, []):
	normalized = normalize_skill_name(entry["example"])
	if normalized in seen_names:
	continue
	seen_names.add(normalized)
	skills.append({
	"skill_name": normalized,
	"category": categorize_skill(normalized, entry["commodity_title"]),
	"source": "onet",
	"source_soc": soc,
	"is_mandatory": True, # Hot+InDemand baseline
	"weight": 1.0, # Admin can override
	"required_level": "INTERMEDIATE", # Admin can override
	})

	# Apply manual augmentation
	for aug in augmentation.get("augmentation", {}).get(name, []):
	normalized = aug["skill"].strip()
	if normalized in seen_names:
	# Augmentation upgrades the existing entry's flags/weight
	for s in skills:
	if s["skill_name"] == normalized:
	s["category"] = aug.get("category", s["category"])
	s["is_mandatory"] = aug["is_mandatory"]
	s["weight"] = aug["weight"]
	s["required_level"] = aug["required_level"]
	s["source"] = "onet+manual"
	break
	else:
	seen_names.add(normalized)
	skills.append({
	"skill_name": normalized,
	"category": aug.get("category", "Tools"),
	"source": "manual",
	"source_soc": None,
	"is_mandatory": aug["is_mandatory"],
	"weight": aug["weight"],
	"required_level": aug["required_level"],
	})

	output["roles"].append({
	"name": name,
	"industry": role["industry"],
	"description": description,
	"primary_soc": primary_soc,
	"all_socs": socs,
	"skills": skills,
	})

	print(f" {name}: {len(skills)} skills "
	f"({sum(1 for s in skills if s['source'].startswith('onet'))} from O*NET, "
	f"{sum(1 for s in skills if s['source'] == 'manual')} manual)")

	OUTPUT_FILE.write_text(
	yaml.safe_dump(output, sort_keys=False, allow_unicode=True, width=200),
	encoding="utf-8",
	)
	print(f"\nWrote {OUTPUT_FILE}")
	print(f" {len(output['roles'])} roles, "
	f"{sum(len(r['skills']) for r in output['roles'])} total RoleSkill entries")
	return 0


	if __name__ == "__main__":
	sys.exit(main())