Spaces:
Sleeping
Sleeping
| """Apply role_curation.yaml to onet_roles_raw.yaml. | |
| Reads: | |
| backend/seed_data/onet_roles_raw.yaml | |
| backend/seed_data/role_curation.yaml | |
| backend/seed_data/manual_skill_augmentation.yaml (for skills mentioned in keep but absent from raw) | |
| Writes: | |
| backend/seed_data/onet_roles_curated.yaml (final seed-ready file) | |
| backend/seed_data/onet_roles_curation_log.md (per-role audit log of keeps + drops + reasons) | |
| Logic: | |
| For each role: | |
| 1. Take its keep list from role_curation.yaml. | |
| 2. For each keep entry, find it in raw skills OR in augmentation; if found nowhere, error. | |
| 3. Apply optional weight / is_mandatory / required_level overrides. | |
| 4. Anything in raw NOT in keep is DROPPED — record reason from drop_reasons map (or "no reason given"). | |
| 5. Write curated YAML with same shape as raw, plus a curated_at timestamp at the top. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| import yaml | |
| REPO_ROOT = Path(__file__).resolve().parent.parent | |
| SEED_DIR = REPO_ROOT / "seed_data" | |
| RAW_FILE = SEED_DIR / "onet_roles_raw.yaml" | |
| CURATION_FILE = SEED_DIR / "role_curation.yaml" | |
| AUGMENTATION_FILE = SEED_DIR / "manual_skill_augmentation.yaml" | |
| OUTPUT_FILE = SEED_DIR / "onet_roles_curated.yaml" | |
| LOG_FILE = SEED_DIR / "onet_roles_curation_log.md" | |
| def find_skill(skills: list[dict], name: str) -> dict | None: | |
| for s in skills: | |
| if s["skill_name"] == name: | |
| return s | |
| return None | |
| def synth_from_augmentation(role_name: str, skill_name: str, aug_doc: dict) -> dict | None: | |
| """If a keep entry isn't in raw, see if it lives in the augmentation YAML.""" | |
| role_aug = aug_doc.get("augmentation", {}).get(role_name, []) | |
| for entry in role_aug: | |
| if entry["skill"] == skill_name: | |
| return { | |
| "skill_name": skill_name, | |
| "category": entry.get("category", "Tools"), | |
| "source": "manual", | |
| "source_soc": None, | |
| "is_mandatory": entry["is_mandatory"], | |
| "weight": entry["weight"], | |
| "required_level": entry["required_level"], | |
| } | |
| return None | |
| def main() -> int: | |
| raw = yaml.safe_load(RAW_FILE.read_text(encoding="utf-8")) | |
| curation = yaml.safe_load(CURATION_FILE.read_text(encoding="utf-8")) | |
| augmentation = yaml.safe_load(AUGMENTATION_FILE.read_text(encoding="utf-8")) | |
| out_roles: list[dict] = [] | |
| log_sections: list[str] = [] | |
| log_sections.append( | |
| f"# Role Curation Log\n\n" | |
| f"Generated {datetime.now(timezone.utc).isoformat()}Z by `scripts/curate_roles.py`\n\n" | |
| f"This log shows, per role, which skills were kept and which were dropped (with reason).\n" | |
| f"Source of truth for keep decisions: `seed_data/role_curation.yaml`. Edit there + re-run.\n\n" | |
| ) | |
| fatal = False | |
| for role in raw["roles"]: | |
| name = role["name"] | |
| rule = curation.get("curation", {}).get(name) | |
| if not rule: | |
| print(f"WARN: no curation rule for '{name}', keeping all skills as-is", | |
| file=sys.stderr) | |
| out_roles.append(role) | |
| continue | |
| keep_entries = rule.get("keep", []) | |
| drop_reasons = rule.get("drop_reasons", {}) | |
| # Build the curated skill list, in the order specified by `keep` | |
| kept: list[dict] = [] | |
| kept_names: set[str] = set() | |
| for entry in keep_entries: | |
| skill_name = entry["name"] | |
| if skill_name in kept_names: | |
| continue | |
| base = find_skill(role["skills"], skill_name) | |
| if base is None: | |
| # Try augmentation fallback | |
| base = synth_from_augmentation(name, skill_name, augmentation) | |
| if base is None: | |
| print(f"FATAL: keep '{skill_name}' for '{name}' not in raw nor augmentation", | |
| file=sys.stderr) | |
| fatal = True | |
| continue | |
| entry_dict = dict(base) # shallow copy | |
| # Apply overrides if present | |
| for k in ("is_mandatory", "weight", "required_level", "category"): | |
| if k in entry: | |
| entry_dict[k] = entry[k] | |
| kept.append(entry_dict) | |
| kept_names.add(skill_name) | |
| # Anything in raw not kept is dropped — log reason | |
| dropped: list[tuple[str, str]] = [] | |
| raw_names = {s["skill_name"] for s in role["skills"]} | |
| for s in role["skills"]: | |
| if s["skill_name"] not in kept_names: | |
| reason = drop_reasons.get(s["skill_name"], "no reason given (please add to role_curation.yaml drop_reasons)") | |
| dropped.append((s["skill_name"], reason)) | |
| # Sanity: any keep referenced a skill that is NOT in raw and NOT in augmentation? Already errored above. | |
| # Also: drop_reasons referencing a skill that wasn't in raw — warn. | |
| for orphan in set(drop_reasons.keys()) - raw_names: | |
| print(f" WARN [{name}]: drop_reasons mentions '{orphan}' but it wasn't in raw", | |
| file=sys.stderr) | |
| out_roles.append({ | |
| "name": name, | |
| "industry": role["industry"], | |
| "description": role["description"], | |
| "primary_soc": role["primary_soc"], | |
| "all_socs": role["all_socs"], | |
| "skills": kept, | |
| }) | |
| log_sections.append( | |
| f"## {name}\n\n" | |
| f"**Kept ({len(kept)} skills):**\n\n" | |
| + "\n".join( | |
| f"- `{s['skill_name']}` " | |
| f"({'mandatory' if s['is_mandatory'] else 'optional'}, " | |
| f"weight={s['weight']}, level={s['required_level']}, source={s['source']})" | |
| for s in kept | |
| ) | |
| + f"\n\n**Dropped ({len(dropped)} skills):**\n\n" | |
| + ("\n".join(f"- `{name}` — {reason}" for name, reason in dropped) if dropped else "_none_") | |
| + "\n\n" | |
| ) | |
| print(f" {name}: {len(kept)} kept, {len(dropped)} dropped") | |
| if fatal: | |
| print("\nABORT: see FATAL messages above. Fix role_curation.yaml and retry.", | |
| file=sys.stderr) | |
| return 1 | |
| OUTPUT_FILE.write_text( | |
| f"# Curated role+skill seed for GapGuide\n" | |
| f"# Generated {datetime.now(timezone.utc).isoformat()}Z by scripts/curate_roles.py\n" | |
| f"# Edit seed_data/role_curation.yaml then re-run; do NOT hand-edit this file.\n\n" | |
| + yaml.safe_dump({"roles": out_roles}, sort_keys=False, allow_unicode=True, width=200), | |
| encoding="utf-8", | |
| ) | |
| LOG_FILE.write_text("\n".join(log_sections), encoding="utf-8") | |
| print(f"\nWrote {OUTPUT_FILE}") | |
| print(f"Wrote {LOG_FILE}") | |
| print(f"Total: {len(out_roles)} roles, {sum(len(r['skills']) for r in out_roles)} kept skills") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |