"""Apply role_curation.yaml to onet_roles_raw.yaml. Reads: backend/seed_data/onet_roles_raw.yaml backend/seed_data/role_curation.yaml backend/seed_data/manual_skill_augmentation.yaml (for skills mentioned in keep but absent from raw) Writes: backend/seed_data/onet_roles_curated.yaml (final seed-ready file) backend/seed_data/onet_roles_curation_log.md (per-role audit log of keeps + drops + reasons) Logic: For each role: 1. Take its keep list from role_curation.yaml. 2. For each keep entry, find it in raw skills OR in augmentation; if found nowhere, error. 3. Apply optional weight / is_mandatory / required_level overrides. 4. Anything in raw NOT in keep is DROPPED — record reason from drop_reasons map (or "no reason given"). 5. Write curated YAML with same shape as raw, plus a curated_at timestamp at the top. """ from __future__ import annotations import sys from datetime import datetime, timezone from pathlib import Path import yaml REPO_ROOT = Path(__file__).resolve().parent.parent SEED_DIR = REPO_ROOT / "seed_data" RAW_FILE = SEED_DIR / "onet_roles_raw.yaml" CURATION_FILE = SEED_DIR / "role_curation.yaml" AUGMENTATION_FILE = SEED_DIR / "manual_skill_augmentation.yaml" OUTPUT_FILE = SEED_DIR / "onet_roles_curated.yaml" LOG_FILE = SEED_DIR / "onet_roles_curation_log.md" def find_skill(skills: list[dict], name: str) -> dict | None: for s in skills: if s["skill_name"] == name: return s return None def synth_from_augmentation(role_name: str, skill_name: str, aug_doc: dict) -> dict | None: """If a keep entry isn't in raw, see if it lives in the augmentation YAML.""" role_aug = aug_doc.get("augmentation", {}).get(role_name, []) for entry in role_aug: if entry["skill"] == skill_name: return { "skill_name": skill_name, "category": entry.get("category", "Tools"), "source": "manual", "source_soc": None, "is_mandatory": entry["is_mandatory"], "weight": entry["weight"], "required_level": entry["required_level"], } return None def main() -> int: raw = yaml.safe_load(RAW_FILE.read_text(encoding="utf-8")) curation = yaml.safe_load(CURATION_FILE.read_text(encoding="utf-8")) augmentation = yaml.safe_load(AUGMENTATION_FILE.read_text(encoding="utf-8")) out_roles: list[dict] = [] log_sections: list[str] = [] log_sections.append( f"# Role Curation Log\n\n" f"Generated {datetime.now(timezone.utc).isoformat()}Z by `scripts/curate_roles.py`\n\n" f"This log shows, per role, which skills were kept and which were dropped (with reason).\n" f"Source of truth for keep decisions: `seed_data/role_curation.yaml`. Edit there + re-run.\n\n" ) fatal = False for role in raw["roles"]: name = role["name"] rule = curation.get("curation", {}).get(name) if not rule: print(f"WARN: no curation rule for '{name}', keeping all skills as-is", file=sys.stderr) out_roles.append(role) continue keep_entries = rule.get("keep", []) drop_reasons = rule.get("drop_reasons", {}) # Build the curated skill list, in the order specified by `keep` kept: list[dict] = [] kept_names: set[str] = set() for entry in keep_entries: skill_name = entry["name"] if skill_name in kept_names: continue base = find_skill(role["skills"], skill_name) if base is None: # Try augmentation fallback base = synth_from_augmentation(name, skill_name, augmentation) if base is None: print(f"FATAL: keep '{skill_name}' for '{name}' not in raw nor augmentation", file=sys.stderr) fatal = True continue entry_dict = dict(base) # shallow copy # Apply overrides if present for k in ("is_mandatory", "weight", "required_level", "category"): if k in entry: entry_dict[k] = entry[k] kept.append(entry_dict) kept_names.add(skill_name) # Anything in raw not kept is dropped — log reason dropped: list[tuple[str, str]] = [] raw_names = {s["skill_name"] for s in role["skills"]} for s in role["skills"]: if s["skill_name"] not in kept_names: reason = drop_reasons.get(s["skill_name"], "no reason given (please add to role_curation.yaml drop_reasons)") dropped.append((s["skill_name"], reason)) # Sanity: any keep referenced a skill that is NOT in raw and NOT in augmentation? Already errored above. # Also: drop_reasons referencing a skill that wasn't in raw — warn. for orphan in set(drop_reasons.keys()) - raw_names: print(f" WARN [{name}]: drop_reasons mentions '{orphan}' but it wasn't in raw", file=sys.stderr) out_roles.append({ "name": name, "industry": role["industry"], "description": role["description"], "primary_soc": role["primary_soc"], "all_socs": role["all_socs"], "skills": kept, }) log_sections.append( f"## {name}\n\n" f"**Kept ({len(kept)} skills):**\n\n" + "\n".join( f"- `{s['skill_name']}` " f"({'mandatory' if s['is_mandatory'] else 'optional'}, " f"weight={s['weight']}, level={s['required_level']}, source={s['source']})" for s in kept ) + f"\n\n**Dropped ({len(dropped)} skills):**\n\n" + ("\n".join(f"- `{name}` — {reason}" for name, reason in dropped) if dropped else "_none_") + "\n\n" ) print(f" {name}: {len(kept)} kept, {len(dropped)} dropped") if fatal: print("\nABORT: see FATAL messages above. Fix role_curation.yaml and retry.", file=sys.stderr) return 1 OUTPUT_FILE.write_text( f"# Curated role+skill seed for GapGuide\n" f"# Generated {datetime.now(timezone.utc).isoformat()}Z by scripts/curate_roles.py\n" f"# Edit seed_data/role_curation.yaml then re-run; do NOT hand-edit this file.\n\n" + yaml.safe_dump({"roles": out_roles}, sort_keys=False, allow_unicode=True, width=200), encoding="utf-8", ) LOG_FILE.write_text("\n".join(log_sections), encoding="utf-8") print(f"\nWrote {OUTPUT_FILE}") print(f"Wrote {LOG_FILE}") print(f"Total: {len(out_roles)} roles, {sum(len(r['skills']) for r in out_roles)} kept skills") return 0 if __name__ == "__main__": sys.exit(main())