gapguide-api / scripts /curate_roles.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
6.89 kB
"""Apply role_curation.yaml to onet_roles_raw.yaml.
Reads:
backend/seed_data/onet_roles_raw.yaml
backend/seed_data/role_curation.yaml
backend/seed_data/manual_skill_augmentation.yaml (for skills mentioned in keep but absent from raw)
Writes:
backend/seed_data/onet_roles_curated.yaml (final seed-ready file)
backend/seed_data/onet_roles_curation_log.md (per-role audit log of keeps + drops + reasons)
Logic:
For each role:
1. Take its keep list from role_curation.yaml.
2. For each keep entry, find it in raw skills OR in augmentation; if found nowhere, error.
3. Apply optional weight / is_mandatory / required_level overrides.
4. Anything in raw NOT in keep is DROPPED — record reason from drop_reasons map (or "no reason given").
5. Write curated YAML with same shape as raw, plus a curated_at timestamp at the top.
"""
from __future__ import annotations
import sys
from datetime import datetime, timezone
from pathlib import Path
import yaml
REPO_ROOT = Path(__file__).resolve().parent.parent
SEED_DIR = REPO_ROOT / "seed_data"
RAW_FILE = SEED_DIR / "onet_roles_raw.yaml"
CURATION_FILE = SEED_DIR / "role_curation.yaml"
AUGMENTATION_FILE = SEED_DIR / "manual_skill_augmentation.yaml"
OUTPUT_FILE = SEED_DIR / "onet_roles_curated.yaml"
LOG_FILE = SEED_DIR / "onet_roles_curation_log.md"
def find_skill(skills: list[dict], name: str) -> dict | None:
for s in skills:
if s["skill_name"] == name:
return s
return None
def synth_from_augmentation(role_name: str, skill_name: str, aug_doc: dict) -> dict | None:
"""If a keep entry isn't in raw, see if it lives in the augmentation YAML."""
role_aug = aug_doc.get("augmentation", {}).get(role_name, [])
for entry in role_aug:
if entry["skill"] == skill_name:
return {
"skill_name": skill_name,
"category": entry.get("category", "Tools"),
"source": "manual",
"source_soc": None,
"is_mandatory": entry["is_mandatory"],
"weight": entry["weight"],
"required_level": entry["required_level"],
}
return None
def main() -> int:
raw = yaml.safe_load(RAW_FILE.read_text(encoding="utf-8"))
curation = yaml.safe_load(CURATION_FILE.read_text(encoding="utf-8"))
augmentation = yaml.safe_load(AUGMENTATION_FILE.read_text(encoding="utf-8"))
out_roles: list[dict] = []
log_sections: list[str] = []
log_sections.append(
f"# Role Curation Log\n\n"
f"Generated {datetime.now(timezone.utc).isoformat()}Z by `scripts/curate_roles.py`\n\n"
f"This log shows, per role, which skills were kept and which were dropped (with reason).\n"
f"Source of truth for keep decisions: `seed_data/role_curation.yaml`. Edit there + re-run.\n\n"
)
fatal = False
for role in raw["roles"]:
name = role["name"]
rule = curation.get("curation", {}).get(name)
if not rule:
print(f"WARN: no curation rule for '{name}', keeping all skills as-is",
file=sys.stderr)
out_roles.append(role)
continue
keep_entries = rule.get("keep", [])
drop_reasons = rule.get("drop_reasons", {})
# Build the curated skill list, in the order specified by `keep`
kept: list[dict] = []
kept_names: set[str] = set()
for entry in keep_entries:
skill_name = entry["name"]
if skill_name in kept_names:
continue
base = find_skill(role["skills"], skill_name)
if base is None:
# Try augmentation fallback
base = synth_from_augmentation(name, skill_name, augmentation)
if base is None:
print(f"FATAL: keep '{skill_name}' for '{name}' not in raw nor augmentation",
file=sys.stderr)
fatal = True
continue
entry_dict = dict(base) # shallow copy
# Apply overrides if present
for k in ("is_mandatory", "weight", "required_level", "category"):
if k in entry:
entry_dict[k] = entry[k]
kept.append(entry_dict)
kept_names.add(skill_name)
# Anything in raw not kept is dropped — log reason
dropped: list[tuple[str, str]] = []
raw_names = {s["skill_name"] for s in role["skills"]}
for s in role["skills"]:
if s["skill_name"] not in kept_names:
reason = drop_reasons.get(s["skill_name"], "no reason given (please add to role_curation.yaml drop_reasons)")
dropped.append((s["skill_name"], reason))
# Sanity: any keep referenced a skill that is NOT in raw and NOT in augmentation? Already errored above.
# Also: drop_reasons referencing a skill that wasn't in raw — warn.
for orphan in set(drop_reasons.keys()) - raw_names:
print(f" WARN [{name}]: drop_reasons mentions '{orphan}' but it wasn't in raw",
file=sys.stderr)
out_roles.append({
"name": name,
"industry": role["industry"],
"description": role["description"],
"primary_soc": role["primary_soc"],
"all_socs": role["all_socs"],
"skills": kept,
})
log_sections.append(
f"## {name}\n\n"
f"**Kept ({len(kept)} skills):**\n\n"
+ "\n".join(
f"- `{s['skill_name']}` "
f"({'mandatory' if s['is_mandatory'] else 'optional'}, "
f"weight={s['weight']}, level={s['required_level']}, source={s['source']})"
for s in kept
)
+ f"\n\n**Dropped ({len(dropped)} skills):**\n\n"
+ ("\n".join(f"- `{name}` — {reason}" for name, reason in dropped) if dropped else "_none_")
+ "\n\n"
)
print(f" {name}: {len(kept)} kept, {len(dropped)} dropped")
if fatal:
print("\nABORT: see FATAL messages above. Fix role_curation.yaml and retry.",
file=sys.stderr)
return 1
OUTPUT_FILE.write_text(
f"# Curated role+skill seed for GapGuide\n"
f"# Generated {datetime.now(timezone.utc).isoformat()}Z by scripts/curate_roles.py\n"
f"# Edit seed_data/role_curation.yaml then re-run; do NOT hand-edit this file.\n\n"
+ yaml.safe_dump({"roles": out_roles}, sort_keys=False, allow_unicode=True, width=200),
encoding="utf-8",
)
LOG_FILE.write_text("\n".join(log_sections), encoding="utf-8")
print(f"\nWrote {OUTPUT_FILE}")
print(f"Wrote {LOG_FILE}")
print(f"Total: {len(out_roles)} roles, {sum(len(r['skills']) for r in out_roles)} kept skills")
return 0
if __name__ == "__main__":
sys.exit(main())