#!/usr/bin/env python3 """Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON. Output columns: - index (1-based) - skill_code - full_description The output contains only skill codes that appear in both: 1) Skills.csv (column: node_code) 2) CASE JSON CFItems (field: humanCodingScheme) Rows are sorted by skill_code. """ from __future__ import annotations import argparse import csv import html import json import re from pathlib import Path from typing import Dict, Set _TAG_RE = re.compile(r"<[^>]+>") _WS_RE = re.compile(r"\s+") _DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)") def _clean_text(text: str) -> str: """Normalize whitespace and strip simple HTML tags from a description.""" text = html.unescape(text or "") text = _TAG_RE.sub("", text) text = _WS_RE.sub(" ", text).strip() return text def _normalize_skill_code(code: str) -> str: """Normalize equivalent skill code formats to a common representation. Example: - 3.MD.C.7a -> 3.MD.C.7a - 3.MD.C.7.a -> 3.MD.C.7a """ normalized = (code or "").strip() normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized) return normalized def load_skill_codes(skills_csv_path: Path) -> Set[str]: """Read unique skill codes from Skills.csv node_code column.""" codes: Set[str] = set() with skills_csv_path.open("r", encoding="utf-8", newline="") as f: reader = csv.DictReader(f) if "node_code" not in (reader.fieldnames or []): raise ValueError( f"Missing required column 'node_code' in {skills_csv_path}" ) for row in reader: code = (row.get("node_code") or "").strip() if code: codes.add(code) return codes def load_case_mapping(case_json_path: Path) -> Dict[str, str]: """Map skill code -> full standard statement from CASE JSON CFItems.""" with case_json_path.open("r", encoding="utf-8") as f: payload = json.load(f) mapping: Dict[str, str] = {} for item in payload.get("CFItems", []): raw_code = (item.get("humanCodingScheme") or "").strip() code = _normalize_skill_code(raw_code) if not code: continue # Keep both Standards and Components. # In CASE, codes like 3.MD.C.7.a are often CFItemType=Component. if item.get("CFItemType") not in {"Standard", "Component"}: continue statement = _clean_text(item.get("fullStatement") or "") if not statement: continue # Keep first non-empty definition if duplicates appear. if code not in mapping: mapping[code] = statement return mapping def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None: """Write final CSV with a 1-based index.""" output_csv_path.parent.mkdir(parents=True, exist_ok=True) with output_csv_path.open("w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(["index", "skill_code", "full_description"]) for idx, (code, desc) in enumerate(rows, start=1): writer.writerow([idx, code, desc]) def main() -> None: parser = argparse.ArgumentParser( description="Create Skill_List.csv from Skills.csv and CASE standards JSON." ) parser.add_argument( "--skills-csv", type=Path, default=Path("../Data/Skills.csv"), help="Path to Skills.csv", ) parser.add_argument( "--case-json", type=Path, default=Path("../Data/CASE-Common Core State Standards for Math.json"), help="Path to CASE JSON", ) parser.add_argument( "--output-csv", type=Path, default=Path("../Data/Skill_Set.csv"), help="Output path for Skill_Set.csv", ) args = parser.parse_args() skills_csv = args.skills_csv.resolve() case_json = args.case_json.resolve() output_csv = args.output_csv.resolve() skill_codes = load_skill_codes(skills_csv) case_map = load_case_mapping(case_json) matched_codes = sorted( code for code in skill_codes if _normalize_skill_code(code) in case_map ) rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes] write_output(output_csv, rows) missing = sorted( code for code in skill_codes if _normalize_skill_code(code) not in case_map ) print(f"Wrote {len(rows)} skills to {output_csv}") print(f"Unique skill codes in Skills.csv: {len(skill_codes)}") print(f"Missing codes not found in CASE: {len(missing)}") if missing: preview = ", ".join(missing[:20]) suffix = " ..." if len(missing) > 20 else "" print(f"Missing preview: {preview}{suffix}") if __name__ == "__main__": main()