| |
| """Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON. |
| |
| Output columns: |
| - index (1-based) |
| - skill_code |
| - full_description |
| |
| The output contains only skill codes that appear in both: |
| 1) Skills.csv (column: node_code) |
| 2) CASE JSON CFItems (field: humanCodingScheme) |
| |
| Rows are sorted by skill_code. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import html |
| import json |
| import re |
| from pathlib import Path |
| from typing import Dict, Set |
|
|
| _TAG_RE = re.compile(r"<[^>]+>") |
| _WS_RE = re.compile(r"\s+") |
| _DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)") |
|
|
|
|
| def _clean_text(text: str) -> str: |
| """Normalize whitespace and strip simple HTML tags from a description.""" |
| text = html.unescape(text or "") |
| text = _TAG_RE.sub("", text) |
| text = _WS_RE.sub(" ", text).strip() |
| return text |
|
|
|
|
| def _normalize_skill_code(code: str) -> str: |
| """Normalize equivalent skill code formats to a common representation. |
| |
| Example: |
| - 3.MD.C.7a -> 3.MD.C.7a |
| - 3.MD.C.7.a -> 3.MD.C.7a |
| """ |
| normalized = (code or "").strip() |
| normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized) |
| return normalized |
|
|
|
|
| def load_skill_codes(skills_csv_path: Path) -> Set[str]: |
| """Read unique skill codes from Skills.csv node_code column.""" |
| codes: Set[str] = set() |
| with skills_csv_path.open("r", encoding="utf-8", newline="") as f: |
| reader = csv.DictReader(f) |
| if "node_code" not in (reader.fieldnames or []): |
| raise ValueError( |
| f"Missing required column 'node_code' in {skills_csv_path}" |
| ) |
| for row in reader: |
| code = (row.get("node_code") or "").strip() |
| if code: |
| codes.add(code) |
| return codes |
|
|
|
|
| def load_case_mapping(case_json_path: Path) -> Dict[str, str]: |
| """Map skill code -> full standard statement from CASE JSON CFItems.""" |
| with case_json_path.open("r", encoding="utf-8") as f: |
| payload = json.load(f) |
|
|
| mapping: Dict[str, str] = {} |
| for item in payload.get("CFItems", []): |
| raw_code = (item.get("humanCodingScheme") or "").strip() |
| code = _normalize_skill_code(raw_code) |
| if not code: |
| continue |
|
|
| |
| |
| if item.get("CFItemType") not in {"Standard", "Component"}: |
| continue |
|
|
| statement = _clean_text(item.get("fullStatement") or "") |
| if not statement: |
| continue |
|
|
| |
| if code not in mapping: |
| mapping[code] = statement |
|
|
| return mapping |
|
|
|
|
| def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None: |
| """Write final CSV with a 1-based index.""" |
| output_csv_path.parent.mkdir(parents=True, exist_ok=True) |
| with output_csv_path.open("w", encoding="utf-8", newline="") as f: |
| writer = csv.writer(f) |
| writer.writerow(["index", "skill_code", "full_description"]) |
| for idx, (code, desc) in enumerate(rows, start=1): |
| writer.writerow([idx, code, desc]) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description="Create Skill_List.csv from Skills.csv and CASE standards JSON." |
| ) |
| parser.add_argument( |
| "--skills-csv", |
| type=Path, |
| default=Path("../Data/Skills.csv"), |
| help="Path to Skills.csv", |
| ) |
| parser.add_argument( |
| "--case-json", |
| type=Path, |
| default=Path("../Data/CASE-Common Core State Standards for Math.json"), |
| help="Path to CASE JSON", |
| ) |
| parser.add_argument( |
| "--output-csv", |
| type=Path, |
| default=Path("../Data/Skill_Set.csv"), |
| help="Output path for Skill_Set.csv", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| skills_csv = args.skills_csv.resolve() |
| case_json = args.case_json.resolve() |
| output_csv = args.output_csv.resolve() |
|
|
| skill_codes = load_skill_codes(skills_csv) |
| case_map = load_case_mapping(case_json) |
|
|
| matched_codes = sorted( |
| code for code in skill_codes if _normalize_skill_code(code) in case_map |
| ) |
| rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes] |
|
|
| write_output(output_csv, rows) |
|
|
| missing = sorted( |
| code for code in skill_codes if _normalize_skill_code(code) not in case_map |
| ) |
| print(f"Wrote {len(rows)} skills to {output_csv}") |
| print(f"Unique skill codes in Skills.csv: {len(skill_codes)}") |
| print(f"Missing codes not found in CASE: {len(missing)}") |
| if missing: |
| preview = ", ".join(missing[:20]) |
| suffix = " ..." if len(missing) > 20 else "" |
| print(f"Missing preview: {preview}{suffix}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|