File size: 4,829 Bytes
6256eb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON.

Output columns:
- index (1-based)
- skill_code
- full_description

The output contains only skill codes that appear in both:
1) Skills.csv (column: node_code)
2) CASE JSON CFItems (field: humanCodingScheme)

Rows are sorted by skill_code.
"""

from __future__ import annotations

import argparse
import csv
import html
import json
import re
from pathlib import Path
from typing import Dict, Set

_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
_DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)")


def _clean_text(text: str) -> str:
    """Normalize whitespace and strip simple HTML tags from a description."""
    text = html.unescape(text or "")
    text = _TAG_RE.sub("", text)
    text = _WS_RE.sub(" ", text).strip()
    return text


def _normalize_skill_code(code: str) -> str:
    """Normalize equivalent skill code formats to a common representation.

    Example:
    - 3.MD.C.7a  -> 3.MD.C.7a
    - 3.MD.C.7.a -> 3.MD.C.7a
    """
    normalized = (code or "").strip()
    normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized)
    return normalized


def load_skill_codes(skills_csv_path: Path) -> Set[str]:
    """Read unique skill codes from Skills.csv node_code column."""
    codes: Set[str] = set()
    with skills_csv_path.open("r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        if "node_code" not in (reader.fieldnames or []):
            raise ValueError(
                f"Missing required column 'node_code' in {skills_csv_path}"
            )
        for row in reader:
            code = (row.get("node_code") or "").strip()
            if code:
                codes.add(code)
    return codes


def load_case_mapping(case_json_path: Path) -> Dict[str, str]:
    """Map skill code -> full standard statement from CASE JSON CFItems."""
    with case_json_path.open("r", encoding="utf-8") as f:
        payload = json.load(f)

    mapping: Dict[str, str] = {}
    for item in payload.get("CFItems", []):
        raw_code = (item.get("humanCodingScheme") or "").strip()
        code = _normalize_skill_code(raw_code)
        if not code:
            continue

        # Keep both Standards and Components.
        # In CASE, codes like 3.MD.C.7.a are often CFItemType=Component.
        if item.get("CFItemType") not in {"Standard", "Component"}:
            continue

        statement = _clean_text(item.get("fullStatement") or "")
        if not statement:
            continue

        # Keep first non-empty definition if duplicates appear.
        if code not in mapping:
            mapping[code] = statement

    return mapping


def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None:
    """Write final CSV with a 1-based index."""
    output_csv_path.parent.mkdir(parents=True, exist_ok=True)
    with output_csv_path.open("w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["index", "skill_code", "full_description"])
        for idx, (code, desc) in enumerate(rows, start=1):
            writer.writerow([idx, code, desc])


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Create Skill_List.csv from Skills.csv and CASE standards JSON."
    )
    parser.add_argument(
        "--skills-csv",
        type=Path,
        default=Path("../Data/Skills.csv"),
        help="Path to Skills.csv",
    )
    parser.add_argument(
        "--case-json",
        type=Path,
        default=Path("../Data/CASE-Common Core State Standards for Math.json"),
        help="Path to CASE JSON",
    )
    parser.add_argument(
        "--output-csv",
        type=Path,
        default=Path("../Data/Skill_Set.csv"),
        help="Output path for Skill_Set.csv",
    )

    args = parser.parse_args()

    skills_csv = args.skills_csv.resolve()
    case_json = args.case_json.resolve()
    output_csv = args.output_csv.resolve()

    skill_codes = load_skill_codes(skills_csv)
    case_map = load_case_mapping(case_json)

    matched_codes = sorted(
        code for code in skill_codes if _normalize_skill_code(code) in case_map
    )
    rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes]

    write_output(output_csv, rows)

    missing = sorted(
        code for code in skill_codes if _normalize_skill_code(code) not in case_map
    )
    print(f"Wrote {len(rows)} skills to {output_csv}")
    print(f"Unique skill codes in Skills.csv: {len(skill_codes)}")
    print(f"Missing codes not found in CASE: {len(missing)}")
    if missing:
        preview = ", ".join(missing[:20])
        suffix = " ..." if len(missing) > 20 else ""
        print(f"Missing preview: {preview}{suffix}")


if __name__ == "__main__":
    main()