File size: 4,829 Bytes
6256eb9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | #!/usr/bin/env python3
"""Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON.
Output columns:
- index (1-based)
- skill_code
- full_description
The output contains only skill codes that appear in both:
1) Skills.csv (column: node_code)
2) CASE JSON CFItems (field: humanCodingScheme)
Rows are sorted by skill_code.
"""
from __future__ import annotations
import argparse
import csv
import html
import json
import re
from pathlib import Path
from typing import Dict, Set
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
_DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)")
def _clean_text(text: str) -> str:
"""Normalize whitespace and strip simple HTML tags from a description."""
text = html.unescape(text or "")
text = _TAG_RE.sub("", text)
text = _WS_RE.sub(" ", text).strip()
return text
def _normalize_skill_code(code: str) -> str:
"""Normalize equivalent skill code formats to a common representation.
Example:
- 3.MD.C.7a -> 3.MD.C.7a
- 3.MD.C.7.a -> 3.MD.C.7a
"""
normalized = (code or "").strip()
normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized)
return normalized
def load_skill_codes(skills_csv_path: Path) -> Set[str]:
"""Read unique skill codes from Skills.csv node_code column."""
codes: Set[str] = set()
with skills_csv_path.open("r", encoding="utf-8", newline="") as f:
reader = csv.DictReader(f)
if "node_code" not in (reader.fieldnames or []):
raise ValueError(
f"Missing required column 'node_code' in {skills_csv_path}"
)
for row in reader:
code = (row.get("node_code") or "").strip()
if code:
codes.add(code)
return codes
def load_case_mapping(case_json_path: Path) -> Dict[str, str]:
"""Map skill code -> full standard statement from CASE JSON CFItems."""
with case_json_path.open("r", encoding="utf-8") as f:
payload = json.load(f)
mapping: Dict[str, str] = {}
for item in payload.get("CFItems", []):
raw_code = (item.get("humanCodingScheme") or "").strip()
code = _normalize_skill_code(raw_code)
if not code:
continue
# Keep both Standards and Components.
# In CASE, codes like 3.MD.C.7.a are often CFItemType=Component.
if item.get("CFItemType") not in {"Standard", "Component"}:
continue
statement = _clean_text(item.get("fullStatement") or "")
if not statement:
continue
# Keep first non-empty definition if duplicates appear.
if code not in mapping:
mapping[code] = statement
return mapping
def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None:
"""Write final CSV with a 1-based index."""
output_csv_path.parent.mkdir(parents=True, exist_ok=True)
with output_csv_path.open("w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["index", "skill_code", "full_description"])
for idx, (code, desc) in enumerate(rows, start=1):
writer.writerow([idx, code, desc])
def main() -> None:
parser = argparse.ArgumentParser(
description="Create Skill_List.csv from Skills.csv and CASE standards JSON."
)
parser.add_argument(
"--skills-csv",
type=Path,
default=Path("../Data/Skills.csv"),
help="Path to Skills.csv",
)
parser.add_argument(
"--case-json",
type=Path,
default=Path("../Data/CASE-Common Core State Standards for Math.json"),
help="Path to CASE JSON",
)
parser.add_argument(
"--output-csv",
type=Path,
default=Path("../Data/Skill_Set.csv"),
help="Output path for Skill_Set.csv",
)
args = parser.parse_args()
skills_csv = args.skills_csv.resolve()
case_json = args.case_json.resolve()
output_csv = args.output_csv.resolve()
skill_codes = load_skill_codes(skills_csv)
case_map = load_case_mapping(case_json)
matched_codes = sorted(
code for code in skill_codes if _normalize_skill_code(code) in case_map
)
rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes]
write_output(output_csv, rows)
missing = sorted(
code for code in skill_codes if _normalize_skill_code(code) not in case_map
)
print(f"Wrote {len(rows)} skills to {output_csv}")
print(f"Unique skill codes in Skills.csv: {len(skill_codes)}")
print(f"Missing codes not found in CASE: {len(missing)}")
if missing:
preview = ", ".join(missing[:20])
suffix = " ..." if len(missing) > 20 else ""
print(f"Missing preview: {preview}{suffix}")
if __name__ == "__main__":
main()
|