FoundationalASSIST / Code /build_skill_set.py
martinakaduc's picture
Upload folder using huggingface_hub
6256eb9 verified
#!/usr/bin/env python3
"""Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON.
Output columns:
- index (1-based)
- skill_code
- full_description
The output contains only skill codes that appear in both:
1) Skills.csv (column: node_code)
2) CASE JSON CFItems (field: humanCodingScheme)
Rows are sorted by skill_code.
"""
from __future__ import annotations
import argparse
import csv
import html
import json
import re
from pathlib import Path
from typing import Dict, Set
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
_DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)")
def _clean_text(text: str) -> str:
"""Normalize whitespace and strip simple HTML tags from a description."""
text = html.unescape(text or "")
text = _TAG_RE.sub("", text)
text = _WS_RE.sub(" ", text).strip()
return text
def _normalize_skill_code(code: str) -> str:
"""Normalize equivalent skill code formats to a common representation.
Example:
- 3.MD.C.7a -> 3.MD.C.7a
- 3.MD.C.7.a -> 3.MD.C.7a
"""
normalized = (code or "").strip()
normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized)
return normalized
def load_skill_codes(skills_csv_path: Path) -> Set[str]:
"""Read unique skill codes from Skills.csv node_code column."""
codes: Set[str] = set()
with skills_csv_path.open("r", encoding="utf-8", newline="") as f:
reader = csv.DictReader(f)
if "node_code" not in (reader.fieldnames or []):
raise ValueError(
f"Missing required column 'node_code' in {skills_csv_path}"
)
for row in reader:
code = (row.get("node_code") or "").strip()
if code:
codes.add(code)
return codes
def load_case_mapping(case_json_path: Path) -> Dict[str, str]:
"""Map skill code -> full standard statement from CASE JSON CFItems."""
with case_json_path.open("r", encoding="utf-8") as f:
payload = json.load(f)
mapping: Dict[str, str] = {}
for item in payload.get("CFItems", []):
raw_code = (item.get("humanCodingScheme") or "").strip()
code = _normalize_skill_code(raw_code)
if not code:
continue
# Keep both Standards and Components.
# In CASE, codes like 3.MD.C.7.a are often CFItemType=Component.
if item.get("CFItemType") not in {"Standard", "Component"}:
continue
statement = _clean_text(item.get("fullStatement") or "")
if not statement:
continue
# Keep first non-empty definition if duplicates appear.
if code not in mapping:
mapping[code] = statement
return mapping
def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None:
"""Write final CSV with a 1-based index."""
output_csv_path.parent.mkdir(parents=True, exist_ok=True)
with output_csv_path.open("w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["index", "skill_code", "full_description"])
for idx, (code, desc) in enumerate(rows, start=1):
writer.writerow([idx, code, desc])
def main() -> None:
parser = argparse.ArgumentParser(
description="Create Skill_List.csv from Skills.csv and CASE standards JSON."
)
parser.add_argument(
"--skills-csv",
type=Path,
default=Path("../Data/Skills.csv"),
help="Path to Skills.csv",
)
parser.add_argument(
"--case-json",
type=Path,
default=Path("../Data/CASE-Common Core State Standards for Math.json"),
help="Path to CASE JSON",
)
parser.add_argument(
"--output-csv",
type=Path,
default=Path("../Data/Skill_Set.csv"),
help="Output path for Skill_Set.csv",
)
args = parser.parse_args()
skills_csv = args.skills_csv.resolve()
case_json = args.case_json.resolve()
output_csv = args.output_csv.resolve()
skill_codes = load_skill_codes(skills_csv)
case_map = load_case_mapping(case_json)
matched_codes = sorted(
code for code in skill_codes if _normalize_skill_code(code) in case_map
)
rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes]
write_output(output_csv, rows)
missing = sorted(
code for code in skill_codes if _normalize_skill_code(code) not in case_map
)
print(f"Wrote {len(rows)} skills to {output_csv}")
print(f"Unique skill codes in Skills.csv: {len(skill_codes)}")
print(f"Missing codes not found in CASE: {len(missing)}")
if missing:
preview = ", ".join(missing[:20])
suffix = " ..." if len(missing) > 20 else ""
print(f"Missing preview: {preview}{suffix}")
if __name__ == "__main__":
main()