FoundationalASSIST / Code /build_skill_set.py

Upload folder using huggingface_hub

6256eb9 verified 13 days ago

4.83 kB

	#!/usr/bin/env python3
	"""Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON.

	Output columns:
	- index (1-based)
	- skill_code
	- full_description

	The output contains only skill codes that appear in both:
	1) Skills.csv (column: node_code)
	2) CASE JSON CFItems (field: humanCodingScheme)

	Rows are sorted by skill_code.
	"""

	from __future__ import annotations

	import argparse
	import csv
	import html
	import json
	import re
	from pathlib import Path
	from typing import Dict, Set

	_TAG_RE = re.compile(r"<[^>]+>")
	_WS_RE = re.compile(r"\s+")
	_DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)")


	def _clean_text(text: str) -> str:
	"""Normalize whitespace and strip simple HTML tags from a description."""
	text = html.unescape(text or "")
	text = _TAG_RE.sub("", text)
	text = _WS_RE.sub(" ", text).strip()
	return text


	def _normalize_skill_code(code: str) -> str:
	"""Normalize equivalent skill code formats to a common representation.

	Example:
	- 3.MD.C.7a -> 3.MD.C.7a
	- 3.MD.C.7.a -> 3.MD.C.7a
	"""
	normalized = (code or "").strip()
	normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized)
	return normalized


	def load_skill_codes(skills_csv_path: Path) -> Set[str]:
	"""Read unique skill codes from Skills.csv node_code column."""
	codes: Set[str] = set()
	with skills_csv_path.open("r", encoding="utf-8", newline="") as f:
	reader = csv.DictReader(f)
	if "node_code" not in (reader.fieldnames or []):
	raise ValueError(
	f"Missing required column 'node_code' in {skills_csv_path}"
	)
	for row in reader:
	code = (row.get("node_code") or "").strip()
	if code:
	codes.add(code)
	return codes


	def load_case_mapping(case_json_path: Path) -> Dict[str, str]:
	"""Map skill code -> full standard statement from CASE JSON CFItems."""
	with case_json_path.open("r", encoding="utf-8") as f:
	payload = json.load(f)

	mapping: Dict[str, str] = {}
	for item in payload.get("CFItems", []):
	raw_code = (item.get("humanCodingScheme") or "").strip()
	code = _normalize_skill_code(raw_code)
	if not code:
	continue

	# Keep both Standards and Components.
	# In CASE, codes like 3.MD.C.7.a are often CFItemType=Component.
	if item.get("CFItemType") not in {"Standard", "Component"}:
	continue

	statement = _clean_text(item.get("fullStatement") or "")
	if not statement:
	continue

	# Keep first non-empty definition if duplicates appear.
	if code not in mapping:
	mapping[code] = statement

	return mapping


	def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None:
	"""Write final CSV with a 1-based index."""
	output_csv_path.parent.mkdir(parents=True, exist_ok=True)
	with output_csv_path.open("w", encoding="utf-8", newline="") as f:
	writer = csv.writer(f)
	writer.writerow(["index", "skill_code", "full_description"])
	for idx, (code, desc) in enumerate(rows, start=1):
	writer.writerow([idx, code, desc])


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Create Skill_List.csv from Skills.csv and CASE standards JSON."
	)
	parser.add_argument(
	"--skills-csv",
	type=Path,
	default=Path("../Data/Skills.csv"),
	help="Path to Skills.csv",
	)
	parser.add_argument(
	"--case-json",
	type=Path,
	default=Path("../Data/CASE-Common Core State Standards for Math.json"),
	help="Path to CASE JSON",
	)
	parser.add_argument(
	"--output-csv",
	type=Path,
	default=Path("../Data/Skill_Set.csv"),
	help="Output path for Skill_Set.csv",
	)

	args = parser.parse_args()

	skills_csv = args.skills_csv.resolve()
	case_json = args.case_json.resolve()
	output_csv = args.output_csv.resolve()

	skill_codes = load_skill_codes(skills_csv)
	case_map = load_case_mapping(case_json)

	matched_codes = sorted(
	code for code in skill_codes if _normalize_skill_code(code) in case_map
	)
	rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes]

	write_output(output_csv, rows)

	missing = sorted(
	code for code in skill_codes if _normalize_skill_code(code) not in case_map
	)
	print(f"Wrote {len(rows)} skills to {output_csv}")
	print(f"Unique skill codes in Skills.csv: {len(skill_codes)}")
	print(f"Missing codes not found in CASE: {len(missing)}")
	if missing:
	preview = ", ".join(missing[:20])
	suffix = " ..." if len(missing) > 20 else ""
	print(f"Missing preview: {preview}{suffix}")


	if __name__ == "__main__":
	main()