#!/usr/bin/env python3
"""Build Skill_List.csv from Skills.csv and CASE Common Core standards JSON.

Output columns:
- index (1-based)
- skill_code
- full_description

The output contains only skill codes that appear in both:
1) Skills.csv (column: node_code)
2) CASE JSON CFItems (field: humanCodingScheme)

Rows are sorted by skill_code.
"""

from __future__ import annotations

import argparse
import csv
import html
import json
import re
from pathlib import Path
from typing import Dict, Set

_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
_DOT_BEFORE_SUFFIX_RE = re.compile(r"(?<=\d)\.(?=[A-Za-z]$)")


def _clean_text(text: str) -> str:
    """Normalize whitespace and strip simple HTML tags from a description."""
    text = html.unescape(text or "")
    text = _TAG_RE.sub("", text)
    text = _WS_RE.sub(" ", text).strip()
    return text


def _normalize_skill_code(code: str) -> str:
    """Normalize equivalent skill code formats to a common representation.

    Example:
    - 3.MD.C.7a  -> 3.MD.C.7a
    - 3.MD.C.7.a -> 3.MD.C.7a
    """
    normalized = (code or "").strip()
    normalized = _DOT_BEFORE_SUFFIX_RE.sub("", normalized)
    return normalized


def load_skill_codes(skills_csv_path: Path) -> Set[str]:
    """Read unique skill codes from Skills.csv node_code column."""
    codes: Set[str] = set()
    with skills_csv_path.open("r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        if "node_code" not in (reader.fieldnames or []):
            raise ValueError(
                f"Missing required column 'node_code' in {skills_csv_path}"
            )
        for row in reader:
            code = (row.get("node_code") or "").strip()
            if code:
                codes.add(code)
    return codes


def load_case_mapping(case_json_path: Path) -> Dict[str, str]:
    """Map skill code -> full standard statement from CASE JSON CFItems."""
    with case_json_path.open("r", encoding="utf-8") as f:
        payload = json.load(f)

    mapping: Dict[str, str] = {}
    for item in payload.get("CFItems", []):
        raw_code = (item.get("humanCodingScheme") or "").strip()
        code = _normalize_skill_code(raw_code)
        if not code:
            continue

        # Keep both Standards and Components.
        # In CASE, codes like 3.MD.C.7.a are often CFItemType=Component.
        if item.get("CFItemType") not in {"Standard", "Component"}:
            continue

        statement = _clean_text(item.get("fullStatement") or "")
        if not statement:
            continue

        # Keep first non-empty definition if duplicates appear.
        if code not in mapping:
            mapping[code] = statement

    return mapping


def write_output(output_csv_path: Path, rows: list[tuple[str, str]]) -> None:
    """Write final CSV with a 1-based index."""
    output_csv_path.parent.mkdir(parents=True, exist_ok=True)
    with output_csv_path.open("w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["index", "skill_code", "full_description"])
        for idx, (code, desc) in enumerate(rows, start=1):
            writer.writerow([idx, code, desc])


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Create Skill_List.csv from Skills.csv and CASE standards JSON."
    )
    parser.add_argument(
        "--skills-csv",
        type=Path,
        default=Path("../Data/Skills.csv"),
        help="Path to Skills.csv",
    )
    parser.add_argument(
        "--case-json",
        type=Path,
        default=Path("../Data/CASE-Common Core State Standards for Math.json"),
        help="Path to CASE JSON",
    )
    parser.add_argument(
        "--output-csv",
        type=Path,
        default=Path("../Data/Skill_Set.csv"),
        help="Output path for Skill_Set.csv",
    )

    args = parser.parse_args()

    skills_csv = args.skills_csv.resolve()
    case_json = args.case_json.resolve()
    output_csv = args.output_csv.resolve()

    skill_codes = load_skill_codes(skills_csv)
    case_map = load_case_mapping(case_json)

    matched_codes = sorted(
        code for code in skill_codes if _normalize_skill_code(code) in case_map
    )
    rows = [(code, case_map[_normalize_skill_code(code)]) for code in matched_codes]

    write_output(output_csv, rows)

    missing = sorted(
        code for code in skill_codes if _normalize_skill_code(code) not in case_map
    )
    print(f"Wrote {len(rows)} skills to {output_csv}")
    print(f"Unique skill codes in Skills.csv: {len(skill_codes)}")
    print(f"Missing codes not found in CASE: {len(missing)}")
    if missing:
        preview = ", ".join(missing[:20])
        suffix = " ..." if len(missing) > 20 else ""
        print(f"Missing preview: {preview}{suffix}")


if __name__ == "__main__":
    main()