"""Rule-based CHAT transcript reviewer for clinical-readiness QA.

The reviewer flags likely formatting and ASR-segmentation issues before a
transcript is used for feature extraction. It never edits transcript text.
"""

from __future__ import annotations

import re
import tempfile
from pathlib import Path
from typing import Any

SPEAKER_RE = re.compile(r"^\*([A-Z]{3}):\s*(.*)$")
ADULT_PROMPT_RE = re.compile(r"\b(what is|tell me|can you|do you)\b", re.I)
THAI_RE = re.compile(r"[ก-๙]")
CONFIDENCE_RE = re.compile(
    r"(?:%conf(?:idence)?\s*:?\s*|(?:asr|diari[sz]ation)[^\n:]*confidence(?:\s+scores?)?\s*[:=]\s*)(0(?:\.\d+)?|1(?:\.0+)?)",
    re.I,
)
TERMINATOR_RE = re.compile(
    r"(\.|\?|!|\+\.\.\.|\+\.\.|\+\/\.|\+\/\/\.|\+\/\?)\s*$"
)

STRUCTURAL_CODES = {
    "MISSING_BEGIN",
    "MISSING_END",
    "MISSING_PARTICIPANTS",
    "MISSING_ID",
    "MISSING_CHI_TIER",
    "MALFORMED_SPEAKER_TIER",
}

LIGHT_WARNING_PENALTY_CODES = {"LANG_TAG_MISMATCH", "LOW_ASR_CONFIDENCE"}


def _issue(
    severity: str,
    code: str,
    message: str,
    line: int | None,
    suggestion: str,
) -> dict[str, Any]:
    return {
        "severity": severity,
        "code": code,
        "message": message,
        "line": line,
        "suggestion": suggestion,
    }


def _marker_counts(text: str) -> dict[str, int]:
    speaker_lines = [
        match.group(2).strip()
        for line in text.splitlines()
        if (match := SPEAKER_RE.match(line.strip()))
    ]
    return {
        "xxx": len(re.findall(r"\bxxx\b", text)),
        "yyy": len(re.findall(r"\byyy\b", text)),
        "zero_vocalization": sum(
            1 for utterance in speaker_lines
            if re.fullmatch(r"0\s*[.?!]?", utterance)
        ),
        "laugh": len(re.findall(r"&=laugh\b", text, flags=re.I)),
        "gasp": len(re.findall(r"&=gasp\b", text, flags=re.I)),
        "repetition": len(re.findall(r"\[/\]", text)),
    }


def _languages_header(lines: list[str]) -> str:
    for line in lines:
        if line.lower().startswith("@languages"):
            return line.lower()
    return ""


def _confidence_values(text: str) -> list[float]:
    values = []
    for match in CONFIDENCE_RE.finditer(text):
        try:
            values.append(float(match.group(1)))
        except ValueError:
            continue
    return values


def _run_pylangacq_parse_check(text: str) -> list[dict[str, Any]]:
    try:
        import pylangacq
    except ImportError:
        return [
            _issue(
                "info",
                "PYLANGACQ_PARSE_SKIPPED",
                "pylangacq is not installed, so CHAT parse validation was skipped.",
                None,
                "Install pylangacq to enable parser-level transcript validation.",
            )
        ]

    with tempfile.NamedTemporaryFile(
        suffix=".cha", mode="w", delete=False, encoding="utf-8"
    ) as tmp:
        tmp.write(text)
        tmp_path = Path(tmp.name)

    try:
        try:
            pylangacq.read_chat(str(tmp_path))
        except Exception:
            pylangacq.read_chat(str(tmp_path), strict=False)
    except Exception as exc:  # noqa: BLE001
        return [
            _issue(
                "error",
                "PYLANGACQ_PARSE_FAILED",
                f"pylangacq could not parse this CHAT transcript: {exc}",
                None,
                "Review CHAT headers, speaker tiers, terminators, and dependent tiers before feature extraction.",
            )
        ]
    finally:
        tmp_path.unlink(missing_ok=True)

    return []


def review_cha_text(text: str) -> dict[str, Any]:
    """Review CHAT text and return score, status, summary, and issues."""
    lines = text.splitlines()
    issues: list[dict[str, Any]] = []
    utterance_count = 0
    child_utterance_count = 0
    thai_in_utterances = False

    if not any(line.strip() == "@Begin" for line in lines):
        issues.append(_issue(
            "error", "MISSING_BEGIN", "Missing @Begin header.", None,
            "Add @Begin as the first CHAT header line.",
        ))
    if not any(line.strip() == "@End" for line in lines):
        issues.append(_issue(
            "error", "MISSING_END", "Missing @End footer.", None,
            "Add @End after the final transcript tier.",
        ))
    if not any(line.startswith("@Participants") for line in lines):
        issues.append(_issue(
            "error", "MISSING_PARTICIPANTS", "Missing @Participants header.",
            None, "Add @Participants with CHI and adult speaker roles.",
        ))
    if not any(line.startswith("@ID") for line in lines):
        issues.append(_issue(
            "error", "MISSING_ID", "Missing @ID participant metadata.",
            None, "Add at least one @ID line, including the child participant.",
        ))

    for idx, raw_line in enumerate(lines, start=1):
        line = raw_line.strip()
        if not line.startswith("*"):
            continue

        match = SPEAKER_RE.match(line)
        if not match:
            issues.append(_issue(
                "error",
                "MALFORMED_SPEAKER_TIER",
                "Speaker tier does not follow CHAT-like *XXX: format.",
                idx,
                "Use a three-letter uppercase speaker code such as *CHI: or *MOT:.",
            ))
            continue

        speaker, utterance = match.groups()
        utterance = utterance.strip()
        utterance_count += 1
        if speaker == "CHI":
            child_utterance_count += 1
        if THAI_RE.search(utterance):
            thai_in_utterances = True

        if not utterance:
            issues.append(_issue(
                "warning",
                "EMPTY_UTTERANCE",
                "Speaker tier has no utterance text.",
                idx,
                "Remove the empty tier or add the reviewed utterance text.",
            ))
            continue

        if not TERMINATOR_RE.search(utterance):
            issues.append(_issue(
                "warning",
                "MISSING_TERMINATOR",
                "Utterance does not end with a reasonable CHAT terminator.",
                idx,
                "End utterance lines with ., ?, !, +..., +.., +/., +//., or +/? as appropriate.",
            ))

        token_count = len(re.findall(r"\S+", utterance))
        if token_count > 40 or len(utterance) > 250:
            issues.append(_issue(
                "warning",
                "LONG_UTTERANCE",
                "Very long utterance may indicate ASR segmentation problems.",
                idx,
                "Review segmentation and split long turns before feature extraction.",
            ))

        if speaker == "CHI" and ADULT_PROMPT_RE.search(utterance):
            issues.append(_issue(
                "warning",
                "SUSPICIOUS_CHI_PROMPT",
                "Child tier contains wording that looks like an adult prompt.",
                idx,
                "Confirm speaker assignment before using this transcript for features.",
            ))

    if child_utterance_count == 0:
        issues.append(_issue(
            "error",
            "MISSING_CHI_TIER",
            "No child speaker tier (*CHI:) was found.",
            None,
            "Add or correct child speaker tiers before feature extraction.",
        ))

    if thai_in_utterances and "tha" not in _languages_header(lines):
        issues.append(_issue(
            "warning",
            "LANG_TAG_MISMATCH",
            "Thai characters were found in speaker utterances, but @Languages does not include tha.",
            None,
            "Add 'tha' to @Languages if using Thai words.",
        ))

    confidences = _confidence_values(text)
    average_confidence = None
    if confidences:
        average_confidence = round(sum(confidences) / len(confidences), 4)
        if average_confidence < 0.6:
            issues.append(_issue(
                "warning",
                "LOW_ASR_CONFIDENCE",
                "Average ASR/diarization confidence is below 0.60.",
                None,
                "Human review is recommended before feature extraction or risk estimate interpretation.",
            ))

    issues.extend(_run_pylangacq_parse_check(text))

    score = 100
    for issue in issues:
        if issue["severity"] == "error":
            score -= 20
        elif issue["severity"] == "warning":
            score -= 5 if issue["code"] in LIGHT_WARNING_PENALTY_CODES else 8
        elif issue["severity"] == "info":
            score -= 2
    score = max(0, min(100, score))

    has_structural_error = any(
        issue["severity"] == "error" and issue["code"] in STRUCTURAL_CODES
        for issue in issues
    )
    has_warning = any(issue["severity"] == "warning" for issue in issues)
    if has_structural_error or score < 60:
        status = "fail"
    elif has_warning or score < 85:
        status = "needs_review"
    else:
        status = "pass"

    return {
        "quality_score": score,
        "status": status,
        "summary": {
            "line_count": len(lines),
            "utterance_count": utterance_count,
            "child_utterance_count": child_utterance_count,
            "marker_counts": _marker_counts(text),
            "average_confidence": average_confidence,
        },
        "issues": issues,
    }


def review_cha_file(path: str | Path) -> dict[str, Any]:
    """Review a CHAT file from disk."""
    cha_path = Path(path)
    return review_cha_text(cha_path.read_text(encoding="utf-8"))