"""Rule-based CHAT transcript reviewer for clinical-readiness QA. The reviewer flags likely formatting and ASR-segmentation issues before a transcript is used for feature extraction. It never edits transcript text. """ from __future__ import annotations import re import tempfile from pathlib import Path from typing import Any SPEAKER_RE = re.compile(r"^\*([A-Z]{3}):\s*(.*)$") ADULT_PROMPT_RE = re.compile(r"\b(what is|tell me|can you|do you)\b", re.I) THAI_RE = re.compile(r"[ก-๙]") CONFIDENCE_RE = re.compile( r"(?:%conf(?:idence)?\s*:?\s*|(?:asr|diari[sz]ation)[^\n:]*confidence(?:\s+scores?)?\s*[:=]\s*)(0(?:\.\d+)?|1(?:\.0+)?)", re.I, ) TERMINATOR_RE = re.compile( r"(\.|\?|!|\+\.\.\.|\+\.\.|\+\/\.|\+\/\/\.|\+\/\?)\s*$" ) STRUCTURAL_CODES = { "MISSING_BEGIN", "MISSING_END", "MISSING_PARTICIPANTS", "MISSING_ID", "MISSING_CHI_TIER", "MALFORMED_SPEAKER_TIER", } LIGHT_WARNING_PENALTY_CODES = {"LANG_TAG_MISMATCH", "LOW_ASR_CONFIDENCE"} def _issue( severity: str, code: str, message: str, line: int | None, suggestion: str, ) -> dict[str, Any]: return { "severity": severity, "code": code, "message": message, "line": line, "suggestion": suggestion, } def _marker_counts(text: str) -> dict[str, int]: speaker_lines = [ match.group(2).strip() for line in text.splitlines() if (match := SPEAKER_RE.match(line.strip())) ] return { "xxx": len(re.findall(r"\bxxx\b", text)), "yyy": len(re.findall(r"\byyy\b", text)), "zero_vocalization": sum( 1 for utterance in speaker_lines if re.fullmatch(r"0\s*[.?!]?", utterance) ), "laugh": len(re.findall(r"&=laugh\b", text, flags=re.I)), "gasp": len(re.findall(r"&=gasp\b", text, flags=re.I)), "repetition": len(re.findall(r"\[/\]", text)), } def _languages_header(lines: list[str]) -> str: for line in lines: if line.lower().startswith("@languages"): return line.lower() return "" def _confidence_values(text: str) -> list[float]: values = [] for match in CONFIDENCE_RE.finditer(text): try: values.append(float(match.group(1))) except ValueError: continue return values def _run_pylangacq_parse_check(text: str) -> list[dict[str, Any]]: try: import pylangacq except ImportError: return [ _issue( "info", "PYLANGACQ_PARSE_SKIPPED", "pylangacq is not installed, so CHAT parse validation was skipped.", None, "Install pylangacq to enable parser-level transcript validation.", ) ] with tempfile.NamedTemporaryFile( suffix=".cha", mode="w", delete=False, encoding="utf-8" ) as tmp: tmp.write(text) tmp_path = Path(tmp.name) try: try: pylangacq.read_chat(str(tmp_path)) except Exception: pylangacq.read_chat(str(tmp_path), strict=False) except Exception as exc: # noqa: BLE001 return [ _issue( "error", "PYLANGACQ_PARSE_FAILED", f"pylangacq could not parse this CHAT transcript: {exc}", None, "Review CHAT headers, speaker tiers, terminators, and dependent tiers before feature extraction.", ) ] finally: tmp_path.unlink(missing_ok=True) return [] def review_cha_text(text: str) -> dict[str, Any]: """Review CHAT text and return score, status, summary, and issues.""" lines = text.splitlines() issues: list[dict[str, Any]] = [] utterance_count = 0 child_utterance_count = 0 thai_in_utterances = False if not any(line.strip() == "@Begin" for line in lines): issues.append(_issue( "error", "MISSING_BEGIN", "Missing @Begin header.", None, "Add @Begin as the first CHAT header line.", )) if not any(line.strip() == "@End" for line in lines): issues.append(_issue( "error", "MISSING_END", "Missing @End footer.", None, "Add @End after the final transcript tier.", )) if not any(line.startswith("@Participants") for line in lines): issues.append(_issue( "error", "MISSING_PARTICIPANTS", "Missing @Participants header.", None, "Add @Participants with CHI and adult speaker roles.", )) if not any(line.startswith("@ID") for line in lines): issues.append(_issue( "error", "MISSING_ID", "Missing @ID participant metadata.", None, "Add at least one @ID line, including the child participant.", )) for idx, raw_line in enumerate(lines, start=1): line = raw_line.strip() if not line.startswith("*"): continue match = SPEAKER_RE.match(line) if not match: issues.append(_issue( "error", "MALFORMED_SPEAKER_TIER", "Speaker tier does not follow CHAT-like *XXX: format.", idx, "Use a three-letter uppercase speaker code such as *CHI: or *MOT:.", )) continue speaker, utterance = match.groups() utterance = utterance.strip() utterance_count += 1 if speaker == "CHI": child_utterance_count += 1 if THAI_RE.search(utterance): thai_in_utterances = True if not utterance: issues.append(_issue( "warning", "EMPTY_UTTERANCE", "Speaker tier has no utterance text.", idx, "Remove the empty tier or add the reviewed utterance text.", )) continue if not TERMINATOR_RE.search(utterance): issues.append(_issue( "warning", "MISSING_TERMINATOR", "Utterance does not end with a reasonable CHAT terminator.", idx, "End utterance lines with ., ?, !, +..., +.., +/., +//., or +/? as appropriate.", )) token_count = len(re.findall(r"\S+", utterance)) if token_count > 40 or len(utterance) > 250: issues.append(_issue( "warning", "LONG_UTTERANCE", "Very long utterance may indicate ASR segmentation problems.", idx, "Review segmentation and split long turns before feature extraction.", )) if speaker == "CHI" and ADULT_PROMPT_RE.search(utterance): issues.append(_issue( "warning", "SUSPICIOUS_CHI_PROMPT", "Child tier contains wording that looks like an adult prompt.", idx, "Confirm speaker assignment before using this transcript for features.", )) if child_utterance_count == 0: issues.append(_issue( "error", "MISSING_CHI_TIER", "No child speaker tier (*CHI:) was found.", None, "Add or correct child speaker tiers before feature extraction.", )) if thai_in_utterances and "tha" not in _languages_header(lines): issues.append(_issue( "warning", "LANG_TAG_MISMATCH", "Thai characters were found in speaker utterances, but @Languages does not include tha.", None, "Add 'tha' to @Languages if using Thai words.", )) confidences = _confidence_values(text) average_confidence = None if confidences: average_confidence = round(sum(confidences) / len(confidences), 4) if average_confidence < 0.6: issues.append(_issue( "warning", "LOW_ASR_CONFIDENCE", "Average ASR/diarization confidence is below 0.60.", None, "Human review is recommended before feature extraction or risk estimate interpretation.", )) issues.extend(_run_pylangacq_parse_check(text)) score = 100 for issue in issues: if issue["severity"] == "error": score -= 20 elif issue["severity"] == "warning": score -= 5 if issue["code"] in LIGHT_WARNING_PENALTY_CODES else 8 elif issue["severity"] == "info": score -= 2 score = max(0, min(100, score)) has_structural_error = any( issue["severity"] == "error" and issue["code"] in STRUCTURAL_CODES for issue in issues ) has_warning = any(issue["severity"] == "warning" for issue in issues) if has_structural_error or score < 60: status = "fail" elif has_warning or score < 85: status = "needs_review" else: status = "pass" return { "quality_score": score, "status": status, "summary": { "line_count": len(lines), "utterance_count": utterance_count, "child_utterance_count": child_utterance_count, "marker_counts": _marker_counts(text), "average_confidence": average_confidence, }, "issues": issues, } def review_cha_file(path: str | Path) -> dict[str, Any]: """Review a CHAT file from disk.""" cha_path = Path(path) return review_cha_text(cha_path.read_text(encoding="utf-8"))