asd-screening-tool / src /transcript_reviewer.py
Paoo
Deploy pastel clinical readiness dashboard
50cf534
"""Rule-based CHAT transcript reviewer for clinical-readiness QA.
The reviewer flags likely formatting and ASR-segmentation issues before a
transcript is used for feature extraction. It never edits transcript text.
"""
from __future__ import annotations
import re
import tempfile
from pathlib import Path
from typing import Any
SPEAKER_RE = re.compile(r"^\*([A-Z]{3}):\s*(.*)$")
ADULT_PROMPT_RE = re.compile(r"\b(what is|tell me|can you|do you)\b", re.I)
THAI_RE = re.compile(r"[ก-๙]")
CONFIDENCE_RE = re.compile(
r"(?:%conf(?:idence)?\s*:?\s*|(?:asr|diari[sz]ation)[^\n:]*confidence(?:\s+scores?)?\s*[:=]\s*)(0(?:\.\d+)?|1(?:\.0+)?)",
re.I,
)
TERMINATOR_RE = re.compile(
r"(\.|\?|!|\+\.\.\.|\+\.\.|\+\/\.|\+\/\/\.|\+\/\?)\s*$"
)
STRUCTURAL_CODES = {
"MISSING_BEGIN",
"MISSING_END",
"MISSING_PARTICIPANTS",
"MISSING_ID",
"MISSING_CHI_TIER",
"MALFORMED_SPEAKER_TIER",
}
LIGHT_WARNING_PENALTY_CODES = {"LANG_TAG_MISMATCH", "LOW_ASR_CONFIDENCE"}
def _issue(
severity: str,
code: str,
message: str,
line: int | None,
suggestion: str,
) -> dict[str, Any]:
return {
"severity": severity,
"code": code,
"message": message,
"line": line,
"suggestion": suggestion,
}
def _marker_counts(text: str) -> dict[str, int]:
speaker_lines = [
match.group(2).strip()
for line in text.splitlines()
if (match := SPEAKER_RE.match(line.strip()))
]
return {
"xxx": len(re.findall(r"\bxxx\b", text)),
"yyy": len(re.findall(r"\byyy\b", text)),
"zero_vocalization": sum(
1 for utterance in speaker_lines
if re.fullmatch(r"0\s*[.?!]?", utterance)
),
"laugh": len(re.findall(r"&=laugh\b", text, flags=re.I)),
"gasp": len(re.findall(r"&=gasp\b", text, flags=re.I)),
"repetition": len(re.findall(r"\[/\]", text)),
}
def _languages_header(lines: list[str]) -> str:
for line in lines:
if line.lower().startswith("@languages"):
return line.lower()
return ""
def _confidence_values(text: str) -> list[float]:
values = []
for match in CONFIDENCE_RE.finditer(text):
try:
values.append(float(match.group(1)))
except ValueError:
continue
return values
def _run_pylangacq_parse_check(text: str) -> list[dict[str, Any]]:
try:
import pylangacq
except ImportError:
return [
_issue(
"info",
"PYLANGACQ_PARSE_SKIPPED",
"pylangacq is not installed, so CHAT parse validation was skipped.",
None,
"Install pylangacq to enable parser-level transcript validation.",
)
]
with tempfile.NamedTemporaryFile(
suffix=".cha", mode="w", delete=False, encoding="utf-8"
) as tmp:
tmp.write(text)
tmp_path = Path(tmp.name)
try:
try:
pylangacq.read_chat(str(tmp_path))
except Exception:
pylangacq.read_chat(str(tmp_path), strict=False)
except Exception as exc: # noqa: BLE001
return [
_issue(
"error",
"PYLANGACQ_PARSE_FAILED",
f"pylangacq could not parse this CHAT transcript: {exc}",
None,
"Review CHAT headers, speaker tiers, terminators, and dependent tiers before feature extraction.",
)
]
finally:
tmp_path.unlink(missing_ok=True)
return []
def review_cha_text(text: str) -> dict[str, Any]:
"""Review CHAT text and return score, status, summary, and issues."""
lines = text.splitlines()
issues: list[dict[str, Any]] = []
utterance_count = 0
child_utterance_count = 0
thai_in_utterances = False
if not any(line.strip() == "@Begin" for line in lines):
issues.append(_issue(
"error", "MISSING_BEGIN", "Missing @Begin header.", None,
"Add @Begin as the first CHAT header line.",
))
if not any(line.strip() == "@End" for line in lines):
issues.append(_issue(
"error", "MISSING_END", "Missing @End footer.", None,
"Add @End after the final transcript tier.",
))
if not any(line.startswith("@Participants") for line in lines):
issues.append(_issue(
"error", "MISSING_PARTICIPANTS", "Missing @Participants header.",
None, "Add @Participants with CHI and adult speaker roles.",
))
if not any(line.startswith("@ID") for line in lines):
issues.append(_issue(
"error", "MISSING_ID", "Missing @ID participant metadata.",
None, "Add at least one @ID line, including the child participant.",
))
for idx, raw_line in enumerate(lines, start=1):
line = raw_line.strip()
if not line.startswith("*"):
continue
match = SPEAKER_RE.match(line)
if not match:
issues.append(_issue(
"error",
"MALFORMED_SPEAKER_TIER",
"Speaker tier does not follow CHAT-like *XXX: format.",
idx,
"Use a three-letter uppercase speaker code such as *CHI: or *MOT:.",
))
continue
speaker, utterance = match.groups()
utterance = utterance.strip()
utterance_count += 1
if speaker == "CHI":
child_utterance_count += 1
if THAI_RE.search(utterance):
thai_in_utterances = True
if not utterance:
issues.append(_issue(
"warning",
"EMPTY_UTTERANCE",
"Speaker tier has no utterance text.",
idx,
"Remove the empty tier or add the reviewed utterance text.",
))
continue
if not TERMINATOR_RE.search(utterance):
issues.append(_issue(
"warning",
"MISSING_TERMINATOR",
"Utterance does not end with a reasonable CHAT terminator.",
idx,
"End utterance lines with ., ?, !, +..., +.., +/., +//., or +/? as appropriate.",
))
token_count = len(re.findall(r"\S+", utterance))
if token_count > 40 or len(utterance) > 250:
issues.append(_issue(
"warning",
"LONG_UTTERANCE",
"Very long utterance may indicate ASR segmentation problems.",
idx,
"Review segmentation and split long turns before feature extraction.",
))
if speaker == "CHI" and ADULT_PROMPT_RE.search(utterance):
issues.append(_issue(
"warning",
"SUSPICIOUS_CHI_PROMPT",
"Child tier contains wording that looks like an adult prompt.",
idx,
"Confirm speaker assignment before using this transcript for features.",
))
if child_utterance_count == 0:
issues.append(_issue(
"error",
"MISSING_CHI_TIER",
"No child speaker tier (*CHI:) was found.",
None,
"Add or correct child speaker tiers before feature extraction.",
))
if thai_in_utterances and "tha" not in _languages_header(lines):
issues.append(_issue(
"warning",
"LANG_TAG_MISMATCH",
"Thai characters were found in speaker utterances, but @Languages does not include tha.",
None,
"Add 'tha' to @Languages if using Thai words.",
))
confidences = _confidence_values(text)
average_confidence = None
if confidences:
average_confidence = round(sum(confidences) / len(confidences), 4)
if average_confidence < 0.6:
issues.append(_issue(
"warning",
"LOW_ASR_CONFIDENCE",
"Average ASR/diarization confidence is below 0.60.",
None,
"Human review is recommended before feature extraction or risk estimate interpretation.",
))
issues.extend(_run_pylangacq_parse_check(text))
score = 100
for issue in issues:
if issue["severity"] == "error":
score -= 20
elif issue["severity"] == "warning":
score -= 5 if issue["code"] in LIGHT_WARNING_PENALTY_CODES else 8
elif issue["severity"] == "info":
score -= 2
score = max(0, min(100, score))
has_structural_error = any(
issue["severity"] == "error" and issue["code"] in STRUCTURAL_CODES
for issue in issues
)
has_warning = any(issue["severity"] == "warning" for issue in issues)
if has_structural_error or score < 60:
status = "fail"
elif has_warning or score < 85:
status = "needs_review"
else:
status = "pass"
return {
"quality_score": score,
"status": status,
"summary": {
"line_count": len(lines),
"utterance_count": utterance_count,
"child_utterance_count": child_utterance_count,
"marker_counts": _marker_counts(text),
"average_confidence": average_confidence,
},
"issues": issues,
}
def review_cha_file(path: str | Path) -> dict[str, Any]:
"""Review a CHAT file from disk."""
cha_path = Path(path)
return review_cha_text(cha_path.read_text(encoding="utf-8"))