Spaces:
Running
Running
| """Deterministic dictation grading: align a blind transcription against a | |
| known reference and emit a strict JSON report. | |
| See specs/ocr.md. The VLM transcription step lives elsewhere; this module | |
| never sees the image and is fully testable with a fixed transcription string. | |
| """ | |
| import re | |
| import unicodedata | |
| from difflib import SequenceMatcher | |
| from typing import Literal, NamedTuple | |
| from pydantic import BaseModel | |
| Status = Literal["correct", "misspelled", "missing", "extra"] | |
| # A word is a maximal run of word characters (Unicode letters/digits/_); | |
| # any other non-space character is a standalone punctuation token. | |
| _TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE) | |
| class Token(NamedTuple): | |
| text: str | |
| is_word: bool | |
| class GradeOptions(NamedTuple): | |
| """Pedagogy knobs for grading (see specs/ocr.md §8). Defaults match the | |
| spec: case matters, punctuation is reported but not counted, diacritics | |
| are graded.""" | |
| case_sensitive: bool = True | |
| grade_punctuation: bool = False | |
| grade_diacritics: bool = True | |
| def comparison_key(word: str, options: GradeOptions) -> str: | |
| """Map a surface word to the key used for alignment/equality, applying the | |
| active grading options. The surface form is preserved for display; only | |
| this key decides whether two words count as 'the same'. | |
| Never maps ß→ss or ö→oe — diacritic *grading* only strips combining marks | |
| (accents), and ß is not a combining mark, so it always stays distinct.""" | |
| key = unicodedata.normalize("NFC", word) | |
| if not options.case_sensitive: | |
| key = key.casefold() | |
| if not options.grade_diacritics: | |
| decomposed = unicodedata.normalize("NFD", key) | |
| key = "".join(ch for ch in decomposed if not unicodedata.combining(ch)) | |
| key = unicodedata.normalize("NFC", key) | |
| return key | |
| def tokenize(text: str) -> list[Token]: | |
| """NFC-normalize then split into word and punctuation tokens.""" | |
| text = unicodedata.normalize("NFC", text) | |
| tokens: list[Token] = [] | |
| for match in _TOKEN_RE.finditer(text): | |
| piece = match.group() | |
| tokens.append(Token(piece, is_word=bool(re.match(r"\w", piece)))) | |
| return tokens | |
| def graphemes(text: str) -> list[str]: | |
| """Split a string into grapheme clusters (a base char plus any trailing | |
| combining marks), so combining diacritics don't desync a character diff.""" | |
| clusters: list[str] = [] | |
| for ch in text: | |
| if clusters and unicodedata.combining(ch): | |
| clusters[-1] += ch | |
| else: | |
| clusters.append(ch) | |
| return clusters | |
| def char_diff(expected: str, read: str) -> str: | |
| """Human-readable grapheme-level diff describing how ``read`` deviates from | |
| ``expected`` (e.g. ``ß→ss``, ``-t``, ``+e``). Empty string if identical.""" | |
| a, b = graphemes(expected), graphemes(read) | |
| parts: list[str] = [] | |
| for op, i1, i2, j1, j2 in SequenceMatcher(a=a, b=b).get_opcodes(): | |
| if op == "equal": | |
| continue | |
| exp_chunk, read_chunk = "".join(a[i1:i2]), "".join(b[j1:j2]) | |
| if op == "replace": | |
| parts.append(f"{exp_chunk}→{read_chunk}") | |
| elif op == "delete": | |
| parts.append(f"-{exp_chunk}") | |
| elif op == "insert": | |
| parts.append(f"+{read_chunk}") | |
| return ", ".join(parts) | |
| class Word(BaseModel): | |
| """One graded word in the report (specs/ocr.md §7). | |
| ``expected`` is None for ``extra``; ``read`` is None for ``missing``; | |
| ``diff`` is present only for ``misspelled``.""" | |
| index: int | |
| expected: str | None | |
| read: str | None | |
| status: Status | |
| diff: str | None = None | |
| def _word_texts(text: str, options: GradeOptions) -> list[str]: | |
| """Token surface forms to align: words always, punctuation only when graded.""" | |
| return [ | |
| tok.text | |
| for tok in tokenize(text) | |
| if tok.is_word or options.grade_punctuation | |
| ] | |
| def align_words( | |
| reference: str, transcription: str, options: GradeOptions | None = None | |
| ) -> list[Word]: | |
| """Align the blind transcription against the reference at word level and | |
| classify each token (specs/ocr.md stages 3-4). Deterministic, no model.""" | |
| options = options or GradeOptions() | |
| ref = _word_texts(reference, options) | |
| read = _word_texts(transcription, options) | |
| ref_keys = [comparison_key(w, options) for w in ref] | |
| read_keys = [comparison_key(w, options) for w in read] | |
| words: list[Word] = [] | |
| def emit(expected: str | None, got: str | None, status: Status) -> None: | |
| diff = char_diff(expected, got) if status == "misspelled" else None | |
| words.append( | |
| Word( | |
| index=len(words), | |
| expected=expected, | |
| read=got, | |
| status=status, | |
| diff=diff or None, | |
| ) | |
| ) | |
| for op, i1, i2, j1, j2 in SequenceMatcher(a=ref_keys, b=read_keys).get_opcodes(): | |
| if op == "equal": | |
| for i, j in zip(range(i1, i2), range(j1, j2)): | |
| emit(ref[i], read[j], "correct") | |
| elif op == "replace": | |
| # Pair up as misspellings; leftovers are missing/extra. | |
| paired = min(i2 - i1, j2 - j1) | |
| for k in range(paired): | |
| emit(ref[i1 + k], read[j1 + k], "misspelled") | |
| for i in range(i1 + paired, i2): | |
| emit(ref[i], None, "missing") | |
| for j in range(j1 + paired, j2): | |
| emit(None, read[j], "extra") | |
| elif op == "delete": | |
| for i in range(i1, i2): | |
| emit(ref[i], None, "missing") | |
| elif op == "insert": | |
| for j in range(j1, j2): | |
| emit(None, read[j], "extra") | |
| return words | |
| class Summary(BaseModel): | |
| """Tally over the graded words. ``total`` counts reference words only | |
| (correct + misspelled + missing); extras don't inflate it.""" | |
| total: int | |
| correct: int | |
| misspelled: int | |
| missing: int | |
| extra: int | |
| accuracy: float | |
| class GradeReport(BaseModel): | |
| """The strict JSON grading report (specs/ocr.md §7).""" | |
| lang: str | |
| reference: str | |
| transcription: str | |
| words: list[Word] | |
| summary: Summary | |
| def _summarize(words: list[Word]) -> Summary: | |
| counts = {"correct": 0, "misspelled": 0, "missing": 0, "extra": 0} | |
| for w in words: | |
| counts[w.status] += 1 | |
| total = counts["correct"] + counts["misspelled"] + counts["missing"] | |
| accuracy = round(counts["correct"] / total, 4) if total else 0.0 | |
| return Summary(total=total, accuracy=accuracy, **counts) | |
| def grade( | |
| reference: str, | |
| transcription: str, | |
| lang: str, | |
| options: GradeOptions | None = None, | |
| ) -> GradeReport: | |
| """Grade a blind transcription against the reference and return a validated | |
| report (specs/ocr.md stages 3-5). The image/VLM never enters here.""" | |
| options = options or GradeOptions() | |
| words = align_words(reference, transcription, options) | |
| return GradeReport( | |
| lang=lang, | |
| reference=unicodedata.normalize("NFC", reference), | |
| transcription=unicodedata.normalize("NFC", transcription), | |
| words=words, | |
| summary=_summarize(words), | |
| ) | |
| _MARKS = {"correct": "✓", "misspelled": "✗", "missing": "·", "extra": "+"} | |
| def format_text_report(report: GradeReport) -> str: | |
| """Render a GradeReport as a human-readable plain-text report (derivable | |
| purely from the JSON, specs/ocr.md §2).""" | |
| s = report.summary | |
| lines = [ | |
| f"[{report.lang}] {s.correct}/{s.total} correct " | |
| f"({s.accuracy * 100:.0f}%) " | |
| f"misspelled={s.misspelled} missing={s.missing} extra={s.extra}", | |
| ] | |
| for w in report.words: | |
| mark = _MARKS[w.status] | |
| if w.status == "correct": | |
| lines.append(f" {mark} {w.read}") | |
| elif w.status == "misspelled": | |
| lines.append(f" {mark} {w.expected} → {w.read} [{w.diff}]") | |
| elif w.status == "missing": | |
| lines.append(f" {mark} {w.expected} (missing)") | |
| else: # extra | |
| lines.append(f" {mark} {w.read} (extra)") | |
| return "\n".join(lines) | |