from __future__ import annotations import re import unicodedata _DEVANAGARI_VOWELS = { "अ": "a", "आ": "aa", "इ": "i", "ई": "i", "उ": "u", "ऊ": "oo", "ऋ": "ri", "ॠ": "ri", "ऌ": "li", "ॡ": "li", "ए": "e", "ऐ": "ai", "ओ": "o", "औ": "au", "ऑ": "o", "ऒ": "o", "ऍ": "e", "ॲ": "a", } _DEVANAGARI_CONSONANTS = { "क": "k", "ख": "kh", "ग": "g", "घ": "gh", "ङ": "ng", "च": "ch", "छ": "chh", "ज": "j", "झ": "jh", "ञ": "ny", "ट": "t", "ठ": "th", "ड": "d", "ढ": "dh", "ण": "n", "त": "t", "थ": "th", "द": "d", "ध": "dh", "न": "n", "प": "p", "फ": "ph", "ब": "b", "भ": "bh", "म": "m", "य": "y", "र": "r", "ल": "l", "व": "v", "श": "sh", "ष": "sh", "स": "s", "ह": "h", "ळ": "l", "क़": "q", "ख़": "kh", "ग़": "g", "ज़": "z", "फ़": "f", "ड़": "d", "ढ़": "dh", "ऩ": "n", "ऱ": "r", "य़": "y", } _DEVANAGARI_MATRAS = { "ा": "a", "ि": "i", "ी": "i", "ु": "u", "ू": "oo", "ृ": "ri", "ॄ": "ri", "ॢ": "li", "ॣ": "li", "े": "e", "ै": "ai", "ो": "o", "ौ": "au", } _DEVANAGARI_SIGNS = { "ं": "n", "ँ": "n", "ः": "h", "ऽ": "", } _ASCII_ALIASES = [ (r"\bpyaj\b", "pyaaz"), (r"\bpyaaj\b", "pyaaz"), (r"\bpyaaja\b", "pyaaz"), (r"\baloo\b", "aloo"), (r"\baaloo\b", "aloo"), (r"\bdoodh\b", "doodh"), (r"\bduudh\b", "doodh"), (r"\btamatar\b", "tamatar"), (r"\btamaatara\b", "tamatar"), (r"\btamaatar\b", "tamatar"), (r"\bdhaniya\b", "dhaniya"), (r"\bdhania\b", "dhaniya"), (r"\bdahee\b", "dahi"), (r"\bdehee\b", "dahi"), (r"\bph?ridg[e]?\b", "fridge"), (r"\bphrij\b", "fridge"), (r"\bfrij\b", "fridge"), (r"\bmrpi?\b", "mrp"), (r"\bskipa?\b", "skip"), (r"\brekord\b", "record"), (r"\bkuntop\b", "counter"), (r"\bcaruntav\b", "counter"), (r"\bkaruntav\b", "counter"), (r"\bpantr(?:y|ii|i)\b", "pantry"), (r"\bbathrum\b", "bathroom"), (r"\bbathroo?m\b", "bathroom"), (r"\bshel[fv]\b", "shelf"), (r"\beks[a-z]*pay[a-z]*ri\b", "expiry"), (r"\beks[a-z]*pari\b", "expiry"), (r"\beks[a-z]*pari?y\b", "expiry"), (r"\bsarph\b", "surf"), (r"\bsaraph\b", "surf"), (r"\bsurf\b", "surf"), (r"\bexc[eiy]l\b", "excel"), (r"\beksel\b", "excel"), (r"\biksel\b", "excel"), (r"\balredi\b", "already"), (r"\bolaredi\b", "already"), (r"\bcola?gate\b", "colgate"), (r"\bbreda\b", "bread"), (r"\bbreada\b", "bread"), (r"\bchawala\b", "chawal"), ] _DEVANAGARI_RE = re.compile(r"[\u0900-\u097f]") _WHITESPACE_RE = re.compile(r"\s+") _NON_WORD_RE = re.compile(r"[^\w\s]") def transliterate_devanagari(text: str) -> str: """Best-effort Devanagari -> Latin transliteration for benchmark scoring.""" out: list[str] = [] chars = text or "" length = len(chars) i = 0 while i < length: ch = chars[i] if ch in _DEVANAGARI_VOWELS: out.append(_DEVANAGARI_VOWELS[ch]) elif ch in _DEVANAGARI_CONSONANTS: base = _DEVANAGARI_CONSONANTS[ch] nxt = chars[i + 1] if i + 1 < length else "" if nxt == "्": out.append(base) i += 1 elif nxt in _DEVANAGARI_MATRAS: out.append(base + _DEVANAGARI_MATRAS[nxt]) i += 1 elif nxt and _DEVANAGARI_RE.match(nxt): out.append(base + "a") else: out.append(base) elif ch in _DEVANAGARI_MATRAS: out.append(_DEVANAGARI_MATRAS[ch]) elif ch in _DEVANAGARI_SIGNS: out.append(_DEVANAGARI_SIGNS[ch]) elif ch == "्": pass elif ch == "।" or ch == "॥": out.append(".") else: out.append(ch) i += 1 transliterated = "".join(out) transliterated = transliterated.translate(str.maketrans("०१२३४५६७८९", "0123456789")) transliterated = unicodedata.normalize("NFKC", transliterated) return transliterated def normalize_text(text: str, *, transliterate: bool = False) -> str: """Normalize benchmark text for fair WER/slot comparisons.""" normalized = (text or "").lower().strip() if transliterate and _DEVANAGARI_RE.search(normalized): normalized = transliterate_devanagari(normalized) for pattern, replacement in _ASCII_ALIASES: normalized = re.sub(pattern, replacement, normalized) normalized = _NON_WORD_RE.sub(" ", normalized) normalized = _WHITESPACE_RE.sub(" ", normalized).strip() return normalized def compute_wer(reference: str, hypothesis: str, *, transliterate_hypothesis: bool = False) -> float: """Word error rate with optional Devanagari transliteration on hypothesis.""" ref_words = normalize_text(reference).split() hyp_words = normalize_text(hypothesis, transliterate=transliterate_hypothesis).split() if not ref_words: return 0.0 if not hyp_words else 1.0 distances = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)] for i in range(len(ref_words) + 1): distances[i][0] = i for j in range(len(hyp_words) + 1): distances[0][j] = j for i in range(1, len(ref_words) + 1): for j in range(1, len(hyp_words) + 1): if ref_words[i - 1] == hyp_words[j - 1]: distances[i][j] = distances[i - 1][j - 1] else: distances[i][j] = 1 + min( distances[i - 1][j], distances[i][j - 1], distances[i - 1][j - 1], ) return distances[-1][-1] / len(ref_words)