Spaces:
Running
Running
Sync ShopStack 2026-06-15: corrections panel, empty-state rewrite, market-source suppression
8294cde verified | from __future__ import annotations | |
| import re | |
| import unicodedata | |
| _DEVANAGARI_VOWELS = { | |
| "अ": "a", | |
| "आ": "aa", | |
| "इ": "i", | |
| "ई": "i", | |
| "उ": "u", | |
| "ऊ": "oo", | |
| "ऋ": "ri", | |
| "ॠ": "ri", | |
| "ऌ": "li", | |
| "ॡ": "li", | |
| "ए": "e", | |
| "ऐ": "ai", | |
| "ओ": "o", | |
| "औ": "au", | |
| "ऑ": "o", | |
| "ऒ": "o", | |
| "ऍ": "e", | |
| "ॲ": "a", | |
| } | |
| _DEVANAGARI_CONSONANTS = { | |
| "क": "k", | |
| "ख": "kh", | |
| "ग": "g", | |
| "घ": "gh", | |
| "ङ": "ng", | |
| "च": "ch", | |
| "छ": "chh", | |
| "ज": "j", | |
| "झ": "jh", | |
| "ञ": "ny", | |
| "ट": "t", | |
| "ठ": "th", | |
| "ड": "d", | |
| "ढ": "dh", | |
| "ण": "n", | |
| "त": "t", | |
| "थ": "th", | |
| "द": "d", | |
| "ध": "dh", | |
| "न": "n", | |
| "प": "p", | |
| "फ": "ph", | |
| "ब": "b", | |
| "भ": "bh", | |
| "म": "m", | |
| "य": "y", | |
| "र": "r", | |
| "ल": "l", | |
| "व": "v", | |
| "श": "sh", | |
| "ष": "sh", | |
| "स": "s", | |
| "ह": "h", | |
| "ळ": "l", | |
| "क़": "q", | |
| "ख़": "kh", | |
| "ग़": "g", | |
| "ज़": "z", | |
| "फ़": "f", | |
| "ड़": "d", | |
| "ढ़": "dh", | |
| "ऩ": "n", | |
| "ऱ": "r", | |
| "य़": "y", | |
| } | |
| _DEVANAGARI_MATRAS = { | |
| "ा": "a", | |
| "ि": "i", | |
| "ी": "i", | |
| "ु": "u", | |
| "ू": "oo", | |
| "ृ": "ri", | |
| "ॄ": "ri", | |
| "ॢ": "li", | |
| "ॣ": "li", | |
| "े": "e", | |
| "ै": "ai", | |
| "ो": "o", | |
| "ौ": "au", | |
| } | |
| _DEVANAGARI_SIGNS = { | |
| "ं": "n", | |
| "ँ": "n", | |
| "ः": "h", | |
| "ऽ": "", | |
| } | |
| _ASCII_ALIASES = [ | |
| (r"\bpyaj\b", "pyaaz"), | |
| (r"\bpyaaj\b", "pyaaz"), | |
| (r"\bpyaaja\b", "pyaaz"), | |
| (r"\baloo\b", "aloo"), | |
| (r"\baaloo\b", "aloo"), | |
| (r"\bdoodh\b", "doodh"), | |
| (r"\bduudh\b", "doodh"), | |
| (r"\btamatar\b", "tamatar"), | |
| (r"\btamaatara\b", "tamatar"), | |
| (r"\btamaatar\b", "tamatar"), | |
| (r"\bdhaniya\b", "dhaniya"), | |
| (r"\bdhania\b", "dhaniya"), | |
| (r"\bdahee\b", "dahi"), | |
| (r"\bdehee\b", "dahi"), | |
| (r"\bph?ridg[e]?\b", "fridge"), | |
| (r"\bphrij\b", "fridge"), | |
| (r"\bfrij\b", "fridge"), | |
| (r"\bmrpi?\b", "mrp"), | |
| (r"\bskipa?\b", "skip"), | |
| (r"\brekord\b", "record"), | |
| (r"\bkuntop\b", "counter"), | |
| (r"\bcaruntav\b", "counter"), | |
| (r"\bkaruntav\b", "counter"), | |
| (r"\bpantr(?:y|ii|i)\b", "pantry"), | |
| (r"\bbathrum\b", "bathroom"), | |
| (r"\bbathroo?m\b", "bathroom"), | |
| (r"\bshel[fv]\b", "shelf"), | |
| (r"\beks[a-z]*pay[a-z]*ri\b", "expiry"), | |
| (r"\beks[a-z]*pari\b", "expiry"), | |
| (r"\beks[a-z]*pari?y\b", "expiry"), | |
| (r"\bsarph\b", "surf"), | |
| (r"\bsaraph\b", "surf"), | |
| (r"\bsurf\b", "surf"), | |
| (r"\bexc[eiy]l\b", "excel"), | |
| (r"\beksel\b", "excel"), | |
| (r"\biksel\b", "excel"), | |
| (r"\balredi\b", "already"), | |
| (r"\bolaredi\b", "already"), | |
| (r"\bcola?gate\b", "colgate"), | |
| (r"\bbreda\b", "bread"), | |
| (r"\bbreada\b", "bread"), | |
| (r"\bchawala\b", "chawal"), | |
| ] | |
| _DEVANAGARI_RE = re.compile(r"[\u0900-\u097f]") | |
| _WHITESPACE_RE = re.compile(r"\s+") | |
| _NON_WORD_RE = re.compile(r"[^\w\s]") | |
| def transliterate_devanagari(text: str) -> str: | |
| """Best-effort Devanagari -> Latin transliteration for benchmark scoring.""" | |
| out: list[str] = [] | |
| chars = text or "" | |
| length = len(chars) | |
| i = 0 | |
| while i < length: | |
| ch = chars[i] | |
| if ch in _DEVANAGARI_VOWELS: | |
| out.append(_DEVANAGARI_VOWELS[ch]) | |
| elif ch in _DEVANAGARI_CONSONANTS: | |
| base = _DEVANAGARI_CONSONANTS[ch] | |
| nxt = chars[i + 1] if i + 1 < length else "" | |
| if nxt == "्": | |
| out.append(base) | |
| i += 1 | |
| elif nxt in _DEVANAGARI_MATRAS: | |
| out.append(base + _DEVANAGARI_MATRAS[nxt]) | |
| i += 1 | |
| elif nxt and _DEVANAGARI_RE.match(nxt): | |
| out.append(base + "a") | |
| else: | |
| out.append(base) | |
| elif ch in _DEVANAGARI_MATRAS: | |
| out.append(_DEVANAGARI_MATRAS[ch]) | |
| elif ch in _DEVANAGARI_SIGNS: | |
| out.append(_DEVANAGARI_SIGNS[ch]) | |
| elif ch == "्": | |
| pass | |
| elif ch == "।" or ch == "॥": | |
| out.append(".") | |
| else: | |
| out.append(ch) | |
| i += 1 | |
| transliterated = "".join(out) | |
| transliterated = transliterated.translate(str.maketrans("०१२३४५६७८९", "0123456789")) | |
| transliterated = unicodedata.normalize("NFKC", transliterated) | |
| return transliterated | |
| def normalize_text(text: str, *, transliterate: bool = False) -> str: | |
| """Normalize benchmark text for fair WER/slot comparisons.""" | |
| normalized = (text or "").lower().strip() | |
| if transliterate and _DEVANAGARI_RE.search(normalized): | |
| normalized = transliterate_devanagari(normalized) | |
| for pattern, replacement in _ASCII_ALIASES: | |
| normalized = re.sub(pattern, replacement, normalized) | |
| normalized = _NON_WORD_RE.sub(" ", normalized) | |
| normalized = _WHITESPACE_RE.sub(" ", normalized).strip() | |
| return normalized | |
| def compute_wer(reference: str, hypothesis: str, *, transliterate_hypothesis: bool = False) -> float: | |
| """Word error rate with optional Devanagari transliteration on hypothesis.""" | |
| ref_words = normalize_text(reference).split() | |
| hyp_words = normalize_text(hypothesis, transliterate=transliterate_hypothesis).split() | |
| if not ref_words: | |
| return 0.0 if not hyp_words else 1.0 | |
| distances = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)] | |
| for i in range(len(ref_words) + 1): | |
| distances[i][0] = i | |
| for j in range(len(hyp_words) + 1): | |
| distances[0][j] = j | |
| for i in range(1, len(ref_words) + 1): | |
| for j in range(1, len(hyp_words) + 1): | |
| if ref_words[i - 1] == hyp_words[j - 1]: | |
| distances[i][j] = distances[i - 1][j - 1] | |
| else: | |
| distances[i][j] = 1 + min( | |
| distances[i - 1][j], | |
| distances[i][j - 1], | |
| distances[i - 1][j - 1], | |
| ) | |
| return distances[-1][-1] / len(ref_words) | |