shopstack / benchmarks /modal /text_normalization.py
pranaysuyash's picture
Sync ShopStack 2026-06-15: corrections panel, empty-state rewrite, market-source suppression
8294cde verified
Raw
History Blame Contribute Delete
5.98 kB
from __future__ import annotations
import re
import unicodedata
_DEVANAGARI_VOWELS = {
"अ": "a",
"आ": "aa",
"इ": "i",
"ई": "i",
"उ": "u",
"ऊ": "oo",
"ऋ": "ri",
"ॠ": "ri",
"ऌ": "li",
"ॡ": "li",
"ए": "e",
"ऐ": "ai",
"ओ": "o",
"औ": "au",
"ऑ": "o",
"ऒ": "o",
"ऍ": "e",
"ॲ": "a",
}
_DEVANAGARI_CONSONANTS = {
"क": "k",
"ख": "kh",
"ग": "g",
"घ": "gh",
"ङ": "ng",
"च": "ch",
"छ": "chh",
"ज": "j",
"झ": "jh",
"ञ": "ny",
"ट": "t",
"ठ": "th",
"ड": "d",
"ढ": "dh",
"ण": "n",
"त": "t",
"थ": "th",
"द": "d",
"ध": "dh",
"न": "n",
"प": "p",
"फ": "ph",
"ब": "b",
"भ": "bh",
"म": "m",
"य": "y",
"र": "r",
"ल": "l",
"व": "v",
"श": "sh",
"ष": "sh",
"स": "s",
"ह": "h",
"ळ": "l",
"क़": "q",
"ख़": "kh",
"ग़": "g",
"ज़": "z",
"फ़": "f",
"ड़": "d",
"ढ़": "dh",
"ऩ": "n",
"ऱ": "r",
"य़": "y",
}
_DEVANAGARI_MATRAS = {
"ा": "a",
"ि": "i",
"ी": "i",
"ु": "u",
"ू": "oo",
"ृ": "ri",
"ॄ": "ri",
"ॢ": "li",
"ॣ": "li",
"े": "e",
"ै": "ai",
"ो": "o",
"ौ": "au",
}
_DEVANAGARI_SIGNS = {
"ं": "n",
"ँ": "n",
"ः": "h",
"ऽ": "",
}
_ASCII_ALIASES = [
(r"\bpyaj\b", "pyaaz"),
(r"\bpyaaj\b", "pyaaz"),
(r"\bpyaaja\b", "pyaaz"),
(r"\baloo\b", "aloo"),
(r"\baaloo\b", "aloo"),
(r"\bdoodh\b", "doodh"),
(r"\bduudh\b", "doodh"),
(r"\btamatar\b", "tamatar"),
(r"\btamaatara\b", "tamatar"),
(r"\btamaatar\b", "tamatar"),
(r"\bdhaniya\b", "dhaniya"),
(r"\bdhania\b", "dhaniya"),
(r"\bdahee\b", "dahi"),
(r"\bdehee\b", "dahi"),
(r"\bph?ridg[e]?\b", "fridge"),
(r"\bphrij\b", "fridge"),
(r"\bfrij\b", "fridge"),
(r"\bmrpi?\b", "mrp"),
(r"\bskipa?\b", "skip"),
(r"\brekord\b", "record"),
(r"\bkuntop\b", "counter"),
(r"\bcaruntav\b", "counter"),
(r"\bkaruntav\b", "counter"),
(r"\bpantr(?:y|ii|i)\b", "pantry"),
(r"\bbathrum\b", "bathroom"),
(r"\bbathroo?m\b", "bathroom"),
(r"\bshel[fv]\b", "shelf"),
(r"\beks[a-z]*pay[a-z]*ri\b", "expiry"),
(r"\beks[a-z]*pari\b", "expiry"),
(r"\beks[a-z]*pari?y\b", "expiry"),
(r"\bsarph\b", "surf"),
(r"\bsaraph\b", "surf"),
(r"\bsurf\b", "surf"),
(r"\bexc[eiy]l\b", "excel"),
(r"\beksel\b", "excel"),
(r"\biksel\b", "excel"),
(r"\balredi\b", "already"),
(r"\bolaredi\b", "already"),
(r"\bcola?gate\b", "colgate"),
(r"\bbreda\b", "bread"),
(r"\bbreada\b", "bread"),
(r"\bchawala\b", "chawal"),
]
_DEVANAGARI_RE = re.compile(r"[\u0900-\u097f]")
_WHITESPACE_RE = re.compile(r"\s+")
_NON_WORD_RE = re.compile(r"[^\w\s]")
def transliterate_devanagari(text: str) -> str:
"""Best-effort Devanagari -> Latin transliteration for benchmark scoring."""
out: list[str] = []
chars = text or ""
length = len(chars)
i = 0
while i < length:
ch = chars[i]
if ch in _DEVANAGARI_VOWELS:
out.append(_DEVANAGARI_VOWELS[ch])
elif ch in _DEVANAGARI_CONSONANTS:
base = _DEVANAGARI_CONSONANTS[ch]
nxt = chars[i + 1] if i + 1 < length else ""
if nxt == "्":
out.append(base)
i += 1
elif nxt in _DEVANAGARI_MATRAS:
out.append(base + _DEVANAGARI_MATRAS[nxt])
i += 1
elif nxt and _DEVANAGARI_RE.match(nxt):
out.append(base + "a")
else:
out.append(base)
elif ch in _DEVANAGARI_MATRAS:
out.append(_DEVANAGARI_MATRAS[ch])
elif ch in _DEVANAGARI_SIGNS:
out.append(_DEVANAGARI_SIGNS[ch])
elif ch == "्":
pass
elif ch == "।" or ch == "॥":
out.append(".")
else:
out.append(ch)
i += 1
transliterated = "".join(out)
transliterated = transliterated.translate(str.maketrans("०१२३४५६७८९", "0123456789"))
transliterated = unicodedata.normalize("NFKC", transliterated)
return transliterated
def normalize_text(text: str, *, transliterate: bool = False) -> str:
"""Normalize benchmark text for fair WER/slot comparisons."""
normalized = (text or "").lower().strip()
if transliterate and _DEVANAGARI_RE.search(normalized):
normalized = transliterate_devanagari(normalized)
for pattern, replacement in _ASCII_ALIASES:
normalized = re.sub(pattern, replacement, normalized)
normalized = _NON_WORD_RE.sub(" ", normalized)
normalized = _WHITESPACE_RE.sub(" ", normalized).strip()
return normalized
def compute_wer(reference: str, hypothesis: str, *, transliterate_hypothesis: bool = False) -> float:
"""Word error rate with optional Devanagari transliteration on hypothesis."""
ref_words = normalize_text(reference).split()
hyp_words = normalize_text(hypothesis, transliterate=transliterate_hypothesis).split()
if not ref_words:
return 0.0 if not hyp_words else 1.0
distances = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)]
for i in range(len(ref_words) + 1):
distances[i][0] = i
for j in range(len(hyp_words) + 1):
distances[0][j] = j
for i in range(1, len(ref_words) + 1):
for j in range(1, len(hyp_words) + 1):
if ref_words[i - 1] == hyp_words[j - 1]:
distances[i][j] = distances[i - 1][j - 1]
else:
distances[i][j] = 1 + min(
distances[i - 1][j],
distances[i][j - 1],
distances[i - 1][j - 1],
)
return distances[-1][-1] / len(ref_words)