Spaces:

pranaysuyash
/

shopstack

Running

App Files Files Community

shopstack / benchmarks /modal /text_normalization.py

pranaysuyash

Sync ShopStack 2026-06-15: corrections panel, empty-state rewrite, market-source suppression

8294cde verified 18 days ago

Raw

History Blame Contribute Delete

5.98 kB

	from __future__ import annotations

	import re
	import unicodedata


	_DEVANAGARI_VOWELS = {
	"अ": "a",
	"आ": "aa",
	"इ": "i",
	"ई": "i",
	"उ": "u",
	"ऊ": "oo",
	"ऋ": "ri",
	"ॠ": "ri",
	"ऌ": "li",
	"ॡ": "li",
	"ए": "e",
	"ऐ": "ai",
	"ओ": "o",
	"औ": "au",
	"ऑ": "o",
	"ऒ": "o",
	"ऍ": "e",
	"ॲ": "a",
	}

	_DEVANAGARI_CONSONANTS = {
	"क": "k",
	"ख": "kh",
	"ग": "g",
	"घ": "gh",
	"ङ": "ng",
	"च": "ch",
	"छ": "chh",
	"ज": "j",
	"झ": "jh",
	"ञ": "ny",
	"ट": "t",
	"ठ": "th",
	"ड": "d",
	"ढ": "dh",
	"ण": "n",
	"त": "t",
	"थ": "th",
	"द": "d",
	"ध": "dh",
	"न": "n",
	"प": "p",
	"फ": "ph",
	"ब": "b",
	"भ": "bh",
	"म": "m",
	"य": "y",
	"र": "r",
	"ल": "l",
	"व": "v",
	"श": "sh",
	"ष": "sh",
	"स": "s",
	"ह": "h",
	"ळ": "l",
	"क़": "q",
	"ख़": "kh",
	"ग़": "g",
	"ज़": "z",
	"फ़": "f",
	"ड़": "d",
	"ढ़": "dh",
	"ऩ": "n",
	"ऱ": "r",
	"य़": "y",
	}

	_DEVANAGARI_MATRAS = {
	"ा": "a",
	"ि": "i",
	"ी": "i",
	"ु": "u",
	"ू": "oo",
	"ृ": "ri",
	"ॄ": "ri",
	"ॢ": "li",
	"ॣ": "li",
	"े": "e",
	"ै": "ai",
	"ो": "o",
	"ौ": "au",
	}

	_DEVANAGARI_SIGNS = {
	"ं": "n",
	"ँ": "n",
	"ः": "h",
	"ऽ": "",
	}

	_ASCII_ALIASES = [
	(r"\bpyaj\b", "pyaaz"),
	(r"\bpyaaj\b", "pyaaz"),
	(r"\bpyaaja\b", "pyaaz"),
	(r"\baloo\b", "aloo"),
	(r"\baaloo\b", "aloo"),
	(r"\bdoodh\b", "doodh"),
	(r"\bduudh\b", "doodh"),
	(r"\btamatar\b", "tamatar"),
	(r"\btamaatara\b", "tamatar"),
	(r"\btamaatar\b", "tamatar"),
	(r"\bdhaniya\b", "dhaniya"),
	(r"\bdhania\b", "dhaniya"),
	(r"\bdahee\b", "dahi"),
	(r"\bdehee\b", "dahi"),
	(r"\bph?ridg[e]?\b", "fridge"),
	(r"\bphrij\b", "fridge"),
	(r"\bfrij\b", "fridge"),
	(r"\bmrpi?\b", "mrp"),
	(r"\bskipa?\b", "skip"),
	(r"\brekord\b", "record"),
	(r"\bkuntop\b", "counter"),
	(r"\bcaruntav\b", "counter"),
	(r"\bkaruntav\b", "counter"),
	(r"\bpantr(?:y\|ii\|i)\b", "pantry"),
	(r"\bbathrum\b", "bathroom"),
	(r"\bbathroo?m\b", "bathroom"),
	(r"\bshel[fv]\b", "shelf"),
	(r"\beks[a-z]pay[a-z]ri\b", "expiry"),
	(r"\beks[a-z]*pari\b", "expiry"),
	(r"\beks[a-z]*pari?y\b", "expiry"),
	(r"\bsarph\b", "surf"),
	(r"\bsaraph\b", "surf"),
	(r"\bsurf\b", "surf"),
	(r"\bexc[eiy]l\b", "excel"),
	(r"\beksel\b", "excel"),
	(r"\biksel\b", "excel"),
	(r"\balredi\b", "already"),
	(r"\bolaredi\b", "already"),
	(r"\bcola?gate\b", "colgate"),
	(r"\bbreda\b", "bread"),
	(r"\bbreada\b", "bread"),
	(r"\bchawala\b", "chawal"),
	]

	_DEVANAGARI_RE = re.compile(r"[\u0900-\u097f]")
	_WHITESPACE_RE = re.compile(r"\s+")
	_NON_WORD_RE = re.compile(r"[^\w\s]")


	def transliterate_devanagari(text: str) -> str:
	"""Best-effort Devanagari -> Latin transliteration for benchmark scoring."""
	out: list[str] = []
	chars = text or ""
	length = len(chars)
	i = 0
	while i < length:
	ch = chars[i]
	if ch in _DEVANAGARI_VOWELS:
	out.append(_DEVANAGARI_VOWELS[ch])
	elif ch in _DEVANAGARI_CONSONANTS:
	base = _DEVANAGARI_CONSONANTS[ch]
	nxt = chars[i + 1] if i + 1 < length else ""
	if nxt == "्":
	out.append(base)
	i += 1
	elif nxt in _DEVANAGARI_MATRAS:
	out.append(base + _DEVANAGARI_MATRAS[nxt])
	i += 1
	elif nxt and _DEVANAGARI_RE.match(nxt):
	out.append(base + "a")
	else:
	out.append(base)
	elif ch in _DEVANAGARI_MATRAS:
	out.append(_DEVANAGARI_MATRAS[ch])
	elif ch in _DEVANAGARI_SIGNS:
	out.append(_DEVANAGARI_SIGNS[ch])
	elif ch == "्":
	pass
	elif ch == "।" or ch == "॥":
	out.append(".")
	else:
	out.append(ch)
	i += 1

	transliterated = "".join(out)
	transliterated = transliterated.translate(str.maketrans("०१२३४५६७८९", "0123456789"))
	transliterated = unicodedata.normalize("NFKC", transliterated)
	return transliterated


	def normalize_text(text: str, *, transliterate: bool = False) -> str:
	"""Normalize benchmark text for fair WER/slot comparisons."""
	normalized = (text or "").lower().strip()
	if transliterate and _DEVANAGARI_RE.search(normalized):
	normalized = transliterate_devanagari(normalized)

	for pattern, replacement in _ASCII_ALIASES:
	normalized = re.sub(pattern, replacement, normalized)

	normalized = _NON_WORD_RE.sub(" ", normalized)
	normalized = _WHITESPACE_RE.sub(" ", normalized).strip()
	return normalized


	def compute_wer(reference: str, hypothesis: str, *, transliterate_hypothesis: bool = False) -> float:
	"""Word error rate with optional Devanagari transliteration on hypothesis."""
	ref_words = normalize_text(reference).split()
	hyp_words = normalize_text(hypothesis, transliterate=transliterate_hypothesis).split()
	if not ref_words:
	return 0.0 if not hyp_words else 1.0
	distances = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)]
	for i in range(len(ref_words) + 1):
	distances[i][0] = i
	for j in range(len(hyp_words) + 1):
	distances[0][j] = j
	for i in range(1, len(ref_words) + 1):
	for j in range(1, len(hyp_words) + 1):
	if ref_words[i - 1] == hyp_words[j - 1]:
	distances[i][j] = distances[i - 1][j - 1]
	else:
	distances[i][j] = 1 + min(
	distances[i - 1][j],
	distances[i][j - 1],
	distances[i - 1][j - 1],
	)
	return distances[-1][-1] / len(ref_words)