from __future__ import annotations

import re
import unicodedata


_DEVANAGARI_VOWELS = {
    "अ": "a",
    "आ": "aa",
    "इ": "i",
    "ई": "i",
    "उ": "u",
    "ऊ": "oo",
    "ऋ": "ri",
    "ॠ": "ri",
    "ऌ": "li",
    "ॡ": "li",
    "ए": "e",
    "ऐ": "ai",
    "ओ": "o",
    "औ": "au",
    "ऑ": "o",
    "ऒ": "o",
    "ऍ": "e",
    "ॲ": "a",
}

_DEVANAGARI_CONSONANTS = {
    "क": "k",
    "ख": "kh",
    "ग": "g",
    "घ": "gh",
    "ङ": "ng",
    "च": "ch",
    "छ": "chh",
    "ज": "j",
    "झ": "jh",
    "ञ": "ny",
    "ट": "t",
    "ठ": "th",
    "ड": "d",
    "ढ": "dh",
    "ण": "n",
    "त": "t",
    "थ": "th",
    "द": "d",
    "ध": "dh",
    "न": "n",
    "प": "p",
    "फ": "ph",
    "ब": "b",
    "भ": "bh",
    "म": "m",
    "य": "y",
    "र": "r",
    "ल": "l",
    "व": "v",
    "श": "sh",
    "ष": "sh",
    "स": "s",
    "ह": "h",
    "ळ": "l",
    "क़": "q",
    "ख़": "kh",
    "ग़": "g",
    "ज़": "z",
    "फ़": "f",
    "ड़": "d",
    "ढ़": "dh",
    "ऩ": "n",
    "ऱ": "r",
    "य़": "y",
}

_DEVANAGARI_MATRAS = {
    "ा": "a",
    "ि": "i",
    "ी": "i",
    "ु": "u",
    "ू": "oo",
    "ृ": "ri",
    "ॄ": "ri",
    "ॢ": "li",
    "ॣ": "li",
    "े": "e",
    "ै": "ai",
    "ो": "o",
    "ौ": "au",
}

_DEVANAGARI_SIGNS = {
    "ं": "n",
    "ँ": "n",
    "ः": "h",
    "ऽ": "",
}

_ASCII_ALIASES = [
    (r"\bpyaj\b", "pyaaz"),
    (r"\bpyaaj\b", "pyaaz"),
    (r"\bpyaaja\b", "pyaaz"),
    (r"\baloo\b", "aloo"),
    (r"\baaloo\b", "aloo"),
    (r"\bdoodh\b", "doodh"),
    (r"\bduudh\b", "doodh"),
    (r"\btamatar\b", "tamatar"),
    (r"\btamaatara\b", "tamatar"),
    (r"\btamaatar\b", "tamatar"),
    (r"\bdhaniya\b", "dhaniya"),
    (r"\bdhania\b", "dhaniya"),
    (r"\bdahee\b", "dahi"),
    (r"\bdehee\b", "dahi"),
    (r"\bph?ridg[e]?\b", "fridge"),
    (r"\bphrij\b", "fridge"),
    (r"\bfrij\b", "fridge"),
    (r"\bmrpi?\b", "mrp"),
    (r"\bskipa?\b", "skip"),
    (r"\brekord\b", "record"),
    (r"\bkuntop\b", "counter"),
    (r"\bcaruntav\b", "counter"),
    (r"\bkaruntav\b", "counter"),
    (r"\bpantr(?:y|ii|i)\b", "pantry"),
    (r"\bbathrum\b", "bathroom"),
    (r"\bbathroo?m\b", "bathroom"),
    (r"\bshel[fv]\b", "shelf"),
    (r"\beks[a-z]*pay[a-z]*ri\b", "expiry"),
    (r"\beks[a-z]*pari\b", "expiry"),
    (r"\beks[a-z]*pari?y\b", "expiry"),
    (r"\bsarph\b", "surf"),
    (r"\bsaraph\b", "surf"),
    (r"\bsurf\b", "surf"),
    (r"\bexc[eiy]l\b", "excel"),
    (r"\beksel\b", "excel"),
    (r"\biksel\b", "excel"),
    (r"\balredi\b", "already"),
    (r"\bolaredi\b", "already"),
    (r"\bcola?gate\b", "colgate"),
    (r"\bbreda\b", "bread"),
    (r"\bbreada\b", "bread"),
    (r"\bchawala\b", "chawal"),
]

_DEVANAGARI_RE = re.compile(r"[\u0900-\u097f]")
_WHITESPACE_RE = re.compile(r"\s+")
_NON_WORD_RE = re.compile(r"[^\w\s]")


def transliterate_devanagari(text: str) -> str:
    """Best-effort Devanagari -> Latin transliteration for benchmark scoring."""
    out: list[str] = []
    chars = text or ""
    length = len(chars)
    i = 0
    while i < length:
        ch = chars[i]
        if ch in _DEVANAGARI_VOWELS:
            out.append(_DEVANAGARI_VOWELS[ch])
        elif ch in _DEVANAGARI_CONSONANTS:
            base = _DEVANAGARI_CONSONANTS[ch]
            nxt = chars[i + 1] if i + 1 < length else ""
            if nxt == "्":
                out.append(base)
                i += 1
            elif nxt in _DEVANAGARI_MATRAS:
                out.append(base + _DEVANAGARI_MATRAS[nxt])
                i += 1
            elif nxt and _DEVANAGARI_RE.match(nxt):
                out.append(base + "a")
            else:
                out.append(base)
        elif ch in _DEVANAGARI_MATRAS:
            out.append(_DEVANAGARI_MATRAS[ch])
        elif ch in _DEVANAGARI_SIGNS:
            out.append(_DEVANAGARI_SIGNS[ch])
        elif ch == "्":
            pass
        elif ch == "।" or ch == "॥":
            out.append(".")
        else:
            out.append(ch)
        i += 1

    transliterated = "".join(out)
    transliterated = transliterated.translate(str.maketrans("०१२३४५६७८९", "0123456789"))
    transliterated = unicodedata.normalize("NFKC", transliterated)
    return transliterated


def normalize_text(text: str, *, transliterate: bool = False) -> str:
    """Normalize benchmark text for fair WER/slot comparisons."""
    normalized = (text or "").lower().strip()
    if transliterate and _DEVANAGARI_RE.search(normalized):
        normalized = transliterate_devanagari(normalized)

    for pattern, replacement in _ASCII_ALIASES:
        normalized = re.sub(pattern, replacement, normalized)

    normalized = _NON_WORD_RE.sub(" ", normalized)
    normalized = _WHITESPACE_RE.sub(" ", normalized).strip()
    return normalized


def compute_wer(reference: str, hypothesis: str, *, transliterate_hypothesis: bool = False) -> float:
    """Word error rate with optional Devanagari transliteration on hypothesis."""
    ref_words = normalize_text(reference).split()
    hyp_words = normalize_text(hypothesis, transliterate=transliterate_hypothesis).split()
    if not ref_words:
        return 0.0 if not hyp_words else 1.0
    distances = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)]
    for i in range(len(ref_words) + 1):
        distances[i][0] = i
    for j in range(len(hyp_words) + 1):
        distances[0][j] = j
    for i in range(1, len(ref_words) + 1):
        for j in range(1, len(hyp_words) + 1):
            if ref_words[i - 1] == hyp_words[j - 1]:
                distances[i][j] = distances[i - 1][j - 1]
            else:
                distances[i][j] = 1 + min(
                    distances[i - 1][j],
                    distances[i][j - 1],
                    distances[i - 1][j - 1],
                )
    return distances[-1][-1] / len(ref_words)