Spaces:

Kalana001
/

SinCode

Running

App Files Files Community

Kalana commited on 25 days ago

Commit

4dbfe95

1 Parent(s): 06a9c4e

Accuracy improvements: MLM normalization, common word overrides, English detection fix (32/40 = 80%)

Browse files

Files changed (4) hide show

.gitignore +8 -0
app.py +72 -37
english_20k.txt +0 -0
sincode_model.py +582 -29

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Ignore local dev files
+__pycache__/
+.venv/
+dump/
+misc/
+*.pyc
+*.pkl
+!dictionary.pkl

app.py CHANGED Viewed

@@ -1,80 +1,115 @@
 import streamlit as st
 import time
-from sincode_model import BeamSearchDecoder
-from PIL import Image
 import base64
-st.set_page_config(page_title="සිංCode Prototype", page_icon="🇱🇰", layout="centered")
-def add_bg_from_local(image_file):
     try:
         with open(image_file, "rb") as f:
-            data = f.read()
-        b64_data = base64.b64encode(data).decode()
         st.markdown(
             f"""
             <style>
             .stApp {{
-                background-image: linear-gradient(rgba(0,0,0,0.7), rgba(0,0,0,0.7)), url(data:image/png;base64,{b64_data});
                 background-size: cover;
                 background-position: center;
                 background-attachment: fixed;
             }}
             </style>
             """,
-            unsafe_allow_html=True
         )
     except FileNotFoundError:
-        pass
 @st.cache_resource
-def load_system():
-    decoder = BeamSearchDecoder()
-    return decoder
-background_path = "images/background.png"
-add_bg_from_local(background_path)
 with st.sidebar:
-    logo = Image.open("images/SinCodeLogo.jpg")
-    st.image(logo, width=200)
     st.title("සිංCode Project")
     st.info("Prototype")
-    st.markdown("### 🏗 Architecture")
-    st.success("""
-    **Data-Driven Neuro-Symbolic Engine**
-    XLM-R contextual scoring (40%) + transliteration fidelity (60%) + dictionary rank prior (0%).
-    **Adaptive Code-Switching**
-    Intelligently detects and preserves English contexts.
-    **Contextual Disambiguation**
-    Resolves Singlish ambiguity using sentence-level probability.
-    """)
     st.markdown("---")
     st.write("© 2026 Kalana Chandrasekara")
 st.title("සිංCode: Context-Aware Transliteration")
-st.markdown("Type Singlish sentences below. The system handles **code-mixing**, **ambiguity**, and **punctuation**.")
-input_text = st.text_area("Input Text", height=100, placeholder="e.g., Singlish sentences type krnna")
 if st.button("Transliterate", type="primary", use_container_width=True) and input_text:
     try:
         with st.spinner("Processing..."):
-            decoder = load_system()
-            start_time = time.time()
-            result, trace_logs = decoder.decode(input_text)
-            end_time = time.time()
         st.success("Transliteration Complete")
         st.markdown(f"### {result}")
-        st.caption(f"Time: {round(end_time - start_time, 2)}s")
-        with st.expander("See How It Works (Scoring Breakdown)", expanded=True):
-            st.write("Below shows the **data-driven scoring** for each word step:")
-            st.caption("MLM = contextual fit · Fid = transliteration fidelity · Rank = dictionary prior · 🔤 = English")
             for log in trace_logs:
                 st.markdown(log)
                 st.divider()

+"""
+SinCode Web UI — Streamlit interface for the transliteration engine.
+"""
 import streamlit as st
 import time
+import os
 import base64
+from PIL import Image
+from sincode_model import BeamSearchDecoder
+st.set_page_config(page_title="සිංCode", page_icon="🇱🇰", layout="centered")
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+def _set_background(image_file: str) -> None:
+    """Inject a dark-overlay background from a local image."""
     try:
         with open(image_file, "rb") as f:
+            b64 = base64.b64encode(f.read()).decode()
         st.markdown(
             f"""
             <style>
             .stApp {{
+                background-image: linear-gradient(rgba(0,0,0,0.7), rgba(0,0,0,0.7)),
+                                  url(data:image/png;base64,{b64});
                 background-size: cover;
                 background-position: center;
                 background-attachment: fixed;
             }}
             </style>
             """,
+            unsafe_allow_html=True,
         )
     except FileNotFoundError:
+        pass
 @st.cache_resource
+def _load_decoder() -> BeamSearchDecoder:
+    """Load the transliteration engine (cached across reruns)."""
+    model_name = os.getenv("SICODE_MODEL_NAME")
+    dict_path = os.getenv("SICODE_DICTIONARY_PATH", "dictionary.pkl")
+    if model_name:
+        return BeamSearchDecoder(model_name=model_name, dictionary_path=dict_path)
+    return BeamSearchDecoder(dictionary_path=dict_path)
+# ─── Layout ──────────────────────────────────────────────────────────────────
+_set_background("images/background.png")
 with st.sidebar:
+    st.image(Image.open("images/SinCodeLogo.jpg"), width=200)
     st.title("සිංCode Project")
     st.info("Prototype")
+    st.markdown("### ⚙️ Settings")
+    decode_mode = st.radio(
+        "Decode Mode",
+        options=["greedy", "beam"],
+        index=0,
+        help=(
+            "**Greedy** — More accurate. Uses actual selected outputs as "
+            "context for each next word.\n\n"
+            "**Beam** — Faster. Uses fixed rule-based context for all words."
+        ),
+    )
+    st.markdown("### 🏗 Architecture")
+    st.success(
+        "**Hybrid Neuro-Symbolic Engine**\n\n"
+        "XLM-R contextual scoring (55%) "
+        "+ transliteration fidelity (45%).\n\n"
+        "**Common Word Overrides** — "
+        "Curated table for high-frequency unambiguous words.\n\n"
+        "**Adaptive Code-Switching** — "
+        "Preserves English words in mixed input.\n\n"
+        "**Contextual Disambiguation** — "
+        "Resolves ambiguity via sentence-level probability."
+    )
     st.markdown("---")
     st.write("© 2026 Kalana Chandrasekara")
 st.title("සිංCode: Context-Aware Transliteration")
+st.markdown(
+    "Type Singlish sentences below. "
+    "The system handles **code-mixing**, **ambiguity**, and **punctuation**."
+)
+input_text = st.text_area(
+    "Input Text", height=100, placeholder="e.g., Singlish sentences type krnna"
+)
 if st.button("Transliterate", type="primary", use_container_width=True) and input_text:
     try:
         with st.spinner("Processing..."):
+            decoder = _load_decoder()
+            t0 = time.time()
+            result, trace_logs = decoder.decode(input_text, mode=decode_mode)
+            elapsed = time.time() - t0
         st.success("Transliteration Complete")
         st.markdown(f"### {result}")
+        st.caption(f"Mode: {decode_mode} · Time: {round(elapsed, 2)}s")
+        with st.expander("Scoring Breakdown", expanded=True):
+            st.caption(
+                "MLM = contextual fit · Fid = transliteration fidelity · "
+                "Rank = dictionary prior · 🔤 = English"
+            )
             for log in trace_logs:
                 st.markdown(log)
                 st.divider()

english_20k.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

sincode_model.py CHANGED Viewed

@@ -6,9 +6,10 @@ Architecture (Tiered Decoding):
     2. Dictionary Lookup – Retrieves Sinhala candidates from 5.9M-word DB
     3. Phonetic Rules    – Generates fallback transliteration for unknown words
     4. Data-Driven Scorer – Ranks ALL candidates using:
-         a) XLM-R MLM contextual probability  (60%)
-         b) Source-aware fidelity              (40%)
-    5. Beam Search       – Finds the globally optimal word sequence
 Author: Kalana Chandrasekara (2026)
 """
@@ -34,17 +35,16 @@ DEFAULT_DICTIONARY_PATH = "dictionary.pkl"
 ENGLISH_CORPUS_URL = (
     "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
 )
-ENGLISH_CORPUS_CACHE = "english_20k.txt"
 # Scoring weights (tunable hyperparameters)
-W_MLM: float = 0.40           # Contextual language model probability
-W_FIDELITY: float = 0.60      # Source-aware transliteration fidelity
 W_RANK: float = 0.00          # Dictionary rank prior (disabled — dict is unordered)
 MAX_CANDIDATES: int = 8       # Max candidates per word position
 DEFAULT_BEAM_WIDTH: int = 5   # Beam search width
 FIDELITY_SCALE: float = 10.0  # Edit-distance penalty multiplier
-DICT_FIDELITY_DAMP: float = 1.5  # Damping factor for dict candidates' fidelity
 MIN_ENGLISH_LEN: int = 3      # Min word length for 20k-corpus English detection
 SINHALA_VIRAMA: str = '\u0DCA'  # Sinhala virama (hal) character
 ZWJ: str = '\u200D'             # Zero-width joiner (for conjuncts)
@@ -57,10 +57,55 @@ CORE_ENGLISH_WORDS: Set[str] = {
     "transliteration", "sincode", "prototype", "assignment", "singlish",
     "rest", "complete", "tutorial", "small", "mistakes", "game", "play",
     "type", "test", "online", "code", "mixing", "project", "demo", "today",
-    "tomorrow", "presentation", "slide",
 }
 # ─── English Vocabulary ─────────────────────────────────────────────────────
 def load_english_vocab() -> Set[str]:
@@ -94,6 +139,122 @@ def load_english_vocab() -> Set[str]:
 ENGLISH_VOCAB: Set[str] = load_english_vocab()
 # ─── Rule-Based Transliteration Engine ───────────────────────────────────────
 # Phonetic mapping tables (Singlish Romanized → Sinhala Unicode)
 # Tables are ordered longest-pattern-first so greedy replacement works.
@@ -103,7 +264,7 @@ CONSONANTS: List[str] = [
     "th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
     "GN", "KN", "Lu", "kh", "Th", "Dh",
     "S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
-    "\\y",                          # FIX: was "\\u005C"+"y" (never matched)
     "Y", "y", "j", "l", "v", "w", "s", "h",
     "N", "L", "K", "G", "P", "B", "f", "g", "r",
 ]
@@ -199,6 +360,17 @@ class ScoredCandidate:
     is_english: bool = False
 class CandidateScorer:
     """
     Data-driven replacement for the old hardcoded penalty table.
@@ -270,13 +442,11 @@ class CandidateScorer:
         """
         Source-aware transliteration fidelity.
-        The fidelity signal considers *where* a candidate came from:
         - **English matching input** → 0.0  (user-intent preservation).
-        - **Dictionary candidates** → damped Levenshtein distance to
-          rule output (50% scale).  Dictionary validation proves the
-          candidate is a real word, reducing penalty, but phonetic
-          closeness to the typed input is still rewarded.
         - **Rule-only outputs not in dictionary** → penalised by
           consonant-skeleton density (high virama ratio = malformed).
         - **Other** → full Levenshtein distance to rule output.
@@ -285,28 +455,23 @@ class CandidateScorer:
         if original_input and candidate.lower() == original_input.lower():
             return 0.0
-        # 2. Dictionary-validated candidate → damped fidelity
-        #    Uses Levenshtein distance to rule output at reduced scale:
-        #    being in the dictionary validates as a real word, but
-        #    phonetic closeness to what the user typed still matters.
         if is_from_dict:
             if candidate == rule_output:
-                return 0.0
             max_len = max(len(candidate), len(rule_output), 1)
             norm_dist = self.levenshtein(candidate, rule_output) / max_len
-            return -norm_dist * self.fidelity_scale * DICT_FIDELITY_DAMP
         # 3. Rule-only output (not validated by dictionary)
         if candidate == rule_output:
-            # Measure consonant-skeleton density: count bare viramas
-            # (virama NOT followed by ZWJ, which would form a conjunct).
             bare_virama = sum(
                 1 for i, ch in enumerate(candidate)
                 if ch == SINHALA_VIRAMA
                 and (i + 1 >= len(candidate) or candidate[i + 1] != ZWJ)
             )
             density = bare_virama / max(len(candidate), 1)
-            # High density → consonant skeleton, not a real word
             return -density * self.fidelity_scale * 2
         # 4. English word not matching input — uncertain
@@ -370,7 +535,7 @@ class DictionaryAdapter:
     def __init__(self, dictionary_dict: Dict[str, List[str]]):
         self.dictionary = dictionary_dict
-    def get_candidates(self, word: str) -> List[str]:
         """
         Return candidate transliterations for a Romanized word.
@@ -378,6 +543,10 @@ class DictionaryAdapter:
             1. English corpus match  → keep original word
             2. Dictionary lookup     → exact / lowercase
             3. Subword decomposition → only when 1 & 2 yield nothing
         """
         cands: List[str] = []
         word_lower = word.lower()
@@ -394,7 +563,16 @@ class DictionaryAdapter:
         # 3. Deduplicate preserving order
         if cands:
-            return list(dict.fromkeys(cands))
         # 4. Subword fallback (compound words)
         length = len(word)
@@ -526,17 +704,288 @@ class BeamSearchDecoder:
         self,
         sentence: str,
         beam_width: int = DEFAULT_BEAM_WIDTH,
     ) -> Tuple[str, List[str]]:
         """
         Transliterate a full Singlish sentence into Sinhala script.
         Returns:
             result      – the best transliteration string
             trace_logs  – per-step markdown logs for the debug UI
         """
         words = sentence.split()
         if not words:
-            return "", []
         # ── Phase 1: candidate generation ────────────────────────────
         word_infos: List[dict] = []
@@ -555,8 +1004,8 @@ class BeamSearchDecoder:
                 })
                 continue
-            cands = self.adapter.get_candidates(core)
             rule_output = self.adapter.get_rule_output(core)
             # Track which candidates are dictionary-validated
             dict_entries: Set[str] = set()
@@ -612,14 +1061,89 @@ class BeamSearchDecoder:
         # ── Phase 2: beam search with data-driven scoring ────────────
         beam: List[Tuple[List[str], float]] = [([], 0.0)]
         trace_logs: List[str] = []
         for t, info in enumerate(word_infos):
             candidates = info["candidates"]
             eng_flags = info["english_flags"]
             d_flags = info.get("dict_flags", [False] * len(candidates))
             rule_out = info["rule_output"]
             total_cands = len(candidates)
             # Build left/right context pairs for multi-mask MLM scoring
             batch_left: List[str] = []
             batch_right: List[str] = []
@@ -639,6 +1163,15 @@ class BeamSearchDecoder:
             mlm_scores = self._batch_mlm_score(batch_left, batch_right, batch_tgt)
             # ── MLM floor for English code-switching ─────────────────
             # XLM-R is not calibrated for Singlish code-mixing: English
             # tokens in Sinhala context receive disproportionately low
@@ -657,6 +1190,7 @@ class BeamSearchDecoder:
             # ── Score & trace ────────────────────────────────────────
             next_beam: List[Tuple[List[str], float]] = []
             step_log = f"**Step {t + 1}: `{words[t]}`** &nbsp;(rule → `{rule_out}`)\n\n"
             for i, mlm in enumerate(mlm_scores):
@@ -685,6 +1219,7 @@ class BeamSearchDecoder:
                 new_total = orig_score + scored.combined_score
                 next_beam.append((orig_path + [cand], new_total))
                 # Trace log (skip very low scores to reduce noise)
                 if mlm > -25.0:
@@ -701,5 +1236,23 @@ class BeamSearchDecoder:
             beam = sorted(next_beam, key=lambda x: x[1], reverse=True)[:beam_width]
         result = " ".join(beam[0][0]) if beam else ""
-        return result, trace_logs

     2. Dictionary Lookup – Retrieves Sinhala candidates from 5.9M-word DB
     3. Phonetic Rules    – Generates fallback transliteration for unknown words
     4. Data-Driven Scorer – Ranks ALL candidates using:
+         a) XLM-R MLM contextual probability  (55%, min-max normalised)
+         b) Source-aware fidelity              (45%)
+    5. Common Word Override – Bypasses scoring for frequent unambiguous words
+    6. Beam / Greedy Search – Finds the globally optimal word sequence
 Author: Kalana Chandrasekara (2026)
 """
 ENGLISH_CORPUS_URL = (
     "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
 )
 # Scoring weights (tunable hyperparameters)
+W_MLM: float = 0.55           # Contextual language model probability
+W_FIDELITY: float = 0.45      # Source-aware transliteration fidelity
 W_RANK: float = 0.00          # Dictionary rank prior (disabled — dict is unordered)
 MAX_CANDIDATES: int = 8       # Max candidates per word position
 DEFAULT_BEAM_WIDTH: int = 5   # Beam search width
 FIDELITY_SCALE: float = 10.0  # Edit-distance penalty multiplier
+DICT_FIDELITY_DAMP: float = 2.0  # Decay rate for dict bonus (higher = stricter filter)
 MIN_ENGLISH_LEN: int = 3      # Min word length for 20k-corpus English detection
 SINHALA_VIRAMA: str = '\u0DCA'  # Sinhala virama (hal) character
 ZWJ: str = '\u200D'             # Zero-width joiner (for conjuncts)
     "transliteration", "sincode", "prototype", "assignment", "singlish",
     "rest", "complete", "tutorial", "small", "mistakes", "game", "play",
     "type", "test", "online", "code", "mixing", "project", "demo", "today",
+    "tomorrow", "presentation", "slide", "submit", "feedback", "deploy",
+    "merge", "update", "delete", "download", "upload", "install", "server",
+    "meeting", "backlog", "comment", "reply", "chat", "selfie", "post",
+    "share", "private", "message", "group", "study", "exam", "results",
+    "viva", "prepared", "site", "redo", "story", "poll",
+    "hall", "exam", "PR", "DM", "page", "app", "bug", "fix",
+    "log", "push", "pull", "branch", "build", "run", "save",
+    "link", "edit", "file", "open", "close", "live", "view",
 }
+def _resolve_english_cache_path() -> str:
+    """
+    Resolve a writable cache path for the English corpus.
+    Hugging Face Spaces may run with constrained write locations, so we prefer:
+    1) explicit env override,
+    2) HF_HOME cache dir,
+    3) local working dir,
+    4) system temp dir.
+    """
+    override = os.getenv("SICODE_ENGLISH_CACHE")
+    if override:
+        return override
+    candidates = [
+        os.path.join(os.getenv("HF_HOME", ""), "english_20k.txt") if os.getenv("HF_HOME") else "",
+        os.path.join(os.getcwd(), "english_20k.txt"),
+        os.path.join(os.getenv("TMPDIR", os.getenv("TEMP", "/tmp")), "english_20k.txt"),
+    ]
+    for path in candidates:
+        if not path:
+            continue
+        parent = os.path.dirname(path) or "."
+        try:
+            os.makedirs(parent, exist_ok=True)
+            with open(path, "a", encoding="utf-8"):
+                pass
+            return path
+        except OSError:
+            continue
+    return "english_20k.txt"
+ENGLISH_CORPUS_CACHE = _resolve_english_cache_path()
 # ─── English Vocabulary ─────────────────────────────────────────────────────
 def load_english_vocab() -> Set[str]:
 ENGLISH_VOCAB: Set[str] = load_english_vocab()
+# ─── Common Word Overrides ──────────────────────────────────────────────────
+# High-frequency Singlish words whose romanisation is ambiguous (long vs.
+# short vowel, retroflex vs. dental, etc.).  When a word appears here the
+# decoder uses the override directly, bypassing MLM/fidelity scoring.
+# Only add words that are *unambiguous* — i.e. one dominant Sinhala form
+# in colloquial written chat.  Context-dependent words (e.g. "eka") should
+# NOT be listed so that MLM can resolve them.
+COMMON_WORDS: Dict[str, str] = {
+    # Pronouns & particles
+    "oya":      "ඔයා",       # you
+    "oyaa":     "ඔයා",
+    "eya":      "ඒයා",       # he/she
+    "eyaa":     "ඒයා",
+    "api":      "අපි",       # we
+    "mama":     "මම",        # I
+    "mage":     "මගේ",       # my
+    "oyage":    "ඔයාගේ",     # your
+    # Common verbs (past tense)
+    "awa":      "ආවා",       # came
+    "aawa":     "ආවා",
+    "giya":     "ගියා",       # went
+    "kala":     "කළා",       # did
+    "kiwa":     "කිව්වා",      # said
+    "kiwwa":    "කිව්වා",
+    "yewwa":    "යැව්වා",     # sent
+    "gawa":     "ගැව්වා",     # hit
+    "katha":    "කතා",       # talked / story
+    # Time
+    "heta":     "හෙට",       # tomorrow
+    "ada":      "අද",        # today
+    "iye":      "ඊයේ",       # yesterday
+    # Common adverbs / particles
+    "one":      "ඕනෙ",       # need/want
+    "oney":     "ඕනේ",
+    "naa":      "නෑ",        # no (long form)
+    "na":       "නෑ",        # no
+    "hari":     "හරි",        # ok / right
+    "wage":     "වගේ",       # like
+    "nisa":     "නිසා",       # because
+    "inne":     "ඉන්නෙ",     # being/staying (colloquial)
+    "inna":     "ඉන්න",      # stay (imperative)
+    "kalin":    "කලින්",      # before / earlier
+    # Common verb endings
+    "giye":     "ගියේ",       # went (emphatic)
+    "una":      "උනා",       # became / happened
+    "wuna":     "උනා",       # became (alt spelling)
+    # Locations / misc
+    "gedaradi": "ගෙදරදී",     # at home
+    "gedara":   "ගෙදර",       # home
+    # Common adjectives / other
+    "honda":    "හොඳ",       # good
+    "ape":      "අපේ",       # our
+    "me":       "මේ",        # this
+    "passe":    "පස්සෙ",      # after / later
+    "ba":       "බෑ",        # can't
+    "bari":     "බැරි",       # impossible
+    "bri":      "බැරි",       # can't (abbrev)
+    "danne":    "දන්නෙ",     # know
+    "wada":     "වැඩ",       # work (noun)
+    "epa":      "එපා",       # don't
+    # Common ad-hoc abbreviations
+    "mta":      "මට",        # mata
+    "oyta":     "ඔයාට",      # oyata
+    "oyata":    "ඔයාට",      # to you
+    "krnna":    "කරන්න",     # karanna
+    "blnna":    "බලන්න",     # balanna
+    "on":       "ඕනෙ",       # one (abbrev)
+    # Common -nawa verb endings
+    "thiyanawa": "තියෙනවා",   # is/has
+    "wenawa":   "වෙනවා",     # becomes
+    "enawa":    "එනවා",      # comes
+    "yanawa":   "යනවා",      # goes
+    "hithenawa":"හිතෙනවා",   # thinks/feels
+    "penenawa": "පේනවා",     # appears/visible
+    "karamu":   "කරමු",      # let's do
+    "balamu":   "බලමු",      # let's see
+    "damu":     "දාමු",       # let's put
+    "yamu":     "යමු",        # let's go
+    # Short English abbreviations (keys are lowercase for lookup)
+    "pr":       "PR",
+    "dm":       "DM",
+    "ai":       "AI",
+    "it":       "IT",
+    "qa":       "QA",
+    "ui":       "UI",
+    "ok":       "OK",
+    # Common ad-hoc abbreviations (contd.)
+    "ek":       "එක",        # eka (short form)
+    "ekta":     "එකට",       # ekata = to that one
+    "ekat":     "ඒකට",       # that-thing + to (standalone form)
+    "eke":      "එකේ",       # of that one
+    "hta":      "හෙට",       # heta (abbrev)
+    "damma":    "දැම්මා",    # put/posted
+    "gannako":  "ගන්නකෝ",   # take (imperative, long ō)
+    # Additional words for accuracy
+    "gena":     "ගැන",       # about
+    "mata":     "මට",        # to me
+    "laga":     "ළඟ",        # near
+    "poth":     "පොත",       # book
+    "iwara":    "ඉවර",       # finished
+    "karanna":  "කරන්න",     # to do
+    "hadamu":   "හදමු",      # let's make
+    "kiyawala":  "කියවලා",    # having read
+    "baya":     "බය",        # fear/scared
+}
+# Context-dependent words: use this form ONLY when the previous word is
+# NOT English. When "eka" follows an English noun (e.g., "assignment eka")
+# the scorer resolves it to එක naturally; standalone "eka" maps to ඒක.
+CONTEXT_WORDS_STANDALONE: Dict[str, str] = {
+    "eka":  "ඒක",     # that thing (standalone)
+    "ekak": "එකක්",   # one of (quantifier — same either way)
+}
 # ─── Rule-Based Transliteration Engine ───────────────────────────────────────
 # Phonetic mapping tables (Singlish Romanized → Sinhala Unicode)
 # Tables are ordered longest-pattern-first so greedy replacement works.
     "th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
     "GN", "KN", "Lu", "kh", "Th", "Dh",
     "S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
+    "\\y",
     "Y", "y", "j", "l", "v", "w", "s", "h",
     "N", "L", "K", "G", "P", "B", "f", "g", "r",
 ]
     is_english: bool = False
+@dataclass
+class WordDiagnostic:
+    """Structured per-word diagnostics for evaluation and error analysis."""
+    step_index: int
+    input_word: str
+    rule_output: str
+    selected_candidate: str
+    beam_score: float
+    candidate_breakdown: List[ScoredCandidate]
 class CandidateScorer:
     """
     Data-driven replacement for the old hardcoded penalty table.
         """
         Source-aware transliteration fidelity.
         - **English matching input** → 0.0  (user-intent preservation).
+        - **Dict + matches rule output** → strong bonus (+2.0). Both
+          signals agree — highest confidence.
+        - **Dict only** → decaying bonus (1.0 down to 0.0 with distance
+          from rule output).  Still a real word, but less certain.
         - **Rule-only outputs not in dictionary** → penalised by
           consonant-skeleton density (high virama ratio = malformed).
         - **Other** → full Levenshtein distance to rule output.
         if original_input and candidate.lower() == original_input.lower():
             return 0.0
+        # 2. Dictionary-validated candidates
         if is_from_dict:
+            # Rule output confirmed by dictionary = highest confidence
             if candidate == rule_output:
+                return 2.0
             max_len = max(len(candidate), len(rule_output), 1)
             norm_dist = self.levenshtein(candidate, rule_output) / max_len
+            return max(0.0, 1.0 - norm_dist * DICT_FIDELITY_DAMP)
         # 3. Rule-only output (not validated by dictionary)
         if candidate == rule_output:
             bare_virama = sum(
                 1 for i, ch in enumerate(candidate)
                 if ch == SINHALA_VIRAMA
                 and (i + 1 >= len(candidate) or candidate[i + 1] != ZWJ)
             )
             density = bare_virama / max(len(candidate), 1)
             return -density * self.fidelity_scale * 2
         # 4. English word not matching input — uncertain
     def __init__(self, dictionary_dict: Dict[str, List[str]]):
         self.dictionary = dictionary_dict
+    def get_candidates(self, word: str, rule_output: str = "") -> List[str]:
         """
         Return candidate transliterations for a Romanized word.
             1. English corpus match  → keep original word
             2. Dictionary lookup     → exact / lowercase
             3. Subword decomposition → only when 1 & 2 yield nothing
+        When more candidates exist than MAX_CANDIDATES, results are
+        sorted by Levenshtein distance to ``rule_output`` so the most
+        phonetically plausible entries survive the cut.
         """
         cands: List[str] = []
         word_lower = word.lower()
         # 3. Deduplicate preserving order
         if cands:
+            cands = list(dict.fromkeys(cands))
+            # Sort Sinhala candidates by closeness to rule output
+            if rule_output and len(cands) > MAX_CANDIDATES:
+                english = [c for c in cands if c.lower() in ENGLISH_VOCAB]
+                sinhala = [c for c in cands if c.lower() not in ENGLISH_VOCAB]
+                sinhala.sort(
+                    key=lambda c: CandidateScorer.levenshtein(c, rule_output)
+                )
+                cands = english + sinhala
+            return cands
         # 4. Subword fallback (compound words)
         length = len(word)
         self,
         sentence: str,
         beam_width: int = DEFAULT_BEAM_WIDTH,
+        mode: str = "greedy",
     ) -> Tuple[str, List[str]]:
         """
         Transliterate a full Singlish sentence into Sinhala script.
+        Args:
+            mode: "greedy" (accurate, uses dynamic context) or
+                  "beam" (faster, uses fixed rule-based context)
         Returns:
             result      – the best transliteration string
             trace_logs  – per-step markdown logs for the debug UI
         """
+        if mode == "greedy":
+            result, trace_logs, _ = self.greedy_decode_with_diagnostics(sentence)
+        else:
+            result, trace_logs, _ = self.decode_with_diagnostics(
+                sentence=sentence,
+                beam_width=beam_width,
+            )
+        return result, trace_logs
+    # ── Greedy decode (dynamic context — more accurate) ──────────────
+    def greedy_decode_with_diagnostics(
+        self,
+        sentence: str,
+    ) -> Tuple[str, List[str], List[WordDiagnostic]]:
+        """
+        Greedy word-by-word decode using actual selected outputs as
+        left context for subsequent MLM scoring.
+        More accurate than beam search with fixed context because XLM-R
+        sees the real transliteration built so far, not rule-based guesses.
+        """
         words = sentence.split()
         if not words:
+            return "", [], []
+        # ── Phase 1: candidate generation (same as beam) ─────────────
+        word_infos: List[dict] = []
+        for raw in words:
+            match = PUNCT_PATTERN.match(raw)
+            prefix, core, suffix = match.groups() if match else ("", raw, "")
+            if not core:
+                word_infos.append({
+                    "candidates": [raw],
+                    "rule_output": raw,
+                    "english_flags": [False],
+                    "dict_flags": [False],
+                    "prefix": prefix,
+                    "suffix": suffix,
+                })
+                continue
+            rule_output = self.adapter.get_rule_output(core)
+            cands = self.adapter.get_candidates(core, rule_output)
+            dict_entries: Set[str] = set()
+            if core in self.adapter.dictionary:
+                dict_entries.update(self.adapter.dictionary[core])
+            elif core.lower() in self.adapter.dictionary:
+                dict_entries.update(self.adapter.dictionary[core.lower()])
+            if rule_output and rule_output not in cands:
+                cands.append(rule_output)
+            if not cands:
+                cands = [rule_output]
+            english_flags = [c.lower() in ENGLISH_VOCAB for c in cands]
+            dict_flags = [c in dict_entries for c in cands]
+            full_cands = [prefix + c + suffix for c in cands]
+            word_infos.append({
+                "candidates": full_cands[:MAX_CANDIDATES],
+                "rule_output": prefix + rule_output + suffix,
+                "english_flags": english_flags[:MAX_CANDIDATES],
+                "dict_flags": dict_flags[:MAX_CANDIDATES],
+                "prefix": prefix,
+                "suffix": suffix,
+            })
+        # Build right-side stable context (rule outputs for future words)
+        stable_right: List[str] = []
+        for info in word_infos:
+            eng_cands = [
+                c for c, e in zip(info["candidates"], info["english_flags"]) if e
+            ]
+            stable_right.append(
+                eng_cands[0] if eng_cands else info["rule_output"]
+            )
+        # ── Phase 2: greedy word-by-word with dynamic left context ───
+        selected_words: List[str] = []
+        trace_logs: List[str] = []
+        diagnostics: List[WordDiagnostic] = []
+        for t, info in enumerate(word_infos):
+            candidates = info["candidates"]
+            eng_flags = info["english_flags"]
+            d_flags = info.get("dict_flags", [False] * len(candidates))
+            rule_out = info["rule_output"]
+            prefix = info.get("prefix", "")
+            suffix = info.get("suffix", "")
+            total_cands = len(candidates)
+            # ── Common-word shortcut ─────────────────────────────────
+            core_lower = words[t].lower().strip()
+            if core_lower in COMMON_WORDS:
+                override = prefix + COMMON_WORDS[core_lower] + suffix
+                selected_words.append(override)
+                trace_logs.append(
+                    f"**Step {t + 1}: `{words[t]}`** &nbsp;→ "
+                    f"`{override}` (common-word override)\n"
+                )
+                diagnostics.append(WordDiagnostic(
+                    step_index=t,
+                    input_word=words[t],
+                    rule_output=rule_out,
+                    selected_candidate=override,
+                    beam_score=0.0,
+                    candidate_breakdown=[],
+                ))
+                continue
+            # ── Context-dependent standalone overrides ─────────────���──
+            # Words like "eka" that change form depending on whether the
+            # previous word was English (e.g., "assignment eka" → එක)
+            # or Sinhala / start of sentence ("eka heta" → ඒක).
+            if core_lower in CONTEXT_WORDS_STANDALONE:
+                prev_word_lower = words[t - 1].lower() if t > 0 else ""
+                prev_common_val = COMMON_WORDS.get(prev_word_lower, "")
+                prev_is_english = (
+                    t > 0
+                    and (
+                        prev_word_lower in ENGLISH_VOCAB
+                        or prev_common_val.isascii() and prev_common_val != ""
+                    )
+                )
+                if not prev_is_english:
+                    override = prefix + CONTEXT_WORDS_STANDALONE[core_lower] + suffix
+                    selected_words.append(override)
+                    trace_logs.append(
+                        f"**Step {t + 1}: `{words[t]}`** &nbsp;→ "
+                        f"`{override}` (standalone override)\n"
+                    )
+                    diagnostics.append(WordDiagnostic(
+                        step_index=t,
+                        input_word=words[t],
+                        rule_output=rule_out,
+                        selected_candidate=override,
+                        beam_score=0.0,
+                        candidate_breakdown=[],
+                    ))
+                    continue
+            # ── English-word shortcut ────────────────────────────────
+            if (
+                len(core_lower) >= MIN_ENGLISH_LEN
+                and core_lower in ENGLISH_VOCAB
+            ):
+                selected_words.append(words[t])
+                trace_logs.append(
+                    f"**Step {t + 1}: `{words[t]}`** &nbsp;→ "
+                    f"`{words[t]}` (English preserved)\n"
+                )
+                diagnostics.append(WordDiagnostic(
+                    step_index=t,
+                    input_word=words[t],
+                    rule_output=rule_out,
+                    selected_candidate=words[t],
+                    beam_score=0.0,
+                    candidate_breakdown=[],
+                ))
+                continue
+            # Dynamic left context = actual selected outputs so far
+            left_ctx = " ".join(selected_words) if selected_words else ""
+            # Right context = rule-based stable context for future words
+            right_ctx = " ".join(stable_right[t + 1:]) if t + 1 < len(words) else ""
+            # Score all candidates for this position in one batch
+            batch_left = [left_ctx] * total_cands
+            batch_right = [right_ctx] * total_cands
+            mlm_scores = self._batch_mlm_score(batch_left, batch_right, candidates)
+            # ── Min-max normalise MLM to [0, 1] ─────────────────────
+            # Raw log-probs span a wide range (e.g. −5 to −25) and can
+            # drown out fidelity.  Per-position normalisation makes the
+            # two signals weight-comparable.
+            mlm_min = min(mlm_scores)
+            mlm_max = max(mlm_scores)
+            mlm_range = mlm_max - mlm_min
+            if mlm_range > 1e-9:
+                mlm_scores = [(m - mlm_min) / mlm_range for m in mlm_scores]
+            else:
+                mlm_scores = [1.0] * len(mlm_scores)
+            # MLM floor for English code-switching
+            best_nonenglish_mlm = -1e9
+            for i, mlm in enumerate(mlm_scores):
+                is_eng = eng_flags[i] if i < len(eng_flags) else False
+                if not is_eng and mlm > best_nonenglish_mlm:
+                    best_nonenglish_mlm = mlm
+            # Score & select best candidate
+            step_log = f"**Step {t + 1}: `{words[t]}`** &nbsp;(rule → `{rule_out}`)\n\n"
+            best_scored: Optional[ScoredCandidate] = None
+            candidate_breakdown: List[ScoredCandidate] = []
+            for i, mlm in enumerate(mlm_scores):
+                cand = candidates[i]
+                is_eng = eng_flags[i] if i < len(eng_flags) else False
+                is_dict = d_flags[i] if i < len(d_flags) else False
+                effective_mlm = mlm
+                if is_eng and cand.lower() == words[t].lower():
+                    effective_mlm = max(mlm, best_nonenglish_mlm)
+                scored = self.scorer.score(
+                    mlm_score=effective_mlm,
+                    candidate=cand,
+                    rule_output=rule_out,
+                    rank=i,
+                    total_candidates=total_cands,
+                    is_english=is_eng,
+                    original_input=words[t],
+                    is_from_dict=is_dict,
+                )
+                candidate_breakdown.append(scored)
+                if best_scored is None or scored.combined_score > best_scored.combined_score:
+                    best_scored = scored
+                if mlm > -25.0:
+                    eng_tag = " 🔤" if is_eng else ""
+                    step_log += (
+                        f"- `{cand}`{eng_tag} &nbsp; "
+                        f"MLM={scored.mlm_score:.2f} &nbsp; "
+                        f"Fid={scored.fidelity_score:.2f} &nbsp; "
+                        f"Rank={scored.rank_score:.2f} → "
+                        f"**{scored.combined_score:.2f}**\n"
+                    )
+            trace_logs.append(step_log)
+            selected = best_scored.text if best_scored else rule_out
+            selected_words.append(selected)
+            candidate_breakdown.sort(key=lambda s: s.combined_score, reverse=True)
+            diagnostics.append(WordDiagnostic(
+                step_index=t,
+                input_word=words[t],
+                rule_output=rule_out,
+                selected_candidate=selected,
+                beam_score=best_scored.combined_score if best_scored else 0.0,
+                candidate_breakdown=candidate_breakdown,
+            ))
+        result = " ".join(selected_words)
+        return result, trace_logs, diagnostics
+    def decode_with_diagnostics(
+        self,
+        sentence: str,
+        beam_width: int = DEFAULT_BEAM_WIDTH,
+    ) -> Tuple[str, List[str], List[WordDiagnostic]]:
+        """
+        Decode sentence and return detailed per-word diagnostics.
+        Returns:
+            result            – best transliterated sentence
+            trace_logs        – markdown logs used by Streamlit UI
+            diagnostics       – structured scores and selected candidates per step
+        """
+        words = sentence.split()
+        if not words:
+            return "", [], []
         # ── Phase 1: candidate generation ────────────────────────────
         word_infos: List[dict] = []
                 })
                 continue
             rule_output = self.adapter.get_rule_output(core)
+            cands = self.adapter.get_candidates(core, rule_output)
             # Track which candidates are dictionary-validated
             dict_entries: Set[str] = set()
         # ── Phase 2: beam search with data-driven scoring ────────────
         beam: List[Tuple[List[str], float]] = [([], 0.0)]
         trace_logs: List[str] = []
+        diagnostics: List[WordDiagnostic] = []
         for t, info in enumerate(word_infos):
             candidates = info["candidates"]
             eng_flags = info["english_flags"]
             d_flags = info.get("dict_flags", [False] * len(candidates))
             rule_out = info["rule_output"]
+            prefix = info.get("prefix", "")
+            suffix = info.get("suffix", "")
             total_cands = len(candidates)
+            # ── Common-word shortcut ─────────────────────────────────
+            core_lower = words[t].lower().strip()
+            if core_lower in COMMON_WORDS:
+                override = prefix + COMMON_WORDS[core_lower] + suffix
+                # Extend every beam path with the override
+                next_beam_cw = [(path + [override], sc) for path, sc in beam]
+                beam = next_beam_cw[:beam_width]
+                trace_logs.append(
+                    f"**Step {t + 1}: `{words[t]}`** &nbsp;→ "
+                    f"`{override}` (common-word override)\n"
+                )
+                diagnostics.append(WordDiagnostic(
+                    step_index=t,
+                    input_word=words[t],
+                    rule_output=rule_out,
+                    selected_candidate=override,
+                    beam_score=beam[0][1] if beam else 0.0,
+                    candidate_breakdown=[],
+                ))
+                continue
+            # ── Context-dependent standalone overrides ────────────────
+            if core_lower in CONTEXT_WORDS_STANDALONE:
+                prev_word_lower = words[t - 1].lower() if t > 0 else ""
+                prev_common_val = COMMON_WORDS.get(prev_word_lower, "")
+                prev_is_english = (
+                    t > 0
+                    and (
+                        prev_word_lower in ENGLISH_VOCAB
+                        or prev_common_val.isascii() and prev_common_val != ""
+                    )
+                )
+                if not prev_is_english:
+                    override = prefix + CONTEXT_WORDS_STANDALONE[core_lower] + suffix
+                    next_beam_ctx = [(path + [override], sc) for path, sc in beam]
+                    beam = next_beam_ctx[:beam_width]
+                    trace_logs.append(
+                        f"**Step {t + 1}: `{words[t]}`** &nbsp;→ "
+                        f"`{override}` (standalone override)\n"
+                    )
+                    diagnostics.append(WordDiagnostic(
+                        step_index=t,
+                        input_word=words[t],
+                        rule_output=rule_out,
+                        selected_candidate=override,
+                        beam_score=beam[0][1] if beam else 0.0,
+                        candidate_breakdown=[],
+                    ))
+                    continue
+            # ── English-word shortcut ────────────────────────────────
+            if (
+                len(core_lower) >= MIN_ENGLISH_LEN
+                and core_lower in ENGLISH_VOCAB
+            ):
+                eng_word = words[t]
+                next_beam_eng = [(path + [eng_word], sc) for path, sc in beam]
+                beam = next_beam_eng[:beam_width]
+                trace_logs.append(
+                    f"**Step {t + 1}: `{words[t]}`** &nbsp;→ "
+                    f"`{eng_word}` (English preserved)\n"
+                )
+                diagnostics.append(WordDiagnostic(
+                    step_index=t,
+                    input_word=words[t],
+                    rule_output=rule_out,
+                    selected_candidate=eng_word,
+                    beam_score=beam[0][1] if beam else 0.0,
+                    candidate_breakdown=[],
+                ))
+                continue
             # Build left/right context pairs for multi-mask MLM scoring
             batch_left: List[str] = []
             batch_right: List[str] = []
             mlm_scores = self._batch_mlm_score(batch_left, batch_right, batch_tgt)
+            # ── Min-max normalise MLM to [0, 1] ─────────────────────
+            mlm_min = min(mlm_scores) if mlm_scores else 0
+            mlm_max = max(mlm_scores) if mlm_scores else 0
+            mlm_range = mlm_max - mlm_min
+            if mlm_range > 1e-9:
+                mlm_scores = [(m - mlm_min) / mlm_range for m in mlm_scores]
+            else:
+                mlm_scores = [1.0] * len(mlm_scores)
             # ── MLM floor for English code-switching ─────────────────
             # XLM-R is not calibrated for Singlish code-mixing: English
             # tokens in Sinhala context receive disproportionately low
             # ── Score & trace ────────────────────────────────────────
             next_beam: List[Tuple[List[str], float]] = []
+            all_step_scores: List[Tuple[int, ScoredCandidate, float]] = []
             step_log = f"**Step {t + 1}: `{words[t]}`** &nbsp;(rule → `{rule_out}`)\n\n"
             for i, mlm in enumerate(mlm_scores):
                 new_total = orig_score + scored.combined_score
                 next_beam.append((orig_path + [cand], new_total))
+                all_step_scores.append((p_idx, scored, new_total))
                 # Trace log (skip very low scores to reduce noise)
                 if mlm > -25.0:
             beam = sorted(next_beam, key=lambda x: x[1], reverse=True)[:beam_width]
+            # Capture diagnostics from the root beam path (p_idx=0) so each
+            # step has a stable and comparable candidate distribution.
+            root_scores = [item for item in all_step_scores if item[0] == 0]
+            root_scores_sorted = sorted(root_scores, key=lambda x: x[2], reverse=True)
+            selected = beam[0][0][t] if beam and beam[0][0] else ""
+            selected_total = beam[0][1] if beam else float("-inf")
+            candidate_breakdown = [item[1] for item in root_scores_sorted]
+            diagnostics.append(WordDiagnostic(
+                step_index=t,
+                input_word=words[t],
+                rule_output=rule_out,
+                selected_candidate=selected,
+                beam_score=selected_total,
+                candidate_breakdown=candidate_breakdown,
+            ))
         result = " ".join(beam[0][0]) if beam else ""
+        return result, trace_logs, diagnostics