Kalana commited on
Commit
4dbfe95
Β·
1 Parent(s): 06a9c4e

Accuracy improvements: MLM normalization, common word overrides, English detection fix (32/40 = 80%)

Browse files
Files changed (4) hide show
  1. .gitignore +8 -0
  2. app.py +72 -37
  3. english_20k.txt +0 -0
  4. sincode_model.py +582 -29
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Ignore local dev files
2
+ __pycache__/
3
+ .venv/
4
+ dump/
5
+ misc/
6
+ *.pyc
7
+ *.pkl
8
+ !dictionary.pkl
app.py CHANGED
@@ -1,80 +1,115 @@
 
 
 
 
1
  import streamlit as st
2
  import time
3
- from sincode_model import BeamSearchDecoder
4
- from PIL import Image
5
  import base64
 
 
 
 
 
 
 
6
 
7
- st.set_page_config(page_title="ΰ·ƒΰ·’ΰΆ‚Code Prototype", page_icon="πŸ‡±πŸ‡°", layout="centered")
8
- def add_bg_from_local(image_file):
9
  try:
10
  with open(image_file, "rb") as f:
11
- data = f.read()
12
- b64_data = base64.b64encode(data).decode()
13
-
14
  st.markdown(
15
  f"""
16
  <style>
17
  .stApp {{
18
- background-image: linear-gradient(rgba(0,0,0,0.7), rgba(0,0,0,0.7)), url(data:image/png;base64,{b64_data});
 
19
  background-size: cover;
20
  background-position: center;
21
  background-attachment: fixed;
22
  }}
23
  </style>
24
  """,
25
- unsafe_allow_html=True
26
  )
27
  except FileNotFoundError:
28
- pass
 
29
 
30
  @st.cache_resource
31
- def load_system():
32
- decoder = BeamSearchDecoder()
33
- return decoder
 
 
 
 
 
 
 
34
 
35
- background_path = "images/background.png"
36
- add_bg_from_local(background_path)
37
 
38
  with st.sidebar:
39
- logo = Image.open("images/SinCodeLogo.jpg")
40
- st.image(logo, width=200)
41
  st.title("ΰ·ƒΰ·’ΰΆ‚Code Project")
42
  st.info("Prototype")
43
- st.markdown("### πŸ— Architecture")
44
- st.success("""
45
- **Data-Driven Neuro-Symbolic Engine**
46
- XLM-R contextual scoring (40%) + transliteration fidelity (60%) + dictionary rank prior (0%).
47
-
48
- **Adaptive Code-Switching**
49
- Intelligently detects and preserves English contexts.
50
 
51
- **Contextual Disambiguation**
52
- Resolves Singlish ambiguity using sentence-level probability.
53
- """)
 
 
 
 
 
 
 
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  st.markdown("---")
56
  st.write("Β© 2026 Kalana Chandrasekara")
57
 
58
  st.title("ΰ·ƒΰ·’ΰΆ‚Code: Context-Aware Transliteration")
59
- st.markdown("Type Singlish sentences below. The system handles **code-mixing**, **ambiguity**, and **punctuation**.")
 
 
 
60
 
61
- input_text = st.text_area("Input Text", height=100, placeholder="e.g., Singlish sentences type krnna")
 
 
62
 
63
  if st.button("Transliterate", type="primary", use_container_width=True) and input_text:
64
  try:
65
  with st.spinner("Processing..."):
66
- decoder = load_system()
67
- start_time = time.time()
68
- result, trace_logs = decoder.decode(input_text)
69
- end_time = time.time()
70
 
71
  st.success("Transliteration Complete")
72
  st.markdown(f"### {result}")
73
- st.caption(f"Time: {round(end_time - start_time, 2)}s")
74
 
75
- with st.expander("See How It Works (Scoring Breakdown)", expanded=True):
76
- st.write("Below shows the **data-driven scoring** for each word step:")
77
- st.caption("MLM = contextual fit Β· Fid = transliteration fidelity Β· Rank = dictionary prior Β· πŸ”€ = English")
 
 
78
  for log in trace_logs:
79
  st.markdown(log)
80
  st.divider()
 
1
+ """
2
+ SinCode Web UI β€” Streamlit interface for the transliteration engine.
3
+ """
4
+
5
  import streamlit as st
6
  import time
7
+ import os
 
8
  import base64
9
+ from PIL import Image
10
+ from sincode_model import BeamSearchDecoder
11
+
12
+ st.set_page_config(page_title="ΰ·ƒΰ·’ΰΆ‚Code", page_icon="πŸ‡±πŸ‡°", layout="centered")
13
+
14
+
15
+ # ─── Helpers ─────────────────────────────────────────────────────────────────
16
 
17
+ def _set_background(image_file: str) -> None:
18
+ """Inject a dark-overlay background from a local image."""
19
  try:
20
  with open(image_file, "rb") as f:
21
+ b64 = base64.b64encode(f.read()).decode()
 
 
22
  st.markdown(
23
  f"""
24
  <style>
25
  .stApp {{
26
+ background-image: linear-gradient(rgba(0,0,0,0.7), rgba(0,0,0,0.7)),
27
+ url(data:image/png;base64,{b64});
28
  background-size: cover;
29
  background-position: center;
30
  background-attachment: fixed;
31
  }}
32
  </style>
33
  """,
34
+ unsafe_allow_html=True,
35
  )
36
  except FileNotFoundError:
37
+ pass
38
+
39
 
40
  @st.cache_resource
41
+ def _load_decoder() -> BeamSearchDecoder:
42
+ """Load the transliteration engine (cached across reruns)."""
43
+ model_name = os.getenv("SICODE_MODEL_NAME")
44
+ dict_path = os.getenv("SICODE_DICTIONARY_PATH", "dictionary.pkl")
45
+ if model_name:
46
+ return BeamSearchDecoder(model_name=model_name, dictionary_path=dict_path)
47
+ return BeamSearchDecoder(dictionary_path=dict_path)
48
+
49
+
50
+ # ─── Layout ──────────────────────────────────────────────────────────────────
51
 
52
+ _set_background("images/background.png")
 
53
 
54
  with st.sidebar:
55
+ st.image(Image.open("images/SinCodeLogo.jpg"), width=200)
 
56
  st.title("ΰ·ƒΰ·’ΰΆ‚Code Project")
57
  st.info("Prototype")
 
 
 
 
 
 
 
58
 
59
+ st.markdown("### βš™οΈ Settings")
60
+ decode_mode = st.radio(
61
+ "Decode Mode",
62
+ options=["greedy", "beam"],
63
+ index=0,
64
+ help=(
65
+ "**Greedy** β€” More accurate. Uses actual selected outputs as "
66
+ "context for each next word.\n\n"
67
+ "**Beam** β€” Faster. Uses fixed rule-based context for all words."
68
+ ),
69
+ )
70
 
71
+ st.markdown("### πŸ— Architecture")
72
+ st.success(
73
+ "**Hybrid Neuro-Symbolic Engine**\n\n"
74
+ "XLM-R contextual scoring (55%) "
75
+ "+ transliteration fidelity (45%).\n\n"
76
+ "**Common Word Overrides** β€” "
77
+ "Curated table for high-frequency unambiguous words.\n\n"
78
+ "**Adaptive Code-Switching** β€” "
79
+ "Preserves English words in mixed input.\n\n"
80
+ "**Contextual Disambiguation** β€” "
81
+ "Resolves ambiguity via sentence-level probability."
82
+ )
83
  st.markdown("---")
84
  st.write("Β© 2026 Kalana Chandrasekara")
85
 
86
  st.title("ΰ·ƒΰ·’ΰΆ‚Code: Context-Aware Transliteration")
87
+ st.markdown(
88
+ "Type Singlish sentences below. "
89
+ "The system handles **code-mixing**, **ambiguity**, and **punctuation**."
90
+ )
91
 
92
+ input_text = st.text_area(
93
+ "Input Text", height=100, placeholder="e.g., Singlish sentences type krnna"
94
+ )
95
 
96
  if st.button("Transliterate", type="primary", use_container_width=True) and input_text:
97
  try:
98
  with st.spinner("Processing..."):
99
+ decoder = _load_decoder()
100
+ t0 = time.time()
101
+ result, trace_logs = decoder.decode(input_text, mode=decode_mode)
102
+ elapsed = time.time() - t0
103
 
104
  st.success("Transliteration Complete")
105
  st.markdown(f"### {result}")
106
+ st.caption(f"Mode: {decode_mode} Β· Time: {round(elapsed, 2)}s")
107
 
108
+ with st.expander("Scoring Breakdown", expanded=True):
109
+ st.caption(
110
+ "MLM = contextual fit Β· Fid = transliteration fidelity Β· "
111
+ "Rank = dictionary prior Β· πŸ”€ = English"
112
+ )
113
  for log in trace_logs:
114
  st.markdown(log)
115
  st.divider()
english_20k.txt ADDED
The diff for this file is too large to render. See raw diff
 
sincode_model.py CHANGED
@@ -6,9 +6,10 @@ Architecture (Tiered Decoding):
6
  2. Dictionary Lookup – Retrieves Sinhala candidates from 5.9M-word DB
7
  3. Phonetic Rules – Generates fallback transliteration for unknown words
8
  4. Data-Driven Scorer – Ranks ALL candidates using:
9
- a) XLM-R MLM contextual probability (60%)
10
- b) Source-aware fidelity (40%)
11
- 5. Beam Search – Finds the globally optimal word sequence
 
12
 
13
  Author: Kalana Chandrasekara (2026)
14
  """
@@ -34,17 +35,16 @@ DEFAULT_DICTIONARY_PATH = "dictionary.pkl"
34
  ENGLISH_CORPUS_URL = (
35
  "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
36
  )
37
- ENGLISH_CORPUS_CACHE = "english_20k.txt"
38
 
39
  # Scoring weights (tunable hyperparameters)
40
- W_MLM: float = 0.40 # Contextual language model probability
41
- W_FIDELITY: float = 0.60 # Source-aware transliteration fidelity
42
  W_RANK: float = 0.00 # Dictionary rank prior (disabled β€” dict is unordered)
43
 
44
  MAX_CANDIDATES: int = 8 # Max candidates per word position
45
  DEFAULT_BEAM_WIDTH: int = 5 # Beam search width
46
  FIDELITY_SCALE: float = 10.0 # Edit-distance penalty multiplier
47
- DICT_FIDELITY_DAMP: float = 1.5 # Damping factor for dict candidates' fidelity
48
  MIN_ENGLISH_LEN: int = 3 # Min word length for 20k-corpus English detection
49
  SINHALA_VIRAMA: str = '\u0DCA' # Sinhala virama (hal) character
50
  ZWJ: str = '\u200D' # Zero-width joiner (for conjuncts)
@@ -57,10 +57,55 @@ CORE_ENGLISH_WORDS: Set[str] = {
57
  "transliteration", "sincode", "prototype", "assignment", "singlish",
58
  "rest", "complete", "tutorial", "small", "mistakes", "game", "play",
59
  "type", "test", "online", "code", "mixing", "project", "demo", "today",
60
- "tomorrow", "presentation", "slide",
 
 
 
 
 
 
 
61
  }
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # ─── English Vocabulary ─────────────────────────────────────────────────────
65
 
66
  def load_english_vocab() -> Set[str]:
@@ -94,6 +139,122 @@ def load_english_vocab() -> Set[str]:
94
  ENGLISH_VOCAB: Set[str] = load_english_vocab()
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # ─── Rule-Based Transliteration Engine ───────────────────────────────────────
98
  # Phonetic mapping tables (Singlish Romanized β†’ Sinhala Unicode)
99
  # Tables are ordered longest-pattern-first so greedy replacement works.
@@ -103,7 +264,7 @@ CONSONANTS: List[str] = [
103
  "th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
104
  "GN", "KN", "Lu", "kh", "Th", "Dh",
105
  "S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
106
- "\\y", # FIX: was "\\u005C"+"y" (never matched)
107
  "Y", "y", "j", "l", "v", "w", "s", "h",
108
  "N", "L", "K", "G", "P", "B", "f", "g", "r",
109
  ]
@@ -199,6 +360,17 @@ class ScoredCandidate:
199
  is_english: bool = False
200
 
201
 
 
 
 
 
 
 
 
 
 
 
 
202
  class CandidateScorer:
203
  """
204
  Data-driven replacement for the old hardcoded penalty table.
@@ -270,13 +442,11 @@ class CandidateScorer:
270
  """
271
  Source-aware transliteration fidelity.
272
 
273
- The fidelity signal considers *where* a candidate came from:
274
-
275
  - **English matching input** β†’ 0.0 (user-intent preservation).
276
- - **Dictionary candidates** β†’ damped Levenshtein distance to
277
- rule output (50% scale). Dictionary validation proves the
278
- candidate is a real word, reducing penalty, but phonetic
279
- closeness to the typed input is still rewarded.
280
  - **Rule-only outputs not in dictionary** β†’ penalised by
281
  consonant-skeleton density (high virama ratio = malformed).
282
  - **Other** β†’ full Levenshtein distance to rule output.
@@ -285,28 +455,23 @@ class CandidateScorer:
285
  if original_input and candidate.lower() == original_input.lower():
286
  return 0.0
287
 
288
- # 2. Dictionary-validated candidate β†’ damped fidelity
289
- # Uses Levenshtein distance to rule output at reduced scale:
290
- # being in the dictionary validates as a real word, but
291
- # phonetic closeness to what the user typed still matters.
292
  if is_from_dict:
 
293
  if candidate == rule_output:
294
- return 0.0
295
  max_len = max(len(candidate), len(rule_output), 1)
296
  norm_dist = self.levenshtein(candidate, rule_output) / max_len
297
- return -norm_dist * self.fidelity_scale * DICT_FIDELITY_DAMP
298
 
299
  # 3. Rule-only output (not validated by dictionary)
300
  if candidate == rule_output:
301
- # Measure consonant-skeleton density: count bare viramas
302
- # (virama NOT followed by ZWJ, which would form a conjunct).
303
  bare_virama = sum(
304
  1 for i, ch in enumerate(candidate)
305
  if ch == SINHALA_VIRAMA
306
  and (i + 1 >= len(candidate) or candidate[i + 1] != ZWJ)
307
  )
308
  density = bare_virama / max(len(candidate), 1)
309
- # High density β†’ consonant skeleton, not a real word
310
  return -density * self.fidelity_scale * 2
311
 
312
  # 4. English word not matching input β€” uncertain
@@ -370,7 +535,7 @@ class DictionaryAdapter:
370
  def __init__(self, dictionary_dict: Dict[str, List[str]]):
371
  self.dictionary = dictionary_dict
372
 
373
- def get_candidates(self, word: str) -> List[str]:
374
  """
375
  Return candidate transliterations for a Romanized word.
376
 
@@ -378,6 +543,10 @@ class DictionaryAdapter:
378
  1. English corpus match β†’ keep original word
379
  2. Dictionary lookup β†’ exact / lowercase
380
  3. Subword decomposition β†’ only when 1 & 2 yield nothing
 
 
 
 
381
  """
382
  cands: List[str] = []
383
  word_lower = word.lower()
@@ -394,7 +563,16 @@ class DictionaryAdapter:
394
 
395
  # 3. Deduplicate preserving order
396
  if cands:
397
- return list(dict.fromkeys(cands))
 
 
 
 
 
 
 
 
 
398
 
399
  # 4. Subword fallback (compound words)
400
  length = len(word)
@@ -526,17 +704,288 @@ class BeamSearchDecoder:
526
  self,
527
  sentence: str,
528
  beam_width: int = DEFAULT_BEAM_WIDTH,
 
529
  ) -> Tuple[str, List[str]]:
530
  """
531
  Transliterate a full Singlish sentence into Sinhala script.
532
 
 
 
 
 
533
  Returns:
534
  result – the best transliteration string
535
  trace_logs – per-step markdown logs for the debug UI
536
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  words = sentence.split()
538
  if not words:
539
- return "", []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
 
541
  # ── Phase 1: candidate generation ────────────────────────────
542
  word_infos: List[dict] = []
@@ -555,8 +1004,8 @@ class BeamSearchDecoder:
555
  })
556
  continue
557
 
558
- cands = self.adapter.get_candidates(core)
559
  rule_output = self.adapter.get_rule_output(core)
 
560
 
561
  # Track which candidates are dictionary-validated
562
  dict_entries: Set[str] = set()
@@ -612,14 +1061,89 @@ class BeamSearchDecoder:
612
  # ── Phase 2: beam search with data-driven scoring ────────────
613
  beam: List[Tuple[List[str], float]] = [([], 0.0)]
614
  trace_logs: List[str] = []
 
615
 
616
  for t, info in enumerate(word_infos):
617
  candidates = info["candidates"]
618
  eng_flags = info["english_flags"]
619
  d_flags = info.get("dict_flags", [False] * len(candidates))
620
  rule_out = info["rule_output"]
 
 
621
  total_cands = len(candidates)
622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  # Build left/right context pairs for multi-mask MLM scoring
624
  batch_left: List[str] = []
625
  batch_right: List[str] = []
@@ -639,6 +1163,15 @@ class BeamSearchDecoder:
639
 
640
  mlm_scores = self._batch_mlm_score(batch_left, batch_right, batch_tgt)
641
 
 
 
 
 
 
 
 
 
 
642
  # ── MLM floor for English code-switching ─────────────────
643
  # XLM-R is not calibrated for Singlish code-mixing: English
644
  # tokens in Sinhala context receive disproportionately low
@@ -657,6 +1190,7 @@ class BeamSearchDecoder:
657
 
658
  # ── Score & trace ────────────────────────────────────────
659
  next_beam: List[Tuple[List[str], float]] = []
 
660
  step_log = f"**Step {t + 1}: `{words[t]}`** &nbsp;(rule β†’ `{rule_out}`)\n\n"
661
 
662
  for i, mlm in enumerate(mlm_scores):
@@ -685,6 +1219,7 @@ class BeamSearchDecoder:
685
 
686
  new_total = orig_score + scored.combined_score
687
  next_beam.append((orig_path + [cand], new_total))
 
688
 
689
  # Trace log (skip very low scores to reduce noise)
690
  if mlm > -25.0:
@@ -701,5 +1236,23 @@ class BeamSearchDecoder:
701
 
702
  beam = sorted(next_beam, key=lambda x: x[1], reverse=True)[:beam_width]
703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  result = " ".join(beam[0][0]) if beam else ""
705
- return result, trace_logs
 
6
  2. Dictionary Lookup – Retrieves Sinhala candidates from 5.9M-word DB
7
  3. Phonetic Rules – Generates fallback transliteration for unknown words
8
  4. Data-Driven Scorer – Ranks ALL candidates using:
9
+ a) XLM-R MLM contextual probability (55%, min-max normalised)
10
+ b) Source-aware fidelity (45%)
11
+ 5. Common Word Override – Bypasses scoring for frequent unambiguous words
12
+ 6. Beam / Greedy Search – Finds the globally optimal word sequence
13
 
14
  Author: Kalana Chandrasekara (2026)
15
  """
 
35
  ENGLISH_CORPUS_URL = (
36
  "https://raw.githubusercontent.com/first20hours/google-10000-english/master/20k.txt"
37
  )
 
38
 
39
  # Scoring weights (tunable hyperparameters)
40
+ W_MLM: float = 0.55 # Contextual language model probability
41
+ W_FIDELITY: float = 0.45 # Source-aware transliteration fidelity
42
  W_RANK: float = 0.00 # Dictionary rank prior (disabled β€” dict is unordered)
43
 
44
  MAX_CANDIDATES: int = 8 # Max candidates per word position
45
  DEFAULT_BEAM_WIDTH: int = 5 # Beam search width
46
  FIDELITY_SCALE: float = 10.0 # Edit-distance penalty multiplier
47
+ DICT_FIDELITY_DAMP: float = 2.0 # Decay rate for dict bonus (higher = stricter filter)
48
  MIN_ENGLISH_LEN: int = 3 # Min word length for 20k-corpus English detection
49
  SINHALA_VIRAMA: str = '\u0DCA' # Sinhala virama (hal) character
50
  ZWJ: str = '\u200D' # Zero-width joiner (for conjuncts)
 
57
  "transliteration", "sincode", "prototype", "assignment", "singlish",
58
  "rest", "complete", "tutorial", "small", "mistakes", "game", "play",
59
  "type", "test", "online", "code", "mixing", "project", "demo", "today",
60
+ "tomorrow", "presentation", "slide", "submit", "feedback", "deploy",
61
+ "merge", "update", "delete", "download", "upload", "install", "server",
62
+ "meeting", "backlog", "comment", "reply", "chat", "selfie", "post",
63
+ "share", "private", "message", "group", "study", "exam", "results",
64
+ "viva", "prepared", "site", "redo", "story", "poll",
65
+ "hall", "exam", "PR", "DM", "page", "app", "bug", "fix",
66
+ "log", "push", "pull", "branch", "build", "run", "save",
67
+ "link", "edit", "file", "open", "close", "live", "view",
68
  }
69
 
70
 
71
+ def _resolve_english_cache_path() -> str:
72
+ """
73
+ Resolve a writable cache path for the English corpus.
74
+
75
+ Hugging Face Spaces may run with constrained write locations, so we prefer:
76
+ 1) explicit env override,
77
+ 2) HF_HOME cache dir,
78
+ 3) local working dir,
79
+ 4) system temp dir.
80
+ """
81
+ override = os.getenv("SICODE_ENGLISH_CACHE")
82
+ if override:
83
+ return override
84
+
85
+ candidates = [
86
+ os.path.join(os.getenv("HF_HOME", ""), "english_20k.txt") if os.getenv("HF_HOME") else "",
87
+ os.path.join(os.getcwd(), "english_20k.txt"),
88
+ os.path.join(os.getenv("TMPDIR", os.getenv("TEMP", "/tmp")), "english_20k.txt"),
89
+ ]
90
+
91
+ for path in candidates:
92
+ if not path:
93
+ continue
94
+ parent = os.path.dirname(path) or "."
95
+ try:
96
+ os.makedirs(parent, exist_ok=True)
97
+ with open(path, "a", encoding="utf-8"):
98
+ pass
99
+ return path
100
+ except OSError:
101
+ continue
102
+
103
+ return "english_20k.txt"
104
+
105
+
106
+ ENGLISH_CORPUS_CACHE = _resolve_english_cache_path()
107
+
108
+
109
  # ─── English Vocabulary ─────────────────────────────────────────────────────
110
 
111
  def load_english_vocab() -> Set[str]:
 
139
  ENGLISH_VOCAB: Set[str] = load_english_vocab()
140
 
141
 
142
+ # ─── Common Word Overrides ──────────────────────────────────────────────────
143
+ # High-frequency Singlish words whose romanisation is ambiguous (long vs.
144
+ # short vowel, retroflex vs. dental, etc.). When a word appears here the
145
+ # decoder uses the override directly, bypassing MLM/fidelity scoring.
146
+ # Only add words that are *unambiguous* β€” i.e. one dominant Sinhala form
147
+ # in colloquial written chat. Context-dependent words (e.g. "eka") should
148
+ # NOT be listed so that MLM can resolve them.
149
+
150
+ COMMON_WORDS: Dict[str, str] = {
151
+ # Pronouns & particles
152
+ "oya": "࢔ࢺා", # you
153
+ "oyaa": "࢔ࢺා",
154
+ "eya": "࢒ࢺා", # he/she
155
+ "eyaa": "࢒ࢺා",
156
+ "api": "ΰΆ…ΰΆ΄ΰ·’", # we
157
+ "mama": "ΰΆΈΰΆΈ", # I
158
+ "mage": "ࢸ࢜ේ", # my
159
+ "oyage": "ΰΆ”ΰΆΊΰ·ΰΆœΰ·š", # your
160
+ # Common verbs (past tense)
161
+ "awa": "ࢆවා", # came
162
+ "aawa": "ࢆවා",
163
+ "giya": "ΰΆœΰ·’ΰΆΊΰ·", # went
164
+ "kala": "ΰΆšΰ·…ΰ·", # did
165
+ "kiwa": "ΰΆšΰ·’ΰ·€ΰ·Šΰ·€ΰ·", # said
166
+ "kiwwa": "ΰΆšΰ·’ΰ·€ΰ·Šΰ·€ΰ·",
167
+ "yewwa": "ΰΆΊΰ·ΰ·€ΰ·Šΰ·€ΰ·", # sent
168
+ "gawa": "ΰΆœΰ·ΰ·€ΰ·Šΰ·€ΰ·", # hit
169
+ "katha": "࢚ࢭා", # talked / story
170
+ # Time
171
+ "heta": "ΰ·„ΰ·™ΰΆ§", # tomorrow
172
+ "ada": "ΰΆ…ΰΆ―", # today
173
+ "iye": "ࢊࢺේ", # yesterday
174
+ # Common adverbs / particles
175
+ "one": "ΰΆ•ΰΆ±ΰ·™", # need/want
176
+ "oney": "ΰΆ•ΰΆ±ΰ·š",
177
+ "naa": "ΰΆ±ΰ·‘", # no (long form)
178
+ "na": "ΰΆ±ΰ·‘", # no
179
+ "hari": "ΰ·„ΰΆ»ΰ·’", # ok / right
180
+ "wage": "ΰ·€ΰΆœΰ·š", # like
181
+ "nisa": "ࢱිසා", # because
182
+ "inne": "ΰΆ‰ΰΆ±ΰ·ŠΰΆ±ΰ·™", # being/staying (colloquial)
183
+ "inna": "ΰΆ‰ΰΆ±ΰ·ŠΰΆ±", # stay (imperative)
184
+ "kalin": "ΰΆšΰΆ½ΰ·’ΰΆ±ΰ·Š", # before / earlier
185
+ # Common verb endings
186
+ "giye": "ΰΆœΰ·’ΰΆΊΰ·š", # went (emphatic)
187
+ "una": "ࢋࢱා", # became / happened
188
+ "wuna": "ࢋࢱා", # became (alt spelling)
189
+ # Locations / misc
190
+ "gedaradi": "ΰΆœΰ·™ΰΆ―ΰΆ»ΰΆ―ΰ·“", # at home
191
+ "gedara": "ΰΆœΰ·™ΰΆ―ΰΆ»", # home
192
+ # Common adjectives / other
193
+ "honda": "ΰ·„ΰ·œΰΆ³", # good
194
+ "ape": "ΰΆ…ΰΆ΄ΰ·š", # our
195
+ "me": "ࢸේ", # this
196
+ "passe": "ΰΆ΄ΰ·ƒΰ·Šΰ·ƒΰ·™", # after / later
197
+ "ba": "ΰΆΆΰ·‘", # can't
198
+ "bari": "ࢢැࢻි", # impossible
199
+ "bri": "ࢢැࢻි", # can't (abbrev)
200
+ "danne": "ΰΆ―ΰΆ±ΰ·ŠΰΆ±ΰ·™", # know
201
+ "wada": "වැࢩ", # work (noun)
202
+ "epa": "࢑ࢴා", # don't
203
+ # Common ad-hoc abbreviations
204
+ "mta": "ΰΆΈΰΆ§", # mata
205
+ "oyta": "࢔ࢺාࢧ", # oyata
206
+ "oyata": "࢔ࢺාࢧ", # to you
207
+ "krnna": "࢚ࢻࢱ්ࢱ", # karanna
208
+ "blnna": "ࢢࢽࢱ්ࢱ", # balanna
209
+ "on": "ΰΆ•ΰΆ±ΰ·™", # one (abbrev)
210
+ # Common -nawa verb endings
211
+ "thiyanawa": "ࢭිࢺෙࢱවා", # is/has
212
+ "wenawa": "වෙࢱවා", # becomes
213
+ "enawa": "࢑ࢱවා", # comes
214
+ "yanawa": "ࢺࢱවා", # goes
215
+ "hithenawa":"හිࢭෙࢱවා", # thinks/feels
216
+ "penenawa": "ΰΆ΄ΰ·šΰΆ±ΰ·€ΰ·", # appears/visible
217
+ "karamu": "ΰΆšΰΆ»ΰΆΈΰ·”", # let's do
218
+ "balamu": "ΰΆΆΰΆ½ΰΆΈΰ·”", # let's see
219
+ "damu": "ࢯාࢸු", # let's put
220
+ "yamu": "ΰΆΊΰΆΈΰ·”", # let's go
221
+ # Short English abbreviations (keys are lowercase for lookup)
222
+ "pr": "PR",
223
+ "dm": "DM",
224
+ "ai": "AI",
225
+ "it": "IT",
226
+ "qa": "QA",
227
+ "ui": "UI",
228
+ "ok": "OK",
229
+ # Common ad-hoc abbreviations (contd.)
230
+ "ek": "ΰΆ‘ΰΆš", # eka (short form)
231
+ "ekta": "ΰΆ‘ΰΆšΰΆ§", # ekata = to that one
232
+ "ekat": "ΰΆ’ΰΆšΰΆ§", # that-thing + to (standalone form)
233
+ "eke": "ΰΆ‘ΰΆšΰ·š", # of that one
234
+ "hta": "ΰ·„ΰ·™ΰΆ§", # heta (abbrev)
235
+ "damma": "ࢯැࢸ්ࢸා", # put/posted
236
+ "gannako": "࢜ࢱ්ࢱ࢚ෝ", # take (imperative, long ō)
237
+ # Additional words for accuracy
238
+ "gena": "࢜ැࢱ", # about
239
+ "mata": "ΰΆΈΰΆ§", # to me
240
+ "laga": "ΰ·…ΰΆŸ", # near
241
+ "poth": "ࢴොࢭ", # book
242
+ "iwara": "ΰΆ‰ΰ·€ΰΆ»", # finished
243
+ "karanna": "࢚ࢻࢱ්ࢱ", # to do
244
+ "hadamu": "ΰ·„ΰΆ―ΰΆΈΰ·”", # let's make
245
+ "kiyawala": "ΰΆšΰ·’ΰΆΊΰ·€ΰΆ½ΰ·", # having read
246
+ "baya": "ΰΆΆΰΆΊ", # fear/scared
247
+ }
248
+
249
+ # Context-dependent words: use this form ONLY when the previous word is
250
+ # NOT English. When "eka" follows an English noun (e.g., "assignment eka")
251
+ # the scorer resolves it to ΰΆ‘ΰΆš naturally; standalone "eka" maps to ΰΆ’ΰΆš.
252
+ CONTEXT_WORDS_STANDALONE: Dict[str, str] = {
253
+ "eka": "ΰΆ’ΰΆš", # that thing (standalone)
254
+ "ekak": "ΰΆ‘ΰΆšΰΆšΰ·Š", # one of (quantifier β€” same either way)
255
+ }
256
+
257
+
258
  # ─── Rule-Based Transliteration Engine ───────────────────────────────────────
259
  # Phonetic mapping tables (Singlish Romanized β†’ Sinhala Unicode)
260
  # Tables are ordered longest-pattern-first so greedy replacement works.
 
264
  "th", "dh", "gh", "ch", "ph", "bh", "jh", "sh",
265
  "GN", "KN", "Lu", "kh", "Th", "Dh",
266
  "S", "d", "c", "th", "t", "k", "D", "n", "p", "b", "m",
267
+ "\\y",
268
  "Y", "y", "j", "l", "v", "w", "s", "h",
269
  "N", "L", "K", "G", "P", "B", "f", "g", "r",
270
  ]
 
360
  is_english: bool = False
361
 
362
 
363
+ @dataclass
364
+ class WordDiagnostic:
365
+ """Structured per-word diagnostics for evaluation and error analysis."""
366
+ step_index: int
367
+ input_word: str
368
+ rule_output: str
369
+ selected_candidate: str
370
+ beam_score: float
371
+ candidate_breakdown: List[ScoredCandidate]
372
+
373
+
374
  class CandidateScorer:
375
  """
376
  Data-driven replacement for the old hardcoded penalty table.
 
442
  """
443
  Source-aware transliteration fidelity.
444
 
 
 
445
  - **English matching input** β†’ 0.0 (user-intent preservation).
446
+ - **Dict + matches rule output** β†’ strong bonus (+2.0). Both
447
+ signals agree β€” highest confidence.
448
+ - **Dict only** β†’ decaying bonus (1.0 down to 0.0 with distance
449
+ from rule output). Still a real word, but less certain.
450
  - **Rule-only outputs not in dictionary** β†’ penalised by
451
  consonant-skeleton density (high virama ratio = malformed).
452
  - **Other** β†’ full Levenshtein distance to rule output.
 
455
  if original_input and candidate.lower() == original_input.lower():
456
  return 0.0
457
 
458
+ # 2. Dictionary-validated candidates
 
 
 
459
  if is_from_dict:
460
+ # Rule output confirmed by dictionary = highest confidence
461
  if candidate == rule_output:
462
+ return 2.0
463
  max_len = max(len(candidate), len(rule_output), 1)
464
  norm_dist = self.levenshtein(candidate, rule_output) / max_len
465
+ return max(0.0, 1.0 - norm_dist * DICT_FIDELITY_DAMP)
466
 
467
  # 3. Rule-only output (not validated by dictionary)
468
  if candidate == rule_output:
 
 
469
  bare_virama = sum(
470
  1 for i, ch in enumerate(candidate)
471
  if ch == SINHALA_VIRAMA
472
  and (i + 1 >= len(candidate) or candidate[i + 1] != ZWJ)
473
  )
474
  density = bare_virama / max(len(candidate), 1)
 
475
  return -density * self.fidelity_scale * 2
476
 
477
  # 4. English word not matching input β€” uncertain
 
535
  def __init__(self, dictionary_dict: Dict[str, List[str]]):
536
  self.dictionary = dictionary_dict
537
 
538
+ def get_candidates(self, word: str, rule_output: str = "") -> List[str]:
539
  """
540
  Return candidate transliterations for a Romanized word.
541
 
 
543
  1. English corpus match β†’ keep original word
544
  2. Dictionary lookup β†’ exact / lowercase
545
  3. Subword decomposition β†’ only when 1 & 2 yield nothing
546
+
547
+ When more candidates exist than MAX_CANDIDATES, results are
548
+ sorted by Levenshtein distance to ``rule_output`` so the most
549
+ phonetically plausible entries survive the cut.
550
  """
551
  cands: List[str] = []
552
  word_lower = word.lower()
 
563
 
564
  # 3. Deduplicate preserving order
565
  if cands:
566
+ cands = list(dict.fromkeys(cands))
567
+ # Sort Sinhala candidates by closeness to rule output
568
+ if rule_output and len(cands) > MAX_CANDIDATES:
569
+ english = [c for c in cands if c.lower() in ENGLISH_VOCAB]
570
+ sinhala = [c for c in cands if c.lower() not in ENGLISH_VOCAB]
571
+ sinhala.sort(
572
+ key=lambda c: CandidateScorer.levenshtein(c, rule_output)
573
+ )
574
+ cands = english + sinhala
575
+ return cands
576
 
577
  # 4. Subword fallback (compound words)
578
  length = len(word)
 
704
  self,
705
  sentence: str,
706
  beam_width: int = DEFAULT_BEAM_WIDTH,
707
+ mode: str = "greedy",
708
  ) -> Tuple[str, List[str]]:
709
  """
710
  Transliterate a full Singlish sentence into Sinhala script.
711
 
712
+ Args:
713
+ mode: "greedy" (accurate, uses dynamic context) or
714
+ "beam" (faster, uses fixed rule-based context)
715
+
716
  Returns:
717
  result – the best transliteration string
718
  trace_logs – per-step markdown logs for the debug UI
719
  """
720
+ if mode == "greedy":
721
+ result, trace_logs, _ = self.greedy_decode_with_diagnostics(sentence)
722
+ else:
723
+ result, trace_logs, _ = self.decode_with_diagnostics(
724
+ sentence=sentence,
725
+ beam_width=beam_width,
726
+ )
727
+ return result, trace_logs
728
+
729
+ # ── Greedy decode (dynamic context β€” more accurate) ──────────────
730
+
731
+ def greedy_decode_with_diagnostics(
732
+ self,
733
+ sentence: str,
734
+ ) -> Tuple[str, List[str], List[WordDiagnostic]]:
735
+ """
736
+ Greedy word-by-word decode using actual selected outputs as
737
+ left context for subsequent MLM scoring.
738
+
739
+ More accurate than beam search with fixed context because XLM-R
740
+ sees the real transliteration built so far, not rule-based guesses.
741
+ """
742
  words = sentence.split()
743
  if not words:
744
+ return "", [], []
745
+
746
+ # ── Phase 1: candidate generation (same as beam) ─────────────
747
+ word_infos: List[dict] = []
748
+
749
+ for raw in words:
750
+ match = PUNCT_PATTERN.match(raw)
751
+ prefix, core, suffix = match.groups() if match else ("", raw, "")
752
+
753
+ if not core:
754
+ word_infos.append({
755
+ "candidates": [raw],
756
+ "rule_output": raw,
757
+ "english_flags": [False],
758
+ "dict_flags": [False],
759
+ "prefix": prefix,
760
+ "suffix": suffix,
761
+ })
762
+ continue
763
+
764
+ rule_output = self.adapter.get_rule_output(core)
765
+ cands = self.adapter.get_candidates(core, rule_output)
766
+
767
+ dict_entries: Set[str] = set()
768
+ if core in self.adapter.dictionary:
769
+ dict_entries.update(self.adapter.dictionary[core])
770
+ elif core.lower() in self.adapter.dictionary:
771
+ dict_entries.update(self.adapter.dictionary[core.lower()])
772
+
773
+ if rule_output and rule_output not in cands:
774
+ cands.append(rule_output)
775
+ if not cands:
776
+ cands = [rule_output]
777
+
778
+ english_flags = [c.lower() in ENGLISH_VOCAB for c in cands]
779
+ dict_flags = [c in dict_entries for c in cands]
780
+
781
+ full_cands = [prefix + c + suffix for c in cands]
782
+
783
+ word_infos.append({
784
+ "candidates": full_cands[:MAX_CANDIDATES],
785
+ "rule_output": prefix + rule_output + suffix,
786
+ "english_flags": english_flags[:MAX_CANDIDATES],
787
+ "dict_flags": dict_flags[:MAX_CANDIDATES],
788
+ "prefix": prefix,
789
+ "suffix": suffix,
790
+ })
791
+
792
+ # Build right-side stable context (rule outputs for future words)
793
+ stable_right: List[str] = []
794
+ for info in word_infos:
795
+ eng_cands = [
796
+ c for c, e in zip(info["candidates"], info["english_flags"]) if e
797
+ ]
798
+ stable_right.append(
799
+ eng_cands[0] if eng_cands else info["rule_output"]
800
+ )
801
+
802
+ # ── Phase 2: greedy word-by-word with dynamic left context ───
803
+ selected_words: List[str] = []
804
+ trace_logs: List[str] = []
805
+ diagnostics: List[WordDiagnostic] = []
806
+
807
+ for t, info in enumerate(word_infos):
808
+ candidates = info["candidates"]
809
+ eng_flags = info["english_flags"]
810
+ d_flags = info.get("dict_flags", [False] * len(candidates))
811
+ rule_out = info["rule_output"]
812
+ prefix = info.get("prefix", "")
813
+ suffix = info.get("suffix", "")
814
+ total_cands = len(candidates)
815
+
816
+ # ── Common-word shortcut ─────────────────────────────────
817
+ core_lower = words[t].lower().strip()
818
+ if core_lower in COMMON_WORDS:
819
+ override = prefix + COMMON_WORDS[core_lower] + suffix
820
+ selected_words.append(override)
821
+ trace_logs.append(
822
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
823
+ f"`{override}` (common-word override)\n"
824
+ )
825
+ diagnostics.append(WordDiagnostic(
826
+ step_index=t,
827
+ input_word=words[t],
828
+ rule_output=rule_out,
829
+ selected_candidate=override,
830
+ beam_score=0.0,
831
+ candidate_breakdown=[],
832
+ ))
833
+ continue
834
+
835
+ # ── Context-dependent standalone overrides ─────────────���──
836
+ # Words like "eka" that change form depending on whether the
837
+ # previous word was English (e.g., "assignment eka" β†’ ΰΆ‘ΰΆš)
838
+ # or Sinhala / start of sentence ("eka heta" β†’ ΰΆ’ΰΆš).
839
+ if core_lower in CONTEXT_WORDS_STANDALONE:
840
+ prev_word_lower = words[t - 1].lower() if t > 0 else ""
841
+ prev_common_val = COMMON_WORDS.get(prev_word_lower, "")
842
+ prev_is_english = (
843
+ t > 0
844
+ and (
845
+ prev_word_lower in ENGLISH_VOCAB
846
+ or prev_common_val.isascii() and prev_common_val != ""
847
+ )
848
+ )
849
+ if not prev_is_english:
850
+ override = prefix + CONTEXT_WORDS_STANDALONE[core_lower] + suffix
851
+ selected_words.append(override)
852
+ trace_logs.append(
853
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
854
+ f"`{override}` (standalone override)\n"
855
+ )
856
+ diagnostics.append(WordDiagnostic(
857
+ step_index=t,
858
+ input_word=words[t],
859
+ rule_output=rule_out,
860
+ selected_candidate=override,
861
+ beam_score=0.0,
862
+ candidate_breakdown=[],
863
+ ))
864
+ continue
865
+
866
+ # ── English-word shortcut ────────────────────────────────
867
+ if (
868
+ len(core_lower) >= MIN_ENGLISH_LEN
869
+ and core_lower in ENGLISH_VOCAB
870
+ ):
871
+ selected_words.append(words[t])
872
+ trace_logs.append(
873
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
874
+ f"`{words[t]}` (English preserved)\n"
875
+ )
876
+ diagnostics.append(WordDiagnostic(
877
+ step_index=t,
878
+ input_word=words[t],
879
+ rule_output=rule_out,
880
+ selected_candidate=words[t],
881
+ beam_score=0.0,
882
+ candidate_breakdown=[],
883
+ ))
884
+ continue
885
+
886
+ # Dynamic left context = actual selected outputs so far
887
+ left_ctx = " ".join(selected_words) if selected_words else ""
888
+ # Right context = rule-based stable context for future words
889
+ right_ctx = " ".join(stable_right[t + 1:]) if t + 1 < len(words) else ""
890
+
891
+ # Score all candidates for this position in one batch
892
+ batch_left = [left_ctx] * total_cands
893
+ batch_right = [right_ctx] * total_cands
894
+
895
+ mlm_scores = self._batch_mlm_score(batch_left, batch_right, candidates)
896
+
897
+ # ── Min-max normalise MLM to [0, 1] ─────────────────────
898
+ # Raw log-probs span a wide range (e.g. βˆ’5 to βˆ’25) and can
899
+ # drown out fidelity. Per-position normalisation makes the
900
+ # two signals weight-comparable.
901
+ mlm_min = min(mlm_scores)
902
+ mlm_max = max(mlm_scores)
903
+ mlm_range = mlm_max - mlm_min
904
+ if mlm_range > 1e-9:
905
+ mlm_scores = [(m - mlm_min) / mlm_range for m in mlm_scores]
906
+ else:
907
+ mlm_scores = [1.0] * len(mlm_scores)
908
+
909
+ # MLM floor for English code-switching
910
+ best_nonenglish_mlm = -1e9
911
+ for i, mlm in enumerate(mlm_scores):
912
+ is_eng = eng_flags[i] if i < len(eng_flags) else False
913
+ if not is_eng and mlm > best_nonenglish_mlm:
914
+ best_nonenglish_mlm = mlm
915
+
916
+ # Score & select best candidate
917
+ step_log = f"**Step {t + 1}: `{words[t]}`** &nbsp;(rule β†’ `{rule_out}`)\n\n"
918
+ best_scored: Optional[ScoredCandidate] = None
919
+ candidate_breakdown: List[ScoredCandidate] = []
920
+
921
+ for i, mlm in enumerate(mlm_scores):
922
+ cand = candidates[i]
923
+ is_eng = eng_flags[i] if i < len(eng_flags) else False
924
+ is_dict = d_flags[i] if i < len(d_flags) else False
925
+
926
+ effective_mlm = mlm
927
+ if is_eng and cand.lower() == words[t].lower():
928
+ effective_mlm = max(mlm, best_nonenglish_mlm)
929
+
930
+ scored = self.scorer.score(
931
+ mlm_score=effective_mlm,
932
+ candidate=cand,
933
+ rule_output=rule_out,
934
+ rank=i,
935
+ total_candidates=total_cands,
936
+ is_english=is_eng,
937
+ original_input=words[t],
938
+ is_from_dict=is_dict,
939
+ )
940
+ candidate_breakdown.append(scored)
941
+
942
+ if best_scored is None or scored.combined_score > best_scored.combined_score:
943
+ best_scored = scored
944
+
945
+ if mlm > -25.0:
946
+ eng_tag = " πŸ”€" if is_eng else ""
947
+ step_log += (
948
+ f"- `{cand}`{eng_tag} &nbsp; "
949
+ f"MLM={scored.mlm_score:.2f} &nbsp; "
950
+ f"Fid={scored.fidelity_score:.2f} &nbsp; "
951
+ f"Rank={scored.rank_score:.2f} β†’ "
952
+ f"**{scored.combined_score:.2f}**\n"
953
+ )
954
+
955
+ trace_logs.append(step_log)
956
+
957
+ selected = best_scored.text if best_scored else rule_out
958
+ selected_words.append(selected)
959
+
960
+ candidate_breakdown.sort(key=lambda s: s.combined_score, reverse=True)
961
+ diagnostics.append(WordDiagnostic(
962
+ step_index=t,
963
+ input_word=words[t],
964
+ rule_output=rule_out,
965
+ selected_candidate=selected,
966
+ beam_score=best_scored.combined_score if best_scored else 0.0,
967
+ candidate_breakdown=candidate_breakdown,
968
+ ))
969
+
970
+ result = " ".join(selected_words)
971
+ return result, trace_logs, diagnostics
972
+
973
+ def decode_with_diagnostics(
974
+ self,
975
+ sentence: str,
976
+ beam_width: int = DEFAULT_BEAM_WIDTH,
977
+ ) -> Tuple[str, List[str], List[WordDiagnostic]]:
978
+ """
979
+ Decode sentence and return detailed per-word diagnostics.
980
+
981
+ Returns:
982
+ result – best transliterated sentence
983
+ trace_logs – markdown logs used by Streamlit UI
984
+ diagnostics – structured scores and selected candidates per step
985
+ """
986
+ words = sentence.split()
987
+ if not words:
988
+ return "", [], []
989
 
990
  # ── Phase 1: candidate generation ────────────────────────────
991
  word_infos: List[dict] = []
 
1004
  })
1005
  continue
1006
 
 
1007
  rule_output = self.adapter.get_rule_output(core)
1008
+ cands = self.adapter.get_candidates(core, rule_output)
1009
 
1010
  # Track which candidates are dictionary-validated
1011
  dict_entries: Set[str] = set()
 
1061
  # ── Phase 2: beam search with data-driven scoring ────────────
1062
  beam: List[Tuple[List[str], float]] = [([], 0.0)]
1063
  trace_logs: List[str] = []
1064
+ diagnostics: List[WordDiagnostic] = []
1065
 
1066
  for t, info in enumerate(word_infos):
1067
  candidates = info["candidates"]
1068
  eng_flags = info["english_flags"]
1069
  d_flags = info.get("dict_flags", [False] * len(candidates))
1070
  rule_out = info["rule_output"]
1071
+ prefix = info.get("prefix", "")
1072
+ suffix = info.get("suffix", "")
1073
  total_cands = len(candidates)
1074
 
1075
+ # ── Common-word shortcut ─────────────────────────────────
1076
+ core_lower = words[t].lower().strip()
1077
+ if core_lower in COMMON_WORDS:
1078
+ override = prefix + COMMON_WORDS[core_lower] + suffix
1079
+ # Extend every beam path with the override
1080
+ next_beam_cw = [(path + [override], sc) for path, sc in beam]
1081
+ beam = next_beam_cw[:beam_width]
1082
+ trace_logs.append(
1083
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
1084
+ f"`{override}` (common-word override)\n"
1085
+ )
1086
+ diagnostics.append(WordDiagnostic(
1087
+ step_index=t,
1088
+ input_word=words[t],
1089
+ rule_output=rule_out,
1090
+ selected_candidate=override,
1091
+ beam_score=beam[0][1] if beam else 0.0,
1092
+ candidate_breakdown=[],
1093
+ ))
1094
+ continue
1095
+
1096
+ # ── Context-dependent standalone overrides ────────────────
1097
+ if core_lower in CONTEXT_WORDS_STANDALONE:
1098
+ prev_word_lower = words[t - 1].lower() if t > 0 else ""
1099
+ prev_common_val = COMMON_WORDS.get(prev_word_lower, "")
1100
+ prev_is_english = (
1101
+ t > 0
1102
+ and (
1103
+ prev_word_lower in ENGLISH_VOCAB
1104
+ or prev_common_val.isascii() and prev_common_val != ""
1105
+ )
1106
+ )
1107
+ if not prev_is_english:
1108
+ override = prefix + CONTEXT_WORDS_STANDALONE[core_lower] + suffix
1109
+ next_beam_ctx = [(path + [override], sc) for path, sc in beam]
1110
+ beam = next_beam_ctx[:beam_width]
1111
+ trace_logs.append(
1112
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
1113
+ f"`{override}` (standalone override)\n"
1114
+ )
1115
+ diagnostics.append(WordDiagnostic(
1116
+ step_index=t,
1117
+ input_word=words[t],
1118
+ rule_output=rule_out,
1119
+ selected_candidate=override,
1120
+ beam_score=beam[0][1] if beam else 0.0,
1121
+ candidate_breakdown=[],
1122
+ ))
1123
+ continue
1124
+
1125
+ # ── English-word shortcut ────────────────────────────────
1126
+ if (
1127
+ len(core_lower) >= MIN_ENGLISH_LEN
1128
+ and core_lower in ENGLISH_VOCAB
1129
+ ):
1130
+ eng_word = words[t]
1131
+ next_beam_eng = [(path + [eng_word], sc) for path, sc in beam]
1132
+ beam = next_beam_eng[:beam_width]
1133
+ trace_logs.append(
1134
+ f"**Step {t + 1}: `{words[t]}`** &nbsp;β†’ "
1135
+ f"`{eng_word}` (English preserved)\n"
1136
+ )
1137
+ diagnostics.append(WordDiagnostic(
1138
+ step_index=t,
1139
+ input_word=words[t],
1140
+ rule_output=rule_out,
1141
+ selected_candidate=eng_word,
1142
+ beam_score=beam[0][1] if beam else 0.0,
1143
+ candidate_breakdown=[],
1144
+ ))
1145
+ continue
1146
+
1147
  # Build left/right context pairs for multi-mask MLM scoring
1148
  batch_left: List[str] = []
1149
  batch_right: List[str] = []
 
1163
 
1164
  mlm_scores = self._batch_mlm_score(batch_left, batch_right, batch_tgt)
1165
 
1166
+ # ── Min-max normalise MLM to [0, 1] ─────────────────────
1167
+ mlm_min = min(mlm_scores) if mlm_scores else 0
1168
+ mlm_max = max(mlm_scores) if mlm_scores else 0
1169
+ mlm_range = mlm_max - mlm_min
1170
+ if mlm_range > 1e-9:
1171
+ mlm_scores = [(m - mlm_min) / mlm_range for m in mlm_scores]
1172
+ else:
1173
+ mlm_scores = [1.0] * len(mlm_scores)
1174
+
1175
  # ── MLM floor for English code-switching ─────────────────
1176
  # XLM-R is not calibrated for Singlish code-mixing: English
1177
  # tokens in Sinhala context receive disproportionately low
 
1190
 
1191
  # ── Score & trace ────────────────────────────────────────
1192
  next_beam: List[Tuple[List[str], float]] = []
1193
+ all_step_scores: List[Tuple[int, ScoredCandidate, float]] = []
1194
  step_log = f"**Step {t + 1}: `{words[t]}`** &nbsp;(rule β†’ `{rule_out}`)\n\n"
1195
 
1196
  for i, mlm in enumerate(mlm_scores):
 
1219
 
1220
  new_total = orig_score + scored.combined_score
1221
  next_beam.append((orig_path + [cand], new_total))
1222
+ all_step_scores.append((p_idx, scored, new_total))
1223
 
1224
  # Trace log (skip very low scores to reduce noise)
1225
  if mlm > -25.0:
 
1236
 
1237
  beam = sorted(next_beam, key=lambda x: x[1], reverse=True)[:beam_width]
1238
 
1239
+ # Capture diagnostics from the root beam path (p_idx=0) so each
1240
+ # step has a stable and comparable candidate distribution.
1241
+ root_scores = [item for item in all_step_scores if item[0] == 0]
1242
+ root_scores_sorted = sorted(root_scores, key=lambda x: x[2], reverse=True)
1243
+
1244
+ selected = beam[0][0][t] if beam and beam[0][0] else ""
1245
+ selected_total = beam[0][1] if beam else float("-inf")
1246
+ candidate_breakdown = [item[1] for item in root_scores_sorted]
1247
+
1248
+ diagnostics.append(WordDiagnostic(
1249
+ step_index=t,
1250
+ input_word=words[t],
1251
+ rule_output=rule_out,
1252
+ selected_candidate=selected,
1253
+ beam_score=selected_total,
1254
+ candidate_breakdown=candidate_breakdown,
1255
+ ))
1256
+
1257
  result = " ".join(beam[0][0]) if beam else ""
1258
+ return result, trace_logs, diagnostics