Kalana commited on
Commit
a31933e
·
1 Parent(s): f911765

Add Sinhala script input detection — passthrough unchanged text (FR8)

Browse files
Files changed (1) hide show
  1. core/decoder.py +73 -0
core/decoder.py CHANGED
@@ -3,6 +3,7 @@ Beam search and greedy decoders for Singlish → Sinhala transliteration.
3
  """
4
 
5
  import math
 
6
  import torch
7
  import pickle
8
  import logging
@@ -22,6 +23,14 @@ from core.dictionary import DictionaryAdapter
22
 
23
  logger = logging.getLogger(__name__)
24
 
 
 
 
 
 
 
 
 
25
 
26
  class BeamSearchDecoder:
27
  """
@@ -210,6 +219,20 @@ class BeamSearchDecoder:
210
  "dict_flags": [False],
211
  "prefix": prefix,
212
  "suffix": suffix,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  })
214
  continue
215
 
@@ -242,6 +265,7 @@ class BeamSearchDecoder:
242
  "dict_flags": dict_flags[:MAX_CANDIDATES],
243
  "prefix": prefix,
244
  "suffix": suffix,
 
245
  })
246
 
247
  # Build right-side stable context (rule outputs for future words)
@@ -268,6 +292,23 @@ class BeamSearchDecoder:
268
  suffix = info.get("suffix", "")
269
  total_cands = len(candidates)
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  # ── Common-word shortcut ─────────────────────────────────
272
  core_lower = words[t].lower().strip()
273
  if core_lower in COMMON_WORDS:
@@ -464,6 +505,19 @@ class BeamSearchDecoder:
464
  "english_flags": [False],
465
  "prefix": prefix,
466
  "suffix": suffix,
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  })
468
  continue
469
 
@@ -495,6 +549,7 @@ class BeamSearchDecoder:
495
  "dict_flags": dict_flags[:MAX_CANDIDATES],
496
  "prefix": prefix,
497
  "suffix": suffix,
 
498
  })
499
 
500
  # Build stable context (fixed for all beam paths)
@@ -521,6 +576,24 @@ class BeamSearchDecoder:
521
  suffix = info.get("suffix", "")
522
  total_cands = len(candidates)
523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  # ── Common-word shortcut ─────────────────────────────────
525
  core_lower = words[t].lower().strip()
526
  if core_lower in COMMON_WORDS:
 
3
  """
4
 
5
  import math
6
+ import re
7
  import torch
8
  import pickle
9
  import logging
 
23
 
24
  logger = logging.getLogger(__name__)
25
 
26
+ # Sinhala Unicode block: U+0D80 – U+0DFF
27
+ _SINHALA_RE = re.compile(r"[\u0D80-\u0DFF]")
28
+
29
+
30
+ def _is_sinhala(text: str) -> bool:
31
+ """Return True if the text already contains Sinhala script characters."""
32
+ return bool(_SINHALA_RE.search(text))
33
+
34
 
35
  class BeamSearchDecoder:
36
  """
 
219
  "dict_flags": [False],
220
  "prefix": prefix,
221
  "suffix": suffix,
222
+ "sinhala_passthrough": False,
223
+ })
224
+ continue
225
+
226
+ # Already-Sinhala text: pass through unchanged
227
+ if _is_sinhala(core):
228
+ word_infos.append({
229
+ "candidates": [raw],
230
+ "rule_output": raw,
231
+ "english_flags": [False],
232
+ "dict_flags": [False],
233
+ "prefix": prefix,
234
+ "suffix": suffix,
235
+ "sinhala_passthrough": True,
236
  })
237
  continue
238
 
 
265
  "dict_flags": dict_flags[:MAX_CANDIDATES],
266
  "prefix": prefix,
267
  "suffix": suffix,
268
+ "sinhala_passthrough": False,
269
  })
270
 
271
  # Build right-side stable context (rule outputs for future words)
 
292
  suffix = info.get("suffix", "")
293
  total_cands = len(candidates)
294
 
295
+ # ── Sinhala passthrough ────────────────────────────────────
296
+ if info.get("sinhala_passthrough"):
297
+ selected_words.append(words[t])
298
+ trace_logs.append(
299
+ f"**Step {t + 1}: `{words[t]}`**  → "
300
+ f"`{words[t]}` (Sinhala passthrough)\n"
301
+ )
302
+ diagnostics.append(WordDiagnostic(
303
+ step_index=t,
304
+ input_word=words[t],
305
+ rule_output=rule_out,
306
+ selected_candidate=words[t],
307
+ beam_score=0.0,
308
+ candidate_breakdown=[],
309
+ ))
310
+ continue
311
+
312
  # ── Common-word shortcut ─────────────────────────────────
313
  core_lower = words[t].lower().strip()
314
  if core_lower in COMMON_WORDS:
 
505
  "english_flags": [False],
506
  "prefix": prefix,
507
  "suffix": suffix,
508
+ "sinhala_passthrough": False,
509
+ })
510
+ continue
511
+
512
+ # Already-Sinhala text: pass through unchanged
513
+ if _is_sinhala(core):
514
+ word_infos.append({
515
+ "candidates": [raw],
516
+ "rule_output": raw,
517
+ "english_flags": [False],
518
+ "prefix": prefix,
519
+ "suffix": suffix,
520
+ "sinhala_passthrough": True,
521
  })
522
  continue
523
 
 
549
  "dict_flags": dict_flags[:MAX_CANDIDATES],
550
  "prefix": prefix,
551
  "suffix": suffix,
552
+ "sinhala_passthrough": False,
553
  })
554
 
555
  # Build stable context (fixed for all beam paths)
 
576
  suffix = info.get("suffix", "")
577
  total_cands = len(candidates)
578
 
579
+ # ── Sinhala passthrough ────────────────────────────────────
580
+ if info.get("sinhala_passthrough"):
581
+ next_beam_si = [(path + [words[t]], sc) for path, sc in beam]
582
+ beam = next_beam_si[:beam_width]
583
+ trace_logs.append(
584
+ f"**Step {t + 1}: `{words[t]}`**  → "
585
+ f"`{words[t]}` (Sinhala passthrough)\n"
586
+ )
587
+ diagnostics.append(WordDiagnostic(
588
+ step_index=t,
589
+ input_word=words[t],
590
+ rule_output=rule_out,
591
+ selected_candidate=words[t],
592
+ beam_score=beam[0][1] if beam else 0.0,
593
+ candidate_breakdown=[],
594
+ ))
595
+ continue
596
+
597
  # ── Common-word shortcut ─────────────────────────────────
598
  core_lower = words[t].lower().strip()
599
  if core_lower in COMMON_WORDS: