Add Sinhala script input detection — passthrough unchanged text (FR8)
Browse files- core/decoder.py +73 -0
core/decoder.py
CHANGED
|
@@ -3,6 +3,7 @@ Beam search and greedy decoders for Singlish → Sinhala transliteration.
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import math
|
|
|
|
| 6 |
import torch
|
| 7 |
import pickle
|
| 8 |
import logging
|
|
@@ -22,6 +23,14 @@ from core.dictionary import DictionaryAdapter
|
|
| 22 |
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
class BeamSearchDecoder:
|
| 27 |
"""
|
|
@@ -210,6 +219,20 @@ class BeamSearchDecoder:
|
|
| 210 |
"dict_flags": [False],
|
| 211 |
"prefix": prefix,
|
| 212 |
"suffix": suffix,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
})
|
| 214 |
continue
|
| 215 |
|
|
@@ -242,6 +265,7 @@ class BeamSearchDecoder:
|
|
| 242 |
"dict_flags": dict_flags[:MAX_CANDIDATES],
|
| 243 |
"prefix": prefix,
|
| 244 |
"suffix": suffix,
|
|
|
|
| 245 |
})
|
| 246 |
|
| 247 |
# Build right-side stable context (rule outputs for future words)
|
|
@@ -268,6 +292,23 @@ class BeamSearchDecoder:
|
|
| 268 |
suffix = info.get("suffix", "")
|
| 269 |
total_cands = len(candidates)
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
# ── Common-word shortcut ─────────────────────────────────
|
| 272 |
core_lower = words[t].lower().strip()
|
| 273 |
if core_lower in COMMON_WORDS:
|
|
@@ -464,6 +505,19 @@ class BeamSearchDecoder:
|
|
| 464 |
"english_flags": [False],
|
| 465 |
"prefix": prefix,
|
| 466 |
"suffix": suffix,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
})
|
| 468 |
continue
|
| 469 |
|
|
@@ -495,6 +549,7 @@ class BeamSearchDecoder:
|
|
| 495 |
"dict_flags": dict_flags[:MAX_CANDIDATES],
|
| 496 |
"prefix": prefix,
|
| 497 |
"suffix": suffix,
|
|
|
|
| 498 |
})
|
| 499 |
|
| 500 |
# Build stable context (fixed for all beam paths)
|
|
@@ -521,6 +576,24 @@ class BeamSearchDecoder:
|
|
| 521 |
suffix = info.get("suffix", "")
|
| 522 |
total_cands = len(candidates)
|
| 523 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
# ── Common-word shortcut ─────────────────────────────────
|
| 525 |
core_lower = words[t].lower().strip()
|
| 526 |
if core_lower in COMMON_WORDS:
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import math
|
| 6 |
+
import re
|
| 7 |
import torch
|
| 8 |
import pickle
|
| 9 |
import logging
|
|
|
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
+
# Sinhala Unicode block: U+0D80 – U+0DFF
|
| 27 |
+
_SINHALA_RE = re.compile(r"[\u0D80-\u0DFF]")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _is_sinhala(text: str) -> bool:
|
| 31 |
+
"""Return True if the text already contains Sinhala script characters."""
|
| 32 |
+
return bool(_SINHALA_RE.search(text))
|
| 33 |
+
|
| 34 |
|
| 35 |
class BeamSearchDecoder:
|
| 36 |
"""
|
|
|
|
| 219 |
"dict_flags": [False],
|
| 220 |
"prefix": prefix,
|
| 221 |
"suffix": suffix,
|
| 222 |
+
"sinhala_passthrough": False,
|
| 223 |
+
})
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
# Already-Sinhala text: pass through unchanged
|
| 227 |
+
if _is_sinhala(core):
|
| 228 |
+
word_infos.append({
|
| 229 |
+
"candidates": [raw],
|
| 230 |
+
"rule_output": raw,
|
| 231 |
+
"english_flags": [False],
|
| 232 |
+
"dict_flags": [False],
|
| 233 |
+
"prefix": prefix,
|
| 234 |
+
"suffix": suffix,
|
| 235 |
+
"sinhala_passthrough": True,
|
| 236 |
})
|
| 237 |
continue
|
| 238 |
|
|
|
|
| 265 |
"dict_flags": dict_flags[:MAX_CANDIDATES],
|
| 266 |
"prefix": prefix,
|
| 267 |
"suffix": suffix,
|
| 268 |
+
"sinhala_passthrough": False,
|
| 269 |
})
|
| 270 |
|
| 271 |
# Build right-side stable context (rule outputs for future words)
|
|
|
|
| 292 |
suffix = info.get("suffix", "")
|
| 293 |
total_cands = len(candidates)
|
| 294 |
|
| 295 |
+
# ── Sinhala passthrough ────────────────────────────────────
|
| 296 |
+
if info.get("sinhala_passthrough"):
|
| 297 |
+
selected_words.append(words[t])
|
| 298 |
+
trace_logs.append(
|
| 299 |
+
f"**Step {t + 1}: `{words[t]}`** → "
|
| 300 |
+
f"`{words[t]}` (Sinhala passthrough)\n"
|
| 301 |
+
)
|
| 302 |
+
diagnostics.append(WordDiagnostic(
|
| 303 |
+
step_index=t,
|
| 304 |
+
input_word=words[t],
|
| 305 |
+
rule_output=rule_out,
|
| 306 |
+
selected_candidate=words[t],
|
| 307 |
+
beam_score=0.0,
|
| 308 |
+
candidate_breakdown=[],
|
| 309 |
+
))
|
| 310 |
+
continue
|
| 311 |
+
|
| 312 |
# ── Common-word shortcut ─────────────────────────────────
|
| 313 |
core_lower = words[t].lower().strip()
|
| 314 |
if core_lower in COMMON_WORDS:
|
|
|
|
| 505 |
"english_flags": [False],
|
| 506 |
"prefix": prefix,
|
| 507 |
"suffix": suffix,
|
| 508 |
+
"sinhala_passthrough": False,
|
| 509 |
+
})
|
| 510 |
+
continue
|
| 511 |
+
|
| 512 |
+
# Already-Sinhala text: pass through unchanged
|
| 513 |
+
if _is_sinhala(core):
|
| 514 |
+
word_infos.append({
|
| 515 |
+
"candidates": [raw],
|
| 516 |
+
"rule_output": raw,
|
| 517 |
+
"english_flags": [False],
|
| 518 |
+
"prefix": prefix,
|
| 519 |
+
"suffix": suffix,
|
| 520 |
+
"sinhala_passthrough": True,
|
| 521 |
})
|
| 522 |
continue
|
| 523 |
|
|
|
|
| 549 |
"dict_flags": dict_flags[:MAX_CANDIDATES],
|
| 550 |
"prefix": prefix,
|
| 551 |
"suffix": suffix,
|
| 552 |
+
"sinhala_passthrough": False,
|
| 553 |
})
|
| 554 |
|
| 555 |
# Build stable context (fixed for all beam paths)
|
|
|
|
| 576 |
suffix = info.get("suffix", "")
|
| 577 |
total_cands = len(candidates)
|
| 578 |
|
| 579 |
+
# ── Sinhala passthrough ────────────────────────────────────
|
| 580 |
+
if info.get("sinhala_passthrough"):
|
| 581 |
+
next_beam_si = [(path + [words[t]], sc) for path, sc in beam]
|
| 582 |
+
beam = next_beam_si[:beam_width]
|
| 583 |
+
trace_logs.append(
|
| 584 |
+
f"**Step {t + 1}: `{words[t]}`** → "
|
| 585 |
+
f"`{words[t]}` (Sinhala passthrough)\n"
|
| 586 |
+
)
|
| 587 |
+
diagnostics.append(WordDiagnostic(
|
| 588 |
+
step_index=t,
|
| 589 |
+
input_word=words[t],
|
| 590 |
+
rule_output=rule_out,
|
| 591 |
+
selected_candidate=words[t],
|
| 592 |
+
beam_score=beam[0][1] if beam else 0.0,
|
| 593 |
+
candidate_breakdown=[],
|
| 594 |
+
))
|
| 595 |
+
continue
|
| 596 |
+
|
| 597 |
# ── Common-word shortcut ─────────────────────────────────
|
| 598 |
core_lower = words[t].lower().strip()
|
| 599 |
if core_lower in COMMON_WORDS:
|