WWHO / router.py
thekusaldarshana's picture
Seperate Before you Compress
e51bea7
"""
==========================================
Code-Switching Router
==========================================
"""
from __future__ import annotations
import re
from dataclasses import dataclass
import tiktoken
@dataclass
class TextSegment:
text: str
language: str
has_leading_space: bool = False
# ---------------------------------------------------------------------------
# Segmenter
# ---------------------------------------------------------------------------
class CodeSwitchSegmenter:
def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None):
"""
language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive
"""
self._ranges: list[tuple[int, int, str]] = []
if language_blocks:
for lang, blocks in language_blocks.items():
for start, end in blocks:
self._ranges.append((start, end, lang))
def _get_char_language(self, ch: str) -> Optional[str]:
if ch in ('\u200C', '\u200D'):
return "__joiner__"
cp = ord(ch)
for start, end, lang in self._ranges:
if start <= cp <= end:
return lang
return None
def segment(self, text: str) -> list[TextSegment]:
if not text:
return []
segments: list[TextSegment] = []
n = len(text)
pos = 0
while pos < n:
ch = text[pos]
ch_lang = self._get_char_language(ch)
is_indic_start = (ch_lang is not None)
if not is_indic_start:
# ─── 1. Accumulate Latin block ───
start = pos
while pos < n:
ch2 = text[pos]
lang2 = self._get_char_language(ch2)
if lang2 is not None and lang2 != "__joiner__":
break
pos += 1
latino_only = text[start:pos]
has_ls = False
if pos < n and latino_only.endswith(" "):
latino_only = latino_only[:-1]
has_ls = True
if latino_only:
segments.append(TextSegment(text=latino_only, language="latin"))
if has_ls and pos < n:
indic_start = pos
current_lang = self._get_char_language(text[pos])
if current_lang == "__joiner__" or current_lang is None:
current_lang = "__unknown__"
while pos < n:
c = text[pos]
c_lang = self._get_char_language(c)
if c_lang == "__joiner__":
pos += 1
elif c_lang is not None:
if current_lang == "__unknown__":
current_lang = c_lang
elif c_lang != current_lang:
break
pos += 1
else:
break
segments.append(TextSegment(
text=text[indic_start:pos],
language=current_lang,
has_leading_space=True
))
else:
indic_start = pos
current_lang = ch_lang
while pos < n:
c = text[pos]
c_lang = self._get_char_language(c)
if c_lang == "__joiner__":
pos += 1
elif c_lang is not None:
if c_lang != current_lang:
break
pos += 1
else:
break
segments.append(TextSegment(
text=text[indic_start:pos],
language=current_lang,
has_leading_space=False
))
return segments
# ---------------------------------------------------------------------------
# Self-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
test_cases = [
# Pure Sinhala
"ශ්‍රී ලංකාව",
# Pure English
"Hello, world!",
# Mixed — English then Sinhala
"The capital is කොළඹ.",
# Mixed — Sinhala then English
"ලංකාව is beautiful.",
# Mixed — Devanagari
"Hello नमस्ते world",
# Code-switching with numbers
"2026 AI සහ machine learning",
# Boundary space edge-case
"GPT-4 ශ්‍රී ලංකා",
# Dense Sinhala
"ආචාර්යවරයාගේ වෛද්‍ය විද්‍යා පර්යේෂණය සාර්ථකයි.",
# Dense Devanagari
"विद्यालय में पढ़ाई होती है।",
# Multi-script sentence
"AI (Artificial Intelligence) සහ देवनागरी text.",
]
language_blocks = {
"sinhala": [(0x0d80, 0x0dff)],
"devanagari": [(0x0900, 0x097f)]
}
seg = CodeSwitchSegmenter(language_blocks)
for text in test_cases:
blocks = seg.segment(text)
print(f"\n Input : {text!r}")
print(f" Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}")