File size: 5,749 Bytes
fa9c240 e51bea7 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 e51bea7 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 e51bea7 fa9c240 e59ea28 fa9c240 e59ea28 e51bea7 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 fa9c240 e59ea28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
==========================================
Code-Switching Router
==========================================
"""
from __future__ import annotations
import re
from dataclasses import dataclass
import tiktoken
@dataclass
class TextSegment:
text: str
language: str
has_leading_space: bool = False
# ---------------------------------------------------------------------------
# Segmenter
# ---------------------------------------------------------------------------
class CodeSwitchSegmenter:
def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None):
"""
language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive
"""
self._ranges: list[tuple[int, int, str]] = []
if language_blocks:
for lang, blocks in language_blocks.items():
for start, end in blocks:
self._ranges.append((start, end, lang))
def _get_char_language(self, ch: str) -> Optional[str]:
if ch in ('\u200C', '\u200D'):
return "__joiner__"
cp = ord(ch)
for start, end, lang in self._ranges:
if start <= cp <= end:
return lang
return None
def segment(self, text: str) -> list[TextSegment]:
if not text:
return []
segments: list[TextSegment] = []
n = len(text)
pos = 0
while pos < n:
ch = text[pos]
ch_lang = self._get_char_language(ch)
is_indic_start = (ch_lang is not None)
if not is_indic_start:
# ─── 1. Accumulate Latin block ───
start = pos
while pos < n:
ch2 = text[pos]
lang2 = self._get_char_language(ch2)
if lang2 is not None and lang2 != "__joiner__":
break
pos += 1
latino_only = text[start:pos]
has_ls = False
if pos < n and latino_only.endswith(" "):
latino_only = latino_only[:-1]
has_ls = True
if latino_only:
segments.append(TextSegment(text=latino_only, language="latin"))
if has_ls and pos < n:
indic_start = pos
current_lang = self._get_char_language(text[pos])
if current_lang == "__joiner__" or current_lang is None:
current_lang = "__unknown__"
while pos < n:
c = text[pos]
c_lang = self._get_char_language(c)
if c_lang == "__joiner__":
pos += 1
elif c_lang is not None:
if current_lang == "__unknown__":
current_lang = c_lang
elif c_lang != current_lang:
break
pos += 1
else:
break
segments.append(TextSegment(
text=text[indic_start:pos],
language=current_lang,
has_leading_space=True
))
else:
indic_start = pos
current_lang = ch_lang
while pos < n:
c = text[pos]
c_lang = self._get_char_language(c)
if c_lang == "__joiner__":
pos += 1
elif c_lang is not None:
if c_lang != current_lang:
break
pos += 1
else:
break
segments.append(TextSegment(
text=text[indic_start:pos],
language=current_lang,
has_leading_space=False
))
return segments
# ---------------------------------------------------------------------------
# Self-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
test_cases = [
# Pure Sinhala
"ශ්රී ලංකාව",
# Pure English
"Hello, world!",
# Mixed — English then Sinhala
"The capital is කොළඹ.",
# Mixed — Sinhala then English
"ලංකාව is beautiful.",
# Mixed — Devanagari
"Hello नमस्ते world",
# Code-switching with numbers
"2026 AI සහ machine learning",
# Boundary space edge-case
"GPT-4 ශ්රී ලංකා",
# Dense Sinhala
"ආචාර්යවරයාගේ වෛද්ය විද්යා පර්යේෂණය සාර්ථකයි.",
# Dense Devanagari
"विद्यालय में पढ़ाई होती है।",
# Multi-script sentence
"AI (Artificial Intelligence) සහ देवनागरी text.",
]
language_blocks = {
"sinhala": [(0x0d80, 0x0dff)],
"devanagari": [(0x0900, 0x097f)]
}
seg = CodeSwitchSegmenter(language_blocks)
for text in test_cases:
blocks = seg.segment(text)
print(f"\n Input : {text!r}")
print(f" Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}")
|