| | """ |
| | ========================================== |
| | Code-Switching Router |
| | ========================================== |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import re |
| | from dataclasses import dataclass |
| |
|
| | import tiktoken |
| |
|
| | @dataclass |
| | class TextSegment: |
| | text: str |
| | language: str |
| | has_leading_space: bool = False |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class CodeSwitchSegmenter: |
| | def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None): |
| | """ |
| | language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive |
| | """ |
| | self._ranges: list[tuple[int, int, str]] = [] |
| | if language_blocks: |
| | for lang, blocks in language_blocks.items(): |
| | for start, end in blocks: |
| | self._ranges.append((start, end, lang)) |
| |
|
| | def _get_char_language(self, ch: str) -> Optional[str]: |
| | if ch in ('\u200C', '\u200D'): |
| | return "__joiner__" |
| | cp = ord(ch) |
| | for start, end, lang in self._ranges: |
| | if start <= cp <= end: |
| | return lang |
| | return None |
| |
|
| | def segment(self, text: str) -> list[TextSegment]: |
| | if not text: |
| | return [] |
| |
|
| | segments: list[TextSegment] = [] |
| | n = len(text) |
| | pos = 0 |
| |
|
| | while pos < n: |
| | ch = text[pos] |
| | ch_lang = self._get_char_language(ch) |
| |
|
| | is_indic_start = (ch_lang is not None) |
| |
|
| | if not is_indic_start: |
| | |
| | start = pos |
| | while pos < n: |
| | ch2 = text[pos] |
| | lang2 = self._get_char_language(ch2) |
| | if lang2 is not None and lang2 != "__joiner__": |
| | break |
| | pos += 1 |
| | |
| | latino_only = text[start:pos] |
| | |
| | has_ls = False |
| | if pos < n and latino_only.endswith(" "): |
| | latino_only = latino_only[:-1] |
| | has_ls = True |
| | |
| | if latino_only: |
| | segments.append(TextSegment(text=latino_only, language="latin")) |
| |
|
| | if has_ls and pos < n: |
| | indic_start = pos |
| | current_lang = self._get_char_language(text[pos]) |
| | if current_lang == "__joiner__" or current_lang is None: |
| | current_lang = "__unknown__" |
| | |
| | while pos < n: |
| | c = text[pos] |
| | c_lang = self._get_char_language(c) |
| | if c_lang == "__joiner__": |
| | pos += 1 |
| | elif c_lang is not None: |
| | if current_lang == "__unknown__": |
| | current_lang = c_lang |
| | elif c_lang != current_lang: |
| | break |
| | pos += 1 |
| | else: |
| | break |
| | |
| | segments.append(TextSegment( |
| | text=text[indic_start:pos], |
| | language=current_lang, |
| | has_leading_space=True |
| | )) |
| | else: |
| | indic_start = pos |
| | current_lang = ch_lang |
| | |
| | while pos < n: |
| | c = text[pos] |
| | c_lang = self._get_char_language(c) |
| | if c_lang == "__joiner__": |
| | pos += 1 |
| | elif c_lang is not None: |
| | if c_lang != current_lang: |
| | break |
| | pos += 1 |
| | else: |
| | break |
| | |
| | segments.append(TextSegment( |
| | text=text[indic_start:pos], |
| | language=current_lang, |
| | has_leading_space=False |
| | )) |
| |
|
| | return segments |
| |
|
| | |
| | |
| | |
| |
|
| | if __name__ == "__main__": |
| | test_cases = [ |
| | |
| | "ශ්රී ලංකාව", |
| | |
| | "Hello, world!", |
| | |
| | "The capital is කොළඹ.", |
| | |
| | "ලංකාව is beautiful.", |
| | |
| | "Hello नमस्ते world", |
| | |
| | "2026 AI සහ machine learning", |
| | |
| | "GPT-4 ශ්රී ලංකා", |
| | |
| | "ආචාර්යවරයාගේ වෛද්ය විද්යා පර්යේෂණය සාර්ථකයි.", |
| | |
| | "विद्यालय में पढ़ाई होती है।", |
| | |
| | "AI (Artificial Intelligence) සහ देवनागरी text.", |
| | ] |
| |
|
| | language_blocks = { |
| | "sinhala": [(0x0d80, 0x0dff)], |
| | "devanagari": [(0x0900, 0x097f)] |
| | } |
| | seg = CodeSwitchSegmenter(language_blocks) |
| | |
| | for text in test_cases: |
| | blocks = seg.segment(text) |
| | print(f"\n Input : {text!r}") |
| | print(f" Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}") |
| |
|