File size: 5,749 Bytes
fa9c240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e51bea7
 
fa9c240
 
 
 
 
 
 
e59ea28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa9c240
 
 
 
 
 
 
 
 
 
e59ea28
fa9c240
e59ea28
fa9c240
 
 
 
 
 
e59ea28
 
e51bea7
fa9c240
 
e59ea28
fa9c240
 
e59ea28
 
fa9c240
 
e59ea28
 
fa9c240
 
 
e59ea28
 
e51bea7
fa9c240
 
 
e59ea28
 
fa9c240
e59ea28
 
e51bea7
e59ea28
fa9c240
 
 
 
 
 
 
e59ea28
fa9c240
 
 
 
e59ea28
fa9c240
 
 
e59ea28
 
fa9c240
e59ea28
 
fa9c240
 
 
 
 
 
 
e59ea28
fa9c240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e59ea28
 
 
 
 
 
fa9c240
 
 
e59ea28
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
==========================================
Code-Switching Router
==========================================
"""

from __future__ import annotations

import re
from dataclasses import dataclass

import tiktoken

@dataclass
class TextSegment:
    text: str
    language: str                    
    has_leading_space: bool = False   


# ---------------------------------------------------------------------------
# Segmenter
# ---------------------------------------------------------------------------

class CodeSwitchSegmenter:
    def __init__(self, language_blocks: dict[str, list[tuple[int, int]]] = None):
        """
        language_blocks: maps language name (e.g. 'sinhala') to a list of (start_cp, end_cp) inclusive
        """
        self._ranges: list[tuple[int, int, str]] = []
        if language_blocks:
            for lang, blocks in language_blocks.items():
                for start, end in blocks:
                    self._ranges.append((start, end, lang))

    def _get_char_language(self, ch: str) -> Optional[str]:
        if ch in ('\u200C', '\u200D'):
            return "__joiner__"
        cp = ord(ch)
        for start, end, lang in self._ranges:
            if start <= cp <= end:
                return lang
        return None

    def segment(self, text: str) -> list[TextSegment]:
        if not text:
            return []

        segments: list[TextSegment] = []
        n = len(text)
        pos = 0

        while pos < n:
            ch = text[pos]
            ch_lang = self._get_char_language(ch)

            is_indic_start = (ch_lang is not None)

            if not is_indic_start:
                # ─── 1. Accumulate Latin block ───
                start = pos
                while pos < n:
                    ch2 = text[pos]
                    lang2 = self._get_char_language(ch2)
                    if lang2 is not None and lang2 != "__joiner__":
                        break  
                    pos += 1
                
                latino_only = text[start:pos]
                
                has_ls = False
                if pos < n and latino_only.endswith(" "):
                    latino_only = latino_only[:-1]
                    has_ls = True
                
                if latino_only:
                    segments.append(TextSegment(text=latino_only, language="latin"))

                if has_ls and pos < n:
                    indic_start = pos
                    current_lang = self._get_char_language(text[pos])
                    if current_lang == "__joiner__" or current_lang is None:
                        current_lang = "__unknown__"
                    
                    while pos < n:
                        c = text[pos]
                        c_lang = self._get_char_language(c)
                        if c_lang == "__joiner__":
                            pos += 1
                        elif c_lang is not None:
                            if current_lang == "__unknown__":
                                current_lang = c_lang
                            elif c_lang != current_lang:
                                break
                            pos += 1
                        else:
                            break
                            
                    segments.append(TextSegment(
                        text=text[indic_start:pos],
                        language=current_lang,
                        has_leading_space=True
                    ))
            else:
                indic_start = pos
                current_lang = ch_lang
                
                while pos < n:
                    c = text[pos]
                    c_lang = self._get_char_language(c)
                    if c_lang == "__joiner__":
                        pos += 1
                    elif c_lang is not None:
                        if c_lang != current_lang:
                            break
                        pos += 1
                    else:
                        break
                        
                segments.append(TextSegment(
                    text=text[indic_start:pos],
                    language=current_lang,
                    has_leading_space=False
                ))

        return segments

# ---------------------------------------------------------------------------
# Self-test
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    test_cases = [
        # Pure Sinhala
        "ශ්‍රී ලංකාව",
        # Pure English
        "Hello, world!",
        # Mixed — English then Sinhala
        "The capital is කොළඹ.",
        # Mixed — Sinhala then English
        "ලංකාව is beautiful.",
        # Mixed — Devanagari
        "Hello नमस्ते world",
        # Code-switching with numbers
        "2026 AI සහ machine learning",
        # Boundary space edge-case
        "GPT-4 ශ්‍රී ලංකා",
        # Dense Sinhala
        "ආචාර්යවරයාගේ වෛද්‍ය විද්‍යා පර්යේෂණය සාර්ථකයි.",
        # Dense Devanagari
        "विद्यालय में पढ़ाई होती है।",
        # Multi-script sentence
        "AI (Artificial Intelligence) සහ देवनागरी text.",
    ]

    language_blocks = {
        "sinhala": [(0x0d80, 0x0dff)],
        "devanagari": [(0x0900, 0x097f)]
    }
    seg = CodeSwitchSegmenter(language_blocks)
    
    for text in test_cases:
        blocks = seg.segment(text)
        print(f"\n  Input  : {text!r}")
        print(f"  Blocks : {[(b.text, b.language, b.has_leading_space) for b in blocks]}")