File size: 3,870 Bytes
8d7564e
 
975e2cd
 
8d7564e
1c1c7e0
8d7564e
f140aab
975e2cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d7564e
975e2cd
 
 
170531c
975e2cd
 
8d7564e
975e2cd
 
 
 
 
 
 
 
 
8d7564e
 
975e2cd
 
 
 
8d7564e
 
 
 
975e2cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d116b47
eb69d70
 
975e2cd
eb69d70
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re

from transformers import pipeline


SEGMENTATION_MODEL_NAME = "chuuhtetnaing/myanmar-text-segmentation-model"

classifier = pipeline("token-classification", model=SEGMENTATION_MODEL_NAME, grouped_entities=True)


def reconstruct(tokens, labels):
    """
    Combine tokens based on B/I labels.
    Add space before 'B' tokens (except the first one).
    """
    result = []
    for token, label in zip(tokens, labels):
        if label == "B" and result:
            result.append(" ")
        result.append(token)
    return "".join(result)


def has_myanmar(text):
    return bool(re.search(r"[\u1000-\u109F]", text))


def has_latin(text):
    return bool(re.search(r"[a-zA-Z0-9]", text))


def split_myanmar_latin(chunk):  # noqa: C901
    """
    Split chunk at Myanmar/Latin boundaries.
    - Opening brackets attach to NEXT letter's script
    - Closing brackets attach to PREVIOUS letter's script
    - Other symbols attach to NEXT letter's script
    """
    if not (has_myanmar(chunk) and has_latin(chunk)):
        return [chunk]

    opening_brackets = set("([{<")
    closing_brackets = set(")]}>")

    # First pass: determine script type for each character
    char_scripts = []
    for char in chunk:
        if re.match(r"[\u1000-\u109F]", char):
            char_scripts.append("myanmar")
        elif re.match(r"[a-zA-Z0-9]", char):
            char_scripts.append("latin")
        else:
            char_scripts.append(None)  # symbol

    # Second pass: assign symbols to appropriate script
    assigned_scripts = char_scripts.copy()

    for i, (char, script) in enumerate(zip(chunk, char_scripts)):
        if script is None:  # symbol
            if char in opening_brackets:
                # Opening bracket: attach to NEXT letter's script
                for j in range(i + 1, len(chunk)):
                    if char_scripts[j] is not None:
                        assigned_scripts[i] = char_scripts[j]
                        break
            elif char in closing_brackets:
                # Closing bracket: attach to PREVIOUS letter's script
                for j in range(i - 1, -1, -1):
                    if char_scripts[j] is not None:
                        assigned_scripts[i] = char_scripts[j]
                        break
            else:
                # Other symbols: attach to NEXT, fallback to PREVIOUS
                for j in range(i + 1, len(chunk)):
                    if char_scripts[j] is not None:
                        assigned_scripts[i] = char_scripts[j]
                        break
                else:
                    for j in range(i - 1, -1, -1):
                        if char_scripts[j] is not None:
                            assigned_scripts[i] = char_scripts[j]
                            break

    # Third pass: group consecutive same-script characters
    result = []
    current = ""
    current_script = None

    for char, script in zip(chunk, assigned_scripts):
        if current_script is None:
            current = char
            current_script = script
        elif script == current_script or script is None:
            current += char
        else:
            if current:
                result.append(current)
            current = char
            current_script = script

    if current:
        result.append(current)

    return result


def preprocess(text):
    tokens = []
    for chunk in text.split():
        parts = split_myanmar_latin(chunk)
        tokens.extend(parts)

    return tokens


def segment(text):
    tokens = classifier(text)
    segmented_text = []

    for item in tokens:
        if item["entity_group"] == "B":
            segmented_text.append(item["word"])
        else:  # 'I' - append to previous word
            segmented_text[-1] += item["word"]

    segmented_text = " ".join(segmented_text)

    return segmented_text