|
|
import re |
|
|
|
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
SEGMENTATION_MODEL_NAME = "chuuhtetnaing/myanmar-text-segmentation-model" |
|
|
|
|
|
classifier = pipeline("token-classification", model=SEGMENTATION_MODEL_NAME, grouped_entities=True) |
|
|
|
|
|
|
|
|
def reconstruct(tokens, labels): |
|
|
""" |
|
|
Combine tokens based on B/I labels. |
|
|
Add space before 'B' tokens (except the first one). |
|
|
""" |
|
|
result = [] |
|
|
for token, label in zip(tokens, labels): |
|
|
if label == "B" and result: |
|
|
result.append(" ") |
|
|
result.append(token) |
|
|
return "".join(result) |
|
|
|
|
|
|
|
|
def has_myanmar(text): |
|
|
return bool(re.search(r"[\u1000-\u109F]", text)) |
|
|
|
|
|
|
|
|
def has_latin(text): |
|
|
return bool(re.search(r"[a-zA-Z0-9]", text)) |
|
|
|
|
|
|
|
|
def split_myanmar_latin(chunk): |
|
|
""" |
|
|
Split chunk at Myanmar/Latin boundaries. |
|
|
- Opening brackets attach to NEXT letter's script |
|
|
- Closing brackets attach to PREVIOUS letter's script |
|
|
- Other symbols attach to NEXT letter's script |
|
|
""" |
|
|
if not (has_myanmar(chunk) and has_latin(chunk)): |
|
|
return [chunk] |
|
|
|
|
|
opening_brackets = set("([{<") |
|
|
closing_brackets = set(")]}>") |
|
|
|
|
|
|
|
|
char_scripts = [] |
|
|
for char in chunk: |
|
|
if re.match(r"[\u1000-\u109F]", char): |
|
|
char_scripts.append("myanmar") |
|
|
elif re.match(r"[a-zA-Z0-9]", char): |
|
|
char_scripts.append("latin") |
|
|
else: |
|
|
char_scripts.append(None) |
|
|
|
|
|
|
|
|
assigned_scripts = char_scripts.copy() |
|
|
|
|
|
for i, (char, script) in enumerate(zip(chunk, char_scripts)): |
|
|
if script is None: |
|
|
if char in opening_brackets: |
|
|
|
|
|
for j in range(i + 1, len(chunk)): |
|
|
if char_scripts[j] is not None: |
|
|
assigned_scripts[i] = char_scripts[j] |
|
|
break |
|
|
elif char in closing_brackets: |
|
|
|
|
|
for j in range(i - 1, -1, -1): |
|
|
if char_scripts[j] is not None: |
|
|
assigned_scripts[i] = char_scripts[j] |
|
|
break |
|
|
else: |
|
|
|
|
|
for j in range(i + 1, len(chunk)): |
|
|
if char_scripts[j] is not None: |
|
|
assigned_scripts[i] = char_scripts[j] |
|
|
break |
|
|
else: |
|
|
for j in range(i - 1, -1, -1): |
|
|
if char_scripts[j] is not None: |
|
|
assigned_scripts[i] = char_scripts[j] |
|
|
break |
|
|
|
|
|
|
|
|
result = [] |
|
|
current = "" |
|
|
current_script = None |
|
|
|
|
|
for char, script in zip(chunk, assigned_scripts): |
|
|
if current_script is None: |
|
|
current = char |
|
|
current_script = script |
|
|
elif script == current_script or script is None: |
|
|
current += char |
|
|
else: |
|
|
if current: |
|
|
result.append(current) |
|
|
current = char |
|
|
current_script = script |
|
|
|
|
|
if current: |
|
|
result.append(current) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def preprocess(text): |
|
|
tokens = [] |
|
|
for chunk in text.split(): |
|
|
parts = split_myanmar_latin(chunk) |
|
|
tokens.extend(parts) |
|
|
|
|
|
return tokens |
|
|
|
|
|
|
|
|
def segment(text): |
|
|
tokens = classifier(text) |
|
|
segmented_text = [] |
|
|
|
|
|
for item in tokens: |
|
|
if item["entity_group"] == "B": |
|
|
segmented_text.append(item["word"]) |
|
|
else: |
|
|
segmented_text[-1] += item["word"] |
|
|
|
|
|
segmented_text = " ".join(segmented_text) |
|
|
|
|
|
return segmented_text |
|
|
|