chuuhtetnaing's picture
fixed the typo
f140aab
import re
from transformers import pipeline
SEGMENTATION_MODEL_NAME = "chuuhtetnaing/myanmar-text-segmentation-model"
classifier = pipeline("token-classification", model=SEGMENTATION_MODEL_NAME, grouped_entities=True)
def reconstruct(tokens, labels):
"""
Combine tokens based on B/I labels.
Add space before 'B' tokens (except the first one).
"""
result = []
for token, label in zip(tokens, labels):
if label == "B" and result:
result.append(" ")
result.append(token)
return "".join(result)
def has_myanmar(text):
return bool(re.search(r"[\u1000-\u109F]", text))
def has_latin(text):
return bool(re.search(r"[a-zA-Z0-9]", text))
def split_myanmar_latin(chunk): # noqa: C901
"""
Split chunk at Myanmar/Latin boundaries.
- Opening brackets attach to NEXT letter's script
- Closing brackets attach to PREVIOUS letter's script
- Other symbols attach to NEXT letter's script
"""
if not (has_myanmar(chunk) and has_latin(chunk)):
return [chunk]
opening_brackets = set("([{<")
closing_brackets = set(")]}>")
# First pass: determine script type for each character
char_scripts = []
for char in chunk:
if re.match(r"[\u1000-\u109F]", char):
char_scripts.append("myanmar")
elif re.match(r"[a-zA-Z0-9]", char):
char_scripts.append("latin")
else:
char_scripts.append(None) # symbol
# Second pass: assign symbols to appropriate script
assigned_scripts = char_scripts.copy()
for i, (char, script) in enumerate(zip(chunk, char_scripts)):
if script is None: # symbol
if char in opening_brackets:
# Opening bracket: attach to NEXT letter's script
for j in range(i + 1, len(chunk)):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
elif char in closing_brackets:
# Closing bracket: attach to PREVIOUS letter's script
for j in range(i - 1, -1, -1):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
else:
# Other symbols: attach to NEXT, fallback to PREVIOUS
for j in range(i + 1, len(chunk)):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
else:
for j in range(i - 1, -1, -1):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
# Third pass: group consecutive same-script characters
result = []
current = ""
current_script = None
for char, script in zip(chunk, assigned_scripts):
if current_script is None:
current = char
current_script = script
elif script == current_script or script is None:
current += char
else:
if current:
result.append(current)
current = char
current_script = script
if current:
result.append(current)
return result
def preprocess(text):
tokens = []
for chunk in text.split():
parts = split_myanmar_latin(chunk)
tokens.extend(parts)
return tokens
def segment(text):
tokens = classifier(text)
segmented_text = []
for item in tokens:
if item["entity_group"] == "B":
segmented_text.append(item["word"])
else: # 'I' - append to previous word
segmented_text[-1] += item["word"]
segmented_text = " ".join(segmented_text)
return segmented_text