import re from transformers import pipeline SEGMENTATION_MODEL_NAME = "chuuhtetnaing/myanmar-text-segmentation-model" classifier = pipeline("token-classification", model=SEGMENTATION_MODEL_NAME, grouped_entities=True) def reconstruct(tokens, labels): """ Combine tokens based on B/I labels. Add space before 'B' tokens (except the first one). """ result = [] for token, label in zip(tokens, labels): if label == "B" and result: result.append(" ") result.append(token) return "".join(result) def has_myanmar(text): return bool(re.search(r"[\u1000-\u109F]", text)) def has_latin(text): return bool(re.search(r"[a-zA-Z0-9]", text)) def split_myanmar_latin(chunk): # noqa: C901 """ Split chunk at Myanmar/Latin boundaries. - Opening brackets attach to NEXT letter's script - Closing brackets attach to PREVIOUS letter's script - Other symbols attach to NEXT letter's script """ if not (has_myanmar(chunk) and has_latin(chunk)): return [chunk] opening_brackets = set("([{<") closing_brackets = set(")]}>") # First pass: determine script type for each character char_scripts = [] for char in chunk: if re.match(r"[\u1000-\u109F]", char): char_scripts.append("myanmar") elif re.match(r"[a-zA-Z0-9]", char): char_scripts.append("latin") else: char_scripts.append(None) # symbol # Second pass: assign symbols to appropriate script assigned_scripts = char_scripts.copy() for i, (char, script) in enumerate(zip(chunk, char_scripts)): if script is None: # symbol if char in opening_brackets: # Opening bracket: attach to NEXT letter's script for j in range(i + 1, len(chunk)): if char_scripts[j] is not None: assigned_scripts[i] = char_scripts[j] break elif char in closing_brackets: # Closing bracket: attach to PREVIOUS letter's script for j in range(i - 1, -1, -1): if char_scripts[j] is not None: assigned_scripts[i] = char_scripts[j] break else: # Other symbols: attach to NEXT, fallback to PREVIOUS for j in range(i + 1, len(chunk)): if char_scripts[j] is not None: assigned_scripts[i] = char_scripts[j] break else: for j in range(i - 1, -1, -1): if char_scripts[j] is not None: assigned_scripts[i] = char_scripts[j] break # Third pass: group consecutive same-script characters result = [] current = "" current_script = None for char, script in zip(chunk, assigned_scripts): if current_script is None: current = char current_script = script elif script == current_script or script is None: current += char else: if current: result.append(current) current = char current_script = script if current: result.append(current) return result def preprocess(text): tokens = [] for chunk in text.split(): parts = split_myanmar_latin(chunk) tokens.extend(parts) return tokens def segment(text): tokens = classifier(text) segmented_text = [] for item in tokens: if item["entity_group"] == "B": segmented_text.append(item["word"]) else: # 'I' - append to previous word segmented_text[-1] += item["word"] segmented_text = " ".join(segmented_text) return segmented_text