File size: 3,870 Bytes
8d7564e 975e2cd 8d7564e 1c1c7e0 8d7564e f140aab 975e2cd 8d7564e 975e2cd 170531c 975e2cd 8d7564e 975e2cd 8d7564e 975e2cd 8d7564e 975e2cd d116b47 eb69d70 975e2cd eb69d70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import re
from transformers import pipeline
SEGMENTATION_MODEL_NAME = "chuuhtetnaing/myanmar-text-segmentation-model"
classifier = pipeline("token-classification", model=SEGMENTATION_MODEL_NAME, grouped_entities=True)
def reconstruct(tokens, labels):
"""
Combine tokens based on B/I labels.
Add space before 'B' tokens (except the first one).
"""
result = []
for token, label in zip(tokens, labels):
if label == "B" and result:
result.append(" ")
result.append(token)
return "".join(result)
def has_myanmar(text):
return bool(re.search(r"[\u1000-\u109F]", text))
def has_latin(text):
return bool(re.search(r"[a-zA-Z0-9]", text))
def split_myanmar_latin(chunk): # noqa: C901
"""
Split chunk at Myanmar/Latin boundaries.
- Opening brackets attach to NEXT letter's script
- Closing brackets attach to PREVIOUS letter's script
- Other symbols attach to NEXT letter's script
"""
if not (has_myanmar(chunk) and has_latin(chunk)):
return [chunk]
opening_brackets = set("([{<")
closing_brackets = set(")]}>")
# First pass: determine script type for each character
char_scripts = []
for char in chunk:
if re.match(r"[\u1000-\u109F]", char):
char_scripts.append("myanmar")
elif re.match(r"[a-zA-Z0-9]", char):
char_scripts.append("latin")
else:
char_scripts.append(None) # symbol
# Second pass: assign symbols to appropriate script
assigned_scripts = char_scripts.copy()
for i, (char, script) in enumerate(zip(chunk, char_scripts)):
if script is None: # symbol
if char in opening_brackets:
# Opening bracket: attach to NEXT letter's script
for j in range(i + 1, len(chunk)):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
elif char in closing_brackets:
# Closing bracket: attach to PREVIOUS letter's script
for j in range(i - 1, -1, -1):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
else:
# Other symbols: attach to NEXT, fallback to PREVIOUS
for j in range(i + 1, len(chunk)):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
else:
for j in range(i - 1, -1, -1):
if char_scripts[j] is not None:
assigned_scripts[i] = char_scripts[j]
break
# Third pass: group consecutive same-script characters
result = []
current = ""
current_script = None
for char, script in zip(chunk, assigned_scripts):
if current_script is None:
current = char
current_script = script
elif script == current_script or script is None:
current += char
else:
if current:
result.append(current)
current = char
current_script = script
if current:
result.append(current)
return result
def preprocess(text):
tokens = []
for chunk in text.split():
parts = split_myanmar_latin(chunk)
tokens.extend(parts)
return tokens
def segment(text):
tokens = classifier(text)
segmented_text = []
for item in tokens:
if item["entity_group"] == "B":
segmented_text.append(item["word"])
else: # 'I' - append to previous word
segmented_text[-1] += item["word"]
segmented_text = " ".join(segmented_text)
return segmented_text
|