Commit ·
eb69d70
1
Parent(s): dce85e5
add option for preserving english text
Browse files- app.py +16 -20
- segmentation.py +11 -7
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
-
from segmentation import
|
| 4 |
|
| 5 |
|
| 6 |
-
def
|
| 7 |
-
"""Segment Myanmar text."""
|
| 8 |
tokens = preprocess(text)
|
| 9 |
|
| 10 |
segmented_texts = []
|
|
@@ -19,21 +19,11 @@ def segment_text_heuristics_supplement_way(text: str) -> str:
|
|
| 19 |
return result
|
| 20 |
|
| 21 |
|
| 22 |
-
def
|
| 23 |
-
"""
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
for item in tokens:
|
| 29 |
-
if item["entity_group"] == "B":
|
| 30 |
-
segmented_words.append(item["word"])
|
| 31 |
-
else: # 'I' - append to previous word
|
| 32 |
-
segmented_words[-1] += item["word"]
|
| 33 |
-
|
| 34 |
-
result = " ".join(segmented_words)
|
| 35 |
-
|
| 36 |
-
return result
|
| 37 |
|
| 38 |
|
| 39 |
css = """
|
|
@@ -69,6 +59,12 @@ with gr.Blocks(css=css) as demo:
|
|
| 69 |
input_text = gr.Textbox(label="Input Text", placeholder="Enter Myanmar text here...", lines=8)
|
| 70 |
output_text = gr.Textbox(label="Segmented Text", lines=8)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
run_button = gr.Button("Segment", variant="primary")
|
| 73 |
|
| 74 |
gr.Examples(
|
|
@@ -92,8 +88,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 92 |
inputs=input_text,
|
| 93 |
)
|
| 94 |
|
| 95 |
-
run_button.click(fn=
|
| 96 |
-
input_text.submit(fn=
|
| 97 |
|
| 98 |
if __name__ == "__main__":
|
| 99 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
+
from segmentation import has_myanmar, preprocess, segment
|
| 4 |
|
| 5 |
|
| 6 |
+
def segment_myanmar_only(text: str) -> str:
|
| 7 |
+
"""Segment only Myanmar text, preserving English/Latin characters as-is."""
|
| 8 |
tokens = preprocess(text)
|
| 9 |
|
| 10 |
segmented_texts = []
|
|
|
|
| 19 |
return result
|
| 20 |
|
| 21 |
|
| 22 |
+
def process_text(text: str, should_preserve_english: bool) -> str:
|
| 23 |
+
"""Process text based on the selected mode."""
|
| 24 |
+
if should_preserve_english:
|
| 25 |
+
return segment_myanmar_only(text)
|
| 26 |
+
return segment(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
css = """
|
|
|
|
| 59 |
input_text = gr.Textbox(label="Input Text", placeholder="Enter Myanmar text here...", lines=8)
|
| 60 |
output_text = gr.Textbox(label="Segmented Text", lines=8)
|
| 61 |
|
| 62 |
+
preserve_english = gr.Checkbox(
|
| 63 |
+
label="Preserve English text",
|
| 64 |
+
value=False,
|
| 65 |
+
info="Only segment Myanmar text and add spaces between English/Myanmar boundaries",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
run_button = gr.Button("Segment", variant="primary")
|
| 69 |
|
| 70 |
gr.Examples(
|
|
|
|
| 88 |
inputs=input_text,
|
| 89 |
)
|
| 90 |
|
| 91 |
+
run_button.click(fn=process_text, inputs=[input_text, preserve_english], outputs=output_text)
|
| 92 |
+
input_text.submit(fn=process_text, inputs=[input_text, preserve_english], outputs=output_text)
|
| 93 |
|
| 94 |
if __name__ == "__main__":
|
| 95 |
demo.launch()
|
segmentation.py
CHANGED
|
@@ -114,11 +114,15 @@ def preprocess(text):
|
|
| 114 |
|
| 115 |
|
| 116 |
def segment(text):
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
entities = [r["entity"] for r in classifier_result]
|
| 120 |
-
result = reconstruct(words, entities)
|
| 121 |
-
result = result.replace("▁", " ")
|
| 122 |
-
result = re.sub(r"\s+", " ", result).strip()
|
| 123 |
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
def segment(text):
|
| 117 |
+
tokens = classifier(text)
|
| 118 |
+
segmented_text = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
for item in tokens:
|
| 121 |
+
if item["entity_group"] == "B":
|
| 122 |
+
segmented_text.append(item["word"])
|
| 123 |
+
else: # 'I' - append to previous word
|
| 124 |
+
segmented_text[-1] += item["word"]
|
| 125 |
+
|
| 126 |
+
segmented_text = " ".join(segmented_text)
|
| 127 |
+
|
| 128 |
+
return segmented_text
|