chuuhtetnaing commited on
Commit
eb69d70
·
1 Parent(s): dce85e5

add option for preserving english text

Browse files
Files changed (2) hide show
  1. app.py +16 -20
  2. segmentation.py +11 -7
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import gradio as gr
2
 
3
- from segmentation import classifier, has_myanmar, preprocess, segment
4
 
5
 
6
- def segment_text_heuristics_supplement_way(text: str) -> str:
7
- """Segment Myanmar text."""
8
  tokens = preprocess(text)
9
 
10
  segmented_texts = []
@@ -19,21 +19,11 @@ def segment_text_heuristics_supplement_way(text: str) -> str:
19
  return result
20
 
21
 
22
- def segment_text(text: str) -> str:
23
- """Segment Myanmar text."""
24
- tokens = classifier(text)
25
-
26
- segmented_words = []
27
-
28
- for item in tokens:
29
- if item["entity_group"] == "B":
30
- segmented_words.append(item["word"])
31
- else: # 'I' - append to previous word
32
- segmented_words[-1] += item["word"]
33
-
34
- result = " ".join(segmented_words)
35
-
36
- return result
37
 
38
 
39
  css = """
@@ -69,6 +59,12 @@ with gr.Blocks(css=css) as demo:
69
  input_text = gr.Textbox(label="Input Text", placeholder="Enter Myanmar text here...", lines=8)
70
  output_text = gr.Textbox(label="Segmented Text", lines=8)
71
 
 
 
 
 
 
 
72
  run_button = gr.Button("Segment", variant="primary")
73
 
74
  gr.Examples(
@@ -92,8 +88,8 @@ with gr.Blocks(css=css) as demo:
92
  inputs=input_text,
93
  )
94
 
95
- run_button.click(fn=segment_text, inputs=input_text, outputs=output_text)
96
- input_text.submit(fn=segment_text, inputs=input_text, outputs=output_text)
97
 
98
  if __name__ == "__main__":
99
  demo.launch()
 
1
  import gradio as gr
2
 
3
+ from segmentation import has_myanmar, preprocess, segment
4
 
5
 
6
+ def segment_myanmar_only(text: str) -> str:
7
+ """Segment only Myanmar text, preserving English/Latin characters as-is."""
8
  tokens = preprocess(text)
9
 
10
  segmented_texts = []
 
19
  return result
20
 
21
 
22
+ def process_text(text: str, should_preserve_english: bool) -> str:
23
+ """Process text based on the selected mode."""
24
+ if should_preserve_english:
25
+ return segment_myanmar_only(text)
26
+ return segment(text)
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  css = """
 
59
  input_text = gr.Textbox(label="Input Text", placeholder="Enter Myanmar text here...", lines=8)
60
  output_text = gr.Textbox(label="Segmented Text", lines=8)
61
 
62
+ preserve_english = gr.Checkbox(
63
+ label="Preserve English text",
64
+ value=False,
65
+ info="Only segment Myanmar text and add spaces between English/Myanmar boundaries",
66
+ )
67
+
68
  run_button = gr.Button("Segment", variant="primary")
69
 
70
  gr.Examples(
 
88
  inputs=input_text,
89
  )
90
 
91
+ run_button.click(fn=process_text, inputs=[input_text, preserve_english], outputs=output_text)
92
+ input_text.submit(fn=process_text, inputs=[input_text, preserve_english], outputs=output_text)
93
 
94
  if __name__ == "__main__":
95
  demo.launch()
segmentation.py CHANGED
@@ -114,11 +114,15 @@ def preprocess(text):
114
 
115
 
116
  def segment(text):
117
- classifier_result = classifier(text)
118
- words = [r["word"] for r in classifier_result]
119
- entities = [r["entity"] for r in classifier_result]
120
- result = reconstruct(words, entities)
121
- result = result.replace("▁", " ")
122
- result = re.sub(r"\s+", " ", result).strip()
123
 
124
- return result
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  def segment(text):
117
+ tokens = classifier(text)
118
+ segmented_text = []
 
 
 
 
119
 
120
+ for item in tokens:
121
+ if item["entity_group"] == "B":
122
+ segmented_text.append(item["word"])
123
+ else: # 'I' - append to previous word
124
+ segmented_text[-1] += item["word"]
125
+
126
+ segmented_text = " ".join(segmented_text)
127
+
128
+ return segmented_text