File size: 6,894 Bytes
a3a328d 0df199d 23d2473 a3a328d a1fd46f a3a328d 0154f98 a3a328d 23d2473 a3a328d 23d2473 a3a328d 8c41812 a3a328d 5f1bd05 a3a328d 15eca27 a3a328d 5f1bd05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import gradio as gr
import os
import json
from morphseg import MorphemeSegmenter
# --- Global Cache for Models ---
# We load models lazily (only when a user requests a specific language)
# to prevent the app from timing out during startup.
LOADED_MODELS = {}
LANGUAGES = {
"English": "en",
"Spanish": "es",
"Russian": "ru",
"French": "fr",
"Italian": "it",
"Czech": "cs",
"Hungarian": "hu",
"Mongolian": "mn",
"Latin": "la"
}
EXAMPLES = [
["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"],
["Spanish", "desafortunadamente reescribieron rápidamente", "+"],
["Russian", "неизбежность переработки неисправима", "+"],
["French", "incompréhensible prétraitement irréversiblement", "+"],
["Italian", "incredibilmente preprocessarono inevitabilmente", "+"],
["Czech", "nepochopitelně přepracování nevratně", "+"],
["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"],
["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"],
["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"]
]
def get_segmenter(lang_code):
"""Retrieves a model from cache or loads it if not present."""
if lang_code not in LOADED_MODELS:
print(f"Loading model for {lang_code}...")
try:
LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
except Exception as e:
raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
return LOADED_MODELS[lang_code]
def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
"""Main processing function for the Gradio interface."""
# 1. Determine Input Source
content = ""
if file_input is not None:
try:
with open(file_input.name, 'r', encoding='utf-8') as f:
content = f.read()
except UnicodeDecodeError:
return "Error: File must be a text file (UTF-8).", None
else:
content = text_input
if not content or content.strip() == "":
return "Please enter text or upload a file.", None
# 2. Get Language Code and Model
lang_code = LANGUAGES.get(language_name)
if not lang_code:
return "Error: Invalid language selection.", None
segmenter = get_segmenter(lang_code)
# 3. Determine Output Format
is_output_string = (output_format == "String")
# 4. Run Segmentation
# Note: The library segment() method handles the empty string check internally
try:
result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
except Exception as e:
return f"Error during segmentation: {str(e)}", None
# 5. Format Output for Display and File Generation
display_output = ""
if is_output_string:
display_output = result
else:
# If list, pretty print it as JSON strings for readability
# If the input was a single sentence, it's a list of lists.
# If massive text, it's a large list of lists.
display_output = json.dumps(result, ensure_ascii=False, indent=2)
# 6. Create Downloadable File
output_filename = "segmented_output.txt"
# If it's JSON/List, save as .json, otherwise .txt
if not is_output_string:
output_filename = "segmented_output.json"
with open(output_filename, "w", encoding="utf-8") as f:
f.write(display_output)
return display_output, output_filename
# --- Gradio UI Construction ---
with gr.Blocks(title="MorphSeg Demo") as demo:
gr.Markdown(
"""
# 🧩 MorphSeg: Canonical Morpheme Segmentation
**MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics,
MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).
*Select a language, enter text, and see the morphemes!*
"""
)
with gr.Row():
with gr.Column(scale=1):
# Controls
lang_dropdown = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English",
label="Language",
info="Select the language of your text."
)
with gr.Tabs():
with gr.TabItem("📝 Text Input"):
txt_input = gr.Textbox(
lines=5,
placeholder="Type word or sentence here...",
value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization",
label="Input Text"
)
with gr.TabItem("mb File Upload"):
file_input = gr.File(
label="Upload Text File (.txt)",
file_types=[".txt", ".csv", ".tsv"]
)
with gr.Accordion("⚙️ Advanced Options", open=False):
delimiter_input = gr.Textbox(
value="+",
label="Morpheme Delimiter",
info="The string used to separate morphemes (e.g., '+', '|', ' @@')."
)
format_radio = gr.Radio(
choices=["String", "List"],
value="String",
label="Output Format",
info="String returns text with delimiters. List returns a Python list structure."
)
submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")
with gr.Column(scale=1):
# Outputs
output_area = gr.Textbox(
label="Segmented Output",
lines=10,
show_label=True
)
download_btn = gr.File(label="Download Result")
# Event Listeners
submit_btn.click(
fn=process_segmentation,
inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
outputs=[output_area, download_btn]
)
gr.Markdown("### Examples")
gr.Examples(
examples=EXAMPLES,
inputs=[lang_dropdown, txt_input, delimiter_input],
label="Click on an example to populate:"
)
gr.Markdown(
"""
---
Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615)
"""
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft()) |