Spaces:

Morphological-Segmentation
/

Morpheme_Segmentation_Demo

Sleeping

App Files Files Community

Donald Winkelman commited on Nov 21, 2025

Commit

a3a328d

1 Parent(s): 8dc7983

init

Browse files

Files changed (5) hide show

.idea/.gitignore +8 -0
.idea/discord.xml +7 -0
.idea/material_theme_project_new.xml +12 -0
app.py +191 -0
requrements.txt +9 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/discord.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DiscordProjectSettings">
+    <option name="show" value="ASK" />
+    <option name="description" value="" />
+  </component>
+</project>

.idea/material_theme_project_new.xml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MaterialThemeProjectNewConfig">
+    <option name="metadata">
+      <MTProjectMetadataState>
+        <option name="migrated" value="true" />
+        <option name="pristineConfig" value="false" />
+        <option name="userId" value="-23be7a9f:18f78ab48c6:-7ffe" />
+      </MTProjectMetadataState>
+    </option>
+  </component>
+</project>

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import gradio as gr
+import os
+import json
+from morphseg import MorphemeSegmenter
+# --- Global Cache for Models ---
+# We load models lazily (only when a user requests a specific language)
+# to prevent the app from timing out during startup.
+LOADED_MODELS = {}
+LANGUAGES = {
+    "English": "en",
+    "Spanish": "es",
+    "Russian": "ru",
+    "French": "fr",
+    "Italian": "it",
+    "Czech": "cs",
+    "Hungarian": "hu",
+    "Mongolian": "mn",
+    "Latin": "la"
+}
+EXAMPLES = [
+    ["English", "Segment words into their smallest units of meaning!", " @@"],
+    ["Spanish", "desafortunadamente reescribieron rápidamente", " @@"],
+    ["Russian", "неизбежность переработки неисправима", " @@"],
+    ["French", "incompréhensible prétraitement irréversiblement", " @@"],
+    ["Italian", "incredibilmente preprocessarono inevitabilmente", " @@"],
+    ["Czech", "nepochopitelně přepracování nevratně", " @@"],
+    ["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", " @@"],
+    ["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", " @@"],
+    ["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", " @@"]
+]
+def get_segmenter(lang_code):
+    """Retrieves a model from cache or loads it if not present."""
+    if lang_code not in LOADED_MODELS:
+        print(f"Loading model for {lang_code}...")
+        try:
+            LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
+        except Exception as e:
+            raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
+    return LOADED_MODELS[lang_code]
+def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
+    """Main processing function for the Gradio interface."""
+    # 1. Determine Input Source
+    content = ""
+    if file_input is not None:
+        try:
+            with open(file_input.name, 'r', encoding='utf-8') as f:
+                content = f.read()
+        except UnicodeDecodeError:
+            return "Error: File must be a text file (UTF-8).", None
+    else:
+        content = text_input
+    if not content or content.strip() == "":
+        return "Please enter text or upload a file.", None
+    # 2. Get Language Code and Model
+    lang_code = LANGUAGES.get(language_name)
+    if not lang_code:
+        return "Error: Invalid language selection.", None
+    segmenter = get_segmenter(lang_code)
+    # 3. Determine Output Format
+    is_output_string = (output_format == "String")
+    # 4. Run Segmentation
+    # Note: The library segment() method handles the empty string check internally
+    try:
+        result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
+    except Exception as e:
+        return f"Error during segmentation: {str(e)}", None
+    # 5. Format Output for Display and File Generation
+    display_output = ""
+    if is_output_string:
+        display_output = result
+    else:
+        # If list, pretty print it as JSON strings for readability
+        # If the input was a single sentence, it's a list of lists.
+        # If massive text, it's a large list of lists.
+        display_output = json.dumps(result, ensure_ascii=False, indent=2)
+    # 6. Create Downloadable File
+    output_filename = "segmented_output.txt"
+    # If it's JSON/List, save as .json, otherwise .txt
+    if not is_output_string:
+        output_filename = "segmented_output.json"
+    with open(output_filename, "w", encoding="utf-8") as f:
+        f.write(display_output)
+    return display_output, output_filename
+# --- Gradio UI Construction ---
+with gr.Blocks(title="MorphSeg Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🧩 MorphSeg: Canonical Morpheme Segmentation
+        **MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics,
+        MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).
+        *Select a language, enter text, and see the morphemes!*
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Controls
+            lang_dropdown = gr.Dropdown(
+                choices=list(LANGUAGES.keys()),
+                value="English",
+                label="Language",
+                info="Select the language of your text."
+            )
+            with gr.Tabs():
+                with gr.TabItem("📝 Text Input"):
+                    txt_input = gr.Textbox(
+                        lines=5,
+                        placeholder="Type word or sentence here...",
+                        label="Input Text"
+                    )
+                with gr.TabItem("mb File Upload"):
+                    file_input = gr.File(
+                        label="Upload Text File (.txt)",
+                        file_types=[".txt", ".csv", ".tsv"]
+                    )
+            with gr.Accordion("⚙️ Advanced Options", open=False):
+                delimiter_input = gr.Textbox(
+                    value=" @@",
+                    label="Morpheme Delimiter",
+                    info="The string used to separate morphemes (e.g., ' @@' or '-')."
+                )
+                format_radio = gr.Radio(
+                    choices=["String", "List"],
+                    value="String",
+                    label="Output Format",
+                    info="String returns text with delimiters. List returns a Python list structure."
+                )
+            submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            # Outputs
+            output_area = gr.TextArea(
+                label="Segmented Output",
+                show_copy_button=True,
+                lines=10
+            )
+            download_btn = gr.File(label="Download Result")
+    # Event Listeners
+    submit_btn.click(
+        fn=process_segmentation,
+        inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
+        outputs=[output_area, download_btn]
+    )
+    # Clear file input if text input is used, and vice versa (optional UX polish)
+    txt_input.change(lambda: None, None, file_input)
+    file_input.change(lambda: None, None, txt_input)
+    gr.Markdown("### Examples")
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[lang_dropdown, txt_input, delimiter_input],
+        label="Click on an example to populate:"
+    )
+    gr.Markdown(
+        """
+        ---
+        Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on SIGMORPHON 2022 Research
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()

requrements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+morphseg
+torch
+pandas
+regex
+unicodedata2
+rich
+safetensors
+huggingface_hub