Spaces:

Morphological-Segmentation
/

Morpheme_Segmentation_Demo

Sleeping

File size: 6,894 Bytes

import gradio as gr
import os
import json
from morphseg import MorphemeSegmenter

# --- Global Cache for Models ---
# We load models lazily (only when a user requests a specific language)
# to prevent the app from timing out during startup.
LOADED_MODELS = {}

LANGUAGES = {
    "English": "en",
    "Spanish": "es",
    "Russian": "ru",
    "French": "fr",
    "Italian": "it",
    "Czech": "cs",
    "Hungarian": "hu",
    "Mongolian": "mn",
    "Latin": "la"
}

EXAMPLES = [
    ["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"],
    ["Spanish", "desafortunadamente reescribieron rápidamente", "+"],
    ["Russian", "неизбежность переработки неисправима", "+"],
    ["French", "incompréhensible prétraitement irréversiblement", "+"],
    ["Italian", "incredibilmente preprocessarono inevitabilmente", "+"],
    ["Czech", "nepochopitelně přepracování nevratně", "+"],
    ["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"],
    ["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"],
    ["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"]
]


def get_segmenter(lang_code):
    """Retrieves a model from cache or loads it if not present."""
    if lang_code not in LOADED_MODELS:
        print(f"Loading model for {lang_code}...")
        try:
            LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
        except Exception as e:
            raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
    return LOADED_MODELS[lang_code]


def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
    """Main processing function for the Gradio interface."""

    # 1. Determine Input Source
    content = ""
    if file_input is not None:
        try:
            with open(file_input.name, 'r', encoding='utf-8') as f:
                content = f.read()
        except UnicodeDecodeError:
            return "Error: File must be a text file (UTF-8).", None
    else:
        content = text_input

    if not content or content.strip() == "":
        return "Please enter text or upload a file.", None

    # 2. Get Language Code and Model
    lang_code = LANGUAGES.get(language_name)
    if not lang_code:
        return "Error: Invalid language selection.", None

    segmenter = get_segmenter(lang_code)

    # 3. Determine Output Format
    is_output_string = (output_format == "String")

    # 4. Run Segmentation
    # Note: The library segment() method handles the empty string check internally
    try:
        result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
    except Exception as e:
        return f"Error during segmentation: {str(e)}", None

    # 5. Format Output for Display and File Generation
    display_output = ""

    if is_output_string:
        display_output = result
    else:
        # If list, pretty print it as JSON strings for readability
        # If the input was a single sentence, it's a list of lists.
        # If massive text, it's a large list of lists.
        display_output = json.dumps(result, ensure_ascii=False, indent=2)

    # 6. Create Downloadable File
    output_filename = "segmented_output.txt"
    # If it's JSON/List, save as .json, otherwise .txt
    if not is_output_string:
        output_filename = "segmented_output.json"

    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(display_output)

    return display_output, output_filename


# --- Gradio UI Construction ---

with gr.Blocks(title="MorphSeg Demo") as demo:
    gr.Markdown(
        """
        # 🧩 MorphSeg: Canonical Morpheme Segmentation

        **MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics, 
        MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).

        *Select a language, enter text, and see the morphemes!*
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            # Controls
            lang_dropdown = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="English",
                label="Language",
                info="Select the language of your text."
            )

            with gr.Tabs():
                with gr.TabItem("📝 Text Input"):
                    txt_input = gr.Textbox(
                        lines=5,
                        placeholder="Type word or sentence here...",
                        value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization",
                        label="Input Text"
                    )
                with gr.TabItem("mb File Upload"):
                    file_input = gr.File(
                        label="Upload Text File (.txt)",
                        file_types=[".txt", ".csv", ".tsv"]
                    )

            with gr.Accordion("⚙️ Advanced Options", open=False):
                delimiter_input = gr.Textbox(
                    value="+",
                    label="Morpheme Delimiter",
                    info="The string used to separate morphemes (e.g., '+', '|', ' @@')."
                )
                format_radio = gr.Radio(
                    choices=["String", "List"],
                    value="String",
                    label="Output Format",
                    info="String returns text with delimiters. List returns a Python list structure."
                )

            submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")

        with gr.Column(scale=1):
            # Outputs
            output_area = gr.Textbox(
                label="Segmented Output",
                lines=10,
                show_label=True
            )
            download_btn = gr.File(label="Download Result")

    # Event Listeners
    submit_btn.click(
        fn=process_segmentation,
        inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
        outputs=[output_area, download_btn]
    )

    gr.Markdown("### Examples")
    gr.Examples(
        examples=EXAMPLES,
        inputs=[lang_dropdown, txt_input, delimiter_input],
        label="Click on an example to populate:"
    )

    gr.Markdown(
        """
        ---
        Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615)
        """
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft())