File size: 6,894 Bytes
a3a328d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0df199d
23d2473
 
 
 
 
 
 
 
a3a328d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1fd46f
a3a328d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0154f98
a3a328d
 
 
 
 
 
 
 
 
 
23d2473
a3a328d
23d2473
a3a328d
 
 
 
 
 
 
 
 
 
 
 
8c41812
a3a328d
5f1bd05
 
a3a328d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15eca27
a3a328d
 
 
 
5f1bd05
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import gradio as gr
import os
import json
from morphseg import MorphemeSegmenter

# --- Global Cache for Models ---
# We load models lazily (only when a user requests a specific language)
# to prevent the app from timing out during startup.
LOADED_MODELS = {}

LANGUAGES = {
    "English": "en",
    "Spanish": "es",
    "Russian": "ru",
    "French": "fr",
    "Italian": "it",
    "Czech": "cs",
    "Hungarian": "hu",
    "Mongolian": "mn",
    "Latin": "la"
}

EXAMPLES = [
    ["English", "The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization", "+"],
    ["Spanish", "desafortunadamente reescribieron rápidamente", "+"],
    ["Russian", "неизбежность переработки неисправима", "+"],
    ["French", "incompréhensible prétraitement irréversiblement", "+"],
    ["Italian", "incredibilmente preprocessarono inevitabilmente", "+"],
    ["Czech", "nepochopitelně přepracování nevratně", "+"],
    ["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", "+"],
    ["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", "+"],
    ["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", "+"]
]


def get_segmenter(lang_code):
    """Retrieves a model from cache or loads it if not present."""
    if lang_code not in LOADED_MODELS:
        print(f"Loading model for {lang_code}...")
        try:
            LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
        except Exception as e:
            raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
    return LOADED_MODELS[lang_code]


def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
    """Main processing function for the Gradio interface."""

    # 1. Determine Input Source
    content = ""
    if file_input is not None:
        try:
            with open(file_input.name, 'r', encoding='utf-8') as f:
                content = f.read()
        except UnicodeDecodeError:
            return "Error: File must be a text file (UTF-8).", None
    else:
        content = text_input

    if not content or content.strip() == "":
        return "Please enter text or upload a file.", None

    # 2. Get Language Code and Model
    lang_code = LANGUAGES.get(language_name)
    if not lang_code:
        return "Error: Invalid language selection.", None

    segmenter = get_segmenter(lang_code)

    # 3. Determine Output Format
    is_output_string = (output_format == "String")

    # 4. Run Segmentation
    # Note: The library segment() method handles the empty string check internally
    try:
        result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
    except Exception as e:
        return f"Error during segmentation: {str(e)}", None

    # 5. Format Output for Display and File Generation
    display_output = ""

    if is_output_string:
        display_output = result
    else:
        # If list, pretty print it as JSON strings for readability
        # If the input was a single sentence, it's a list of lists.
        # If massive text, it's a large list of lists.
        display_output = json.dumps(result, ensure_ascii=False, indent=2)

    # 6. Create Downloadable File
    output_filename = "segmented_output.txt"
    # If it's JSON/List, save as .json, otherwise .txt
    if not is_output_string:
        output_filename = "segmented_output.json"

    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(display_output)

    return display_output, output_filename


# --- Gradio UI Construction ---

with gr.Blocks(title="MorphSeg Demo") as demo:
    gr.Markdown(
        """
        # 🧩 MorphSeg: Canonical Morpheme Segmentation

        **MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics, 
        MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).

        *Select a language, enter text, and see the morphemes!*
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            # Controls
            lang_dropdown = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="English",
                label="Language",
                info="Select the language of your text."
            )

            with gr.Tabs():
                with gr.TabItem("📝 Text Input"):
                    txt_input = gr.Textbox(
                        lines=5,
                        placeholder="Type word or sentence here...",
                        value="The unbelievably disagreeable preprocessor unsuccessfully reprocessed the unquestionably irreversible decontextualization",
                        label="Input Text"
                    )
                with gr.TabItem("mb File Upload"):
                    file_input = gr.File(
                        label="Upload Text File (.txt)",
                        file_types=[".txt", ".csv", ".tsv"]
                    )

            with gr.Accordion("⚙️ Advanced Options", open=False):
                delimiter_input = gr.Textbox(
                    value="+",
                    label="Morpheme Delimiter",
                    info="The string used to separate morphemes (e.g., '+', '|', ' @@')."
                )
                format_radio = gr.Radio(
                    choices=["String", "List"],
                    value="String",
                    label="Output Format",
                    info="String returns text with delimiters. List returns a Python list structure."
                )

            submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")

        with gr.Column(scale=1):
            # Outputs
            output_area = gr.Textbox(
                label="Segmented Output",
                lines=10,
                show_label=True
            )
            download_btn = gr.File(label="Download Result")

    # Event Listeners
    submit_btn.click(
        fn=process_segmentation,
        inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
        outputs=[output_area, download_btn]
    )

    gr.Markdown("### Examples")
    gr.Examples(
        examples=EXAMPLES,
        inputs=[lang_dropdown, txt_input, delimiter_input],
        label="Click on an example to populate:"
    )

    gr.Markdown(
        """
        ---
        Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on the work of [Girrbach (2022)](https://aclanthology.org/2022.sigmorphon-1.13/) submitted to the [SIGMORPHON 2022 Shared Task on Morpheme Segmentation](https://arxiv.org/abs/2206.07615)
        """
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft())