Donald Winkelman commited on
Commit
a3a328d
·
1 Parent(s): 8dc7983
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/discord.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="DiscordProjectSettings">
4
+ <option name="show" value="ASK" />
5
+ <option name="description" value="" />
6
+ </component>
7
+ </project>
.idea/material_theme_project_new.xml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="MaterialThemeProjectNewConfig">
4
+ <option name="metadata">
5
+ <MTProjectMetadataState>
6
+ <option name="migrated" value="true" />
7
+ <option name="pristineConfig" value="false" />
8
+ <option name="userId" value="-23be7a9f:18f78ab48c6:-7ffe" />
9
+ </MTProjectMetadataState>
10
+ </option>
11
+ </component>
12
+ </project>
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ from morphseg import MorphemeSegmenter
5
+
6
+ # --- Global Cache for Models ---
7
+ # We load models lazily (only when a user requests a specific language)
8
+ # to prevent the app from timing out during startup.
9
+ LOADED_MODELS = {}
10
+
11
+ LANGUAGES = {
12
+ "English": "en",
13
+ "Spanish": "es",
14
+ "Russian": "ru",
15
+ "French": "fr",
16
+ "Italian": "it",
17
+ "Czech": "cs",
18
+ "Hungarian": "hu",
19
+ "Mongolian": "mn",
20
+ "Latin": "la"
21
+ }
22
+
23
+ EXAMPLES = [
24
+ ["English", "Segment words into their smallest units of meaning!", " @@"],
25
+ ["Spanish", "desafortunadamente reescribieron rápidamente", " @@"],
26
+ ["Russian", "неизбежность переработки неисправима", " @@"],
27
+ ["French", "incompréhensible prétraitement irréversiblement", " @@"],
28
+ ["Italian", "incredibilmente preprocessarono inevitabilmente", " @@"],
29
+ ["Czech", "nepochopitelně přepracování nevratně", " @@"],
30
+ ["Hungarian", "visszafordíthatatlan átdolgozást végrehajtottak", " @@"],
31
+ ["Mongolian", "боломжгүй дахин боловсруулах ажиллагаа", " @@"],
32
+ ["Latin", "philosophica sapientia irreprehensibilis et compositionis cūrātissimē perfectae", " @@"]
33
+ ]
34
+
35
+
36
+ def get_segmenter(lang_code):
37
+ """Retrieves a model from cache or loads it if not present."""
38
+ if lang_code not in LOADED_MODELS:
39
+ print(f"Loading model for {lang_code}...")
40
+ try:
41
+ LOADED_MODELS[lang_code] = MorphemeSegmenter(lang=lang_code)
42
+ except Exception as e:
43
+ raise gr.Error(f"Failed to load model for {lang_code}: {str(e)}")
44
+ return LOADED_MODELS[lang_code]
45
+
46
+
47
+ def process_segmentation(language_name, text_input, file_input, delimiter, output_format):
48
+ """Main processing function for the Gradio interface."""
49
+
50
+ # 1. Determine Input Source
51
+ content = ""
52
+ if file_input is not None:
53
+ try:
54
+ with open(file_input.name, 'r', encoding='utf-8') as f:
55
+ content = f.read()
56
+ except UnicodeDecodeError:
57
+ return "Error: File must be a text file (UTF-8).", None
58
+ else:
59
+ content = text_input
60
+
61
+ if not content or content.strip() == "":
62
+ return "Please enter text or upload a file.", None
63
+
64
+ # 2. Get Language Code and Model
65
+ lang_code = LANGUAGES.get(language_name)
66
+ if not lang_code:
67
+ return "Error: Invalid language selection.", None
68
+
69
+ segmenter = get_segmenter(lang_code)
70
+
71
+ # 3. Determine Output Format
72
+ is_output_string = (output_format == "String")
73
+
74
+ # 4. Run Segmentation
75
+ # Note: The library segment() method handles the empty string check internally
76
+ try:
77
+ result = segmenter.segment(content, output_string=is_output_string, delimiter=delimiter)
78
+ except Exception as e:
79
+ return f"Error during segmentation: {str(e)}", None
80
+
81
+ # 5. Format Output for Display and File Generation
82
+ display_output = ""
83
+
84
+ if is_output_string:
85
+ display_output = result
86
+ else:
87
+ # If list, pretty print it as JSON strings for readability
88
+ # If the input was a single sentence, it's a list of lists.
89
+ # If massive text, it's a large list of lists.
90
+ display_output = json.dumps(result, ensure_ascii=False, indent=2)
91
+
92
+ # 6. Create Downloadable File
93
+ output_filename = "segmented_output.txt"
94
+ # If it's JSON/List, save as .json, otherwise .txt
95
+ if not is_output_string:
96
+ output_filename = "segmented_output.json"
97
+
98
+ with open(output_filename, "w", encoding="utf-8") as f:
99
+ f.write(display_output)
100
+
101
+ return display_output, output_filename
102
+
103
+
104
+ # --- Gradio UI Construction ---
105
+
106
+ with gr.Blocks(title="MorphSeg Demo", theme=gr.themes.Soft()) as demo:
107
+ gr.Markdown(
108
+ """
109
+ # 🧩 MorphSeg: Canonical Morpheme Segmentation
110
+
111
+ **MorphSeg** provides linguistically aware segmentation. Unlike standard tokenizers (BPE) which split words based on frequency statistics,
112
+ MorphSeg splits words into their true morphological roots and affixes (Canonical Segmentation).
113
+
114
+ *Select a language, enter text, and see the morphemes!*
115
+ """
116
+ )
117
+
118
+ with gr.Row():
119
+ with gr.Column(scale=1):
120
+ # Controls
121
+ lang_dropdown = gr.Dropdown(
122
+ choices=list(LANGUAGES.keys()),
123
+ value="English",
124
+ label="Language",
125
+ info="Select the language of your text."
126
+ )
127
+
128
+ with gr.Tabs():
129
+ with gr.TabItem("📝 Text Input"):
130
+ txt_input = gr.Textbox(
131
+ lines=5,
132
+ placeholder="Type word or sentence here...",
133
+ label="Input Text"
134
+ )
135
+ with gr.TabItem("mb File Upload"):
136
+ file_input = gr.File(
137
+ label="Upload Text File (.txt)",
138
+ file_types=[".txt", ".csv", ".tsv"]
139
+ )
140
+
141
+ with gr.Accordion("⚙️ Advanced Options", open=False):
142
+ delimiter_input = gr.Textbox(
143
+ value=" @@",
144
+ label="Morpheme Delimiter",
145
+ info="The string used to separate morphemes (e.g., ' @@' or '-')."
146
+ )
147
+ format_radio = gr.Radio(
148
+ choices=["String", "List"],
149
+ value="String",
150
+ label="Output Format",
151
+ info="String returns text with delimiters. List returns a Python list structure."
152
+ )
153
+
154
+ submit_btn = gr.Button("🔍 Segment", variant="primary", size="lg")
155
+
156
+ with gr.Column(scale=1):
157
+ # Outputs
158
+ output_area = gr.TextArea(
159
+ label="Segmented Output",
160
+ show_copy_button=True,
161
+ lines=10
162
+ )
163
+ download_btn = gr.File(label="Download Result")
164
+
165
+ # Event Listeners
166
+ submit_btn.click(
167
+ fn=process_segmentation,
168
+ inputs=[lang_dropdown, txt_input, file_input, delimiter_input, format_radio],
169
+ outputs=[output_area, download_btn]
170
+ )
171
+
172
+ # Clear file input if text input is used, and vice versa (optional UX polish)
173
+ txt_input.change(lambda: None, None, file_input)
174
+ file_input.change(lambda: None, None, txt_input)
175
+
176
+ gr.Markdown("### Examples")
177
+ gr.Examples(
178
+ examples=EXAMPLES,
179
+ inputs=[lang_dropdown, txt_input, delimiter_input],
180
+ label="Click on an example to populate:"
181
+ )
182
+
183
+ gr.Markdown(
184
+ """
185
+ ---
186
+ Built with [MorphSeg](https://github.com/TheWelcomer/MorphSeg) | Based on SIGMORPHON 2022 Research
187
+ """
188
+ )
189
+
190
+ if __name__ == "__main__":
191
+ demo.launch()
requrements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ morphseg
3
+ torch
4
+ pandas
5
+ regex
6
+ unicodedata2
7
+ rich
8
+ safetensors
9
+ huggingface_hub