sbompolas commited on
Commit
a600f21
·
verified ·
1 Parent(s): db0d4ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -249
app.py CHANGED
@@ -1,249 +1,75 @@
1
- import gradio as gr
2
- import stanza
3
- import io
4
- import pandas as pd
5
- from typing import List, Dict, Any
6
- import re
7
-
8
- # Download and initialize Stanza models for multiple languages
9
- # This will be done when the space starts
10
- def initialize_stanza_models():
11
- """Initialize Stanza models for common languages"""
12
- languages = ['en', 'es', 'fr', 'de', 'zh', 'ru', 'ar']
13
- models = {}
14
-
15
- for lang in languages:
16
- try:
17
- # Download model if not present
18
- stanza.download(lang, verbose=False)
19
- # Initialize pipeline
20
- models[lang] = stanza.Pipeline(lang, processors='tokenize,pos,lemma,depparse', verbose=False)
21
- print(f"Loaded {lang} model successfully")
22
- except Exception as e:
23
- print(f"Failed to load {lang} model: {e}")
24
-
25
- return models
26
-
27
- # Global variable to store models
28
- STANZA_MODELS = {}
29
-
30
- def parse_text_with_stanza(text: str, language: str) -> str:
31
- """Parse text using Stanza and return CoNLL-U format"""
32
- if language not in STANZA_MODELS:
33
- return f"Error: Language '{language}' not available. Available languages: {list(STANZA_MODELS.keys())}"
34
-
35
- try:
36
- # Process the text
37
- doc = STANZA_MODELS[language](text)
38
-
39
- # Convert to CoNLL-U format
40
- conllu_output = doc.to_conllu()
41
- return conllu_output
42
-
43
- except Exception as e:
44
- return f"Error processing text: {str(e)}"
45
-
46
- def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
47
- """Convert CoNLL-U text to pandas DataFrame for visualization"""
48
- lines = conllu_text.strip().split('\n')
49
- data = []
50
-
51
- for line in lines:
52
- # Skip comments and empty lines
53
- if line.startswith('#') or not line.strip():
54
- continue
55
-
56
- # Parse CoNLL-U format
57
- parts = line.split('\t')
58
- if len(parts) >= 10:
59
- data.append({
60
- 'ID': parts[0],
61
- 'FORM': parts[1],
62
- 'LEMMA': parts[2],
63
- 'UPOS': parts[3],
64
- 'XPOS': parts[4],
65
- 'FEATS': parts[5],
66
- 'HEAD': parts[6],
67
- 'DEPREL': parts[7],
68
- 'DEPS': parts[8],
69
- 'MISC': parts[9]
70
- })
71
-
72
- return pd.DataFrame(data)
73
-
74
- def create_dependency_visualization(df: pd.DataFrame) -> str:
75
- """Create a simple text-based dependency visualization"""
76
- if df.empty:
77
- return "No data to visualize"
78
-
79
- viz_lines = []
80
- viz_lines.append("Dependency Parse Visualization:")
81
- viz_lines.append("=" * 50)
82
-
83
- for _, row in df.iterrows():
84
- word = row['FORM']
85
- pos = row['UPOS']
86
- deprel = row['DEPREL']
87
- head_id = row['HEAD']
88
-
89
- # Find the head word
90
- if head_id != '0': # Not root
91
- try:
92
- head_idx = int(head_id) - 1
93
- if head_idx < len(df):
94
- head_word = df.iloc[head_idx]['FORM']
95
- viz_lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
96
- else:
97
- viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
98
- except (ValueError, IndexError):
99
- viz_lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
100
- else:
101
- viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
102
-
103
- return "\n".join(viz_lines)
104
-
105
- def process_text(text: str, language: str):
106
- """Main processing function that returns all outputs"""
107
- if not text.strip():
108
- return "Please enter some text to parse.", "", "No data to display"
109
-
110
- # Parse with Stanza
111
- conllu_output = parse_text_with_stanza(text, language)
112
-
113
- if conllu_output.startswith("Error"):
114
- return conllu_output, "", "Error in parsing"
115
-
116
- # Convert to DataFrame
117
- try:
118
- df = conllu_to_dataframe(conllu_output)
119
-
120
- # Create visualization
121
- visualization = create_dependency_visualization(df)
122
-
123
- return conllu_output, df, visualization
124
-
125
- except Exception as e:
126
- return conllu_output, "", f"Error creating visualization: {str(e)}"
127
-
128
- # Initialize models (this will run when the space starts)
129
- print("Initializing Stanza models...")
130
- STANZA_MODELS = initialize_stanza_models()
131
-
132
- # Create Gradio interface
133
- def create_gradio_app():
134
- with gr.Blocks(title="Stanza Parser with CoNLL-U Viewer", theme=gr.themes.Soft()) as app:
135
- gr.Markdown("""
136
- # Stanza Parser with CoNLL-U Viewer
137
-
138
- This tool uses Stanford's Stanza library to parse sentences and provides:
139
- - **CoNLL-U Format Output**: Standard linguistic annotation format
140
- - **Interactive Table**: Browse parsed tokens with linguistic features
141
- - **Dependency Visualization**: Text-based dependency structure display
142
-
143
- Enter your text below and select a language to get started!
144
- """)
145
-
146
- with gr.Row():
147
- with gr.Column(scale=2):
148
- text_input = gr.Textbox(
149
- label="Input Text",
150
- placeholder="Enter the text you want to parse...",
151
- lines=4,
152
- value="The quick brown fox jumps over the lazy dog."
153
- )
154
-
155
- language_dropdown = gr.Dropdown(
156
- choices=list(STANZA_MODELS.keys()),
157
- label="Language",
158
- value="en" if "en" in STANZA_MODELS else list(STANZA_MODELS.keys())[0],
159
- info="Select the language of your input text"
160
- )
161
-
162
- parse_button = gr.Button("Parse Text", variant="primary")
163
-
164
- with gr.Row():
165
- with gr.Column():
166
- gr.Markdown("### CoNLL-U Output")
167
- conllu_output = gr.Textbox(
168
- label="CoNLL-U Format",
169
- lines=10,
170
- max_lines=20,
171
- show_copy_button=True,
172
- info="Raw CoNLL-U format output - you can copy this for use in other tools"
173
- )
174
-
175
- with gr.Row():
176
- with gr.Column():
177
- gr.Markdown("### Parsed Data Table")
178
- data_table = gr.Dataframe(
179
- label="Token Analysis",
180
- interactive=False,
181
- wrap=True
182
- )
183
-
184
- with gr.Row():
185
- with gr.Column():
186
- gr.Markdown("### Dependency Structure")
187
- dependency_viz = gr.Textbox(
188
- label="Dependency Relationships",
189
- lines=8,
190
- max_lines=15,
191
- show_copy_button=True,
192
- info="Text-based visualization of dependency relationships"
193
- )
194
-
195
- # Event handling
196
- parse_button.click(
197
- fn=process_text,
198
- inputs=[text_input, language_dropdown],
199
- outputs=[conllu_output, data_table, dependency_viz]
200
- )
201
-
202
- # Also trigger on Enter in text input
203
- text_input.submit(
204
- fn=process_text,
205
- inputs=[text_input, language_dropdown],
206
- outputs=[conllu_output, data_table, dependency_viz]
207
- )
208
-
209
- # Add examples
210
- gr.Markdown("### Example Texts")
211
- examples = [
212
- ["The quick brown fox jumps over the lazy dog.", "en"],
213
- ["El gato está en la mesa.", "es"],
214
- ["Le chat est sur la table.", "fr"],
215
- ["Die Katze ist auf dem Tisch.", "de"],
216
- ]
217
-
218
- gr.Examples(
219
- examples=examples,
220
- inputs=[text_input, language_dropdown],
221
- outputs=[conllu_output, data_table, dependency_viz],
222
- fn=process_text,
223
- cache_examples=False
224
- )
225
-
226
- gr.Markdown("""
227
- ### About CoNLL-U Format
228
-
229
- The CoNLL-U format includes these fields for each token:
230
- - **ID**: Token index
231
- - **FORM**: Word form or punctuation symbol
232
- - **LEMMA**: Lemma or stem of word form
233
- - **UPOS**: Universal part-of-speech tag
234
- - **XPOS**: Language-specific part-of-speech tag
235
- - **FEATS**: Morphological features
236
- - **HEAD**: Head of the current word
237
- - **DEPREL**: Dependency relation to the head
238
- - **DEPS**: Enhanced dependency graph
239
- - **MISC**: Miscellaneous annotations
240
-
241
- For more information about Stanza, visit: https://stanfordnlp.github.io/stanza/
242
- """)
243
-
244
- return app
245
-
246
- # Create and launch the app
247
- if __name__ == "__main__":
248
- app = create_gradio_app()
249
- app.launch()
 
1
+ ---
2
+ title: Stanza Parser with CoNLL-U Viewer
3
+ emoji: 🔍
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Stanza Parser with CoNLL-U Viewer
14
+
15
+ A comprehensive linguistic analysis tool powered by Stanford's Stanza library that provides sentence parsing with multiple output formats.
16
+
17
+ ## Features
18
+
19
+ - **Multi-language Support**: Parse text in English, Spanish, French, German, Chinese, Russian, and Arabic
20
+ - **CoNLL-U Output**: Get standard linguistic annotation format output
21
+ - **Interactive Data Table**: Browse parsed tokens with all linguistic features
22
+ - **Dependency Visualization**: Text-based visualization of dependency relationships
23
+ - **Copy-friendly Output**: Easy to copy results for use in other tools
24
+
25
+ ## What is CoNLL-U?
26
+
27
+ CoNLL-U is a standard format for representing linguistic annotations that includes:
28
+
29
+ - **Tokenization**: Word and sentence boundaries
30
+ - **Part-of-Speech Tagging**: Universal and language-specific POS tags
31
+ - **Lemmatization**: Base forms of words
32
+ - **Morphological Features**: Grammatical attributes
33
+ - **Dependency Parsing**: Syntactic relationships between words
34
+
35
+ ## How to Use
36
+
37
+ 1. Enter your text in the input box
38
+ 2. Select the appropriate language
39
+ 3. Click "Parse Text" or press Enter
40
+ 4. View results in three formats:
41
+ - Raw CoNLL-U format (copy-paste ready)
42
+ - Interactive data table
43
+ - Dependency structure visualization
44
+
45
+ ## Example Output
46
+
47
+ For the sentence "The cat sits on the mat", you'll get:
48
+
49
+ - **CoNLL-U format**: Standard 10-column format with all linguistic features
50
+ - **Data table**: Interactive view of each token's properties
51
+ - **Dependencies**: "cat --nsubj--> sits", "mat --nmod--> sits", etc.
52
+
53
+ ## Use Cases
54
+
55
+ - **Linguistic Research**: Analyze sentence structure and grammatical relationships
56
+ - **NLP Development**: Generate training data or test parsing models
57
+ - **Educational**: Learn about syntactic analysis and dependency grammar
58
+ - **Text Processing**: Prepare annotated data for downstream tasks
59
+
60
+ ## Technical Details
61
+
62
+ This space uses:
63
+ - **Stanza**: Stanford's multilingual NLP toolkit
64
+ - **Gradio**: For the interactive web interface
65
+ - **Pandas**: For data table visualization
66
+
67
+ The models are automatically downloaded and cached when the space starts up.
68
+
69
+ ## Supported Languages
70
+
71
+ Currently supports: English (en), Spanish (es), French (fr), German (de), Chinese (zh), Russian (ru), Arabic (ar)
72
+
73
+ ---
74
+
75
+ *Powered by Stanford Stanza - https://stanfordnlp.github.io/stanza/*