Update app.py
Browse files
app.py
CHANGED
|
@@ -1,249 +1,75 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
if df.empty:
|
| 77 |
-
return "No data to visualize"
|
| 78 |
-
|
| 79 |
-
viz_lines = []
|
| 80 |
-
viz_lines.append("Dependency Parse Visualization:")
|
| 81 |
-
viz_lines.append("=" * 50)
|
| 82 |
-
|
| 83 |
-
for _, row in df.iterrows():
|
| 84 |
-
word = row['FORM']
|
| 85 |
-
pos = row['UPOS']
|
| 86 |
-
deprel = row['DEPREL']
|
| 87 |
-
head_id = row['HEAD']
|
| 88 |
-
|
| 89 |
-
# Find the head word
|
| 90 |
-
if head_id != '0': # Not root
|
| 91 |
-
try:
|
| 92 |
-
head_idx = int(head_id) - 1
|
| 93 |
-
if head_idx < len(df):
|
| 94 |
-
head_word = df.iloc[head_idx]['FORM']
|
| 95 |
-
viz_lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
|
| 96 |
-
else:
|
| 97 |
-
viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
|
| 98 |
-
except (ValueError, IndexError):
|
| 99 |
-
viz_lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
|
| 100 |
-
else:
|
| 101 |
-
viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
|
| 102 |
-
|
| 103 |
-
return "\n".join(viz_lines)
|
| 104 |
-
|
| 105 |
-
def process_text(text: str, language: str):
|
| 106 |
-
"""Main processing function that returns all outputs"""
|
| 107 |
-
if not text.strip():
|
| 108 |
-
return "Please enter some text to parse.", "", "No data to display"
|
| 109 |
-
|
| 110 |
-
# Parse with Stanza
|
| 111 |
-
conllu_output = parse_text_with_stanza(text, language)
|
| 112 |
-
|
| 113 |
-
if conllu_output.startswith("Error"):
|
| 114 |
-
return conllu_output, "", "Error in parsing"
|
| 115 |
-
|
| 116 |
-
# Convert to DataFrame
|
| 117 |
-
try:
|
| 118 |
-
df = conllu_to_dataframe(conllu_output)
|
| 119 |
-
|
| 120 |
-
# Create visualization
|
| 121 |
-
visualization = create_dependency_visualization(df)
|
| 122 |
-
|
| 123 |
-
return conllu_output, df, visualization
|
| 124 |
-
|
| 125 |
-
except Exception as e:
|
| 126 |
-
return conllu_output, "", f"Error creating visualization: {str(e)}"
|
| 127 |
-
|
| 128 |
-
# Initialize models (this will run when the space starts)
|
| 129 |
-
print("Initializing Stanza models...")
|
| 130 |
-
STANZA_MODELS = initialize_stanza_models()
|
| 131 |
-
|
| 132 |
-
# Create Gradio interface
|
| 133 |
-
def create_gradio_app():
|
| 134 |
-
with gr.Blocks(title="Stanza Parser with CoNLL-U Viewer", theme=gr.themes.Soft()) as app:
|
| 135 |
-
gr.Markdown("""
|
| 136 |
-
# Stanza Parser with CoNLL-U Viewer
|
| 137 |
-
|
| 138 |
-
This tool uses Stanford's Stanza library to parse sentences and provides:
|
| 139 |
-
- **CoNLL-U Format Output**: Standard linguistic annotation format
|
| 140 |
-
- **Interactive Table**: Browse parsed tokens with linguistic features
|
| 141 |
-
- **Dependency Visualization**: Text-based dependency structure display
|
| 142 |
-
|
| 143 |
-
Enter your text below and select a language to get started!
|
| 144 |
-
""")
|
| 145 |
-
|
| 146 |
-
with gr.Row():
|
| 147 |
-
with gr.Column(scale=2):
|
| 148 |
-
text_input = gr.Textbox(
|
| 149 |
-
label="Input Text",
|
| 150 |
-
placeholder="Enter the text you want to parse...",
|
| 151 |
-
lines=4,
|
| 152 |
-
value="The quick brown fox jumps over the lazy dog."
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
language_dropdown = gr.Dropdown(
|
| 156 |
-
choices=list(STANZA_MODELS.keys()),
|
| 157 |
-
label="Language",
|
| 158 |
-
value="en" if "en" in STANZA_MODELS else list(STANZA_MODELS.keys())[0],
|
| 159 |
-
info="Select the language of your input text"
|
| 160 |
-
)
|
| 161 |
-
|
| 162 |
-
parse_button = gr.Button("Parse Text", variant="primary")
|
| 163 |
-
|
| 164 |
-
with gr.Row():
|
| 165 |
-
with gr.Column():
|
| 166 |
-
gr.Markdown("### CoNLL-U Output")
|
| 167 |
-
conllu_output = gr.Textbox(
|
| 168 |
-
label="CoNLL-U Format",
|
| 169 |
-
lines=10,
|
| 170 |
-
max_lines=20,
|
| 171 |
-
show_copy_button=True,
|
| 172 |
-
info="Raw CoNLL-U format output - you can copy this for use in other tools"
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
with gr.Row():
|
| 176 |
-
with gr.Column():
|
| 177 |
-
gr.Markdown("### Parsed Data Table")
|
| 178 |
-
data_table = gr.Dataframe(
|
| 179 |
-
label="Token Analysis",
|
| 180 |
-
interactive=False,
|
| 181 |
-
wrap=True
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
with gr.Row():
|
| 185 |
-
with gr.Column():
|
| 186 |
-
gr.Markdown("### Dependency Structure")
|
| 187 |
-
dependency_viz = gr.Textbox(
|
| 188 |
-
label="Dependency Relationships",
|
| 189 |
-
lines=8,
|
| 190 |
-
max_lines=15,
|
| 191 |
-
show_copy_button=True,
|
| 192 |
-
info="Text-based visualization of dependency relationships"
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
# Event handling
|
| 196 |
-
parse_button.click(
|
| 197 |
-
fn=process_text,
|
| 198 |
-
inputs=[text_input, language_dropdown],
|
| 199 |
-
outputs=[conllu_output, data_table, dependency_viz]
|
| 200 |
-
)
|
| 201 |
-
|
| 202 |
-
# Also trigger on Enter in text input
|
| 203 |
-
text_input.submit(
|
| 204 |
-
fn=process_text,
|
| 205 |
-
inputs=[text_input, language_dropdown],
|
| 206 |
-
outputs=[conllu_output, data_table, dependency_viz]
|
| 207 |
-
)
|
| 208 |
-
|
| 209 |
-
# Add examples
|
| 210 |
-
gr.Markdown("### Example Texts")
|
| 211 |
-
examples = [
|
| 212 |
-
["The quick brown fox jumps over the lazy dog.", "en"],
|
| 213 |
-
["El gato está en la mesa.", "es"],
|
| 214 |
-
["Le chat est sur la table.", "fr"],
|
| 215 |
-
["Die Katze ist auf dem Tisch.", "de"],
|
| 216 |
-
]
|
| 217 |
-
|
| 218 |
-
gr.Examples(
|
| 219 |
-
examples=examples,
|
| 220 |
-
inputs=[text_input, language_dropdown],
|
| 221 |
-
outputs=[conllu_output, data_table, dependency_viz],
|
| 222 |
-
fn=process_text,
|
| 223 |
-
cache_examples=False
|
| 224 |
-
)
|
| 225 |
-
|
| 226 |
-
gr.Markdown("""
|
| 227 |
-
### About CoNLL-U Format
|
| 228 |
-
|
| 229 |
-
The CoNLL-U format includes these fields for each token:
|
| 230 |
-
- **ID**: Token index
|
| 231 |
-
- **FORM**: Word form or punctuation symbol
|
| 232 |
-
- **LEMMA**: Lemma or stem of word form
|
| 233 |
-
- **UPOS**: Universal part-of-speech tag
|
| 234 |
-
- **XPOS**: Language-specific part-of-speech tag
|
| 235 |
-
- **FEATS**: Morphological features
|
| 236 |
-
- **HEAD**: Head of the current word
|
| 237 |
-
- **DEPREL**: Dependency relation to the head
|
| 238 |
-
- **DEPS**: Enhanced dependency graph
|
| 239 |
-
- **MISC**: Miscellaneous annotations
|
| 240 |
-
|
| 241 |
-
For more information about Stanza, visit: https://stanfordnlp.github.io/stanza/
|
| 242 |
-
""")
|
| 243 |
-
|
| 244 |
-
return app
|
| 245 |
-
|
| 246 |
-
# Create and launch the app
|
| 247 |
-
if __name__ == "__main__":
|
| 248 |
-
app = create_gradio_app()
|
| 249 |
-
app.launch()
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Stanza Parser with CoNLL-U Viewer
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Stanza Parser with CoNLL-U Viewer
|
| 14 |
+
|
| 15 |
+
A comprehensive linguistic analysis tool powered by Stanford's Stanza library that provides sentence parsing with multiple output formats.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **Multi-language Support**: Parse text in English, Spanish, French, German, Chinese, Russian, and Arabic
|
| 20 |
+
- **CoNLL-U Output**: Get standard linguistic annotation format output
|
| 21 |
+
- **Interactive Data Table**: Browse parsed tokens with all linguistic features
|
| 22 |
+
- **Dependency Visualization**: Text-based visualization of dependency relationships
|
| 23 |
+
- **Copy-friendly Output**: Easy to copy results for use in other tools
|
| 24 |
+
|
| 25 |
+
## What is CoNLL-U?
|
| 26 |
+
|
| 27 |
+
CoNLL-U is a standard format for representing linguistic annotations that includes:
|
| 28 |
+
|
| 29 |
+
- **Tokenization**: Word and sentence boundaries
|
| 30 |
+
- **Part-of-Speech Tagging**: Universal and language-specific POS tags
|
| 31 |
+
- **Lemmatization**: Base forms of words
|
| 32 |
+
- **Morphological Features**: Grammatical attributes
|
| 33 |
+
- **Dependency Parsing**: Syntactic relationships between words
|
| 34 |
+
|
| 35 |
+
## How to Use
|
| 36 |
+
|
| 37 |
+
1. Enter your text in the input box
|
| 38 |
+
2. Select the appropriate language
|
| 39 |
+
3. Click "Parse Text" or press Enter
|
| 40 |
+
4. View results in three formats:
|
| 41 |
+
- Raw CoNLL-U format (copy-paste ready)
|
| 42 |
+
- Interactive data table
|
| 43 |
+
- Dependency structure visualization
|
| 44 |
+
|
| 45 |
+
## Example Output
|
| 46 |
+
|
| 47 |
+
For the sentence "The cat sits on the mat", you'll get:
|
| 48 |
+
|
| 49 |
+
- **CoNLL-U format**: Standard 10-column format with all linguistic features
|
| 50 |
+
- **Data table**: Interactive view of each token's properties
|
| 51 |
+
- **Dependencies**: "cat --nsubj--> sits", "mat --nmod--> sits", etc.
|
| 52 |
+
|
| 53 |
+
## Use Cases
|
| 54 |
+
|
| 55 |
+
- **Linguistic Research**: Analyze sentence structure and grammatical relationships
|
| 56 |
+
- **NLP Development**: Generate training data or test parsing models
|
| 57 |
+
- **Educational**: Learn about syntactic analysis and dependency grammar
|
| 58 |
+
- **Text Processing**: Prepare annotated data for downstream tasks
|
| 59 |
+
|
| 60 |
+
## Technical Details
|
| 61 |
+
|
| 62 |
+
This space uses:
|
| 63 |
+
- **Stanza**: Stanford's multilingual NLP toolkit
|
| 64 |
+
- **Gradio**: For the interactive web interface
|
| 65 |
+
- **Pandas**: For data table visualization
|
| 66 |
+
|
| 67 |
+
The models are automatically downloaded and cached when the space starts up.
|
| 68 |
+
|
| 69 |
+
## Supported Languages
|
| 70 |
+
|
| 71 |
+
Currently supports: English (en), Spanish (es), French (fr), German (de), Chinese (zh), Russian (ru), Arabic (ar)
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
*Powered by Stanford Stanza - https://stanfordnlp.github.io/stanza/*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|