Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jun 28, 2025

Commit

a600f21

verified ·

1 Parent(s): db0d4ec

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -249

app.py CHANGED Viewed

@@ -1,249 +1,75 @@
-import gradio as gr
-import stanza
-import io
-import pandas as pd
-from typing import List, Dict, Any
-import re
-# Download and initialize Stanza models for multiple languages
-# This will be done when the space starts
-def initialize_stanza_models():
-    """Initialize Stanza models for common languages"""
-    languages = ['en', 'es', 'fr', 'de', 'zh', 'ru', 'ar']
-    models = {}
-    for lang in languages:
-        try:
-            # Download model if not present
-            stanza.download(lang, verbose=False)
-            # Initialize pipeline
-            models[lang] = stanza.Pipeline(lang, processors='tokenize,pos,lemma,depparse', verbose=False)
-            print(f"Loaded {lang} model successfully")
-        except Exception as e:
-            print(f"Failed to load {lang} model: {e}")
-    return models
-# Global variable to store models
-STANZA_MODELS = {}
-def parse_text_with_stanza(text: str, language: str) -> str:
-    """Parse text using Stanza and return CoNLL-U format"""
-    if language not in STANZA_MODELS:
-        return f"Error: Language '{language}' not available. Available languages: {list(STANZA_MODELS.keys())}"
-    try:
-        # Process the text
-        doc = STANZA_MODELS[language](text)
-        # Convert to CoNLL-U format
-        conllu_output = doc.to_conllu()
-        return conllu_output
-    except Exception as e:
-        return f"Error processing text: {str(e)}"
-def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
-    """Convert CoNLL-U text to pandas DataFrame for visualization"""
-    lines = conllu_text.strip().split('\n')
-    data = []
-    for line in lines:
-        # Skip comments and empty lines
-        if line.startswith('#') or not line.strip():
-            continue
-        # Parse CoNLL-U format
-        parts = line.split('\t')
-        if len(parts) >= 10:
-            data.append({
-                'ID': parts[0],
-                'FORM': parts[1],
-                'LEMMA': parts[2],
-                'UPOS': parts[3],
-                'XPOS': parts[4],
-                'FEATS': parts[5],
-                'HEAD': parts[6],
-                'DEPREL': parts[7],
-                'DEPS': parts[8],
-                'MISC': parts[9]
-            })
-    return pd.DataFrame(data)
-def create_dependency_visualization(df: pd.DataFrame) -> str:
-    """Create a simple text-based dependency visualization"""
-    if df.empty:
-        return "No data to visualize"
-    viz_lines = []
-    viz_lines.append("Dependency Parse Visualization:")
-    viz_lines.append("=" * 50)
-    for _, row in df.iterrows():
-        word = row['FORM']
-        pos = row['UPOS']
-        deprel = row['DEPREL']
-        head_id = row['HEAD']
-        # Find the head word
-        if head_id != '0':  # Not root
-            try:
-                head_idx = int(head_id) - 1
-                if head_idx < len(df):
-                    head_word = df.iloc[head_idx]['FORM']
-                    viz_lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
-                else:
-                    viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
-            except (ValueError, IndexError):
-                viz_lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
-        else:
-            viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
-    return "\n".join(viz_lines)
-def process_text(text: str, language: str):
-    """Main processing function that returns all outputs"""
-    if not text.strip():
-        return "Please enter some text to parse.", "", "No data to display"
-    # Parse with Stanza
-    conllu_output = parse_text_with_stanza(text, language)
-    if conllu_output.startswith("Error"):
-        return conllu_output, "", "Error in parsing"
-    # Convert to DataFrame
-    try:
-        df = conllu_to_dataframe(conllu_output)
-        # Create visualization
-        visualization = create_dependency_visualization(df)
-        return conllu_output, df, visualization
-    except Exception as e:
-        return conllu_output, "", f"Error creating visualization: {str(e)}"
-# Initialize models (this will run when the space starts)
-print("Initializing Stanza models...")
-STANZA_MODELS = initialize_stanza_models()
-# Create Gradio interface
-def create_gradio_app():
-    with gr.Blocks(title="Stanza Parser with CoNLL-U Viewer", theme=gr.themes.Soft()) as app:
-        gr.Markdown("""
-        # Stanza Parser with CoNLL-U Viewer
-        This tool uses Stanford's Stanza library to parse sentences and provides:
-        - **CoNLL-U Format Output**: Standard linguistic annotation format
-        - **Interactive Table**: Browse parsed tokens with linguistic features
-        - **Dependency Visualization**: Text-based dependency structure display
-        Enter your text below and select a language to get started!
-        """)
-        with gr.Row():
-            with gr.Column(scale=2):
-                text_input = gr.Textbox(
-                    label="Input Text",
-                    placeholder="Enter the text you want to parse...",
-                    lines=4,
-                    value="The quick brown fox jumps over the lazy dog."
-                )
-                language_dropdown = gr.Dropdown(
-                    choices=list(STANZA_MODELS.keys()),
-                    label="Language",
-                    value="en" if "en" in STANZA_MODELS else list(STANZA_MODELS.keys())[0],
-                    info="Select the language of your input text"
-                )
-                parse_button = gr.Button("Parse Text", variant="primary")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### CoNLL-U Output")
-                conllu_output = gr.Textbox(
-                    label="CoNLL-U Format",
-                    lines=10,
-                    max_lines=20,
-                    show_copy_button=True,
-                    info="Raw CoNLL-U format output - you can copy this for use in other tools"
-                )
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### Parsed Data Table")
-                data_table = gr.Dataframe(
-                    label="Token Analysis",
-                    interactive=False,
-                    wrap=True
-                )
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### Dependency Structure")
-                dependency_viz = gr.Textbox(
-                    label="Dependency Relationships",
-                    lines=8,
-                    max_lines=15,
-                    show_copy_button=True,
-                    info="Text-based visualization of dependency relationships"
-                )
-        # Event handling
-        parse_button.click(
-            fn=process_text,
-            inputs=[text_input, language_dropdown],
-            outputs=[conllu_output, data_table, dependency_viz]
-        )
-        # Also trigger on Enter in text input
-        text_input.submit(
-            fn=process_text,
-            inputs=[text_input, language_dropdown],
-            outputs=[conllu_output, data_table, dependency_viz]
-        )
-        # Add examples
-        gr.Markdown("### Example Texts")
-        examples = [
-            ["The quick brown fox jumps over the lazy dog.", "en"],
-            ["El gato está en la mesa.", "es"],
-            ["Le chat est sur la table.", "fr"],
-            ["Die Katze ist auf dem Tisch.", "de"],
-        ]
-        gr.Examples(
-            examples=examples,
-            inputs=[text_input, language_dropdown],
-            outputs=[conllu_output, data_table, dependency_viz],
-            fn=process_text,
-            cache_examples=False
-        )
-        gr.Markdown("""
-        ### About CoNLL-U Format
-        The CoNLL-U format includes these fields for each token:
-        - **ID**: Token index
-        - **FORM**: Word form or punctuation symbol
-        - **LEMMA**: Lemma or stem of word form
-        - **UPOS**: Universal part-of-speech tag
-        - **XPOS**: Language-specific part-of-speech tag
-        - **FEATS**: Morphological features
-        - **HEAD**: Head of the current word
-        - **DEPREL**: Dependency relation to the head
-        - **DEPS**: Enhanced dependency graph
-        - **MISC**: Miscellaneous annotations
-        For more information about Stanza, visit: https://stanfordnlp.github.io/stanza/
-        """)
-    return app
-# Create and launch the app
-if __name__ == "__main__":
-    app = create_gradio_app()
-    app.launch()

+---
+title: Stanza Parser with CoNLL-U Viewer
+emoji: 🔍
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# Stanza Parser with CoNLL-U Viewer
+A comprehensive linguistic analysis tool powered by Stanford's Stanza library that provides sentence parsing with multiple output formats.
+## Features
+- **Multi-language Support**: Parse text in English, Spanish, French, German, Chinese, Russian, and Arabic
+- **CoNLL-U Output**: Get standard linguistic annotation format output
+- **Interactive Data Table**: Browse parsed tokens with all linguistic features
+- **Dependency Visualization**: Text-based visualization of dependency relationships
+- **Copy-friendly Output**: Easy to copy results for use in other tools
+## What is CoNLL-U?
+CoNLL-U is a standard format for representing linguistic annotations that includes:
+- **Tokenization**: Word and sentence boundaries
+- **Part-of-Speech Tagging**: Universal and language-specific POS tags
+- **Lemmatization**: Base forms of words
+- **Morphological Features**: Grammatical attributes
+- **Dependency Parsing**: Syntactic relationships between words
+## How to Use
+1. Enter your text in the input box
+2. Select the appropriate language
+3. Click "Parse Text" or press Enter
+4. View results in three formats:
+   - Raw CoNLL-U format (copy-paste ready)
+   - Interactive data table
+   - Dependency structure visualization
+## Example Output
+For the sentence "The cat sits on the mat", you'll get:
+- **CoNLL-U format**: Standard 10-column format with all linguistic features
+- **Data table**: Interactive view of each token's properties
+- **Dependencies**: "cat --nsubj--> sits", "mat --nmod--> sits", etc.
+## Use Cases
+- **Linguistic Research**: Analyze sentence structure and grammatical relationships
+- **NLP Development**: Generate training data or test parsing models
+- **Educational**: Learn about syntactic analysis and dependency grammar
+- **Text Processing**: Prepare annotated data for downstream tasks
+## Technical Details
+This space uses:
+- **Stanza**: Stanford's multilingual NLP toolkit
+- **Gradio**: For the interactive web interface
+- **Pandas**: For data table visualization
+The models are automatically downloaded and cached when the space starts up.
+## Supported Languages
+Currently supports: English (en), Spanish (es), French (fr), German (de), Chinese (zh), Russian (ru), Arabic (ar)
+---
+*Powered by Stanford Stanza - https://stanfordnlp.github.io/stanza/*