Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jun 30, 2025

Commit

8c61d1b

verified ·

1 Parent(s): be6e194

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -177

app.py CHANGED Viewed

@@ -3,92 +3,122 @@ import stanza
 import pandas as pd
 import sys
 import traceback
-from huggingface_hub import hf_hub_download
 import os
-# Global variable to store the model
 LESBIAN_GREEK_MODEL = None
-def download_and_load_model():
-    """Download and load the Lesbian Greek model from Hugging Face"""
     global LESBIAN_GREEK_MODEL
     try:
-        print("Downloading Lesbian Greek model components from Hugging Face...")
-        # Download model files
-        model_files = {
-            'tokenizer': 'tokenizer.pt',
-            'lemmatizer': 'lemmatizer.pt',
-            'pos': 'pos.pt',
-            'depparse': 'depparse.pt'
-        }
-        model_paths = {}
-        for component, filename in model_files.items():
-            try:
-                print(f"Downloading {filename}...")
-                model_path = hf_hub_download(
-                    repo_id="sbompolas/Lesbian-Greek-Morphosyntactic-Model",
-                    filename=filename,
-                    cache_dir="./model_cache"
-                )
-                model_paths[component] = model_path
-                print(f"Successfully downloaded {filename}")
-            except Exception as e:
-                print(f"Failed to download {filename}: {e}")
-                return False
-        # Initialize the Stanza pipeline with downloaded models
-        print("Initializing Stanza pipeline with custom models...")
-        LESBIAN_GREEK_MODEL = stanza.Pipeline(
-            lang='grc',  # Ancient Greek language code
-            use_gpu=False,
-            processors='tokenize,lemma,pos,depparse',
-            tokenize_model_path=model_paths['tokenizer'],
-            lemma_model_path=model_paths['lemmatizer'],
-            pos_model_path=model_paths['pos'],
-            depparse_model_path=model_paths['depparse'],
-            tokenize_pretokenized=False,
-            verbose=True
-        )
-        print("Successfully loaded Lesbian Greek Stanza pipeline!")
-        return True
-    except Exception as e:
-        print(f"Failed to download/load model: {e}")
-        traceback.print_exc()
-        # Fallback: try to initialize with direct URLs
-        try:
-            print("Trying direct URL approach...")
-            tokenize_model_path = 'https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/tokenizer.pt'
-            lemma_model_path = 'https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/lemmatizer.pt'
-            pos_model_path = 'https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/pos.pt'
-            depparse_model_path = 'https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/depparse.pt'
-            LESBIAN_GREEK_MODEL = stanza.Pipeline(
-                lang='grc',
-                use_gpu=False,
-                processors='tokenize,lemma,pos,depparse',
-                tokenize_model_path=tokenize_model_path,
-                lemma_model_path=lemma_model_path,
-                pos_model_path=pos_model_path,
-                depparse_model_path=depparse_model_path,
-                tokenize_pretokenized=False,
-                verbose=True
-            )
-            print("Successfully loaded with direct URLs!")
-            return True
-        except Exception as e2:
-            print(f"Direct URL approach also failed: {e2}")
-            return False
 def stanza_doc_to_conllu(doc) -> str:
-    """Convert Stanza Document to CoNLL-U format"""
     conllu_lines = []
     for sent_idx, sentence in enumerate(doc.sentences):
@@ -117,8 +147,8 @@ def stanza_doc_to_conllu(doc) -> str:
     return "\n".join(conllu_lines)
-def parse_text_with_model(text: str) -> str:
-    """Parse Lesbian Greek text using Stanza and return CoNLL-U format"""
     global LESBIAN_GREEK_MODEL
     if LESBIAN_GREEK_MODEL is None:
@@ -128,11 +158,11 @@ def parse_text_with_model(text: str) -> str:
         return "Error: Please enter some text to parse."
     try:
-        print(f"Processing text: {text[:50]}...")
         # Process the text
         doc = LESBIAN_GREEK_MODEL(text)
-        # Convert to CoNLL-U format
         conllu_output = stanza_doc_to_conllu(doc)
         print("CoNLL-U conversion successful!")
         return conllu_output
@@ -187,15 +217,13 @@ def create_dependency_visualization(df: pd.DataFrame) -> str:
     try:
         viz_lines = []
         viz_lines.append("Dependency Parse Visualization:")
-        viz_lines.append("=" * 60)
         for _, row in df.iterrows():
             word = row['FORM']
             pos = row['UPOS']
-            lemma = row['LEMMA']
             deprel = row['DEPREL']
             head_id = row['HEAD']
-            feats = row['FEATS']
             # Find the head word
             if head_id != '0':  # Not root
@@ -203,47 +231,28 @@ def create_dependency_visualization(df: pd.DataFrame) -> str:
                     head_idx = int(head_id) - 1
                     if 0 <= head_idx < len(df):
                         head_word = df.iloc[head_idx]['FORM']
-                        viz_lines.append(f"{word} [{lemma}] ({pos}) --{deprel}--> {head_word}")
                     else:
-                        viz_lines.append(f"{word} [{lemma}] ({pos}) --{deprel}--> [OUT_OF_RANGE]")
                 except (ValueError, IndexError):
-                    viz_lines.append(f"{word} [{lemma}] ({pos}) --{deprel}--> [ERROR]")
             else:
-                viz_lines.append(f"{word} [{lemma}] ({pos}) --{deprel}--> ROOT")
-            # Add morphological features if available
-            if feats != "_" and feats.strip():
-                viz_lines.append(f"    Morphology: {feats}")
-                viz_lines.append("")  # Add blank line for readability
         return "\n".join(viz_lines)
     except Exception as e:
         return f"Error creating visualization: {str(e)}"
-def get_model_info():
-    """Get information about the loaded model"""
-    global LESBIAN_GREEK_MODEL
-    if LESBIAN_GREEK_MODEL is None:
-        return "Model not loaded"
-    try:
-        info = []
-        info.append(f"Language: {LESBIAN_GREEK_MODEL.lang}")
-        info.append(f"Processors: {', '.join(LESBIAN_GREEK_MODEL.processors.keys())}")
-        return " | ".join(info)
-    except:
-        return "Model loaded successfully"
 def process_text(text: str):
     """Main processing function that returns all outputs"""
     if not text.strip():
         empty_df = pd.DataFrame()
         return "Please enter some Lesbian Greek text to parse.", empty_df, "No data to display"
-    # Parse with Stanza
     print(f"Starting to process: {text[:30]}...")
-    conllu_output = parse_text_with_model(text)
     if conllu_output.startswith("Error"):
         empty_df = pd.DataFrame()
@@ -267,12 +276,13 @@ def process_text(text: str):
         empty_df = pd.DataFrame()
         return conllu_output, empty_df, error_msg
-# Initialize model
 print("Initializing Lesbian Greek Stanza model...")
-model_loaded = download_and_load_model()
 if not model_loaded:
-    print("WARNING: Could not load Lesbian Greek model! The app will still run but parsing may not work.")
 # Create Gradio interface
 def create_gradio_app():
@@ -280,36 +290,55 @@ def create_gradio_app():
         gr.Markdown("""
         # Lesbian Greek Morphosyntactic Parser
-        This tool uses the Lesbian Greek Morphosyntactic Model from Hugging Face to parse Ancient Greek text (specifically Lesbian dialect) and provides:
-        - **CoNLL-U Format Output**: Standard linguistic annotation format
-        - **Interactive Table**: Browse parsed tokens with linguistic features
-        - **Dependency Visualization**: Text-based dependency structure display
-        - **Morphological Analysis**: Detailed morphological features for each token
-        **Model**: [sbompolas/Lesbian-Greek-Morphosyntactic-Model](https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model)
         Enter your Lesbian Greek text below to get started!
         """)
-        # Model status
-        with gr.Row():
-            model_status = gr.Textbox(
-                label="Model Status",
-                value=get_model_info(),
-                interactive=False,
-                max_lines=1
-            )
         with gr.Row():
             with gr.Column():
                 text_input = gr.Textbox(
                     label="Lesbian Greek Text Input",
-                    placeholder="Enter the Lesbian Greek text you want to analyze...",
                     lines=4,
-                    value="τὰν δὲ πάντα ταῦτα"
                 )
-                parse_button = gr.Button("Parse Text", variant="primary", size="lg")
         with gr.Row():
             with gr.Column():
@@ -319,7 +348,7 @@ def create_gradio_app():
                     lines=10,
                     max_lines=20,
                     show_copy_button=True,
-                    info="Raw CoNLL-U format output - you can copy this for use in other tools"
                 )
         with gr.Row():
@@ -333,77 +362,67 @@ def create_gradio_app():
         with gr.Row():
             with gr.Column():
-                gr.Markdown("### Dependency Structure & Morphology")
                 dependency_viz = gr.Textbox(
                     label="Dependency Relationships",
-                    lines=12,
-                    max_lines=25,
                     show_copy_button=True,
-                    info="Text-based visualization of dependency relationships with morphological features"
                 )
         # Event handling
-        parse_button.click(
-            fn=process_text,
-            inputs=[text_input],
-            outputs=[conllu_output, data_table, dependency_viz]
-        )
-        # Also trigger on Enter in text input
-        text_input.submit(
-            fn=process_text,
-            inputs=[text_input],
-            outputs=[conllu_output, data_table, dependency_viz]
-        )
-        # Add example texts
-        gr.Markdown("### Example Lesbian Greek Texts")
-        examples = [
-            ["τὰν δὲ πάντα ταῦτα"],
-            ["ἔρος δηὖτέ μ᾽ ὀ λυσιμέλης δόνει"],
-            ["παῖς τάδ᾽ ἀμφὶ πάντα"],
-            ["κάλλιστον μὲν ἐγὼ φαίμι"],
-            ["οὐδ᾽ ἴα μάλα λάμπρα"],
-            ["ἀλλὰ τίς σ᾽, ὦ Ψάπφ᾽, ἀδικήει;"],
-        ]
-        gr.Examples(
-            examples=examples,
-            inputs=[text_input],
-            outputs=[conllu_output, data_table, dependency_viz],
-            fn=process_text,
-            cache_examples=False
-        )
-        gr.Markdown("""
-        ### About This Tool
-        This parser is specifically designed for Ancient Greek texts in the Lesbian dialect (associated with Sappho and other poets from Lesbos).
-        The model provides detailed morphosyntactic analysis including:
-        **CoNLL-U Format Fields:**
         - **ID**: Token index
         - **FORM**: Word form or punctuation symbol
         - **LEMMA**: Lemma or stem of word form
         - **UPOS**: Universal part-of-speech tag
         - **XPOS**: Language-specific part-of-speech tag
-        - **FEATS**: Morphological features (case, number, gender, tense, etc.)
         - **HEAD**: Head of the current word
         - **DEPREL**: Dependency relation to the head
         - **DEPS**: Enhanced dependency graph
         - **MISC**: Miscellaneous annotations
-        **About the Model:**
-        - Built with Stanford Stanza framework
-        - Trained specifically on Lesbian Greek texts
-        - Provides accurate morphological analysis for this dialect
-        - Includes syntactic parsing with dependency relations
-        - Components: tokenizer, lemmatizer, POS tagger, dependency parser
-        **Credits:**
-        - Model: [sbompolas/Lesbian-Greek-Morphosyntactic-Model](https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model)
-        - Built with Stanford Stanza framework
-        - Interface created with Gradio
         """)
     return app

 import pandas as pd
 import sys
 import traceback
 import os
+import tempfile
+import requests
+from pathlib import Path
+# Global variable to store the Lesbian Greek model
 LESBIAN_GREEK_MODEL = None
+def download_model_file(url, filename):
+    """Download a model file from Hugging Face"""
+    try:
+        print(f"Downloading {filename}...")
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        with open(filename, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Successfully downloaded {filename}")
+        return True
+    except Exception as e:
+        print(f"Failed to download {filename}: {e}")
+        return False
+def setup_lesbian_greek_models():
+    """Download and setup the Lesbian Greek models"""
+    models_dir = Path("./lesbian_greek_models")
+    models_dir.mkdir(exist_ok=True)
+    # Model URLs from the Hugging Face repository
+    model_files = {
+        "tokenizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/el_test_tokenizer.pt",
+        "lemmatizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/el_test_nocharlm_lemmatizer.pt",
+        "pos.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/el_test_transformer_tagger.pt",
+        "depparse.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/el_test_transformer_parser.pt"
+    }
+    # Download all model files
+    for local_name, url in model_files.items():
+        local_path = models_dir / local_name
+        if not local_path.exists():
+            if not download_model_file(url, str(local_path)):
+                return False, f"Failed to download {local_name}"
+    return True, models_dir
+def initialize_lesbian_greek_model():
+    """Initialize Stanza model for Lesbian Greek using custom models"""
     global LESBIAN_GREEK_MODEL
     try:
+        print("Setting up Lesbian Greek models...")
+        # Setup custom models
+        success, models_dir = setup_lesbian_greek_models()
+        if not success:
+            return False, models_dir  # models_dir contains error message in this case
+        print("Creating Stanza pipeline with custom Lesbian Greek models...")
+        # Create custom model directory structure that Stanza expects
+        stanza_dir = Path("./stanza_resources")
+        stanza_dir.mkdir(exist_ok=True)
+        # Set STANZA_RESOURCES_DIR environment variable
+        os.environ['STANZA_RESOURCES_DIR'] = str(stanza_dir)
+        # Create language-specific directory
+        lang_dir = stanza_dir / "el"
+        lang_dir.mkdir(exist_ok=True)
+        # Create processor-specific directories and copy models
+        processors = {
+            "tokenize": "tokenizer.pt",
+            "lemma": "lemmatizer.pt",
+            "pos": "pos.pt",
+            "depparse": "depparse.pt"
+        }
+        for processor, model_file in processors.items():
+            proc_dir = lang_dir / processor
+            proc_dir.mkdir(exist_ok=True)
+            # Copy model file to expected location
+            src_path = models_dir / model_file
+            dst_path = proc_dir / "lesbian_greek.pt"
+            if src_path.exists():
+                import shutil
+                shutil.copy2(str(src_path), str(dst_path))
+                print(f"Copied {model_file} to {dst_path}")
+        # Create the pipeline configuration
+        config = {
+            'processors': 'tokenize,pos,lemma,depparse',
+            'lang': 'el',
+            'tokenize_model_path': str(lang_dir / "tokenize" / "lesbian_greek.pt"),
+            'pos_model_path': str(lang_dir / "pos" / "lesbian_greek.pt"),
+            'lemma_model_path': str(lang_dir / "lemma" / "lesbian_greek.pt"),
+            'depparse_model_path': str(lang_dir / "depparse" / "lesbian_greek.pt"),
+            'use_gpu': False,
+            'verbose': True
+        }
+        print("Initializing Lesbian Greek pipeline...")
+        LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
+        print("Lesbian Greek model loaded successfully!")
+        return True, "Model loaded successfully"
+    except Exception as e:
+        error_msg = f"Failed to load Lesbian Greek model: {e}"
+        print(error_msg)
+        traceback.print_exc()
+        return False, error_msg
 def stanza_doc_to_conllu(doc) -> str:
+    """Convert Stanza Document to CoNLL-U format manually"""
     conllu_lines = []
     for sent_idx, sentence in enumerate(doc.sentences):
     return "\n".join(conllu_lines)
+def parse_text_with_lesbian_greek(text: str) -> str:
+    """Parse Lesbian Greek text using custom Stanza models and return CoNLL-U format"""
     global LESBIAN_GREEK_MODEL
     if LESBIAN_GREEK_MODEL is None:
         return "Error: Please enter some text to parse."
     try:
+        print(f"Processing Lesbian Greek text: {text[:50]}...")
         # Process the text
         doc = LESBIAN_GREEK_MODEL(text)
+        # Convert to CoNLL-U format manually
         conllu_output = stanza_doc_to_conllu(doc)
         print("CoNLL-U conversion successful!")
         return conllu_output
     try:
         viz_lines = []
         viz_lines.append("Dependency Parse Visualization:")
+        viz_lines.append("=" * 50)
         for _, row in df.iterrows():
             word = row['FORM']
             pos = row['UPOS']
             deprel = row['DEPREL']
             head_id = row['HEAD']
             # Find the head word
             if head_id != '0':  # Not root
                     head_idx = int(head_id) - 1
                     if 0 <= head_idx < len(df):
                         head_word = df.iloc[head_idx]['FORM']
+                        viz_lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
                     else:
+                        viz_lines.append(f"{word} ({pos}) --{deprel}--> [OUT_OF_RANGE]")
                 except (ValueError, IndexError):
+                    viz_lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
             else:
+                viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
         return "\n".join(viz_lines)
     except Exception as e:
         return f"Error creating visualization: {str(e)}"
 def process_text(text: str):
     """Main processing function that returns all outputs"""
     if not text.strip():
         empty_df = pd.DataFrame()
         return "Please enter some Lesbian Greek text to parse.", empty_df, "No data to display"
+    # Parse with custom Lesbian Greek model
     print(f"Starting to process: {text[:30]}...")
+    conllu_output = parse_text_with_lesbian_greek(text)
     if conllu_output.startswith("Error"):
         empty_df = pd.DataFrame()
         empty_df = pd.DataFrame()
         return conllu_output, empty_df, error_msg
+# Initialize Lesbian Greek model
 print("Initializing Lesbian Greek Stanza model...")
+model_loaded, status_message = initialize_lesbian_greek_model()
 if not model_loaded:
+    print(f"CRITICAL ERROR: {status_message}")
+    # Don't exit, allow app to start and show error message
 # Create Gradio interface
 def create_gradio_app():
         gr.Markdown("""
         # Lesbian Greek Morphosyntactic Parser
+        This tool uses custom Stanza models trained specifically for the **Lesbian dialect of Greek**
+        (spoken on the island of Lesbos). The models provide:
+        - **Tokenization**: Splits text into tokens
+        - **POS Tagging**: Part-of-speech classification
+        - **Lemmatization**: Base form identification
+        - **Dependency Parsing**: Syntactic relationship analysis
+        - **CoNLL-U Output**: Standard linguistic annotation format
+        ## About the Models
+        These models were trained on a curated treebank of 540 sentences from both oral and written
+        sources collected from various villages of Lesbos, including Agra, Chidira, Eressos,
+        Pterounta, Mesotopos, and Parakoila.
+        **Citation**: Bompolas, S., Markantonatou, S., Ralli, A., & Anastasopoulos, A. (2025).
+        Crossing Dialectal Boundaries: Building a Treebank for the Dialect of Lesbos through
+        Knowledge Transfer from Standard Modern Greek.
         Enter your Lesbian Greek text below to get started!
         """)
+        if not model_loaded:
+            gr.Markdown(f"""
+            ⚠️ **Model Loading Error**: {status_message}
+            The Lesbian Greek models could not be loaded. This may be due to:
+            - Network issues downloading the models
+            - Insufficient memory or storage
+            - Model compatibility issues
+            Please try refreshing the page or contact the developers.
+            """)
         with gr.Row():
             with gr.Column():
                 text_input = gr.Textbox(
                     label="Lesbian Greek Text Input",
+                    placeholder="Εισάγετε το κείμενο στη Λεσβιακή διάλεκτο...",
                     lines=4,
+                    value="Τα παιδιά πάντ στο κήπ." if model_loaded else ""
                 )
+                parse_button = gr.Button(
+                    "Parse Lesbian Greek Text",
+                    variant="primary",
+                    size="lg",
+                    interactive=model_loaded
+                )
         with gr.Row():
             with gr.Column():
                     lines=10,
                     max_lines=20,
                     show_copy_button=True,
+                    info="Raw CoNLL-U format output optimized for Lesbian Greek dialect"
                 )
         with gr.Row():
         with gr.Row():
             with gr.Column():
+                gr.Markdown("### Dependency Structure")
                 dependency_viz = gr.Textbox(
                     label="Dependency Relationships",
+                    lines=8,
+                    max_lines=15,
                     show_copy_button=True,
+                    info="Text-based visualization of syntactic dependencies"
                 )
         # Event handling
+        if model_loaded:
+            parse_button.click(
+                fn=process_text,
+                inputs=[text_input],
+                outputs=[conllu_output, data_table, dependency_viz]
+            )
+            # Also trigger on Enter in text input
+            text_input.submit(
+                fn=process_text,
+                inputs=[text_input],
+                outputs=[conllu_output, data_table, dependency_viz]
+            )
+        # Add Lesbian Greek examples (if available)
+        if model_loaded:
+            gr.Markdown("### Example Lesbian Greek Texts")
+            examples = [
+                ["Τα παιδιά πάντ στο κήπ."],
+                ["Η γάτα κάθεται στο τραπέζ."],
+                ["Ο ήλιος λάμπει στον ουρανό."],
+                ["Η θάλασσα είναι γαλάζια και όμορφη."],
+            ]
+            gr.Examples(
+                examples=examples,
+                inputs=[text_input],
+                outputs=[conllu_output, data_table, dependency_viz],
+                fn=process_text,
+                cache_examples=False
+            )
+        gr.Markdown("""
+        ### About CoNLL-U Format
+        The CoNLL-U format includes these fields for each token:
         - **ID**: Token index
         - **FORM**: Word form or punctuation symbol
         - **LEMMA**: Lemma or stem of word form
         - **UPOS**: Universal part-of-speech tag
         - **XPOS**: Language-specific part-of-speech tag
+        - **FEATS**: Morphological features
         - **HEAD**: Head of the current word
         - **DEPREL**: Dependency relation to the head
         - **DEPS**: Enhanced dependency graph
         - **MISC**: Miscellaneous annotations
+        ### Resources
+        - [Lesbian Greek Models on Hugging Face](https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model)
+        - [UD_Greek-Lesbian Treebank](https://github.com/UniversalDependencies/UD_Greek-Lesbian)
+        - [Stanza Documentation](https://stanfordnlp.github.io/stanza/)
         """)
     return app