Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

96e10be

verified ·

1 Parent(s): 9f3a059

Update app.py

Browse files

Files changed (1) hide show

app.py +873 -130

app.py CHANGED Viewed

@@ -9,173 +9,916 @@ import requests
 from pathlib import Path
 import json
-STANZA_MODELS = {}
-MODEL_CONFIGS = {
-    "Lesbian-only": {
-        "files": {
-            "tokenizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/tokenizer.pt",
-            "lemmatizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/lemmatizer.pt",
-            "pos.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/pos.pt",
-            "depparse.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/depparse.pt"
-        }
-    },
-    "Lesbian-synthetic-data": {
-        "files": {
-            "tokenizer.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/tokenizer.pt",
-            "lemmatizer.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/lemmatizer.pt",
-            "pos.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/pos.pt",
-            "depparse.pt": "https://huggingface.co/sbompolas/NGUD-Lesbian-Morphosyntactic-Model/resolve/main/depparse.pt"
-        }
-    }
-}
 def download_model_file(url, filename):
     try:
         response = requests.get(url, stream=True)
         response.raise_for_status()
         with open(filename, 'wb') as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
         return True
     except Exception as e:
         print(f"Failed to download {filename}: {e}")
         return False
-def setup_model(model_key):
-    models_dir = Path(f"./models_{model_key}")
     models_dir.mkdir(exist_ok=True)
-    model_files = MODEL_CONFIGS[model_key]['files']
-    for fname, url in model_files.items():
-        local_path = models_dir / fname
         if not local_path.exists():
             if not download_model_file(url, str(local_path)):
-                return False, f"Failed to download {fname}"
     return True, models_dir
-def load_model(model_key):
-    if model_key in STANZA_MODELS:
-        return STANZA_MODELS[model_key]
-    success, models_dir = setup_model(model_key)
-    if not success:
-        return None
-    model_paths = {
-        'tokenize_model_path': str(models_dir / "tokenizer.pt"),
-        'pos_model_path': str(models_dir / "pos.pt"),
-        'lemma_model_path': str(models_dir / "lemmatizer.pt"),
-        'depparse_model_path': str(models_dir / "depparse.pt")
-    }
-    config = {
-        'processors': 'tokenize,pos,lemma,depparse',
-        'lang': 'el',
-        'use_gpu': False,
-        'verbose': False,
-        'download_method': None,
-        **model_paths
-    }
-    model = stanza.Pipeline(**config)
-    STANZA_MODELS[model_key] = model
-    return model
-def stanza_doc_to_conllu(doc):
     conllu_lines = []
     for sent_idx, sentence in enumerate(doc.sentences):
         conllu_lines.append(f"# sent_id = {sent_idx + 1}")
         conllu_lines.append(f"# text = {sentence.text}")
         for word in sentence.words:
             fields = [
-                str(word.id), word.text, word.lemma or "_", word.upos or "_",
-                word.xpos or "_", word.feats or "_", str(word.head or 0),
-                word.deprel or "_", "_", "_"
             ]
             conllu_lines.append("\t".join(fields))
         conllu_lines.append("")
     return "\n".join(conllu_lines)
-def conllu_to_dataframe(conllu_text):
-    lines = conllu_text.strip().split('\n')
-    data = []
-    sentence_dfs = []
-    current = []
-    for line in lines:
-        if line.startswith('# text ='):
-            if current:
-                sentence_dfs.append(pd.DataFrame(current))
-                current = []
-        elif not line.startswith('#') and line.strip():
             parts = line.split('\t')
             if len(parts) >= 10:
-                current.append({
-                    'ID': parts[0], 'FORM': parts[1], 'LEMMA': parts[2], 'UPOS': parts[3],
-                    'XPOS': parts[4], 'FEATS': parts[5], 'HEAD': parts[6],
-                    'DEPREL': parts[7], 'DEPS': parts[8], 'MISC': parts[9]
                 })
-    if current:
-        sentence_dfs.append(pd.DataFrame(current))
-    return sentence_dfs
-def create_simple_tree(df):
-    lines = []
-    for _, row in df.iterrows():
-        head = "ROOT" if row['HEAD'] == '0' else df[df['ID'] == row['HEAD']]['FORM'].values[0]
-        lines.append(f"{row['FORM']} ({row['UPOS']}) --{row['DEPREL']}--> {head}")
-    return "\n".join(lines)
-def parse_text(text, model_key):
     if not text.strip():
-        return "Please enter text.", [], "", ""
-    model = load_model(model_key)
-    if model is None:
-        return f"Error loading model: {model_key}", [], "", ""
     try:
-        doc = model(text)
-        conllu = stanza_doc_to_conllu(doc)
-        dfs = conllu_to_dataframe(conllu)
-        trees = [create_simple_tree(df) for df in dfs]
-        return conllu, dfs, trees[0], f"Sentence 1 of {len(dfs)}"
     except Exception as e:
-        return f"Error: {str(e)}", [], "", ""
-def update_sentence(index, dfs):
-    if not dfs:
-        return "", ""
-    df = dfs[index]
-    tree = create_simple_tree(df)
-    return tree, f"Sentence {index+1} of {len(dfs)}"
 def create_gradio_app():
-    with gr.Blocks() as demo:
-        gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
-        model_choice = gr.Dropdown(label="Select Model", choices=list(MODEL_CONFIGS.keys()), value="Lesbian-only")
-        text_input = gr.Textbox(label="Input Text", lines=3, value="Τα παιδιά πάντ στο κήπ.")
-        parse_btn = gr.Button("Parse")
-        conllu_box = gr.Textbox(label="CoNLL-U Output", lines=10)
-        tree_output = gr.Textbox(label="Dependency Tree", lines=10)
-        sentence_label = gr.Textbox(label="Sentence Info", interactive=False)
-        sentence_slider = gr.Slider(minimum=0, maximum=0, step=1, label="Sentence Index")
-        state_dfs = gr.State([])
-        parse_btn.click(
-            fn=parse_text,
-            inputs=[text_input, model_choice],
-            outputs=[conllu_box, state_dfs, tree_output, sentence_label]
-        ).then(
-            fn=lambda dfs: gr.update(maximum=len(dfs)-1, value=0) if dfs else gr.update(visible=False),
-            inputs=state_dfs,
-            outputs=sentence_slider
-        )
-        sentence_slider.change(
-            fn=update_sentence,
-            inputs=[sentence_slider, state_dfs],
-            outputs=[tree_output, sentence_label]
-        )
-    return demo
-if __name__ == '__main__':
     app = create_gradio_app()
-    app.launch()

 from pathlib import Path
 import json
+# Global variable to store the Lesbian Greek model
+LESBIAN_GREEK_MODEL = None
 def download_model_file(url, filename):
+    """Download a model file from Hugging Face"""
     try:
+        print(f"Downloading {filename}...")
         response = requests.get(url, stream=True)
         response.raise_for_status()
         with open(filename, 'wb') as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
+        print(f"Successfully downloaded {filename}")
         return True
     except Exception as e:
         print(f"Failed to download {filename}: {e}")
         return False
+def setup_lesbian_greek_models():
+    """Download and setup the Lesbian Greek models"""
+    models_dir = Path("./lesbian_greek_models")
     models_dir.mkdir(exist_ok=True)
+    # Model URLs from the Hugging Face repository - using simplified URLs as requested
+    model_files = {
+        "tokenizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/tokenizer.pt",
+        "lemmatizer.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/lemmatizer.pt",
+        "pos.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/pos.pt",
+        "depparse.pt": "https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model/resolve/main/depparse.pt"
+    }
+    # Download all model files
+    for local_name, url in model_files.items():
+        local_path = models_dir / local_name
         if not local_path.exists():
             if not download_model_file(url, str(local_path)):
+                return False, f"Failed to download {local_name}"
     return True, models_dir
+def initialize_lesbian_greek_model():
+    """Initialize Stanza model for Lesbian Greek using custom models"""
+    global LESBIAN_GREEK_MODEL
+    try:
+        print("Setting up Lesbian Greek models...")
+        # Setup custom models
+        success, models_dir = setup_lesbian_greek_models()
+        if not success:
+            return False, models_dir  # models_dir contains error message in this case
+        print("Initializing Lesbian Greek pipeline with custom models...")
+        # First, let's try a completely different approach - bypass Stanza's resource system
+        print("Attempting bypass of Stanza resource system...")
+        try:
+            # Create a minimal temporary resources file to satisfy Stanza's requirements
+            temp_stanza_dir = Path("./temp_stanza")
+            temp_stanza_dir.mkdir(exist_ok=True)
+            # Create a basic resources.json with Greek language support
+            temp_resources = {
+                "el": {
+                    "lang_name": "Greek",
+                    "packages": {
+                        "default": {
+                            "processors": {
+                                "tokenize": "default.pt",
+                                "pos": "default.pt",
+                                "lemma": "default.pt",
+                                "depparse": "default.pt"
+                            }
+                        }
+                    },
+                    "default_packages": {
+                        "tokenize": "default",
+                        "pos": "default",
+                        "lemma": "default",
+                        "depparse": "default"
+                    }
+                }
+            }
+            temp_resources_path = temp_stanza_dir / "resources.json"
+            with open(temp_resources_path, 'w', encoding='utf-8') as f:
+                json.dump(temp_resources, f, indent=2)
+            # Set the temporary directory
+            os.environ['STANZA_RESOURCES_DIR'] = str(temp_stanza_dir)
+            # Try direct model paths with explicit model files
+            model_paths = {
+                'tokenize_model_path': str(models_dir / "tokenizer.pt"),
+                'pos_model_path': str(models_dir / "pos.pt"),
+                'lemma_model_path': str(models_dir / "lemmatizer.pt"),
+                'depparse_model_path': str(models_dir / "depparse.pt")
+            }
+            print("Trying direct model paths with temp resources...")
+            for key, path in model_paths.items():
+                print(f"  {key}: {path}")
+                if not Path(path).exists():
+                    raise FileNotFoundError(f"Model file not found: {path}")
+            config = {
+                'processors': 'tokenize,pos,lemma,depparse',
+                'lang': 'el',
+                'use_gpu': False,
+                'verbose': False,
+                'download_method': None,
+                **model_paths
+            }
+            LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
+            print("SUCCESS: Direct model loading with temp resources worked!")
+            return True, "✅ Custom Lesbian Greek models loaded successfully"
+        except Exception as e1:
+            print(f"Direct approach with temp resources failed: {e1}")
+            # Try the original approach with better error handling
+            try:
+                print("Trying improved package approach...")
+                # Create proper Stanza resources directory structure
+                stanza_dir = Path("./stanza_resources")
+                stanza_dir.mkdir(exist_ok=True)
+                # Set environment variable
+                os.environ['STANZA_RESOURCES_DIR'] = str(stanza_dir)
+                # Create language directory structure
+                lang_dir = stanza_dir / "el"
+                lang_dir.mkdir(exist_ok=True)
+                # Copy models to expected locations with consistent naming
+                processors = ["tokenize", "pos", "lemma", "depparse"]
+                model_files = {
+                    "tokenize": "tokenizer.pt",
+                    "pos": "pos.pt",
+                    "lemma": "lemmatizer.pt",
+                    "depparse": "depparse.pt"
+                }
+                for proc in processors:
+                    proc_dir = lang_dir / proc
+                    proc_dir.mkdir(exist_ok=True)
+                    src_file = models_dir / model_files[proc]
+                    dst_file = proc_dir / "lesbian.pt"
+                    if src_file.exists():
+                        import shutil
+                        shutil.copy2(str(src_file), str(dst_file))
+                        print(f"Copied {model_files[proc]} to {dst_file}")
+                # Create an improved resources.json
+                resources_json = {
+                    "el": {
+                        "lang_name": "Greek",
+                        "packages": {
+                            "lesbian": {
+                                "processors": {
+                                    "tokenize": "lesbian.pt",
+                                    "pos": "lesbian.pt",
+                                    "lemma": "lesbian.pt",
+                                    "depparse": "lesbian.pt"
+                                }
+                            }
+                        },
+                        "default_packages": {
+                            "tokenize": "lesbian",
+                            "pos": "lesbian",
+                            "lemma": "lesbian",
+                            "depparse": "lesbian"
+                        }
+                    }
+                }
+                resources_path = stanza_dir / "resources.json"
+                with open(resources_path, 'w', encoding='utf-8') as f:
+                    json.dump(resources_json, f, indent=2)
+                print(f"Created improved resources.json at {resources_path}")
+                # Try with explicit package name and directory
+                config = {
+                    'processors': 'tokenize,pos,lemma,depparse',
+                    'lang': 'el',
+                    'package': 'lesbian',
+                    'use_gpu': False,
+                    'verbose': False,
+                    'download_method': None,
+                    'dir': str(stanza_dir)
+                }
+                LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
+                print("SUCCESS: Package approach worked!")
+                return True, "✅ Custom Lesbian Greek models loaded successfully"
+            except Exception as e2:
+                print(f"Package approach failed: {e2}")
+                # Try direct paths without package system
+                try:
+                    print("Trying direct paths without any package system...")
+                    # Reset environment to avoid conflicts
+                    if 'STANZA_RESOURCES_DIR' in os.environ:
+                        del os.environ['STANZA_RESOURCES_DIR']
+                    model_paths = {
+                        'tokenize_model_path': str(models_dir / "tokenizer.pt"),
+                        'pos_model_path': str(models_dir / "pos.pt"),
+                        'lemma_model_path': str(models_dir / "lemmatizer.pt"),
+                        'depparse_model_path': str(models_dir / "depparse.pt")
+                    }
+                    config = {
+                        'processors': 'tokenize,pos,lemma,depparse',
+                        'lang': 'el',
+                        'use_gpu': False,
+                        'verbose': False,
+                        **model_paths
+                    }
+                    LESBIAN_GREEK_MODEL = stanza.Pipeline(**config)
+                    print("SUCCESS: Direct paths without package system worked!")
+                    return True, "✅ Custom Lesbian Greek models loaded successfully"
+                except Exception as e3:
+                    print(f"Direct paths approach failed: {e3}")
+                    # Final fallback to default Greek models
+                    try:
+                        print("Falling back to default Greek models...")
+                        LESBIAN_GREEK_MODEL = stanza.Pipeline(
+                            lang='el',
+                            processors='tokenize,pos,lemma,depparse',
+                            use_gpu=False,
+                            verbose=False
+                        )
+                        return True, "⚠️ Using default Greek models (not Lesbian dialect specific)"
+                    except Exception as e4:
+                        print(f"Even fallback failed: {e4}")
+                        return False, f"All approaches failed. Last error: {str(e4)}"
+    except Exception as e:
+        error_msg = f"Failed to initialize: {e}"
+        print(error_msg)
+        traceback.print_exc()
+        return False, error_msg
+def stanza_doc_to_conllu(doc) -> str:
+    """Convert Stanza Document to CoNLL-U format manually"""
     conllu_lines = []
     for sent_idx, sentence in enumerate(doc.sentences):
+        # Add sentence comment
         conllu_lines.append(f"# sent_id = {sent_idx + 1}")
         conllu_lines.append(f"# text = {sentence.text}")
         for word in sentence.words:
+            # CoNLL-U format: ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC
             fields = [
+                str(word.id),                                    # ID
+                word.text,                                       # FORM
+                word.lemma if word.lemma else "_",              # LEMMA
+                word.upos if word.upos else "_",                # UPOS
+                word.xpos if word.xpos else "_",                # XPOS
+                word.feats if word.feats else "_",              # FEATS
+                str(word.head) if word.head else "0",           # HEAD
+                word.deprel if word.deprel else "_",            # DEPREL
+                "_",                                            # DEPS (enhanced dependencies)
+                "_"                                             # MISC
             ]
             conllu_lines.append("\t".join(fields))
+        # Add empty line between sentences
         conllu_lines.append("")
     return "\n".join(conllu_lines)
+def parse_text_with_lesbian_greek(text: str) -> str:
+    """Parse Lesbian Greek text using custom Stanza models and return CoNLL-U format"""
+    global LESBIAN_GREEK_MODEL
+    if LESBIAN_GREEK_MODEL is None:
+        return "Error: Lesbian Greek model not loaded. Please try refreshing the page."
+    if not text.strip():
+        return "Error: Please enter some text to parse."
+    try:
+        print(f"Processing Lesbian Greek text: {text[:50]}...")
+        # Process the text
+        doc = LESBIAN_GREEK_MODEL(text)
+        # Convert to CoNLL-U format manually
+        conllu_output = stanza_doc_to_conllu(doc)
+        print("CoNLL-U conversion successful!")
+        return conllu_output
+    except Exception as e:
+        error_msg = f"Error processing text: {str(e)}"
+        print(error_msg)
+        traceback.print_exc()
+        return error_msg
+def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
+    """Convert CoNLL-U text to pandas DataFrame for visualization"""
+    if conllu_text.startswith("Error"):
+        return pd.DataFrame()
+    try:
+        lines = conllu_text.strip().split('\n')
+        data = []
+        for line in lines:
+            # Skip comments and empty lines
+            if line.startswith('#') or not line.strip():
+                continue
+            # Parse CoNLL-U format
             parts = line.split('\t')
             if len(parts) >= 10:
+                data.append({
+                    'ID': parts[0],
+                    'FORM': parts[1],
+                    'LEMMA': parts[2],
+                    'UPOS': parts[3],
+                    'XPOS': parts[4],
+                    'FEATS': parts[5],
+                    'HEAD': parts[6],
+                    'DEPREL': parts[7],
+                    'DEPS': parts[8],
+                    'MISC': parts[9]
                 })
+        return pd.DataFrame(data)
+    except Exception as e:
+        print(f"Error creating dataframe: {e}")
+        return pd.DataFrame()
+def create_dependency_tree_svg(df: pd.DataFrame) -> str:
+    """Create an SVG dependency tree visualization similar to dep2pict style with sentence navigation"""
+    if df.empty:
+        return "<p>No data to visualize</p>"
+    try:
+        # Parse sentences from dataframe (sentences are separated by sentence IDs in comments)
+        sentences = []
+        current_sentence = []
+        # Group by sentences - assuming sentence breaks when ID resets to 1 or we see a new sentence
+        for idx, row in df.iterrows():
+            word_id = int(row['ID'])
+            if word_id == 1 and current_sentence:  # New sentence starting
+                sentences.append(current_sentence)
+                current_sentence = []
+            current_sentence.append(row)
+        if current_sentence:  # Add the last sentence
+            sentences.append(current_sentence)
+        if not sentences:
+            sentences = [df.to_dict('records')]  # Fallback: treat all as one sentence
+        # Create multi-sentence visualization with navigation
+        if len(sentences) > 1:
+            return create_multi_sentence_svg(sentences)
+        else:
+            return create_single_sentence_svg(sentences[0])
+    except Exception as e:
+        return f"<p>Error creating visualization: {str(e)}</p>"
+def create_multi_sentence_svg(sentences):
+    """Create SVG with sentence navigation for multiple sentences"""
+    sentence_svgs = []
+    for i, sentence_data in enumerate(sentences):
+        sentence_df = pd.DataFrame(sentence_data)
+        svg_content = create_single_sentence_svg(sentence_data, sentence_num=i+1, total_sentences=len(sentences))
+        sentence_svgs.append(svg_content)
+    # Escape the SVG content for JavaScript
+    escaped_svgs = []
+    for svg in sentence_svgs:
+        escaped = svg.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n').replace('\r', '\\r')
+        escaped_svgs.append(f'"{escaped}"')
+    # Create the main container with navigation
+    container_id = "sentence_container"
+    nav_html = f"""
+    <div id="{container_id}" style="border: 1px solid #ddd; padding: 10px; background: white;">
+        <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; padding: 5px; background: #f8f9fa; border-radius: 5px;">
+            <button onclick="previousSentence()" id="prevBtn" style="padding: 5px 10px; background: #007bff; color: white; border: none; border-radius: 3px; cursor: pointer;">← Previous</button>
+            <span id="sentenceCounter" style="font-weight: bold; color: #333;">Sentence 1 of {len(sentences)}</span>
+            <button onclick="nextSentence()" id="nextBtn" style="padding: 5px 10px; background: #007bff; color: white; border: none; border-radius: 3px; cursor: pointer;">Next →</button>
+        </div>
+        <div id="sentenceDisplay">
+            {sentence_svgs[0]}
+        </div>
+    </div>
+    <script>
+    (function() {{
+        let currentSentence = 0;
+        const sentences = [{', '.join(escaped_svgs)}];
+        const totalSentences = {len(sentences)};
+        function updateDisplay() {{
+            document.getElementById('sentenceDisplay').innerHTML = sentences[currentSentence];
+            document.getElementById('sentenceCounter').textContent = 'Sentence ' + (currentSentence + 1) + ' of ' + totalSentences;
+            const prevBtn = document.getElementById('prevBtn');
+            const nextBtn = document.getElementById('nextBtn');
+            if (prevBtn) {{
+                prevBtn.disabled = currentSentence === 0;
+                prevBtn.style.opacity = currentSentence === 0 ? '0.5' : '1';
+            }}
+            if (nextBtn) {{
+                nextBtn.disabled = currentSentence === totalSentences - 1;
+                nextBtn.style.opacity = currentSentence === totalSentences - 1 ? '0.5' : '1';
+            }}
+        }}
+        window.nextSentence = function() {{
+            if (currentSentence < totalSentences - 1) {{
+                currentSentence++;
+                updateDisplay();
+            }}
+        }};
+        window.previousSentence = function() {{
+            if (currentSentence > 0) {{
+                currentSentence--;
+                updateDisplay();
+            }}
+        }};
+        // Initialize
+        updateDisplay();
+    }})();
+    </script>
+    """
+    return nav_html
+def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
+    """Create SVG for a single sentence with full morphological annotations"""
+    try:
+        # Convert to DataFrame if it's a list of dicts
+        if isinstance(sentence_data, list):
+            df = pd.DataFrame(sentence_data)
+        else:
+            df = sentence_data
+        # Calculate optimal spacing and dimensions
+        word_count = len(df)
+        base_word_width = 100
+        min_spacing = 30
+        word_spacing = max(base_word_width, (word_count * base_word_width + min_spacing * (word_count - 1)) / word_count)
+        width = max(800, word_count * word_spacing + 100)
+        height = 500  # Increased height for more annotations
+        # Text positioning
+        word_y = height - 120
+        pos_y = word_y + 20
+        features_start_y = pos_y + 15
+        # Colors (professional palette)
+        deprel_colors = {
+            'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
+            'amod': '#8e44ad', 'nmod': '#16a085', 'case': '#34495e', 'punct': '#7f8c8d',
+            'cc': '#d35400', 'conj': '#2c3e50', 'cop': '#e74c3c', 'mark': '#9b59b6',
+            'csubj': '#3498db', 'xcomp': '#1abc9c', 'ccomp': '#f39c12', 'advcl': '#e91e63',
+            'advmod': '#9c27b0', 'obl': '#795548', 'iobj': '#607d8b', 'fixed': '#ff5722',
+            'aux': '#ff9800', 'acl': '#4caf50', 'appos': '#673ab7', 'compound': '#009688'
+        }
+        svg_parts = [
+            f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" style="background: white; border: 1px solid #eee;">',
+            '<defs>'
+        ]
+        # Create arrowhead markers for each relation type (thinner arrows)
+        for deprel, color in deprel_colors.items():
+            marker_id = f"arrow_{deprel}"
+            svg_parts.append(
+                f'<marker id="{marker_id}" markerWidth="4" markerHeight="4" '
+                f'markerUnits="userSpaceOnUse" orient="auto" refX="3.5" refY="2">'
+                f'<path d="M0,0 L4,2 L0,4 Z" fill="{color}"/>'
+                f'</marker>'
+            )
+        svg_parts.append('</defs>')
+        svg_parts.append('<g>')
+        # Calculate word positions
+        word_positions = {}
+        for idx, row in df.iterrows():
+            word_id = int(row['ID'])
+            word_x = 50 + (word_id - 1) * word_spacing
+            word_positions[word_id] = word_x
+        # Draw dependency arcs with improved collision detection
+        used_spans = []
+        for idx, row in df.iterrows():
+            word_id = int(row['ID'])
+            head_id = int(row['HEAD']) if row['HEAD'] != '0' else 0
+            deprel = row['DEPREL']
+            if head_id == 0:  # Root dependency
+                word_x = word_positions[word_id]
+                color = deprel_colors.get(deprel, '#000000')
+                svg_parts.append(
+                    f'<line x1="{word_x}" y1="{word_y - 15}" x2="{word_x}" y2="50" '
+                    f'stroke="{color}" stroke-width="1.5"/>'
+                )
+                root_label_y = (word_y - 15 + 50) / 2
+                # White background for ROOT label
+                svg_parts.append(
+                    f'<rect x="{word_x - 15}" y="{root_label_y - 8}" '
+                    f'width="30" height="14" fill="white" stroke="{color}" stroke-width="1" rx="2"/>'
+                )
+                svg_parts.append(
+                    f'<text x="{word_x}" y="{root_label_y + 2}" text-anchor="middle" fill="{color}" '
+                    f'font-family="Arial, sans-serif" font-size="8" font-weight="bold">ROOT</text>'
+                )
+            else:
+                if head_id in word_positions:
+                    word_x = word_positions[word_id]
+                    head_x = word_positions[head_id]
+                    # Advanced collision detection
+                    span_start = min(word_id, head_id)
+                    span_end = max(word_id, head_id)
+                    current_span = (span_start, span_end)
+                    level = 0
+                    conflict_found = True
+                    while conflict_found:
+                        conflict_found = False
+                        for existing_span, existing_level in used_spans:
+                            if existing_level == level:
+                                if not (span_end < existing_span[0] or span_start > existing_span[1]):
+                                    conflict_found = True
+                                    level += 1
+                                    break
+                    used_spans.append((current_span, level))
+                    # Calculate arc
+                    span_distance = abs(head_x - word_x)
+                    base_height = min(40 + span_distance * 0.15, 100)
+                    arc_height = base_height + level * 35
+                    color = deprel_colors.get(deprel, '#000000')
+                    marker_id = f"arrow_{deprel}" if deprel in deprel_colors else "arrow_root"
+                    mid_x = (word_x + head_x) / 2
+                    control_y = word_y - arc_height
+                    path = f'M {word_x} {word_y - 15} Q {mid_x} {control_y} {head_x} {word_y - 15}'
+                    svg_parts.append(
+                        f'<path d="{path}" stroke="{color}" stroke-width="1.5" '
+                        f'fill="none" marker-end="url(#{marker_id})"/>'
+                    )
+                    # Label with white background window
+                    arc_mid_x = 0.25 * word_x + 0.5 * mid_x + 0.25 * head_x
+                    arc_mid_y = 0.25 * (word_y - 15) + 0.5 * control_y + 0.25 * (word_y - 15)
+                    label_width = len(deprel) * 6 + 8
+                    svg_parts.append(
+                        f'<rect x="{arc_mid_x - label_width/2}" y="{arc_mid_y - 8}" '
+                        f'width="{label_width}" height="14" fill="white" stroke="{color}" stroke-width="1" rx="2"/>'
+                    )
+                    svg_parts.append(
+                        f'<text x="{arc_mid_x}" y="{arc_mid_y + 2}" text-anchor="middle" fill="{color}" '
+                        f'font-family="Arial, sans-serif" font-size="8" font-weight="bold">{deprel}</text>'
+                    )
+        # Draw words and complete morphological annotations
+        for idx, row in df.iterrows():
+            word_id = int(row['ID'])
+            word = row['FORM']
+            pos = row['UPOS']
+            lemma = row['LEMMA']
+            feats = row['FEATS']
+            xpos = row['XPOS']
+            word_x = word_positions[word_id]
+            # Main word text
+            svg_parts.append(
+                f'<text x="{word_x}" y="{word_y}" text-anchor="middle" fill="#000000" '
+                f'font-family="Arial, sans-serif" font-size="13" font-weight="bold">{word}</text>'
+            )
+            # Morphological annotations
+            annotations = []
+            # Universal POS
+            if pos and pos != '_':
+                annotations.append(f"upos={pos}")
+            # Lemma
+            if lemma and lemma != '_' and lemma != word:
+                annotations.append(f"lemma={lemma}")
+            # Language-specific POS
+            if xpos and xpos != '_':
+                annotations.append(f"xpos={xpos}")
+            # Parse and add ALL morphological features
+            if feats and feats != '_':
+                feat_pairs = feats.split('|') if '|' in feats else [feats]
+                for feat in feat_pairs:
+                    if '=' in feat:
+                        annotations.append(feat)
+            # Display all annotations
+            for i, annotation in enumerate(annotations):
+                y_pos = features_start_y + i * 12
+                svg_parts.append(
+                    f'<text x="{word_x}" y="{y_pos}" text-anchor="middle" '
+                    f'fill="#666666" font-family="Arial, sans-serif" font-size="7">{annotation}</text>'
+                )
+        svg_parts.append('</g>')
+        svg_parts.append('</svg>')
+        return ''.join(svg_parts)
+    except Exception as e:
+        return f"<p>Error creating single sentence visualization: {str(e)}</p>"
+def create_dependency_visualization(df: pd.DataFrame) -> str:
+    """Create a simple text-based dependency visualization"""
+    if df.empty:
+        return "No data to visualize"
+    try:
+        viz_lines = []
+        viz_lines.append("Dependency Parse Visualization:")
+        viz_lines.append("=" * 50)
+        for _, row in df.iterrows():
+            word = row['FORM']
+            pos = row['UPOS']
+            deprel = row['DEPREL']
+            head_id = row['HEAD']
+            # Find the head word
+            if head_id != '0':  # Not root
+                try:
+                    head_idx = int(head_id) - 1
+                    if 0 <= head_idx < len(df):
+                        head_word = df.iloc[head_idx]['FORM']
+                        viz_lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
+                    else:
+                        viz_lines.append(f"{word} ({pos}) --{deprel}--> [OUT_OF_RANGE]")
+                except (ValueError, IndexError):
+                    viz_lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
+            else:
+                viz_lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
+        return "\n".join(viz_lines)
+    except Exception as e:
+        return f"Error creating visualization: {str(e)}"
+def process_text(text: str):
+    """Main processing function that returns all outputs"""
     if not text.strip():
+        empty_df = pd.DataFrame()
+        return "Please enter some Lesbian Greek text to parse.", empty_df, "No data to display", "<p>No data to visualize</p>"
+    # Parse with custom Lesbian Greek model
+    print(f"Starting to process: {text[:30]}...")
+    conllu_output = parse_text_with_lesbian_greek(text)
+    if conllu_output.startswith("Error"):
+        empty_df = pd.DataFrame()
+        return conllu_output, empty_df, "Error in parsing", "<p>Error in parsing</p>"
+    # Convert to DataFrame
     try:
+        df = conllu_to_dataframe(conllu_output)
+        if df.empty:
+            return conllu_output, df, "No tokens found", "<p>No tokens found</p>"
+        # Create visualizations
+        text_visualization = create_dependency_visualization(df)
+        svg_visualization = create_dependency_tree_svg(df)
+        return conllu_output, df, text_visualization, svg_visualization
     except Exception as e:
+        error_msg = f"Error creating outputs: {str(e)}"
+        print(error_msg)
+        empty_df = pd.DataFrame()
+        return conllu_output, empty_df, error_msg, f"<p>{error_msg}</p>"
+# Initialize Lesbian Greek model
+print("Initializing Lesbian Greek Stanza model...")
+model_loaded, status_message = initialize_lesbian_greek_model()
+print(f"Model initialization result: {model_loaded}")
+print(f"Status: {status_message}")
+# Create Gradio interface
 def create_gradio_app():
+    with gr.Blocks(title="Lesbian Greek Morphosyntactic Parser", theme=gr.themes.Soft()) as app:
+        gr.Markdown("""
+        # Lesbian Greek Morphosyntactic Parser
+        This tool uses custom Stanza models trained specifically for the **Lesbian dialect of Greek**
+        (spoken on the island of Lesbos). The models provide:
+        - **Tokenization**: Splits text into tokens
+        - **POS Tagging**: Part-of-speech classification
+        - **Lemmatization**: Base form identification
+        - **Dependency Parsing**: Syntactic relationship analysis
+        - **CoNLL-U Output**: Standard linguistic annotation format
+        ## About the Models
+        These models were trained on a curated treebank of 540 sentences from both oral and written
+        sources collected from various villages of Lesbos, including Agra, Chidira, Eressos,
+        Pterounta, Mesotopos, and Parakoila.
+        **Citation**: Bompolas, S., Markantonatou, S., Ralli, A., & Anastasopoulos, A. (2025).
+        Crossing Dialectal Boundaries: Building a Treebank for the Dialect of Lesbos through
+        Knowledge Transfer from Standard Modern Greek.
+        Enter your Lesbian Greek text below to get started!
+        """)
+        # Show model status
+        if model_loaded:
+            if "default Greek" in status_message:
+                gr.Markdown(f"""
+                ⚠️ **Model Status**: {status_message}
+                The custom Lesbian Greek models could not be loaded, so default Greek models are being used.
+                Results may not be optimized for the Lesbian dialect.
+                """)
+            else:
+                gr.Markdown(f"""
+                ✅ **Model Status**: {status_message}
+                Custom Lesbian Greek models loaded successfully!
+                """)
+        else:
+            gr.Markdown(f"""
+            ❌ **Model Loading Error**: {status_message}
+            The models could not be loaded. This may be due to:
+            - Network issues downloading the models
+            - Missing dependencies (transformers library)
+            - Insufficient memory or storage
+            - Model compatibility issues
+            Please try refreshing the page or contact the developers.
+            """)
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(
+                    label="Lesbian Greek Text Input",
+                    placeholder="Εισάγετε το κείμενο στη Λεσβιακή διάλεκτο..." if model_loaded else "Models not loaded - please refresh page",
+                    lines=4,
+                    value="Τα παιδιά πάντ στο κήπ." if model_loaded else "",
+                    interactive=model_loaded
+                )
+                parse_button = gr.Button(
+                    "Parse Lesbian Greek Text",
+                    variant="primary",
+                    size="lg",
+                    interactive=model_loaded
+                )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Interactive Dependency Tree")
+                dependency_tree_viz = gr.HTML(
+                    label="Visual Dependency Tree",
+                    value="<p>Enter text and click parse to see the dependency tree visualization</p>"
+                )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### CoNLL-U Output")
+                conllu_output = gr.Textbox(
+                    label="CoNLL-U Format",
+                    lines=10,
+                    max_lines=20,
+                    show_copy_button=True,
+                    info="Raw CoNLL-U format output optimized for Lesbian Greek dialect"
+                )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Parsed Data Table")
+                data_table = gr.Dataframe(
+                    label="Token Analysis",
+                    interactive=False,
+                    wrap=True
+                )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Text-based Dependency Structure")
+                dependency_viz = gr.Textbox(
+                    label="Dependency Relationships",
+                    lines=8,
+                    max_lines=15,
+                    show_copy_button=True,
+                    info="Text-based visualization of syntactic dependencies"
+                )
+        # Event handling
+        if model_loaded:
+            parse_button.click(
+                fn=process_text,
+                inputs=[text_input],
+                outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
+            )
+            # Also trigger on Enter in text input
+            text_input.submit(
+                fn=process_text,
+                inputs=[text_input],
+                outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
+            )
+        # Add Lesbian Greek examples (if available)
+        if model_loaded:
+            gr.Markdown("### Example Lesbian Greek Texts")
+            examples = [
+                ["Τα παιδιά πάντ στο κήπ."],
+                ["Η γάτα κάθεται στο τραπέζ."],
+                ["Ο ήλιος λάμπει στον ουρανό."],
+                ["Η θάλασσα είναι γαλάζια και όμορφη."],
+            ]
+            gr.Examples(
+                examples=examples,
+                inputs=[text_input],
+                outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz],
+                fn=process_text,
+                cache_examples=False
+            )
+        gr.Markdown("""
+        ### Visualization Legend
+        The **Interactive Dependency Tree** shows:
+        - **Words** in bold at the bottom with their position numbers
+        - **POS tags** in gray below each word
+        - **Dependency arcs** as curved lines with arrows pointing to heads
+        - **Dependency relations** labeled on the arcs
+        - **Color coding** for different dependency types:
+          - Red: ROOT relations
+          - Blue: Subject relations (nsubj)
+          - Green: Object relations (obj)
+          - Orange: Determiners (det)
+          - Purple: Adjective modifiers (amod)
+          - And more...
+        ### About CoNLL-U Format
+        The CoNLL-U format includes these fields for each token:
+        - **ID**: Token index
+        - **FORM**: Word form or punctuation symbol
+        - **LEMMA**: Lemma or stem of word form
+        - **UPOS**: Universal part-of-speech tag
+        - **XPOS**: Language-specific part-of-speech tag
+        - **FEATS**: Morphological features
+        - **HEAD**: Head of the current word
+        - **DEPREL**: Dependency relation to the head
+        - **DEPS**: Enhanced dependency graph
+        - **MISC**: Miscellaneous annotations
+        ### Resources
+        - [Lesbian Greek Models on Hugging Face](https://huggingface.co/sbompolas/Lesbian-Greek-Morphosyntactic-Model)
+        - [UD_Greek-Lesbian Treebank](https://github.com/UniversalDependencies/UD_Greek-Lesbian)
+        - [Stanza Documentation](https://stanfordnlp.github.io/stanza/)
+        """)
+    return app
+# Create and launch the app
+if __name__ == "__main__":
+    print("Creating Gradio app...")
     app = create_gradio_app()
+    print("Launching app...")
+    app.launch()