Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

fb5b190

verified ·

1 Parent(s): fe64a8e

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -217

app.py CHANGED Viewed

@@ -1,321 +1,237 @@
 import gradio as gr
 import stanza
 import pandas as pd
-import sys
 import traceback
-import os
-import tempfile
 import requests
 from pathlib import Path
 import json
 # Global variables to store the pipelines
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
-    "Lesbian-only":             "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
-    "Lesbian-synthetic-data":   "sbompolas/NGUD-Lesbian-Morphosyntactic-Model"
 }
 def download_model_file(url, filename):
     """Download a model file from Hugging Face"""
     try:
-        print(f"Downloading {filename}...")
         response = requests.get(url, stream=True)
         response.raise_for_status()
         with open(filename, 'wb') as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
-        print(f"Successfully downloaded {filename}")
         return True
     except Exception as e:
-        print(f"Failed to download {filename}: {e}")
         return False
 def initialize_lesbian_greek_model():
-    """Download and initialize both Lesbian-only and Lesbian-synthetic-data models"""
     try:
-        print("Initializing both Lesbian-only and Lesbian-synthetic-data pipelines...")
         base_dir = Path("./models")
         base_dir.mkdir(exist_ok=True)
         for variant_name, repo in MODEL_VARIANTS.items():
-            print(f"\n→ Setting up '{variant_name}' from repo {repo}...")
-            out_dir = base_dir / variant_name
             out_dir.mkdir(parents=True, exist_ok=True)
-            # Model files to fetch
-            model_files = {
-                "tokenizer.pt":    f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
-                "lemmatizer.pt":   f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
-                "pos.pt":          f"https://huggingface.co/{repo}/resolve/main/pos.pt",
-                "depparse.pt":     f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
-            # Download files if missing
-            for fname, url in model_files.items():
-                tgt = out_dir / fname
                 if not tgt.exists():
                     if not download_model_file(url, str(tgt)):
-                        return False, f"Failed to download {fname} for {variant_name}"
-            # Build the stanza pipeline directly
             config = {
                 'processors': 'tokenize,pos,lemma,depparse',
                 'lang': 'el',
                 'use_gpu': False,
                 'verbose': False,
-                'tokenize_model_path': str(out_dir / "tokenizer.pt"),
-                'pos_model_path':      str(out_dir / "pos.pt"),
-                'lemma_model_path':    str(out_dir / "lemmatizer.pt"),
-                'depparse_model_path': str(out_dir / "depparse.pt")
             }
             try:
-                MODEL = stanza.Pipeline(**config)
-                LESBIAN_MODELS[variant_name] = MODEL
-                print(f"✅ Loaded pipeline for {variant_name}")
             except Exception as e:
-                print(f"⚠️ Could not load {variant_name}: {e}")
                 return False, f"Pipeline init error for {variant_name}: {e}"
-        return True, "Both variants loaded successfully"
     except Exception as e:
-        print(f"Initialization failure: {e}")
         traceback.print_exc()
         return False, str(e)
 def stanza_doc_to_conllu(doc) -> str:
     """Convert Stanza Document to CoNLL-U format"""
-    conllu_lines = []
-    for sent_idx, sentence in enumerate(doc.sentences):
-        conllu_lines.append(f"# sent_id = {sent_idx+1}")
-        conllu_lines.append(f"# text = {sentence.text}")
-        for word in sentence.words:
             fields = [
-                str(word.id),
-                word.text,
-                word.lemma or "_",
-                word.upos or "_",
-                word.xpos or "_",
-                word.feats or "_",
-                str(word.head) if word.head is not None else "0",
-                word.deprel or "_",
                 "_",
                 "_"
             ]
-            conllu_lines.append("\t".join(fields))
-        conllu_lines.append("")
-    return "\n".join(conllu_lines)
-def parse_and_conllu(text: str, variant: str) -> str:
-    """Parse text with chosen variant and return CoNLL-U"""
-    pipeline = LESBIAN_MODELS.get(variant)
-    if not pipeline:
-        return f"Error: model {variant} not loaded."
-    if not text.strip():
-        return "Error: Please enter some text to parse."
-    try:
-        doc = pipeline(text)
-        return stanza_doc_to_conllu(doc)
-    except Exception as e:
-        return f"Error processing text: {e}"
 def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
     """Convert CoNLL-U text to pandas DataFrame"""
-    if conllu_text.startswith("Error"):
         return pd.DataFrame()
-    data = []
     for line in conllu_text.splitlines():
         if not line or line.startswith("#"):
             continue
         parts = line.split("\t")
         if len(parts) >= 10:
-            data.append({
                 'ID': parts[0], 'FORM': parts[1], 'LEMMA': parts[2],
                 'UPOS': parts[3], 'XPOS': parts[4], 'FEATS': parts[5],
                 'HEAD': parts[6], 'DEPREL': parts[7], 'DEPS': parts[8], 'MISC': parts[9]
             })
-    return pd.DataFrame(data)
 def create_dependency_visualization(df: pd.DataFrame) -> str:
     """Simple text-based dependency display"""
     if df.empty:
         return "No data to visualize"
-    lines = ["Dependency Parse Visualization:", "-"*40]
-    for _, row in df.iterrows():
-        word, pos, deprel, head = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
-        if head != '0':
             try:
-                head_idx = int(head)-1
-                head_word = df.iloc[head_idx]['FORM']
-                lines.append(f"{word} ({pos}) --{deprel}--> {head_word}")
             except:
-                lines.append(f"{word} ({pos}) --{deprel}--> [ERROR]")
         else:
-            lines.append(f"{word} ({pos}) --{deprel}--> ROOT")
-    return "\n".join(lines)
-def create_dependency_tree_svg(df: pd.DataFrame) -> str:
-    """SVG-based dependency tree with simple slider navigation"""
-    if df.empty:
-        return "<p>No data to visualize</p>"
-    # group into sentences
-    sentences, current = [], []
-    for idx, row in df.iterrows():
-        wid = int(row['ID'])
-        if wid == 1 and current:
-            sentences.append(current)
-            current = []
-        current.append(row.to_dict())
-    if current:
-        sentences.append(current)
-    # render each as an SVG slide
-    slides = []
-    for sent in sentences:
-        svg = create_single_sentence_svg(sent)
-        slides.append(svg)
-    # wrap slides in divs with show/hide logic
-    slide_divs = "\n".join(
-        f'<div class="slide" style="display:{"block" if i==0 else "none"}">{svg}</div>'
-        for i, svg in enumerate(slides)
-    )
-    return f"""
-      <div id="slider">
-        {slide_divs}
-        <button id="prevBtn">← Prev</button>
-        <button id="nextBtn">Next →</button>
-      </div>
-      <script>
-        (function() {{
-          const slides = document.querySelectorAll('#slider .slide');
-          let idx = 0;
-          document.getElementById('prevBtn').onclick = () => {{
-            slides[idx].style.display = 'none';
-            idx = (idx - 1 + slides.length) % slides.length;
-            slides[idx].style.display = 'block';
-          }};
-          document.getElementById('nextBtn').onclick = () => {{
-            slides[idx].style.display = 'none';
-            idx = (idx + 1) % slides.length;
-            slides[idx].style.display = 'block';
-          }};
-        }})();
-      </script>
-    """
 def create_single_sentence_svg(sentence_data):
-    """Generate detailed SVG for one sentence (kept as in original code)"""
-    # For brevity, I'm reusing your original single-sentence SVG builder.
-    # Paste the full create_single_sentence_svg implementation here unchanged.
-    # ...
-    return "<svg><!-- your detailed SVG here --></svg>"
 def process_text(text, variant):
-    """Main entry: returns four outputs for Gradio"""
     if not text.strip():
-        empty_df = pd.DataFrame()
-        return "Please enter some Lesbian Greek text to parse.", empty_df, "", "<p>No data</p>"
-    conllu = parse_and_conllu(text, variant)
-    if conllu.startswith("Error"):
-        return conllu, pd.DataFrame(), "", "<p>Error</p>"
-    df = conllu_to_dataframe(conllu)
-    if df.empty:
-        return conllu, df, "", "<p>No tokens found</p>"
     text_viz = create_dependency_visualization(df)
-    svg_viz  = create_dependency_tree_svg(df)
-    return conllu, df, text_viz, svg_viz
-# Initialize models at startup
-print("Initializing pipelines...")
-loaded, status_msg = initialize_lesbian_greek_model()
-print(f"Loaded={loaded}, Status={status_msg}")
 def create_gradio_app():
-    with gr.Blocks(title="Lesbian Greek Morphosyntactic Parser", theme=gr.themes.Soft()) as app:
-        gr.Markdown("""
-# Lesbian Greek Morphosyntactic Parser
-Pick a model variant, enter your text, and see tokenization, POS, lemmas, and dependencies.
-        """)
-        # show status
         if loaded:
-            gr.Markdown(f"✅ Models loaded: {', '.join(MODEL_VARIANTS.keys())}")
         else:
-            gr.Markdown(f"❌ Loading error: {status_msg}")
-        with gr.Row():
-            with gr.Column():
-                text_input = gr.Textbox(
-                    label="Lesbian Greek Text Input",
-                    placeholder="Enter Lesbian Greek here...",
-                    lines=4
-                )
-                parse_button = gr.Button("Parse Text", variant="primary")
-            with gr.Column():
-                model_selector = gr.Radio(
-                    choices=list(MODEL_VARIANTS.keys()),
-                    value="Lesbian-only",
-                    label="Choose Variant"
-                )
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### Dependency Tree")
-                dependency_tree_viz = gr.HTML("<p>Parse to see tree</p>")
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown("### CoNLL-U Output")
-                conllu_output = gr.Textbox(lines=10, show_copy_button=True)
         with gr.Row():
             with gr.Column():
-                gr.Markdown("### Token Table")
-                data_table = gr.Dataframe(interactive=False)
-        with gr.Row():
             with gr.Column():
-                gr.Markdown("### Text-based Dependencies")
-                dependency_viz = gr.Textbox(lines=8, show_copy_button=True)
-        if loaded:
-            parse_button.click(
-                fn=process_text,
-                inputs=[text_input, model_selector],
-                outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
-            )
-            text_input.submit(
-                fn=process_text,
-                inputs=[text_input, model_selector],
-                outputs=[conllu_output, data_table, dependency_viz, dependency_tree_viz]
-            )
     return app
 if __name__ == "__main__":
     app = create_gradio_app()
     app.launch()

 import gradio as gr
 import stanza
 import pandas as pd
 import traceback
 import requests
 from pathlib import Path
 import json
+import os
 # Global variables to store the pipelines
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
+    "Lesbian-only":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
+    "Lesbian-synthetic-data": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model"
 }
 def download_model_file(url, filename):
     """Download a model file from Hugging Face"""
     try:
         response = requests.get(url, stream=True)
         response.raise_for_status()
         with open(filename, 'wb') as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
         return True
     except Exception as e:
+        print(f"Download failed {filename}: {e}")
         return False
 def initialize_lesbian_greek_model():
+    """Download and initialize both model variants"""
     try:
         base_dir = Path("./models")
         base_dir.mkdir(exist_ok=True)
         for variant_name, repo in MODEL_VARIANTS.items():
+            out_dir = base_dir/variant_name
             out_dir.mkdir(parents=True, exist_ok=True)
+            # four model files
+            files = {
+                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
+                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
+                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
+                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
+            # download
+            for fn, url in files.items():
+                tgt = out_dir/fn
                 if not tgt.exists():
                     if not download_model_file(url, str(tgt)):
+                        return False, f"Failed download {fn} for {variant_name}"
+            # build stanza pipeline
             config = {
                 'processors': 'tokenize,pos,lemma,depparse',
                 'lang': 'el',
                 'use_gpu': False,
                 'verbose': False,
+                'tokenize_model_path': str(out_dir/"tokenizer.pt"),
+                'pos_model_path':      str(out_dir/"pos.pt"),
+                'lemma_model_path':    str(out_dir/"lemmatizer.pt"),
+                'depparse_model_path': str(out_dir/"depparse.pt")
             }
             try:
+                pipe = stanza.Pipeline(**config)
+                LESBIAN_MODELS[variant_name] = pipe
+                print(f"Loaded {variant_name}")
             except Exception as e:
                 return False, f"Pipeline init error for {variant_name}: {e}"
+        return True, "Models loaded"
     except Exception as e:
         traceback.print_exc()
         return False, str(e)
 def stanza_doc_to_conllu(doc) -> str:
     """Convert Stanza Document to CoNLL-U format"""
+    lines = []
+    for sid, sentence in enumerate(doc.sentences, start=1):
+        lines.append(f"# sent_id = {sid}")
+        lines.append(f"# text = {sentence.text}")
+        for w in sentence.words:
             fields = [
+                str(w.id),
+                w.text,
+                w.lemma or "_",
+                w.upos or "_",
+                w.xpos or "_",
+                w.feats or "_",
+                str(w.head) if w.head is not None else "0",
+                w.deprel or "_",
                 "_",
                 "_"
             ]
+            lines.append("\t".join(fields))
+        lines.append("")
+    return "\n".join(lines)
 def conllu_to_dataframe(conllu_text: str) -> pd.DataFrame:
     """Convert CoNLL-U text to pandas DataFrame"""
+    if not conllu_text or conllu_text.startswith("Error"):
         return pd.DataFrame()
+    rows = []
     for line in conllu_text.splitlines():
         if not line or line.startswith("#"):
             continue
         parts = line.split("\t")
         if len(parts) >= 10:
+            rows.append({
                 'ID': parts[0], 'FORM': parts[1], 'LEMMA': parts[2],
                 'UPOS': parts[3], 'XPOS': parts[4], 'FEATS': parts[5],
                 'HEAD': parts[6], 'DEPREL': parts[7], 'DEPS': parts[8], 'MISC': parts[9]
             })
+    return pd.DataFrame(rows)
 def create_dependency_visualization(df: pd.DataFrame) -> str:
     """Simple text-based dependency display"""
     if df.empty:
         return "No data to visualize"
+    viz = ["Dependency Parse Visualization:", "-"*40]
+    for _, r in df.iterrows():
+        w, p, d, h = r['FORM'], r['UPOS'], r['DEPREL'], r['HEAD']
+        if h != '0':
             try:
+                hw = df.iloc[int(h)-1]['FORM']
             except:
+                hw = "[ERROR]"
+            viz.append(f"{w} ({p}) --{d}--> {hw}")
         else:
+            viz.append(f"{w} ({p}) --{d}--> ROOT")
+    return "\n".join(viz)
 def create_single_sentence_svg(sentence_data):
+    """Detailed SVG builder unchanged—paste your original implementation here."""
+    # ... your create_single_sentence_svg code ...
+    return "<svg><!-- your SVG here --></svg>"
 def process_text(text, variant):
+    """Parse the text, return all outputs including sentence list and initial SVG"""
     if not text.strip():
+        empty = pd.DataFrame()
+        return "Please enter text.", empty, "", [], [], "<p>No data</p>"
+    pipe = LESBIAN_MODELS.get(variant)
+    if not pipe:
+        return f"Error: model {variant} not loaded.", pd.DataFrame(), "", [], [], "<p>Error</p>"
+    try:
+        doc = pipe(text)
+    except Exception as e:
+        return f"Error parsing: {e}", pd.DataFrame(), "", [], [], "<p>Error</p>"
+    conllu = stanza_doc_to_conllu(doc)
+    df = conllu_to_dataframe(conllu)
     text_viz = create_dependency_visualization(df)
+    # build per-sentence data
+    sentences = []
+    for sent in doc.sentences:
+        sent_rows = []
+        for w in sent.words:
+            sent_rows.append({
+                'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
+                'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
+                'FEATS': w.feats or "_", 'HEAD': w.head or 0, 'DEPREL': w.deprel or "_"
+            })
+        sentences.append(sent_rows)
+    # dropdown choices and initial svg
+    choices = [str(i+1) for i in range(len(sentences))]
+    init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No sentences</p>"
+    return conllu, df, text_viz, choices, sentences, init_svg
+def update_svg(sel, sentences):
+    """Return SVG for the selected sentence id"""
+    try:
+        idx = int(sel) - 1
+        if 0 <= idx < len(sentences):
+            return create_single_sentence_svg(sentences[idx])
+    except:
+        pass
+    return "<p>Invalid selection</p>"
+# initialize at startup
+loaded, status = initialize_lesbian_greek_model()
+print(f"Models loaded={loaded}, status={status}")
 def create_gradio_app():
+    with gr.Blocks(title="Lesbian Greek Parser") as app:
+        gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
         if loaded:
+            gr.Markdown(f"✅ Loaded variants: {', '.join(MODEL_VARIANTS.keys())}")
         else:
+            gr.Markdown(f"❌ Loading error: {status}")
         with gr.Row():
             with gr.Column():
+                text_input = gr.Textbox(label="Text", lines=4,
+                    placeholder="Εισάγετε κείμενο...")
+                parse_btn   = gr.Button("Parse")
             with gr.Column():
+                model_sel   = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
+                    value="Lesbian-only", label="Model Variant")
+        # sentence selector & state
+        sentence_dropdown = gr.Dropdown(label="Sentence", choices=[])
+        sentences_state   = gr.State([])
+        # outputs
+        conllu_out    = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
+        table_out     = gr.Dataframe(label="Tokens")
+        text_viz_out  = gr.Textbox(label="Dependencies", lines=8, show_copy_button=True)
+        svg_out       = gr.HTML("<p>No data</p>")
+        # wire up parse event
+        parse_btn.click(
+            fn=process_text,
+            inputs=[text_input, model_sel],
+            outputs=[
+                conllu_out, table_out, text_viz_out,
+                sentence_dropdown, sentences_state, svg_out
+            ]
+        )
+        # on sentence change
+        sentence_dropdown.change(
+            fn=update_svg,
+            inputs=[sentence_dropdown, sentences_state],
+            outputs=svg_out
+        )
     return app
 if __name__ == "__main__":
     app = create_gradio_app()
     app.launch()