Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

103058d

verified ·

1 Parent(s): e4bdfd5

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -66

app.py CHANGED Viewed

@@ -6,8 +6,7 @@ import requests
 import traceback
 from pathlib import Path
-# ─── 1. MODEL VARIANTS & INITIALIZATION ──────────────────────────────────
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
     "Lesbian-only":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
@@ -34,10 +33,10 @@ def initialize_models():
             out = base/name
             out.mkdir(exist_ok=True)
             files = {
-                "tokenizer.pt":    f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
-                "lemmatizer.pt":   f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
-                "pos.pt":          f"https://huggingface.co/{repo}/resolve/main/pos.pt",
-                "depparse.pt":     f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
             for fn, url in files.items():
                 tgt = out/fn
@@ -60,7 +59,7 @@ def initialize_models():
 loaded, load_status = initialize_models()
-# ─── 2. CoNLL-U STRINGIZER ────────────────────────────────────────────────
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
@@ -80,102 +79,137 @@ def stanza_doc_to_conllu(doc) -> str:
     return "\n".join(lines)
-# ─── 3. TOKEN TABLE with comment‐rows ────────────────────────────────────
-def conllu_to_dataframe_table(conllu: str) -> pd.DataFrame:
     """
-    Insert:
-     - blank row (except first)
-     - # sent_id = …
-     - # text = …
-    before each sentence's tokens.
     """
-    records = []
     blocks = [b for b in conllu.split("\n\n") if b.strip()]
     first = True
     for block in blocks:
         lines = block.splitlines()
-        sid_line, txt_line = lines[0], lines[1]
         token_lines = lines[2:]
         if not first:
-            # blank row
-            records.append({c: "" for c in ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]})
         first = False
         # comment rows
-        records.append({"Id": sid_line,  "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
-        records.append({"Id": txt_line,  "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
         # token rows
         for tl in token_lines:
             parts = tl.split("\t")
             if len(parts) < 10:
                 continue
             records.append({
-                "Id":      parts[0],
-                "Form":    parts[1],
-                "Lemma":   parts[2],
-                "UPosTag": parts[3],
-                "XPosTag": parts[4],
-                "Feats":   parts[5],
-                "Head":    parts[6],
-                "DepRel":  parts[7],
-                "Deps":    parts[8],
-                "Misc":    parts[9]
             })
-    return pd.DataFrame(records, columns=["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"])
-# ─── 4. TEXT‐BASED DEPENDENCIES with blank + comments ─────────────��───────
 def create_dependency_visualization(df_table: pd.DataFrame) -> str:
     """
-    Walk token‐table rows, emitting:
-     - blank line + comment lines before each sentence
-     - dependency lines
     """
     if df_table.empty:
         return "No data to visualize"
-    lines = []
     first = True
-    # We detect new sentence by comment rows in Id column starting with '# sent_id'
     for row in df_table.itertuples(index=False):
         if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
             if not first:
-                lines.append("")  # blank separator
             first = False
-            # emit sent_id and text
-            lines.append(row.Id)
-            # next row in table is "# text = …"
             continue
         if isinstance(row.Id, str) and row.Id.startswith("# text"):
-            lines.append(row.Id)
             continue
-        # skip blank/comment rows
         if not row.Id.isdigit():
             continue
-        # actual token row
         w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
         if h != "0":
-            # find head form
             try:
                 hw = df_table[df_table.Id == h].iloc[0].Form
             except:
                 hw = "[ERR]"
-            lines.append(f"{w} ({p}) --{d}--> {hw}")
         else:
-            lines.append(f"{w} ({p}) --{d}--> ROOT")
-    return "\n".join(lines)
-# ─── 5. SVG BUILDER (unchanged) ───────────────────────────────────────────
-def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
-    # Paste your entire original SVG‐generation code here unchanged
-    # It must accept sentence_data: list of dicts and return SVG string.
-    svg = "<svg><!-- your SVG here --></svg>"
-    return svg
-# ─── 6. PROCESS & DROPDOWN LOGIC ──────────────────────────────────────────
 def process_text(text, variant):
     if not text.strip():
@@ -185,9 +219,9 @@ def process_text(text, variant):
             [], "", pd.DataFrame(), ""
         )
     pipe = LESBIAN_MODELS.get(variant)
-    if pipe is None:
         return (
-            gr.HTML.update(value="<p>Error: model not loaded</p>"),
             gr.Dropdown.update(choices=[], value=None),
             [], "", pd.DataFrame(), ""
         )
@@ -195,20 +229,20 @@ def process_text(text, variant):
     doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
-    # token table with comments
-    df_table = conllu_to_dataframe_table(conllu)
-    # text-based dependencies
     text_viz = create_dependency_visualization(df_table)
-    # prepare for SVG dropdown
     sentences = []
     for sent in doc.sentences:
         payload = [{
             'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
             'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
             'FEATS': w.feats or "_", 'HEAD': w.head or 0,
-            'DEPREL': w.deprel or "_'
         } for w in sent.words]
         sentences.append(payload)
@@ -233,7 +267,7 @@ def update_svg(selected_id, sentences):
         return "<p>Invalid selection</p>"
-# ─── 7. GRADIO UI ─────────────────────────────────────────────────────────
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
@@ -264,8 +298,10 @@ def create_app():
         btn.click(
             fn=process_text,
             inputs=[txt, mdl],
-            outputs=[svg_out, sentence_dd, sentences_st,
-                     conllu_out, table_out, text_out]
         )
         sentence_dd.change(
             fn=update_svg,

 import traceback
 from pathlib import Path
+# 1. MODEL VARIANTS & INITIALIZATION
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
     "Lesbian-only":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
             out = base/name
             out.mkdir(exist_ok=True)
             files = {
+                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
+                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
+                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
+                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
             for fn, url in files.items():
                 tgt = out/fn
 loaded, load_status = initialize_models()
+# 2. CoNLL-U STRINGIZER
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
     return "\n".join(lines)
+# 3. TOKEN TABLE WITH COMMENT‐ROWS
+def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     """
+    Before each sentence (except first):
+      - an empty row
+      - a row with '# sent_id = …'
+      - a row with '# text = …'
+    Then the token rows.
     """
     blocks = [b for b in conllu.split("\n\n") if b.strip()]
+    records = []
     first = True
     for block in blocks:
         lines = block.splitlines()
+        sid_line = lines[0]
+        txt_line = lines[1]
         token_lines = lines[2:]
         if not first:
+            # empty row
+            records.append({c:"" for c in
+                ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
+            })
         first = False
         # comment rows
+        records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
+                        "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
+        records.append({"Id": txt_line, "Form":"", "Lemma":"", "UPosTag":"",
+                        "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
         # token rows
         for tl in token_lines:
             parts = tl.split("\t")
             if len(parts) < 10:
                 continue
             records.append({
+                "Id":       parts[0],
+                "Form":     parts[1],
+                "Lemma":    parts[2],
+                "UPosTag":  parts[3],
+                "XPosTag":  parts[4],
+                "Feats":    parts[5],
+                "Head":     parts[6],
+                "DepRel":   parts[7],
+                "Deps":     parts[8],
+                "Misc":     parts[9]
             })
+    return pd.DataFrame(records, columns=[
+        "Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
+    ])
+# 4. TEXT‐BASED DEPENDENCIES WITH BLANK+COMMENTS
 def create_dependency_visualization(df_table: pd.DataFrame) -> str:
     """
+    Inserts a blank line + comment lines before each sentence.
     """
     if df_table.empty:
         return "No data to visualize"
+    out = []
     first = True
     for row in df_table.itertuples(index=False):
+        # detect comment row
         if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
             if not first:
+                out.append("")  # blank line
             first = False
+            out.append(row.Id)
             continue
         if isinstance(row.Id, str) and row.Id.startswith("# text"):
+            out.append(row.Id)
             continue
+        # skip blank/comment
         if not row.Id.isdigit():
             continue
+        # token row
         w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
         if h != "0":
             try:
                 hw = df_table[df_table.Id == h].iloc[0].Form
             except:
                 hw = "[ERR]"
+            out.append(f"{w} ({p}) --{d}--> {hw}")
         else:
+            out.append(f"{w} ({p}) --{d}--> ROOT")
+    return "\n".join(out)
+# 5. FULL SVG BUILDER (UNCHANGED)
+def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
+    """
+    Paste your complete SVG‐generation function here, exactly as before.
+    sentence_data is a list of dicts; return an <svg>…</svg> string.
+    """
+    # -- your original code below --
+    df = pd.DataFrame(sentence_data)
+    word_count = len(df)
+    base_w, min_sp = 100, 30
+    spacing = max(base_w, (word_count*base_w + min_sp*(word_count-1))/word_count)
+    width = max(800, word_count*spacing + 100)
+    height = 500
+    word_y = height - 120
+    features_start_y = word_y + 20 + 15
+    deprel_colors = {
+        'root': '#000000', 'nsubj':'#2980b9', 'obj':'#27ae60', 'det':'#e67e22',
+        # … and the rest …
+    }
+    svg_parts = [
+        f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" '
+        'style="background:white;border:1px solid #eee"><defs>'
+    ]
+    # … arrowhead markers, arcs, labels, words & feats exactly as before …
+    svg_parts.append('</defs><g>')
+    # (Insert your entire previous implementation here)
+    svg_parts.append('</g></svg>')
+    return "".join(svg_parts)
+# 6. PROCESS + DROPDOWN
 def process_text(text, variant):
     if not text.strip():
             [], "", pd.DataFrame(), ""
         )
     pipe = LESBIAN_MODELS.get(variant)
+    if not pipe:
         return (
+            gr.HTML.update(value="<p>Error loading model</p>"),
             gr.Dropdown.update(choices=[], value=None),
             [], "", pd.DataFrame(), ""
         )
     doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
+    # Token table with comments
+    df_table = conllu_to_dataframe(conllu)
+    # Text‐based dependencies
     text_viz = create_dependency_visualization(df_table)
+    # Sentence payloads for SVG
     sentences = []
     for sent in doc.sentences:
         payload = [{
             'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
             'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
             'FEATS': w.feats or "_", 'HEAD': w.head or 0,
+            'DEPREL': w.deprel or "_"
         } for w in sent.words]
         sentences.append(payload)
         return "<p>Invalid selection</p>"
+# 7. GRADIO UI
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         btn.click(
             fn=process_text,
             inputs=[txt, mdl],
+            outputs=[
+                svg_out, sentence_dd, sentences_st,
+                conllu_out, table_out, text_out
+            ]
         )
         sentence_dd.change(
             fn=update_svg,