Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

08f1dd3

verified ·

1 Parent(s): 383a058

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -34

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import requests
 import traceback
 from pathlib import Path
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
     "Lesbian-only":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
@@ -59,6 +61,9 @@ def initialize_models():
 loaded, load_status = initialize_models()
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
     for sid, sent in enumerate(doc.sentences, 1):
@@ -78,22 +83,37 @@ def stanza_doc_to_conllu(doc) -> str:
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     rows = []
-    for L in conllu.splitlines():
-        if not L:
             if rows and rows[-1] != {}:
                 rows.append({})
             continue
-        if L.startswith("#"):
-            if "=" in L:
-                key, val = L[2:].split("=", 1)
-                rows.append({'ID': f"# {key.strip()} =", 'FORM': val.strip()})
             continue
-        parts = L.split("\t")
         if len(parts) >= 10:
             rows.append({
-                'ID': parts[0], 'FORM': parts[1], 'LEMMA': parts[2],
-                'UPOS': parts[3], 'XPOS': parts[4], 'FEATS': parts[5],
-                'HEAD': parts[6], 'DEPREL': parts[7], 'DEPS': parts[8], 'MISC': parts[9]
             })
     return pd.DataFrame(rows)
@@ -101,14 +121,19 @@ def create_dependency_visualization(df: pd.DataFrame) -> str:
     if df.empty:
         return "No data to visualize"
     viz = []
-    for i, row in df.iterrows():
-        if pd.isna(row["ID"]):
             continue
-        if isinstance(row["ID"], str) and row["ID"].startswith("#"):
             if viz:
-                viz.append("")
-            viz.append(f"{row['ID']} {row['FORM']}")
             continue
         w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
         if h != '0':
             try:
@@ -120,8 +145,159 @@ def create_dependency_visualization(df: pd.DataFrame) -> str:
             viz.append(f"{w} ({p}) --{d}--> ROOT")
     return "\n".join(viz)
-# Keep your create_single_sentence_svg as-is; it already includes annotation rendering
-# Be sure ann = [...] block includes: upos, lemma, and all feats, which it does in your version
 def process_text(text, variant):
     if not text.strip():
@@ -146,12 +322,16 @@ def process_text(text, variant):
     sentences = []
     for sent in doc.sentences:
-        payload = [ {
-            'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
-            'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
-            'FEATS': w.feats or "_", 'HEAD': w.head or 0,
             'DEPREL': w.deprel or "_"
-        } for w in sent.words ]
         sentences.append(payload)
     sent_ids = [str(i+1) for i in range(len(sentences))]
@@ -169,11 +349,14 @@ def process_text(text, variant):
 def update_svg(selected_id, sentences):
     try:
-        idx = int(selected_id)-1
         return create_single_sentence_svg(sentences[idx])
     except:
         return "<p>Invalid selection</p>"
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
@@ -183,11 +366,16 @@ def create_app():
         with gr.Row():
             with gr.Column():
-                txt = gr.Textbox(label="Input Text", lines=4,
-                                 placeholder="Εισάγετε κείμενο…")
-                mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
-                               value="Lesbian-only",
-                               label="Model Variant")
                 btn = gr.Button("Parse", variant="primary")
         with gr.Row():
@@ -198,13 +386,17 @@ def create_app():
         with gr.Row():
             with gr.Column():
-                conllu_out = gr.Textbox(label="CoNLL-U",
-                                        lines=10,
-                                        show_copy_button=True)
                 table_out  = gr.Dataframe(label="Token Table")
-                text_out   = gr.Textbox(label="Text-based Dependencies",
-                                        lines=8,
-                                        show_copy_button=True)
         btn.click(
             fn=process_text,

 import traceback
 from pathlib import Path
+# 1. MODEL VARIANTS & INITIALIZATION
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
     "Lesbian-only":           "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
 loaded, load_status = initialize_models()
+# 2. CONLL-U / DATAFRAME / TEXT‐VIZ
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
     for sid, sent in enumerate(doc.sentences, 1):
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     rows = []
+    for line in conllu.splitlines():
+        # empty line → separator between sentences
+        if not line:
             if rows and rows[-1] != {}:
                 rows.append({})
             continue
+        # comment line → sentence header
+        if line.startswith("#"):
+            # ex: "# sent_id = 2"
+            if "=" in line:
+                key, val = line[2:].split("=", 1)
+                rows.append({
+                    'ID':   f"# {key.strip()} =",
+                    'FORM': val.strip()
+                })
             continue
+        parts = line.split("\t")
         if len(parts) >= 10:
             rows.append({
+                'ID':     parts[0],
+                'FORM':   parts[1],
+                'LEMMA':  parts[2],
+                'UPOS':   parts[3],
+                'XPOS':   parts[4],
+                'FEATS':  parts[5],
+                'HEAD':   parts[6],
+                'DEPREL': parts[7],
+                'DEPS':   parts[8],
+                'MISC':   parts[9]
             })
     return pd.DataFrame(rows)
     if df.empty:
         return "No data to visualize"
     viz = []
+    for _, row in df.iterrows():
+        rid = row.get("ID")
+        # skip outright blank-rows skeleton
+        if pd.isna(rid):
             continue
+        # sentence header
+        if isinstance(rid, str) and rid.startswith("#"):
             if viz:
+                viz.append("")  # blank line before new sentence
+            viz.append(f"{rid} {row.get('FORM')}")
             continue
         w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
         if h != '0':
             try:
             viz.append(f"{w} ({p}) --{d}--> ROOT")
     return "\n".join(viz)
+# 3. FULL SVG BUILDER
+def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
+    try:
+        df = pd.DataFrame(sentence_data) if isinstance(sentence_data, list) else sentence_data
+        word_count = len(df)
+        base_word_width = 100
+        min_spacing = 30
+        word_spacing = max(
+            base_word_width,
+            (word_count * base_word_width + min_spacing * (word_count - 1)) / word_count
+        )
+        width = max(800, word_count * word_spacing + 100)
+        height = 500
+        word_y = height - 120
+        pos_y = word_y + 20
+        features_start_y = pos_y + 15
+        deprel_colors = {
+            'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
+            'amod': '#8e44ad', 'nmod': '#16a085', 'case': '#34495e', 'punct': '#7f8c8d',
+            'cc': '#d35400', 'conj': '#2c3e50', 'cop': '#e74c3c', 'mark': '#9b59b6',
+            'csubj': '#3498db', 'xcomp': '#1abc9c', 'ccomp': '#f39c12', 'advcl': '#e91e63',
+            'advmod': '#9c27b0', 'obl': '#795548', 'iobj': '#607d8b', 'fixed': '#ff5722',
+            'aux': '#ff9800', 'acl': '#4caf50', 'appos': '#673ab7', 'compound': '#009688'
+        }
+        svg = [
+            f'<svg width="{width}" height="{height}" xmlns="http://www.w3.org/2000/svg" '
+            'style="background: white; border: 1px solid #eee;">',
+            '<defs>'
+        ]
+        for rel, color in deprel_colors.items():
+            svg.append(
+                f'<marker id="arrow_{rel}" markerWidth="4" markerHeight="4" '
+                'markerUnits="userSpaceOnUse" orient="auto" refX="3.5" refY="2">'
+                f'<path d="M0,0 L4,2 L0,4 Z" fill="{color}"/>'
+                '</marker>'
+            )
+        svg.append('</defs><g>')
+        # calculate x positions
+        word_positions = {}
+        for idx, row in df.iterrows():
+            wid = int(row['ID'])
+            word_positions[wid] = 50 + (wid - 1) * word_spacing
+        # draw arcs/lines
+        used_spans = []
+        for _, row in df.iterrows():
+            wid = int(row['ID'])
+            hid = int(row['HEAD']) if row['HEAD'] != '0' else 0
+            rel = row['DEPREL']
+            x1 = word_positions[wid]
+            col = deprel_colors.get(rel, '#000')
+            if hid == 0:
+                # root arrow
+                svg.append(
+                    f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
+                    f'stroke="{col}" stroke-width="1.5"/>'
+                )
+                mid = (word_y-15 + 50) / 2
+                svg.append(
+                    f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
+                    f'fill="white" stroke="{col}" rx="2"/>'
+                )
+                svg.append(
+                    f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
+                    f'fill="{col}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>'
+                )
+            else:
+                x2 = word_positions.get(hid, x1)
+                span = (min(wid, hid), max(wid, hid))
+                lvl = 0
+                conflict = True
+                while conflict:
+                    conflict = False
+                    for (es, el), used_lvl in used_spans:
+                        if used_lvl == lvl and not (span[1] < es or span[0] > el):
+                            lvl += 1
+                            conflict = True
+                            break
+                used_spans.append((span, lvl))
+                dist = abs(x2 - x1)
+                base_h = min(40 + dist * 0.15, 100)
+                arc_h = base_h + lvl * 35
+                midx = (x1 + x2) / 2
+                cty = word_y - arc_h
+                svg.append(
+                    f'<path d="M {x1} {word_y-15} Q {midx} {cty} {x2} {word_y-15}" '
+                    f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
+                )
+                # label box
+                amx = 0.25*x1 + 0.5*midx + 0.25*x2
+                amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
+                lw = len(rel)*6 + 8
+                svg.append(
+                    f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
+                    f'fill="white" stroke="{col}" rx="2"/>'
+                )
+                svg.append(
+                    f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
+                    f'fill="{col}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>'
+                )
+        # draw words + annotations
+        for _, row in df.iterrows():
+            wid = int(row['ID'])
+            x = word_positions[wid]
+            word  = row['FORM']
+            pos   = row['UPOS']
+            lemma = row['LEMMA']
+            feats = row['FEATS']
+            xpos  = row['XPOS']
+            # word text
+            svg.append(
+                f'<text x="{x}" y="{word_y}" text-anchor="middle" '
+                'font-family="Arial" font-size="13" font-weight="bold">'
+                f'{word}</text>'
+            )
+            # annotations underneath
+            ann = []
+            if pos   and pos   != '_': ann.append(f"upos={pos}")
+            if lemma and lemma not in ('_', word): ann.append(f"lemma={lemma}")
+            if xpos  and xpos  != '_': ann.append(f"xpos={xpos}")
+            if feats and feats not in ('', '_'):
+                for fpair in feats.split('|'):
+                    if '=' in fpair:
+                        ann.append(fpair)
+            for i, a in enumerate(ann):
+                y0 = features_start_y + i*12
+                svg.append(
+                    f'<text x="{x}" y="{y0}" text-anchor="middle" '
+                    f'font-family="Arial" font-size="7" fill="#666">{a}</text>'
+                )
+        svg.append('</g></svg>')
+        return "".join(svg)
+    except Exception as e:
+        return f"<p>Error creating SVG: {e}</p>"
+# 4. PROCESS & DROPDOWN-UPDATES
 def process_text(text, variant):
     if not text.strip():
     sentences = []
     for sent in doc.sentences:
+        payload = [{
+            'ID':     w.id,
+            'FORM':   w.text,
+            'LEMMA':  w.lemma or "_",
+            'UPOS':   w.upos or "_",
+            'XPOS':   w.xpos or "_",
+            'FEATS':  w.feats or "_",
+            'HEAD':   w.head or 0,
             'DEPREL': w.deprel or "_"
+        } for w in sent.words]
         sentences.append(payload)
     sent_ids = [str(i+1) for i in range(len(sentences))]
 def update_svg(selected_id, sentences):
     try:
+        idx = int(selected_id) - 1
         return create_single_sentence_svg(sentences[idx])
     except:
         return "<p>Invalid selection</p>"
+# 5. BUILD GRADIO UI
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
         with gr.Row():
             with gr.Column():
+                txt = gr.Textbox(
+                    label="Input Text",
+                    lines=4,
+                    placeholder="Εισάγετε κείμενο…"
+                )
+                mdl = gr.Radio(
+                    choices=list(MODEL_VARIANTS.keys()),
+                    value="Lesbian-only",
+                    label="Model Variant"
+                )
                 btn = gr.Button("Parse", variant="primary")
         with gr.Row():
         with gr.Row():
             with gr.Column():
+                conllu_out = gr.Textbox(
+                    label="CoNLL-U",
+                    lines=10,
+                    show_copy_button=True
+                )
                 table_out  = gr.Dataframe(label="Token Table")
+                text_out   = gr.Textbox(
+                    label="Text-based Dependencies",
+                    lines=8,
+                    show_copy_button=True
+                )
         btn.click(
             fn=process_text,