Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

9b63c71

verified ·

1 Parent(s): 961e051

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -118

app.py CHANGED Viewed

@@ -6,15 +6,19 @@ import requests
 import traceback
 from pathlib import Path
 # 1. MODEL VARIANTS & INITIALIZATION
 MODEL_VARIANTS = {
-    "Lesbian-only (UD_Greek-Lesbian)": "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
-    "Lesbian-augmented (UD_Greek-NGUD+Lesbian)": "sbompolas/NGUD-Lesbian-Morphosyntactic-Model",
-    "Standard Modern Greek (UD_Greek-GUD)": "viv/UD_Greek-GUD/tree/main/models"
 }
-def download_model_file(url, filename):
     try:
         resp = requests.get(url, stream=True)
         resp.raise_for_status()
@@ -23,26 +27,33 @@ def download_model_file(url, filename):
                 f.write(chunk)
         return True
     except Exception as e:
-        print(f"Download failed {filename}: {e}")
         return False
 def initialize_models():
     try:
         base = Path("./models")
         base.mkdir(exist_ok=True)
-        for name, repo in MODEL_VARIANTS.items():
-            out = base / name
             out.mkdir(exist_ok=True)
             files = {
-                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
-                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
-                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
-                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
             for fn, url in files.items():
                 tgt = out / fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
-                    return False, f"Failed to download {fn} for {name}"
             cfg = {
                 'processors': 'tokenize,pos,lemma,depparse',
                 'lang': 'el',
@@ -51,9 +62,10 @@ def initialize_models():
                 'tokenize_model_path': str(out/"tokenizer.pt"),
                 'pos_model_path':      str(out/"pos.pt"),
                 'lemma_model_path':    str(out/"lemmatizer.pt"),
-                'depparse_model_path': str(out/"depparse.pt")
             }
-            LESBIAN_MODELS[name] = stanza.Pipeline(**cfg)
         return True, "Models loaded"
     except Exception as e:
         traceback.print_exc()
@@ -62,7 +74,9 @@ def initialize_models():
 loaded, load_status = initialize_models()
-# 2. CONLL-U / DATAFRAME
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
@@ -71,22 +85,20 @@ def stanza_doc_to_conllu(doc) -> str:
         lines.append(f"# text = {sent.text}")
         for w in sent.words:
             fields = [
-                str(w.id), w.text,
-                w.lemma or "_", w.upos or "_",
                 w.xpos or "_", w.feats or "_",
-                str(w.head) if w.head is not None else "0",
                 w.deprel or "_", "_", "_"
             ]
             lines.append("\t".join(fields))
-        lines.append("")  # blank line after each sentence
     return "\n".join(lines)
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
     rows = []
     for line in conllu.splitlines():
-        if not line:
-            # empty separator row
             rows.append({c: "" for c in cols})
             continue
         if line.startswith("#"):
@@ -103,7 +115,9 @@ def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     return pd.DataFrame(rows, columns=cols).fillna("")
-# 3. FULL SVG BUILDER (crop top + bottom padding, arrows at start)
 def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
     try:
@@ -112,16 +126,17 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
         base_w, min_sp = 100, 30
         spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n)
         width = max(800, n*spacing + 100)
-        orig_height = 500
-        crop_top   = 30  # px to remove from top
-        bottom_pad = 30  # px to add at bottom
-        height     = orig_height - crop_top + bottom_pad
-        word_y   = height - 120
-        feats_y  = word_y + 35
         colors = {
-            'root':'#000000','nsubj':'#2980b9','obj':'#27ae60','det':'#e67e22',
             'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d',
             'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6',
             'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63',
@@ -130,8 +145,10 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
         }
         svg = [
-            f'<svg width="{width}" height="{height}" viewBox="0 {crop_top} {width} {orig_height}" '
-            'xmlns="http://www.w3.org/2000/svg" style="background:white;border:1px solid #eee;"><defs>'
         ]
         for rel, c in colors.items():
             svg.append(
@@ -139,98 +156,81 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
                 'markerUnits="userSpaceOnUse" orient="auto-start-reverse" refX="3.5" refY="2">'
                 f'<path d="M0,0 L4,2 L0,4Z" fill="{c}"/></marker>'
             )
-        svg.append('</defs><g>')
-        # compute x positions
         xpos = {
-            int(r['ID']): 50 + (int(r['ID']) - 1) * spacing
-            for _, r in df.iterrows() if str(r['ID']).isdigit()
         }
         used_spans = []
         for _, r in df.iterrows():
-            if not str(r['ID']).isdigit():
-                continue
             i, h = int(r['ID']), int(r['HEAD'])
             rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000')
             x1 = xpos[i]
             if h == 0:
-                # ROOT line
-                svg.append(
-                    f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
-                    f'stroke="{c}" stroke-width="1.5"/>'
-                )
-                mid = (word_y-15 + 50) / 2
-                svg.append(
-                    f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
-                    f'fill="white" stroke="{c}" rx="2"/>'
-                )
-                svg.append(
-                    f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
-                    f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>'
-                )
             else:
                 x2 = xpos.get(h, x1)
-                span = (min(i, h), max(i, h))
-                lvl = 0
                 conflict = True
                 while conflict:
                     conflict = False
-                    for (es, el), used_lvl in used_spans:
-                        if used_lvl == lvl and not (span[1] < es or span[0] > el):
                             lvl += 1
                             conflict = True
                             break
-                used_spans.append((span, lvl))
-                dist = abs(x2 - x1)
-                arc_h = min(40 + dist * 0.15, 100) + lvl * 35
-                midx, cty = (x1 + x2) / 2, word_y - arc_h
-                path_d = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}'
                 svg.append(
-                    f'<path d="{path_d}" stroke="{c}" fill="none" stroke-width="1.5" '
                     f'marker-start="url(#m_{rel})"/>'
                 )
                 amx = 0.25*x1 + 0.5*midx + 0.25*x2
-                amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
-                lw = len(rel)*6 + 8
-                svg.append(
-                    f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
-                    f'fill="white" stroke="{c}" rx="2"/>'
-                )
-                svg.append(
-                    f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
-                    f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>'
-                )
-        # draw tokens + annotations
         for _, r in df.iterrows():
-            if not str(r['ID']).isdigit():
-                continue
             x = xpos[int(r['ID'])]
-            svg.append(
-                f'<text x="{x}" y="{word_y}" text-anchor="middle" '
-                f'font-family="Arial" font-size="13" font-weight="bold">{r["FORM"]}</text>'
-            )
             ann = []
-            if r['UPOS'] and r['UPOS'] != '_': ann.append(f"upos={r['UPOS']}")
-            if r['LEMMA'] not in ('_', r['FORM']): ann.append(f"lemma={r['LEMMA']}")
-            if r['FEATS'] and r['FEATS'] not in ('', '_'):
-                for f in r['FEATS'].split('|'):
-                    if '=' in f:
-                        ann.append(f)
-            for i, a in enumerate(ann):
-                svg.append(
-                    f'<text x="{x}" y="{feats_y + i*12}" text-anchor="middle" '
-                    f'font-family="Arial" font-size="7" fill="#666">{a}</text>'
-                )
-        svg.append('</g></svg>')
         return "".join(svg)
     except Exception as e:
         return f"<p>Error creating SVG: {e}</p>"
-# 4. PROCESS & DROPDOWN-UPDATES
 def process_text(text, variant):
     if not text.strip():
@@ -239,33 +239,34 @@ def process_text(text, variant):
             gr.Dropdown.update(choices=[], value=None),
             [], "", pd.DataFrame()
         )
-    pipe = LESBIAN_MODELS.get(variant)
     if pipe is None:
         return (
             gr.HTML.update(value="<p>Error: model not loaded</p>"),
             gr.Dropdown.update(choices=[], value=None),
             [], "", pd.DataFrame()
         )
-    doc = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
-    df = conllu_to_dataframe(conllu)
     sentences = []
     for sent in doc.sentences:
         payload = [{
-            'ID':     w.id,
-            'FORM':   w.text,
-            'LEMMA':  w.lemma or "_",
-            'UPOS':   w.upos or "_",
-            'XPOS':   w.xpos or "_",
-            'FEATS':  w.feats or "_",
-            'HEAD':   w.head or 0,
-            'DEPREL': w.deprel or "_"
         } for w in sent.words]
         sentences.append(payload)
     sent_ids = [str(i+1) for i in range(len(sentences))]
-    dd_upd = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None)
     init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No data</p>"
     return init_svg, dd_upd, sentences, conllu, df
@@ -278,38 +279,41 @@ def update_svg(selected_id, sentences):
         return "<p>Invalid selection</p>"
-# 5. BUILD GRADIO UI
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
         if not loaded:
             gr.Markdown(f"❌ Load error: {load_status}")
         with gr.Row():
             with gr.Column():
-                txt = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…")
-                mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()), value="Lesbian-only", label="Model Variant")
-                btn = gr.Button("Parse", variant="primary")
         with gr.Row():
             with gr.Column():
-                svg_out      = gr.HTML("<p>No visualization</p>")
-                sentence_dd  = gr.Dropdown(label="Choose sentence", choices=[])
-                sentences_st = gr.State([])
         with gr.Row():
             with gr.Column():
                 conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
                 table_out  = gr.Dataframe(label="Token Table")
-        btn.click(
-            fn=process_text,
-            inputs=[txt, mdl],
-            outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out]
-        )
-        sentence_dd.change(fn=update_svg, inputs=[sentence_dd, sentences_st], outputs=svg_out)
     return app

 import traceback
 from pathlib import Path
+# -----------------------------------------------------------------------------
 # 1. MODEL VARIANTS & INITIALIZATION
+# -----------------------------------------------------------------------------
 MODEL_VARIANTS = {
+    "Lesbian-only (UD_Greek-Lesbian)":             "sbompolas/Lesbian-Greek-Morphosyntactic-Model",
+    "Lesbian-augmented (UD_Greek-NGUD+Lesbian)":   "sbompolas/NGUD-Lesbian-Morphosyntactic-Model",
+    "Standard Modern Greek (UD_Greek-GUD)":        "viv/UD_Greek-GUD"
 }
+MODELS = {}
+def download_model_file(url: str, filename: str) -> bool:
     try:
         resp = requests.get(url, stream=True)
         resp.raise_for_status()
                 f.write(chunk)
         return True
     except Exception as e:
+        print(f"⛔ Download failed {filename}: {e}")
         return False
 def initialize_models():
     try:
         base = Path("./models")
         base.mkdir(exist_ok=True)
+        for label, repo in MODEL_VARIANTS.items():
+            out = base / label
             out.mkdir(exist_ok=True)
+            # GUD stores its .pt files under "models/" subfolder
+            is_gud = repo.endswith("UD_Greek-GUD")
+            subdir = "models" if is_gud else ""
             files = {
+                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/{subdir}/tokenizer.pt",
+                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/{subdir}/lemmatizer.pt",
+                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/{subdir}/pos.pt",
+                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/{subdir}/depparse.pt",
             }
             for fn, url in files.items():
                 tgt = out / fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
+                    return False, f"Failed to download {fn} for {label}"
             cfg = {
                 'processors': 'tokenize,pos,lemma,depparse',
                 'lang': 'el',
                 'tokenize_model_path': str(out/"tokenizer.pt"),
                 'pos_model_path':      str(out/"pos.pt"),
                 'lemma_model_path':    str(out/"lemmatizer.pt"),
+                'depparse_model_path': str(out/"depparse.pt"),
             }
+            MODELS[label] = stanza.Pipeline(**cfg)
         return True, "Models loaded"
     except Exception as e:
         traceback.print_exc()
 loaded, load_status = initialize_models()
+# -----------------------------------------------------------------------------
+# 2. CONLL-U OUTPUT & TOKEN TABLE
+# -----------------------------------------------------------------------------
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
         lines.append(f"# text = {sent.text}")
         for w in sent.words:
             fields = [
+                str(w.id), w.text, w.lemma or "_", w.upos or "_",
                 w.xpos or "_", w.feats or "_",
+                str(w.head if w.head is not None else 0),
                 w.deprel or "_", "_", "_"
             ]
             lines.append("\t".join(fields))
+        lines.append("")  # separator
     return "\n".join(lines)
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
     rows = []
     for line in conllu.splitlines():
+        if not line.strip():
             rows.append({c: "" for c in cols})
             continue
         if line.startswith("#"):
     return pd.DataFrame(rows, columns=cols).fillna("")
+# -----------------------------------------------------------------------------
+# 3. SVG BUILDER (arrows at start, crop top + pad bottom)
+# -----------------------------------------------------------------------------
 def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
     try:
         base_w, min_sp = 100, 30
         spacing = max(base_w, (n*base_w + (n-1)*min_sp)/n)
         width = max(800, n*spacing + 100)
+        orig_h   = 500
+        crop_top = 30
+        pad_bot  = 30
+        height   = orig_h - crop_top + pad_bot
+        word_y  = height - 120
+        feats_y = word_y + 35
         colors = {
+            'root':'#000', 'nsubj':'#2980b9', 'obj':'#27ae60', 'det':'#e67e22',
             'amod':'#8e44ad','nmod':'#16a085','case':'#34495e','punct':'#7f8c8d',
             'cc':'#d35400','conj':'#2c3e50','cop':'#e74c3c','mark':'#9b59b6',
             'csubj':'#3498db','xcomp':'#1abc9c','ccomp':'#f39c12','advcl':'#e91e63',
         }
         svg = [
+            f'<svg width="{width}" height="{height}" '
+            f'viewBox="0 {crop_top} {width} {orig_h}" '
+            'xmlns="http://www.w3.org/2000/svg" '
+            'style="background:white;border:1px solid #eee;"><defs>'
         ]
         for rel, c in colors.items():
             svg.append(
                 'markerUnits="userSpaceOnUse" orient="auto-start-reverse" refX="3.5" refY="2">'
                 f'<path d="M0,0 L4,2 L0,4Z" fill="{c}"/></marker>'
             )
+        svg.append("</defs><g>")
         xpos = {
+            int(r['ID']): 50 + (int(r['ID'])-1)*spacing
+            for _,r in df.iterrows() if str(r['ID']).isdigit()
         }
         used_spans = []
         for _, r in df.iterrows():
+            if not str(r['ID']).isdigit(): continue
             i, h = int(r['ID']), int(r['HEAD'])
             rel, c = r['DEPREL'], colors.get(r['DEPREL'], '#000')
             x1 = xpos[i]
             if h == 0:
+                svg.append(f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
+                           f'stroke="{c}" stroke-width="1.5"/>')
+                mid = (word_y-15 + 50)/2
+                svg.append(f'<rect x="{x1-15}" y="{mid-8}" width="30" height="14" '
+                           f'fill="white" stroke="{c}" rx="2"/>')
+                svg.append(f'<text x="{x1}" y="{mid+2}" text-anchor="middle" '
+                           f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">ROOT</text>')
             else:
                 x2 = xpos.get(h, x1)
+                span = (min(i,h), max(i,h))
+                lvl  = 0
                 conflict = True
                 while conflict:
                     conflict = False
+                    for (es,el), lvl_used in used_spans:
+                        if lvl_used==lvl and not (span[1]<es or span[0]>el):
                             lvl += 1
                             conflict = True
                             break
+                used_spans.append((span,lvl))
+                dist  = abs(x2-x1)
+                arc_h = min(40+dist*0.15,100)+lvl*35
+                midx, cty = (x1+x2)/2, word_y-arc_h
+                path = f'M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}'
                 svg.append(
+                    f'<path d="{path}" stroke="{c}" fill="none" stroke-width="1.5" '
                     f'marker-start="url(#m_{rel})"/>'
                 )
                 amx = 0.25*x1 + 0.5*midx + 0.25*x2
+                amy = 0.25*(word_y-15)+0.5*cty+0.25*(word_y-15)
+                lw  = len(rel)*6 + 8
+                svg.append(f'<rect x="{amx-lw/2}" y="{amy-8}" width="{lw}" height="14" '
+                           f'fill="white" stroke="{c}" rx="2"/>')
+                svg.append(f'<text x="{amx}" y="{amy+2}" text-anchor="middle" '
+                           f'fill="{c}" font-family="Arial" font-size="8" font-weight="bold">{rel}</text>')
         for _, r in df.iterrows():
+            if not str(r['ID']).isdigit(): continue
             x = xpos[int(r['ID'])]
+            svg.append(f'<text x="{x}" y="{word_y}" text-anchor="middle" '
+                       f'font-family="Arial" font-size="13" font-weight="bold">{r["FORM"]}</text>')
             ann = []
+            if r['UPOS'] and r['UPOS']!="_": ann.append(f"upos={r['UPOS']}")
+            if r['LEMMA'] not in ("_",r["FORM"]): ann.append(f"lemma={r['LEMMA']}")
+            if r['FEATS'] and r['FEATS']!="_":
+                for f in r['FEATS'].split("|"):
+                    if "=" in f: ann.append(f)
+            for i,a in enumerate(ann):
+                svg.append(f'<text x="{x}" y="{feats_y+i*12}" text-anchor="middle" '
+                           f'font-family="Arial" font-size="7" fill="#666">{a}</text>')
+        svg.append("</g></svg>")
         return "".join(svg)
     except Exception as e:
         return f"<p>Error creating SVG: {e}</p>"
+# -----------------------------------------------------------------------------
+# 4. PROCESS & DROPDOWNS
+# -----------------------------------------------------------------------------
 def process_text(text, variant):
     if not text.strip():
             gr.Dropdown.update(choices=[], value=None),
             [], "", pd.DataFrame()
         )
+    pipe = MODELS.get(variant)
     if pipe is None:
         return (
             gr.HTML.update(value="<p>Error: model not loaded</p>"),
             gr.Dropdown.update(choices=[], value=None),
             [], "", pd.DataFrame()
         )
+    doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
+    df     = conllu_to_dataframe(conllu)
     sentences = []
     for sent in doc.sentences:
         payload = [{
+            'ID':    w.id,
+            'FORM':  w.text,
+            'LEMMA': w.lemma or "_",
+            'UPOS':  w.upos or "_",
+            'XPOS':  w.xpos or "_",
+            'FEATS': w.feats or "_",
+            'HEAD':  w.head or 0,
+            'DEPREL':w.deprel or "_"
         } for w in sent.words]
         sentences.append(payload)
     sent_ids = [str(i+1) for i in range(len(sentences))]
+    dd_upd   = update(choices=sent_ids, value=sent_ids[0] if sent_ids else None)
     init_svg = create_single_sentence_svg(sentences[0]) if sentences else "<p>No data</p>"
     return init_svg, dd_upd, sentences, conllu, df
         return "<p>Invalid selection</p>"
+# -----------------------------------------------------------------------------
+# 5. BUILD GRADIO APP
+# -----------------------------------------------------------------------------
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
         if not loaded:
             gr.Markdown(f"❌ Load error: {load_status}")
         with gr.Row():
             with gr.Column():
+                txt  = gr.Textbox(label="Input Text", lines=4, placeholder="Εισάγετε κείμενο…")
+                mdl  = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
+                                value=list(MODEL_VARIANTS.keys())[0],
+                                label="Model Variant")
+                btn  = gr.Button("Parse", variant="primary")
         with gr.Row():
             with gr.Column():
+                svg_out     = gr.HTML("<p>No visualization</p>")
+                sentence_dd = gr.Dropdown(label="Choose sentence", choices=[])
+                state_sents = gr.State([])
         with gr.Row():
             with gr.Column():
                 conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
                 table_out  = gr.Dataframe(label="Token Table")
+        btn.click(fn=process_text,
+                  inputs=[txt, mdl],
+                  outputs=[svg_out, sentence_dd, state_sents, conllu_out, table_out])
+        sentence_dd.change(fn=update_svg,
+                           inputs=[sentence_dd, state_sents],
+                           outputs=svg_out)
     return app