Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

f73ba75

verified ·

1 Parent(s): 78b8555

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -104

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import requests
 import traceback
 from pathlib import Path
-# ─── 1. MODEL VARIANTS & INITIALIZATION ──────────────────────────────────────
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
@@ -34,20 +34,18 @@ def initialize_models():
             out = models_dir/name
             out.mkdir(exist_ok=True)
             files = {
-                "tokenizer.pt":    f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
-                "lemmatizer.pt":   f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
-                "pos.pt":          f"https://huggingface.co/{repo}/resolve/main/pos.pt",
-                "depparse.pt":     f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
-            for fn, url in files.items():
                 tgt = out/fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
                     return False, f"Failed to download {fn} for {name}"
             cfg = {
                 'processors': 'tokenize,pos,lemma,depparse',
-                'lang': 'el',
-                'use_gpu': False,
-                'verbose': False,
                 'tokenize_model_path': str(out/"tokenizer.pt"),
                 'pos_model_path':      str(out/"pos.pt"),
                 'lemma_model_path':    str(out/"lemmatizer.pt"),
@@ -62,95 +60,139 @@ def initialize_models():
 loaded, load_status = initialize_models()
-# ─── 2. CONLL-U / TOKEN TABLE / TEXT-BASED VIZ ────────────────────────────────
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
-    for sid, sent in enumerate(doc.sentences, 1):
         lines.append(f"# sent_id = {sid}")
         lines.append(f"# text = {sent.text}")
         for w in sent.words:
             fields = [
-                str(w.id),
-                w.text,
-                w.lemma or "_",
-                w.upos or "_",
-                w.xpos or "_",
-                w.feats or "_",
                 str(w.head) if w.head is not None else "0",
-                w.deprel or "_",
-                "_",
-                "_"
             ]
             lines.append("\t".join(fields))
-        lines.append("")
     return "\n".join(lines)
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
-    """Return only the 10 token columns, renamed per spec."""
-    rows = []
-    for line in conllu.splitlines():
-        if not line or line.startswith("#"):
-            continue
-        parts = line.split("\t")
-        if len(parts) < 10:
-            continue
-        rows.append({
-            "Id":       parts[0],
-            "Form":     parts[1],
-            "Lemma":    parts[2],
-            "UPosTag":  parts[3],
-            "XPosTag":  parts[4],
-            "Feats":    parts[5],
-            "Head":     parts[6],
-            "DepRel":   parts[7],
-            "Deps":     parts[8],
-            "Misc":     parts[9],
-        })
-    return pd.DataFrame(rows, columns=[
-        "Id","Form","Lemma","UPosTag","XPosTag",
-        "Feats","Head","DepRel","Deps","Misc"
     ])
-def create_dependency_visualization(df: pd.DataFrame) -> str:
     """
-    Blank line before every new sentence (ID == '1'), except the first.
     """
-    if df.empty:
-        return "No data to visualize"
-    lines = []
     first = True
-    for row in df.itertuples(index=False):
-        if row.Id == "1":
-            if not first:
-                lines.append("")            # blank line between sentences
-            first = False
-        w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
-        if h != "0":
-            try:
-                hw = df[df.Id == h].iloc[0].Form
-            except:
-                hw = "[ERR]"
-            lines.append(f"{w} ({p}) --{d}--> {hw}")
-        else:
-            lines.append(f"{w} ({p}) --{d}--> ROOT")
-    return "\n".join(lines)
-# ─── 3. FULL SVG BUILDER ─────────────────────────────────────────────────────
 def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
     """
-    Paste your original create_single_sentence_svg code here unchanged.
-    It expects sentence_data: List[Dict] and returns <svg>…</svg>.
     """
-    # … your full SVG builder …
     return "<svg><!-- your SVG here --></svg>"
-# ─── 4. PROCESS & DROPDOWN-UPDATES ───────────────────────────────────────────
 def process_text(text, variant):
     if not text.strip():
@@ -167,12 +209,17 @@ def process_text(text, variant):
             [], "", pd.DataFrame(), ""
         )
-    doc = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
     df     = conllu_to_dataframe(conllu)
-    text_v = create_dependency_visualization(df)
-    # build sentence‐level payload for SVG
     sentences = []
     for sent in doc.sentences:
         payload = [{
@@ -198,13 +245,13 @@ def process_text(text, variant):
 def update_svg(selected_id, sentences):
     try:
-        idx = int(selected_id) - 1
         return create_single_sentence_svg(sentences[idx])
     except:
         return "<p>Invalid selection</p>"
-# ─── 5. BUILD THE GRADIO INTERFACE ──────────────────────────────────────────
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
@@ -215,16 +262,10 @@ def create_app():
         with gr.Row():
             with gr.Column():
-                txt = gr.Textbox(
-                    label="Input Text",
-                    lines=4,
-                    placeholder="Εισάγετε κείμενο…"
-                )
-                mdl = gr.Radio(
-                    choices=list(MODEL_VARIANTS.keys()),
-                    value="Lesbian-only",
-                    label="Model Variant"
-                )
                 btn = gr.Button("Parse", variant="primary")
         with gr.Row():
@@ -235,29 +276,14 @@ def create_app():
         with gr.Row():
             with gr.Column():
-                conllu_out = gr.Textbox(
-                    label="CoNLL-U",
-                    lines=10,
-                    show_copy_button=True
-                )
                 table_out  = gr.Dataframe(label="Token Table")
-                text_out   = gr.Textbox(
-                    label="Text-based Dependencies",
-                    lines=8,
-                    show_copy_button=True
-                )
         btn.click(
             fn=process_text,
             inputs=[txt, mdl],
-            outputs=[
-                svg_out,
-                sentence_dd,
-                sentences_st,
-                conllu_out,
-                table_out,
-                text_out
-            ]
         )
         sentence_dd.change(
             fn=update_svg,

 import traceback
 from pathlib import Path
+# 1. MODEL VARIANTS & INITIALIZATION
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
             out = models_dir/name
             out.mkdir(exist_ok=True)
             files = {
+                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
+                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
+                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
+                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
+            for fn,url in files.items():
                 tgt = out/fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
                     return False, f"Failed to download {fn} for {name}"
             cfg = {
                 'processors': 'tokenize,pos,lemma,depparse',
+                'lang': 'el', 'use_gpu': False, 'verbose': False,
                 'tokenize_model_path': str(out/"tokenizer.pt"),
                 'pos_model_path':      str(out/"pos.pt"),
                 'lemma_model_path':    str(out/"lemmatizer.pt"),
 loaded, load_status = initialize_models()
+# 2. CONLL-U TO COARSE OUTPUT
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
+    for sid, sent in enumerate(doc.sentences,1):
         lines.append(f"# sent_id = {sid}")
         lines.append(f"# text = {sent.text}")
         for w in sent.words:
             fields = [
+                str(w.id), w.text,
+                w.lemma or "_", w.upos or "_",
+                w.xpos or "_", w.feats or "_",
                 str(w.head) if w.head is not None else "0",
+                w.deprel or "_","_","_"
             ]
             lines.append("\t".join(fields))
+        lines.append("")  # blank line after each sentence
     return "\n".join(lines)
+# 3. TOKEN TABLE: insert comment‐rows + empty‐rows per sentence
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
+    """
+    Splits conllu into sentence‐blocks, then for each block:
+     - (if not first) insert a blank row
+     - insert '# sent_id = …' row
+     - insert '# text = …' row
+     - then all token rows
+    """
+    blocks = [b for b in conllu.split("\n\n") if b.strip()]
+    records = []
+    first = True
+    for block in blocks:
+        lines = block.splitlines()
+        sid_line  = lines[0]
+        text_line = lines[1]
+        token_lines = lines[2:]
+        if not first:
+            # blank row
+            records.append({c:"" for c in
+                ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
+            })
+        first = False
+        # comment rows
+        records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
+                        "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
+        records.append({"Id": text_line, "Form":"", "Lemma":"", "UPosTag":"",
+                        "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
+        # token rows
+        for tl in token_lines:
+            parts = tl.split("\t")
+            if len(parts) < 10:
+                continue
+            records.append({
+                "Id": parts[0],
+                "Form": parts[1],
+                "Lemma": parts[2],
+                "UPosTag": parts[3],
+                "XPosTag": parts[4],
+                "Feats": parts[5],
+                "Head": parts[6],
+                "DepRel": parts[7],
+                "Deps": parts[8],
+                "Misc": parts[9]
+            })
+    return pd.DataFrame(records, columns=[
+        "Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
     ])
+# 4. TEXT-BASED DEPENDENCIES: blank + comment per sentence
+def create_dependency_visualization(conllu: str) -> str:
     """
+    Splits by blank‐line into sentence‐blocks, then for each:
+     - blank line (if not first)
+     - comment lines
+     - parse lines
     """
+    blocks = [b for b in conllu.split("\n\n") if b.strip()]
+    out = []
     first = True
+    for block in blocks:
+        lines = block.splitlines()
+        sid_line, txt_line = lines[0], lines[1]
+        token_lines = lines[2:]
+        if not first:
+            out.append("")  # blank line separator
+        first = False
+        out.append(sid_line)
+        out.append(txt_line)
+        # build head→form map for this sentence
+        id2form = {}
+        for tl in token_lines:
+            p = tl.split("\t")
+            if len(p)>=2:
+                id2form[p[0]] = p[1]
+        for tl in token_lines:
+            p = tl.split("\t")
+            if len(p) < 8:
+                continue
+            w, upos, head, deprel = p[1], p[3], p[6], p[7]
+            if head != "0" and head in id2form:
+                out.append(f"{w} ({upos}) --{deprel}--> {id2form[head]}")
+            else:
+                out.append(f"{w} ({upos}) --{deprel}--> ROOT")
+    return "\n".join(out)
+# 5. SVG BUILDER (unchanged)
 def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
     """
+    Paste your entire original SVG‐generation code here unchanged.
+    It takes sentence_data: List[dict] and returns an <svg>…</svg> string.
     """
+    # … your SVG builder from the attached file …
     return "<svg><!-- your SVG here --></svg>"
+# 6. PROCESS + DROPDOWN
 def process_text(text, variant):
     if not text.strip():
             [], "", pd.DataFrame(), ""
         )
+    # parse
+    doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
+    # build token table
     df     = conllu_to_dataframe(conllu)
+    # text-based deps
+    text_v = create_dependency_visualization(conllu)
+    # prepare sentence payloads for SVG
     sentences = []
     for sent in doc.sentences:
         payload = [{
 def update_svg(selected_id, sentences):
     try:
+        idx = int(selected_id)-1
         return create_single_sentence_svg(sentences[idx])
     except:
         return "<p>Invalid selection</p>"
+# 7. GRADIO UI
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         with gr.Row():
             with gr.Column():
+                txt = gr.Textbox(label="Input Text", lines=4,
+                                 placeholder="Εισάγετε κείμενο…")
+                mdl = gr.Radio(choices=list(MODEL_VARIANTS.keys()),
+                               value="Lesbian-only", label="Model Variant")
                 btn = gr.Button("Parse", variant="primary")
         with gr.Row():
         with gr.Row():
             with gr.Column():
+                conllu_out = gr.Textbox(label="CoNLL-U", lines=10, show_copy_button=True)
                 table_out  = gr.Dataframe(label="Token Table")
+                text_out   = gr.Textbox(label="Text-based Dependencies", lines=8, show_copy_button=True)
         btn.click(
             fn=process_text,
             inputs=[txt, mdl],
+            outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out, text_out]
         )
         sentence_dd.change(
             fn=update_svg,