Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

e4bdfd5

verified ·

1 Parent(s): f73ba75

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -105

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import requests
 import traceback
 from pathlib import Path
-# 1. MODEL VARIANTS & INITIALIZATION
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
@@ -28,18 +28,18 @@ def download_model_file(url, filename):
 def initialize_models():
     try:
-        models_dir = Path("./models")
-        models_dir.mkdir(exist_ok=True)
         for name, repo in MODEL_VARIANTS.items():
-            out = models_dir/name
             out.mkdir(exist_ok=True)
             files = {
-                "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
-                "lemmatizer.pt": f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
-                "pos.pt":        f"https://huggingface.co/{repo}/resolve/main/pos.pt",
-                "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
-            for fn,url in files.items():
                 tgt = out/fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
                     return False, f"Failed to download {fn} for {name}"
@@ -60,11 +60,11 @@ def initialize_models():
 loaded, load_status = initialize_models()
-# 2. CONLL-U TO COARSE OUTPUT
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
-    for sid, sent in enumerate(doc.sentences,1):
         lines.append(f"# sent_id = {sid}")
         lines.append(f"# text = {sent.text}")
         for w in sent.words:
@@ -73,126 +73,109 @@ def stanza_doc_to_conllu(doc) -> str:
                 w.lemma or "_", w.upos or "_",
                 w.xpos or "_", w.feats or "_",
                 str(w.head) if w.head is not None else "0",
-                w.deprel or "_","_","_"
             ]
             lines.append("\t".join(fields))
         lines.append("")  # blank line after each sentence
     return "\n".join(lines)
-# 3. TOKEN TABLE: insert comment‐rows + empty‐rows per sentence
-def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     """
-    Splits conllu into sentence‐blocks, then for each block:
-     - (if not first) insert a blank row
-     - insert '# sent_id = …' row
-     - insert '# text = …' row
-     - then all token rows
     """
-    blocks = [b for b in conllu.split("\n\n") if b.strip()]
     records = []
     first = True
     for block in blocks:
         lines = block.splitlines()
-        sid_line  = lines[0]
-        text_line = lines[1]
         token_lines = lines[2:]
         if not first:
             # blank row
-            records.append({c:"" for c in
-                ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]
-            })
         first = False
         # comment rows
-        records.append({"Id": sid_line, "Form":"", "Lemma":"", "UPosTag":"",
-                        "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
-        records.append({"Id": text_line, "Form":"", "Lemma":"", "UPosTag":"",
-                        "XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
         # token rows
         for tl in token_lines:
             parts = tl.split("\t")
             if len(parts) < 10:
                 continue
             records.append({
-                "Id": parts[0],
-                "Form": parts[1],
-                "Lemma": parts[2],
                 "UPosTag": parts[3],
                 "XPosTag": parts[4],
-                "Feats": parts[5],
-                "Head": parts[6],
-                "DepRel": parts[7],
-                "Deps": parts[8],
-                "Misc": parts[9]
             })
-    return pd.DataFrame(records, columns=[
-        "Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"
-    ])
-# 4. TEXT-BASED DEPENDENCIES: blank + comment per sentence
-def create_dependency_visualization(conllu: str) -> str:
     """
-    Splits by blank‐line into sentence‐blocks, then for each:
-     - blank line (if not first)
-     - comment lines
-     - parse lines
     """
-    blocks = [b for b in conllu.split("\n\n") if b.strip()]
-    out = []
     first = True
-    for block in blocks:
-        lines = block.splitlines()
-        sid_line, txt_line = lines[0], lines[1]
-        token_lines = lines[2:]
-        if not first:
-            out.append("")  # blank line separator
-        first = False
-        out.append(sid_line)
-        out.append(txt_line)
-        # build head→form map for this sentence
-        id2form = {}
-        for tl in token_lines:
-            p = tl.split("\t")
-            if len(p)>=2:
-                id2form[p[0]] = p[1]
-        for tl in token_lines:
-            p = tl.split("\t")
-            if len(p) < 8:
-                continue
-            w, upos, head, deprel = p[1], p[3], p[6], p[7]
-            if head != "0" and head in id2form:
-                out.append(f"{w} ({upos}) --{deprel}--> {id2form[head]}")
-            else:
-                out.append(f"{w} ({upos}) --{deprel}--> ROOT")
-    return "\n".join(out)
-# 5. SVG BUILDER (unchanged)
 def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
-    """
-    Paste your entire original SVG‐generation code here unchanged.
-    It takes sentence_data: List[dict] and returns an <svg>…</svg> string.
-    """
-    # … your SVG builder from the attached file …
-    return "<svg><!-- your SVG here --></svg>"
-# 6. PROCESS + DROPDOWN
 def process_text(text, variant):
     if not text.strip():
@@ -209,24 +192,23 @@ def process_text(text, variant):
             [], "", pd.DataFrame(), ""
         )
-    # parse
     doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
-    # build token table
-    df     = conllu_to_dataframe(conllu)
-    # text-based deps
-    text_v = create_dependency_visualization(conllu)
-    # prepare sentence payloads for SVG
     sentences = []
     for sent in doc.sentences:
         payload = [{
             'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
             'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
             'FEATS': w.feats or "_", 'HEAD': w.head or 0,
-            'DEPREL': w.deprel or "_"
         } for w in sent.words]
         sentences.append(payload)
@@ -239,24 +221,23 @@ def process_text(text, variant):
         dd_upd,
         sentences,
         conllu,
-        df,
-        text_v
     )
 def update_svg(selected_id, sentences):
     try:
-        idx = int(selected_id)-1
         return create_single_sentence_svg(sentences[idx])
     except:
         return "<p>Invalid selection</p>"
-# 7. GRADIO UI
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
         if not loaded:
             gr.Markdown(f"❌ Load error: {load_status}")
@@ -283,7 +264,8 @@ def create_app():
         btn.click(
             fn=process_text,
             inputs=[txt, mdl],
-            outputs=[svg_out, sentence_dd, sentences_st, conllu_out, table_out, text_out]
         )
         sentence_dd.change(
             fn=update_svg,

 import traceback
 from pathlib import Path
+# ─── 1. MODEL VARIANTS & INITIALIZATION ──────────────────────────────────
 LESBIAN_MODELS = {}
 MODEL_VARIANTS = {
 def initialize_models():
     try:
+        base = Path("./models")
+        base.mkdir(exist_ok=True)
         for name, repo in MODEL_VARIANTS.items():
+            out = base/name
             out.mkdir(exist_ok=True)
             files = {
+                "tokenizer.pt":    f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
+                "lemmatizer.pt":   f"https://huggingface.co/{repo}/resolve/main/lemmatizer.pt",
+                "pos.pt":          f"https://huggingface.co/{repo}/resolve/main/pos.pt",
+                "depparse.pt":     f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
+            for fn, url in files.items():
                 tgt = out/fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
                     return False, f"Failed to download {fn} for {name}"
 loaded, load_status = initialize_models()
+# ─── 2. CoNLL-U STRINGIZER ────────────────────────────────────────────────
 def stanza_doc_to_conllu(doc) -> str:
     lines = []
+    for sid, sent in enumerate(doc.sentences, 1):
         lines.append(f"# sent_id = {sid}")
         lines.append(f"# text = {sent.text}")
         for w in sent.words:
                 w.lemma or "_", w.upos or "_",
                 w.xpos or "_", w.feats or "_",
                 str(w.head) if w.head is not None else "0",
+                w.deprel or "_", "_", "_"
             ]
             lines.append("\t".join(fields))
         lines.append("")  # blank line after each sentence
     return "\n".join(lines)
+# ─── 3. TOKEN TABLE with comment‐rows ────────────────────────────────────
+def conllu_to_dataframe_table(conllu: str) -> pd.DataFrame:
     """
+    Insert:
+     - blank row (except first)
+     - # sent_id = …
+     - # text = …
+    before each sentence's tokens.
     """
     records = []
+    blocks = [b for b in conllu.split("\n\n") if b.strip()]
     first = True
     for block in blocks:
         lines = block.splitlines()
+        sid_line, txt_line = lines[0], lines[1]
         token_lines = lines[2:]
         if not first:
             # blank row
+            records.append({c: "" for c in ["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"]})
         first = False
         # comment rows
+        records.append({"Id": sid_line,  "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
+        records.append({"Id": txt_line,  "Form":"", "Lemma":"","UPosTag":"","XPosTag":"","Feats":"","Head":"","DepRel":"","Deps":"","Misc":""})
         # token rows
         for tl in token_lines:
             parts = tl.split("\t")
             if len(parts) < 10:
                 continue
             records.append({
+                "Id":      parts[0],
+                "Form":    parts[1],
+                "Lemma":   parts[2],
                 "UPosTag": parts[3],
                 "XPosTag": parts[4],
+                "Feats":   parts[5],
+                "Head":    parts[6],
+                "DepRel":  parts[7],
+                "Deps":    parts[8],
+                "Misc":    parts[9]
             })
+    return pd.DataFrame(records, columns=["Id","Form","Lemma","UPosTag","XPosTag","Feats","Head","DepRel","Deps","Misc"])
+# ─── 4. TEXT‐BASED DEPENDENCIES with blank + comments ─────────────────────
+def create_dependency_visualization(df_table: pd.DataFrame) -> str:
     """
+    Walk token‐table rows, emitting:
+     - blank line + comment lines before each sentence
+     - dependency lines
     """
+    if df_table.empty:
+        return "No data to visualize"
+    lines = []
     first = True
+    # We detect new sentence by comment rows in Id column starting with '# sent_id'
+    for row in df_table.itertuples(index=False):
+        if isinstance(row.Id, str) and row.Id.startswith("# sent_id"):
+            if not first:
+                lines.append("")  # blank separator
+            first = False
+            # emit sent_id and text
+            lines.append(row.Id)
+            # next row in table is "# text = …"
+            continue
+        if isinstance(row.Id, str) and row.Id.startswith("# text"):
+            lines.append(row.Id)
+            continue
+        # skip blank/comment rows
+        if not row.Id.isdigit():
+            continue
+        # actual token row
+        w, p, d, h = row.Form, row.UPosTag, row.DepRel, row.Head
+        if h != "0":
+            # find head form
+            try:
+                hw = df_table[df_table.Id == h].iloc[0].Form
+            except:
+                hw = "[ERR]"
+            lines.append(f"{w} ({p}) --{d}--> {hw}")
+        else:
+            lines.append(f"{w} ({p}) --{d}--> ROOT")
+    return "\n".join(lines)
+# ─── 5. SVG BUILDER (unchanged) ───────────────────────────────────────────
 def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1):
+    # Paste your entire original SVG‐generation code here unchanged
+    # It must accept sentence_data: list of dicts and return SVG string.
+    svg = "<svg><!-- your SVG here --></svg>"
+    return svg
+# ─── 6. PROCESS & DROPDOWN LOGIC ──────────────────────────────────────────
 def process_text(text, variant):
     if not text.strip():
             [], "", pd.DataFrame(), ""
         )
     doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
+    # token table with comments
+    df_table = conllu_to_dataframe_table(conllu)
+    # text-based dependencies
+    text_viz = create_dependency_visualization(df_table)
+    # prepare for SVG dropdown
     sentences = []
     for sent in doc.sentences:
         payload = [{
             'ID': w.id, 'FORM': w.text, 'LEMMA': w.lemma or "_",
             'UPOS': w.upos or "_", 'XPOS': w.xpos or "_",
             'FEATS': w.feats or "_", 'HEAD': w.head or 0,
+            'DEPREL': w.deprel or "_'
         } for w in sent.words]
         sentences.append(payload)
         dd_upd,
         sentences,
         conllu,
+        df_table,
+        text_viz
     )
 def update_svg(selected_id, sentences):
     try:
+        idx = int(selected_id) - 1
         return create_single_sentence_svg(sentences[idx])
     except:
         return "<p>Invalid selection</p>"
+# ─── 7. GRADIO UI ─────────────────────────────────────────────────────────
 def create_app():
     with gr.Blocks(title="Lesbian Greek Parser") as app:
         gr.Markdown("# Lesbian Greek Morphosyntactic Parser")
         if not loaded:
             gr.Markdown(f"❌ Load error: {load_status}")
         btn.click(
             fn=process_text,
             inputs=[txt, mdl],
+            outputs=[svg_out, sentence_dd, sentences_st,
+                     conllu_out, table_out, text_out]
         )
         sentence_dd.change(
             fn=update_svg,