Spaces:

sbompolas
/

Lesbian-morphosyntactic-parsing

Sleeping

App Files Files Community

sbompolas commited on Jul 1, 2025

Commit

4670ce3

verified ·

1 Parent(s): 08f1dd3

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -65

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ def initialize_models():
         base = Path("./models")
         base.mkdir(exist_ok=True)
         for name, repo in MODEL_VARIANTS.items():
-            out = base/name
             out.mkdir(exist_ok=True)
             files = {
                 "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
@@ -40,7 +40,7 @@ def initialize_models():
                 "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
             for fn, url in files.items():
-                tgt = out/fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
                     return False, f"Failed to download {fn} for {name}"
             cfg = {
@@ -78,29 +78,26 @@ def stanza_doc_to_conllu(doc) -> str:
                 w.deprel or "_", "_", "_"
             ]
             lines.append("\t".join(fields))
-        lines.append("")
     return "\n".join(lines)
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
     rows = []
     for line in conllu.splitlines():
-        # empty line → separator between sentences
         if not line:
-            if rows and rows[-1] != {}:
-                rows.append({})
             continue
-        # comment line → sentence header
         if line.startswith("#"):
-            # ex: "# sent_id = 2"
             if "=" in line:
                 key, val = line[2:].split("=", 1)
-                rows.append({
-                    'ID':   f"# {key.strip()} =",
-                    'FORM': val.strip()
-                })
             continue
         parts = line.split("\t")
         if len(parts) >= 10:
             rows.append({
@@ -115,25 +112,22 @@ def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
                 'DEPS':   parts[8],
                 'MISC':   parts[9]
             })
-    return pd.DataFrame(rows)
 def create_dependency_visualization(df: pd.DataFrame) -> str:
     if df.empty:
         return "No data to visualize"
     viz = []
     for _, row in df.iterrows():
-        rid = row.get("ID")
-        # skip outright blank-rows skeleton
-        if pd.isna(rid):
-            continue
-        # sentence header
         if isinstance(rid, str) and rid.startswith("#"):
-            if viz:
-                viz.append("")  # blank line before new sentence
-            viz.append(f"{rid} {row.get('FORM')}")
             continue
         w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
         if h != '0':
             try:
@@ -161,10 +155,8 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
         width = max(800, word_count * word_spacing + 100)
         height = 500
         word_y = height - 120
-        pos_y = word_y + 20
-        features_start_y = pos_y + 15
         deprel_colors = {
             'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
@@ -189,24 +181,22 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
             )
         svg.append('</defs><g>')
-        # calculate x positions
-        word_positions = {}
-        for idx, row in df.iterrows():
-            wid = int(row['ID'])
-            word_positions[wid] = 50 + (wid - 1) * word_spacing
         # draw arcs/lines
         used_spans = []
         for _, row in df.iterrows():
-            wid = int(row['ID'])
-            hid = int(row['HEAD']) if row['HEAD'] != '0' else 0
             rel = row['DEPREL']
             x1 = word_positions[wid]
             col = deprel_colors.get(rel, '#000')
             if hid == 0:
-                # root arrow
                 svg.append(
                     f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
                     f'stroke="{col}" stroke-width="1.5"/>'
@@ -233,17 +223,13 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
                             conflict = True
                             break
                 used_spans.append((span, lvl))
                 dist = abs(x2 - x1)
-                base_h = min(40 + dist * 0.15, 100)
-                arc_h = base_h + lvl * 35
-                midx = (x1 + x2) / 2
-                cty = word_y - arc_h
                 svg.append(
-                    f'<path d="M {x1} {word_y-15} Q {midx} {cty} {x2} {word_y-15}" '
                     f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
                 )
-                # label box
                 amx = 0.25*x1 + 0.5*midx + 0.25*x2
                 amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
                 lw = len(rel)*6 + 8
@@ -258,31 +244,21 @@ def create_single_sentence_svg(sentence_data, sentence_num=1, total_sentences=1)
         # draw words + annotations
         for _, row in df.iterrows():
-            wid = int(row['ID'])
-            x = word_positions[wid]
-            word  = row['FORM']
-            pos   = row['UPOS']
-            lemma = row['LEMMA']
-            feats = row['FEATS']
-            xpos  = row['XPOS']
-            # word text
             svg.append(
                 f'<text x="{x}" y="{word_y}" text-anchor="middle" '
                 'font-family="Arial" font-size="13" font-weight="bold">'
-                f'{word}</text>'
             )
-            # annotations underneath
             ann = []
-            if pos   and pos   != '_': ann.append(f"upos={pos}")
-            if lemma and lemma not in ('_', word): ann.append(f"lemma={lemma}")
-            if xpos  and xpos  != '_': ann.append(f"xpos={xpos}")
-            if feats and feats not in ('', '_'):
-                for fpair in feats.split('|'):
                     if '=' in fpair:
                         ann.append(fpair)
             for i, a in enumerate(ann):
                 y0 = features_start_y + i*12
                 svg.append(
@@ -315,7 +291,7 @@ def process_text(text, variant):
             [], "", pd.DataFrame(), ""
         )
-    doc = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
     df     = conllu_to_dataframe(conllu)
     text_v = create_dependency_visualization(df)
@@ -394,7 +370,7 @@ def create_app():
                 table_out  = gr.Dataframe(label="Token Table")
                 text_out   = gr.Textbox(
                     label="Text-based Dependencies",
-                    lines=8,
                     show_copy_button=True
                 )

         base = Path("./models")
         base.mkdir(exist_ok=True)
         for name, repo in MODEL_VARIANTS.items():
+            out = base / name
             out.mkdir(exist_ok=True)
             files = {
                 "tokenizer.pt":  f"https://huggingface.co/{repo}/resolve/main/tokenizer.pt",
                 "depparse.pt":   f"https://huggingface.co/{repo}/resolve/main/depparse.pt",
             }
             for fn, url in files.items():
+                tgt = out / fn
                 if not tgt.exists() and not download_model_file(url, str(tgt)):
                     return False, f"Failed to download {fn} for {name}"
             cfg = {
                 w.deprel or "_", "_", "_"
             ]
             lines.append("\t".join(fields))
+        lines.append("")  # blank line after each sentence
     return "\n".join(lines)
 def conllu_to_dataframe(conllu: str) -> pd.DataFrame:
+    cols = ['ID','FORM','LEMMA','UPOS','XPOS','FEATS','HEAD','DEPREL','DEPS','MISC']
     rows = []
     for line in conllu.splitlines():
         if not line:
+            # empty row between sentences
+            rows.append({c: "" for c in cols})
             continue
         if line.startswith("#"):
             if "=" in line:
                 key, val = line[2:].split("=", 1)
+                key, val = key.strip(), val.strip()
+                if key == "sent_id":
+                    rows.append({'ID': f"# sent_id = {val}", 'FORM': ""})
+                elif key == "text":
+                    rows.append({'ID': f"# text = {val}", 'FORM': ""})
             continue
         parts = line.split("\t")
         if len(parts) >= 10:
             rows.append({
                 'DEPS':   parts[8],
                 'MISC':   parts[9]
             })
+    df = pd.DataFrame(rows, columns=cols)
+    return df.fillna("")  # replace NaN with empty strings
 def create_dependency_visualization(df: pd.DataFrame) -> str:
     if df.empty:
         return "No data to visualize"
     viz = []
     for _, row in df.iterrows():
+        rid = row['ID']
+        # sentence header rows
         if isinstance(rid, str) and rid.startswith("#"):
+            # only before each new sent_id do we add a blank line
+            if rid.startswith("# sent_id") and viz:
+                viz.append("")
+            viz.append(rid)
             continue
         w, p, d, h = row['FORM'], row['UPOS'], row['DEPREL'], row['HEAD']
         if h != '0':
             try:
         width = max(800, word_count * word_spacing + 100)
         height = 500
         word_y = height - 120
+        features_start_y = word_y + 35  # space for UPOS + lemma
         deprel_colors = {
             'root': '#000000', 'nsubj': '#2980b9', 'obj': '#27ae60', 'det': '#e67e22',
             )
         svg.append('</defs><g>')
+        # x positions
+        word_positions = {
+            int(r['ID']): 50 + (int(r['ID']) - 1) * word_spacing
+            for _, r in df.iterrows() if str(r['ID']).isdigit()
+        }
         # draw arcs/lines
         used_spans = []
         for _, row in df.iterrows():
+            if not str(row['ID']).isdigit():
+                continue
+            wid, hid = int(row['ID']), int(row['HEAD'])
             rel = row['DEPREL']
             x1 = word_positions[wid]
             col = deprel_colors.get(rel, '#000')
             if hid == 0:
                 svg.append(
                     f'<line x1="{x1}" y1="{word_y-15}" x2="{x1}" y2="50" '
                     f'stroke="{col}" stroke-width="1.5"/>'
                             conflict = True
                             break
                 used_spans.append((span, lvl))
                 dist = abs(x2 - x1)
+                arc_h = min(40 + dist * 0.15, 100) + lvl * 35
+                midx, cty = (x1 + x2) / 2, word_y - arc_h
                 svg.append(
+                    f'<path d="M{x1} {word_y-15} Q{midx} {cty} {x2} {word_y-15}" '
                     f'stroke="{col}" stroke-width="1.5" fill="none" marker-end="url(#arrow_{rel})"/>'
                 )
                 amx = 0.25*x1 + 0.5*midx + 0.25*x2
                 amy = 0.25*(word_y-15) + 0.5*cty + 0.25*(word_y-15)
                 lw = len(rel)*6 + 8
         # draw words + annotations
         for _, row in df.iterrows():
+            if not str(row['ID']).isdigit():
+                continue
+            x = word_positions[int(row['ID'])]
             svg.append(
                 f'<text x="{x}" y="{word_y}" text-anchor="middle" '
                 'font-family="Arial" font-size="13" font-weight="bold">'
+                f'{row["FORM"]}</text>'
             )
             ann = []
+            if row['UPOS'] and row['UPOS'] != '_':       ann.append(f"upos={row['UPOS']}")
+            if row['LEMMA'] and row['LEMMA'] not in ('_', row['FORM']): ann.append(f"lemma={row['LEMMA']}")
+            if row['FEATS'] and row['FEATS'] not in ('', '_'):
+                for fpair in row['FEATS'].split('|'):
                     if '=' in fpair:
                         ann.append(fpair)
             for i, a in enumerate(ann):
                 y0 = features_start_y + i*12
                 svg.append(
             [], "", pd.DataFrame(), ""
         )
+    doc    = pipe(text)
     conllu = stanza_doc_to_conllu(doc)
     df     = conllu_to_dataframe(conllu)
     text_v = create_dependency_visualization(df)
                 table_out  = gr.Dataframe(label="Token Table")
                 text_out   = gr.Textbox(
                     label="Text-based Dependencies",
+                    lines=10,
                     show_copy_button=True
                 )