Spaces:

Setur
/

Marka

Running

App Files Files Community

unijoh commited on Jan 21

Commit

2d19454

verified ·

1 Parent(s): 2ad5b1a

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -20

app.py CHANGED Viewed

@@ -7,6 +7,16 @@ import numpy as np
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 # ----------------------------
 # Config
 # ----------------------------
@@ -15,6 +25,8 @@ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
 LABELS_FILEPATH = "tag_labels.json"
 HF_TOKEN = os.getenv("BRAGD")
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
 if not os.path.exists(LABELS_FILEPATH):
@@ -63,7 +75,7 @@ CSS = """
   color:#0b1b19 !important;
 }
-/* Dark mode: make the INACTIVE buttons match what you had before (darker, readable) */
 @media (prefers-color-scheme: dark){
   #lang_fo_off, #lang_en_off{
     background:#2a3b38 !important;
@@ -77,7 +89,7 @@ CSS = """
   }
 }
-/* Minimal layout so the language buttons stay hard-right like before */
 #results_hdr{
   display:flex !important;
   align-items:center !important;
@@ -96,7 +108,8 @@ CSS = """
   min-width:120px !important;
   flex:0 0 auto !important;
 }
-/* Remove the big Gradio panel/frame around the textbox (keep textarea normal) */
 #input_col,
 #input_col > div,
 #input_col .gr-block,
@@ -111,11 +124,53 @@ CSS = """
 """
 # ----------------------------
-# Tokenization
 # ----------------------------
 def simp_tok(sentence: str):
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
 # ----------------------------
 # CSV mapping
 # ----------------------------
@@ -249,10 +304,9 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
     tag = vector_to_tag(vec)
     wc = wc_code(vec)
-    # --- ADDED: compute mood_code and skip flag for infinitive/imperative verbs ---
     mood_code = group_code(vec, "mood") if wc == "V" else ""
     skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"})  # navnháttur or boðsháttur
-    # --- end added ---
     if tag == "DGd":
         return "fyriseting" if lang=="fo" else "preposition"
@@ -280,15 +334,14 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
         if not c:
             continue
-        # --- ADDED: skip only the generic "no" codes for verbs in infinitive/imperative ---
         if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
             continue
-        # --- end added ---
         if wc in {"P","C"} and g == "subcategory":
             continue
         if (wc, g, c) in HIDE_IN_ANALYSIS:
             continue
         lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
         if lbl and lbl not in parts:
             parts.append(lbl)
@@ -358,15 +411,29 @@ def build_overview(lang: str) -> str:
         lines.append("")
     return "\n".join(lines).strip()
 def run_model(sentence: str):
     s = (sentence or "").strip()
     if not s:
         return []
     tokens = simp_tok(s)
     if not tokens:
         return []
-    enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
-                    padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
     input_ids = enc["input_ids"].to(device)
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
@@ -388,15 +455,25 @@ def run_model(sentence: str):
     rows, vec_i, seen = [], 0, set()
     for i,wid in enumerate(word_ids):
-        if wid is None or begin[i]!=1 or wid in seen:
             continue
         seen.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
         vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
         rows.append({"word": word, "vec": vec.int().tolist()})
         vec_i += 1
     return rows
 def render(rows_state, lang: str):
     lang = "fo" if lang=="fo" else "en"
     df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
@@ -411,6 +488,9 @@ def render(rows_state, lang: str):
         out_mean.append([r["word"], tag, expanded_text(vec, lang)])
     return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
 with gr.Blocks(css=CSS, title="Marka") as demo:
     with gr.Row(equal_height=True):
         with gr.Column(scale=2, elem_id="input_col"):
@@ -429,6 +509,7 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
     results_hdr = gr.Row(elem_id="results_hdr", visible=True)
     with results_hdr:
         results_title = gr.Markdown("### Úrslit / Results")
         with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
             btn_lang_fo_on  = gr.Button("Føroyskt", variant="primary",   elem_id="lang_fo_on",  visible=False)
             btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
@@ -454,8 +535,8 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
     with overview_acc:
         overview_md = gr.Markdown(build_overview("fo"))
-    def on_tag(sentence, lang_current):
-        rows = run_model(sentence)
         df_main, df_mean, overview = render(rows, lang_current)
         show_fo = (lang_current == "fo")
@@ -466,11 +547,11 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
             gr.update(value=df_main, visible=True),
             gr.update(value=df_mean),
             gr.update(value=overview),
-            gr.update(visible=True),   # expanded_acc
-            gr.update(visible=show_fo),
-            gr.update(visible=not show_fo),
-            gr.update(visible=show_en),
-            gr.update(visible=not show_en),
             lang_current,
         )
@@ -500,11 +581,14 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
     btn.click(
         on_tag,
         inputs=[inp, lang_state],
-        outputs=[state, out_df, out_mean_df, overview_md, expanded_acc,
-                 btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state],
         queue=False,
     )
     btn_lang_fo_on.click(
         on_set_fo,
         inputs=[state],

 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
+# --- FO-Tokenizer (sentence splitting) ---
+try:
+    import fotokenizer
+    from fotokenizer import tokenize, TOK
+except Exception as e:
+    raise RuntimeError(
+        "fotokenizer is not installed. Add it to requirements.txt (see below). "
+        f"Original error: {e}"
+    )
 # ----------------------------
 # Config
 # ----------------------------
 LABELS_FILEPATH = "tag_labels.json"
 HF_TOKEN = os.getenv("BRAGD")
+MAX_LENGTH = 256  # <-- changed from 128 to 256
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
 if not os.path.exists(LABELS_FILEPATH):
   color:#0b1b19 !important;
 }
+/* Dark mode: make the INACTIVE buttons darker but readable */
 @media (prefers-color-scheme: dark){
   #lang_fo_off, #lang_en_off{
     background:#2a3b38 !important;
   }
 }
+/* Minimal layout so the language buttons stay hard-right */
 #results_hdr{
   display:flex !important;
   align-items:center !important;
   min-width:120px !important;
   flex:0 0 auto !important;
 }
+/* Remove the big Gradio panel/frame around the textbox column (keep textarea normal) */
 #input_col,
 #input_col > div,
 #input_col .gr-block,
 """
 # ----------------------------
+# Tokenization helpers
 # ----------------------------
 def simp_tok(sentence: str):
+    # simple word/punct split; whitespace ignored
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
+def normalize_token_text(s: str) -> str:
+    # normalize newlines to spaces (same spirit as your TEI script)
+    return re.sub(r"[\r\n]+", " ", s or "")
+def split_sentences_fotokenizer(text: str):
+    """
+    Uses fotokenizer BEGIN_SENT / END_SENT markers to split into sentence strings.
+    """
+    text = text or ""
+    sentences = []
+    buf = []
+    toks = tokenize(text)
+    for t in toks:
+        if not getattr(t, "txt", ""):
+            # marker tokens: use TOK.descr[t.kind]
+            kind = TOK.descr[t.kind].replace(" ", "_")
+            if kind == "BEGIN_SENT":
+                # start a new sentence buffer
+                buf = []
+            elif kind == "END_SENT":
+                s = "".join(buf).strip()
+                if s:
+                    sentences.append(s)
+                buf = []
+            continue
+        buf.append(normalize_token_text(t.txt))
+    # flush tail if tokenizer didn't end with END_SENT
+    tail = "".join(buf).strip()
+    if tail:
+        sentences.append(tail)
+    # If for some reason no markers exist, fall back to whole text
+    if not sentences and text.strip():
+        sentences = [text.strip()]
+    return sentences
 # ----------------------------
 # CSV mapping
 # ----------------------------
     tag = vector_to_tag(vec)
     wc = wc_code(vec)
+    # Skip listing "no number/tense/person" for infinitive/imperative verbs
     mood_code = group_code(vec, "mood") if wc == "V" else ""
     skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"})  # navnháttur or boðsháttur
     if tag == "DGd":
         return "fyriseting" if lang=="fo" else "preposition"
         if not c:
             continue
         if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
             continue
         if wc in {"P","C"} and g == "subcategory":
             continue
         if (wc, g, c) in HIDE_IN_ANALYSIS:
             continue
         lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
         if lbl and lbl not in parts:
             parts.append(lbl)
         lines.append("")
     return "\n".join(lines).strip()
+# ----------------------------
+# Model inference (single sentence)
+# ----------------------------
 def run_model(sentence: str):
     s = (sentence or "").strip()
     if not s:
         return []
     tokens = simp_tok(s)
     if not tokens:
         return []
+    enc = tokenizer(
+        tokens,
+        is_split_into_words=True,
+        add_special_tokens=True,
+        max_length=MAX_LENGTH,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+        return_tensors="pt"
+    )
     input_ids = enc["input_ids"].to(device)
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
     rows, vec_i, seen = [], 0, set()
     for i,wid in enumerate(word_ids):
+        if wid is None or begin[i] != 1 or wid in seen:
             continue
         seen.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
         vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
         rows.append({"word": word, "vec": vec.int().tolist()})
         vec_i += 1
     return rows
+# ----------------------------
+# Model inference (multi-sentence via fotokenizer)
+# ----------------------------
+def run_model_multisentence(text: str):
+    all_rows = []
+    for sent in split_sentences_fotokenizer(text):
+        all_rows.extend(run_model(sent))
+    return all_rows
 def render(rows_state, lang: str):
     lang = "fo" if lang=="fo" else "en"
     df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
         out_mean.append([r["word"], tag, expanded_text(vec, lang)])
     return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
+# ----------------------------
+# UI
+# ----------------------------
 with gr.Blocks(css=CSS, title="Marka") as demo:
     with gr.Row(equal_height=True):
         with gr.Column(scale=2, elem_id="input_col"):
     results_hdr = gr.Row(elem_id="results_hdr", visible=True)
     with results_hdr:
         results_title = gr.Markdown("### Úrslit / Results")
+        # IMPORTANT: keep row always present; hide/show buttons only (prevents duplication)
         with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
             btn_lang_fo_on  = gr.Button("Føroyskt", variant="primary",   elem_id="lang_fo_on",  visible=False)
             btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
     with overview_acc:
         overview_md = gr.Markdown(build_overview("fo"))
+    def on_tag(text, lang_current):
+        rows = run_model_multisentence(text)
         df_main, df_mean, overview = render(rows, lang_current)
         show_fo = (lang_current == "fo")
             gr.update(value=df_main, visible=True),
             gr.update(value=df_mean),
             gr.update(value=overview),
+            gr.update(visible=True),           # expanded_acc
+            gr.update(visible=show_fo),        # fo_on
+            gr.update(visible=not show_fo),    # fo_off
+            gr.update(visible=show_en),        # en_on
+            gr.update(visible=not show_en),    # en_off
             lang_current,
         )
     btn.click(
         on_tag,
         inputs=[inp, lang_state],
+        outputs=[
+            state, out_df, out_mean_df, overview_md, expanded_acc,
+            btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state
+        ],
         queue=False,
     )
+    # Language switch: re-render existing rows (does NOT rerun the model)
     btn_lang_fo_on.click(
         on_set_fo,
         inputs=[state],