Spaces:

Setur
/

Marka

Running

App Files Files Community

unijoh commited on Jan 21

Commit

7c4ea3b

verified ·

1 Parent(s): 2d19454

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -92

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os, re, string, json
 from collections import defaultdict
 import gradio as gr
@@ -7,15 +9,44 @@ import numpy as np
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
-# --- FO-Tokenizer (sentence splitting) ---
 try:
-    import fotokenizer
-    from fotokenizer import tokenize, TOK
-except Exception as e:
-    raise RuntimeError(
-        "fotokenizer is not installed. Add it to requirements.txt (see below). "
-        f"Original error: {e}"
-    )
 # ----------------------------
 # Config
@@ -25,7 +56,7 @@ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
 LABELS_FILEPATH = "tag_labels.json"
 HF_TOKEN = os.getenv("BRAGD")
-MAX_LENGTH = 256  # <-- changed from 128 to 256
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
@@ -75,7 +106,7 @@ CSS = """
   color:#0b1b19 !important;
 }
-/* Dark mode: make the INACTIVE buttons darker but readable */
 @media (prefers-color-scheme: dark){
   #lang_fo_off, #lang_en_off{
     background:#2a3b38 !important;
@@ -89,7 +120,7 @@ CSS = """
   }
 }
-/* Minimal layout so the language buttons stay hard-right */
 #results_hdr{
   display:flex !important;
   align-items:center !important;
@@ -108,8 +139,7 @@ CSS = """
   min-width:120px !important;
   flex:0 0 auto !important;
 }
-/* Remove the big Gradio panel/frame around the textbox column (keep textarea normal) */
 #input_col,
 #input_col > div,
 #input_col .gr-block,
@@ -124,52 +154,73 @@ CSS = """
 """
 # ----------------------------
-# Tokenization helpers
 # ----------------------------
 def simp_tok(sentence: str):
-    # simple word/punct split; whitespace ignored
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
-def normalize_token_text(s: str) -> str:
-    # normalize newlines to spaces (same spirit as your TEI script)
-    return re.sub(r"[\r\n]+", " ", s or "")
-def split_sentences_fotokenizer(text: str):
-    """
-    Uses fotokenizer BEGIN_SENT / END_SENT markers to split into sentence strings.
     """
-    text = text or ""
-    sentences = []
-    buf = []
-    toks = tokenize(text)
-    for t in toks:
-        if not getattr(t, "txt", ""):
-            # marker tokens: use TOK.descr[t.kind]
-            kind = TOK.descr[t.kind].replace(" ", "_")
-            if kind == "BEGIN_SENT":
-                # start a new sentence buffer
-                buf = []
-            elif kind == "END_SENT":
-                s = "".join(buf).strip()
-                if s:
-                    sentences.append(s)
-                buf = []
-            continue
-        buf.append(normalize_token_text(t.txt))
-    # flush tail if tokenizer didn't end with END_SENT
-    tail = "".join(buf).strip()
-    if tail:
-        sentences.append(tail)
-    # If for some reason no markers exist, fall back to whole text
-    if not sentences and text.strip():
-        sentences = [text.strip()]
-    return sentences
 # ----------------------------
 # CSV mapping
@@ -264,6 +315,16 @@ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device); model.eval()
 if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
     raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
@@ -304,9 +365,10 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
     tag = vector_to_tag(vec)
     wc = wc_code(vec)
-    # Skip listing "no number/tense/person" for infinitive/imperative verbs
     mood_code = group_code(vec, "mood") if wc == "V" else ""
     skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"})  # navnháttur or boðsháttur
     if tag == "DGd":
         return "fyriseting" if lang=="fo" else "preposition"
@@ -334,14 +396,15 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
         if not c:
             continue
         if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
             continue
         if wc in {"P","C"} and g == "subcategory":
             continue
         if (wc, g, c) in HIDE_IN_ANALYSIS:
             continue
         lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
         if lbl and lbl not in parts:
             parts.append(lbl)
@@ -411,29 +474,15 @@ def build_overview(lang: str) -> str:
         lines.append("")
     return "\n".join(lines).strip()
-# ----------------------------
-# Model inference (single sentence)
-# ----------------------------
 def run_model(sentence: str):
     s = (sentence or "").strip()
     if not s:
         return []
     tokens = simp_tok(s)
     if not tokens:
         return []
-    enc = tokenizer(
-        tokens,
-        is_split_into_words=True,
-        add_special_tokens=True,
-        max_length=MAX_LENGTH,
-        padding="max_length",
-        truncation=True,
-        return_attention_mask=True,
-        return_tensors="pt"
-    )
     input_ids = enc["input_ids"].to(device)
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
@@ -455,25 +504,15 @@ def run_model(sentence: str):
     rows, vec_i, seen = [], 0, set()
     for i,wid in enumerate(word_ids):
-        if wid is None or begin[i] != 1 or wid in seen:
             continue
         seen.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
         vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
         rows.append({"word": word, "vec": vec.int().tolist()})
         vec_i += 1
     return rows
-# ----------------------------
-# Model inference (multi-sentence via fotokenizer)
-# ----------------------------
-def run_model_multisentence(text: str):
-    all_rows = []
-    for sent in split_sentences_fotokenizer(text):
-        all_rows.extend(run_model(sent))
-    return all_rows
 def render(rows_state, lang: str):
     lang = "fo" if lang=="fo" else "en"
     df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
@@ -488,9 +527,6 @@ def render(rows_state, lang: str):
         out_mean.append([r["word"], tag, expanded_text(vec, lang)])
     return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
-# ----------------------------
-# UI
-# ----------------------------
 with gr.Blocks(css=CSS, title="Marka") as demo:
     with gr.Row(equal_height=True):
         with gr.Column(scale=2, elem_id="input_col"):
@@ -509,7 +545,6 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
     results_hdr = gr.Row(elem_id="results_hdr", visible=True)
     with results_hdr:
         results_title = gr.Markdown("### Úrslit / Results")
-        # IMPORTANT: keep row always present; hide/show buttons only (prevents duplication)
         with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
             btn_lang_fo_on  = gr.Button("Føroyskt", variant="primary",   elem_id="lang_fo_on",  visible=False)
             btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
@@ -547,11 +582,11 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
             gr.update(value=df_main, visible=True),
             gr.update(value=df_mean),
             gr.update(value=overview),
-            gr.update(visible=True),           # expanded_acc
-            gr.update(visible=show_fo),        # fo_on
-            gr.update(visible=not show_fo),    # fo_off
-            gr.update(visible=show_en),        # en_on
-            gr.update(visible=not show_en),    # en_off
             lang_current,
         )
@@ -581,14 +616,11 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
     btn.click(
         on_tag,
         inputs=[inp, lang_state],
-        outputs=[
-            state, out_df, out_mean_df, overview_md, expanded_acc,
-            btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state
-        ],
         queue=False,
     )
-    # Language switch: re-render existing rows (does NOT rerun the model)
     btn_lang_fo_on.click(
         on_set_fo,
         inputs=[state],

 import os, re, string, json
+import inspect
+import importlib.resources as importlib_resources
 from collections import defaultdict
 import gradio as gr
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
+# ----------------------------
+# Optional: FO-Tokenizer (fotokenizer) for sentence splitting
+# ----------------------------
+_HAS_FOTOKENIZER = False
 try:
+    import fotokenizer  # noqa: F401
+    from fotokenizer import tokenize as fo_tokenize
+    from fotokenizer import TOK as FO_TOK
+    import fotokenizer.abbrev as fo_abbrev
+    _HAS_FOTOKENIZER = True
+except Exception:
+    _HAS_FOTOKENIZER = False
+def _patch_fotokenizer_for_py313() -> None:
+    """FO-Tokenizer currently uses importlib.resources.open_text(package=..., resource=...).
+    In Python 3.13, open_text no longer accepts the `package=` keyword.
+    This shim patches fotokenizer so it works on Python 3.13 (Hugging Face Spaces default)."""
+    if not _HAS_FOTOKENIZER:
+        return
+    try:
+        # If open_text doesn't accept `package`, patch the reference inside fotokenizer.abbrev.
+        if "package" not in inspect.signature(importlib_resources.open_text).parameters:
+            def _open_text_compat(*args, **kwargs):
+                if "package" in kwargs:
+                    pkg = kwargs.pop("package")
+                    res = kwargs.pop("resource")
+                    encoding = kwargs.pop("encoding", "utf-8")
+                    errors = kwargs.pop("errors", "strict")
+                    return importlib_resources.open_text(pkg, res, encoding=encoding, errors=errors)
+                return importlib_resources.open_text(*args, **kwargs)
+            # Patch the function that fotokenizer.abbrev imported into its module namespace
+            fo_abbrev.open_text = _open_text_compat  # type: ignore[attr-defined]
+    except Exception:
+        # If patching fails, we'll fall back to a naive sentence split later.
+        pass
+_patch_fotokenizer_for_py313()
 # ----------------------------
 # Config
 LABELS_FILEPATH = "tag_labels.json"
 HF_TOKEN = os.getenv("BRAGD")
+TARGET_MAX_TOKENS = 256  # We will cap this to the model's max if needed.
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
   color:#0b1b19 !important;
 }
+/* Dark mode: make the INACTIVE buttons match what you had before (darker, readable) */
 @media (prefers-color-scheme: dark){
   #lang_fo_off, #lang_en_off{
     background:#2a3b38 !important;
   }
 }
+/* Minimal layout so the language buttons stay hard-right like before */
 #results_hdr{
   display:flex !important;
   align-items:center !important;
   min-width:120px !important;
   flex:0 0 auto !important;
 }
+/* Remove the big Gradio panel/frame around the textbox (keep textarea normal) */
 #input_col,
 #input_col > div,
 #input_col .gr-block,
 """
 # ----------------------------
+# Tokenization
 # ----------------------------
 def simp_tok(sentence: str):
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
+# ----------------------------
+# Sentence splitting
+# ----------------------------
+def split_sentences(text: str):
+    """Split input into sentences.
+    - Prefer FO-Tokenizer if available (BEGIN_SENT / END_SENT markers).
+    - Fall back to a simple regex split if FO-Tokenizer isn't available or fails.
     """
+    s = (text or "").strip()
+    if not s:
+        return []
+    if _HAS_FOTOKENIZER:
+        try:
+            toks = fo_tokenize(s)
+            sents = []
+            cur = []
+            for tok in toks:
+                if tok.txt:
+                    cur.append(re.sub(r"[\r\n]+", " ", tok.txt))
+                    continue
+                # Descriptor-only token (e.g., sentence boundary markers)
+                descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
+                if descr == "BEGIN_SENT":
+                    if cur:
+                        sent = "".join(cur).strip()
+                        if sent:
+                            sents.append(sent)
+                    cur = []
+                elif descr == "END_SENT":
+                    sent = "".join(cur).strip()
+                    if sent:
+                        sents.append(sent)
+                    cur = []
+                else:
+                    # Ignore other descriptor-only tokens
+                    pass
+            if cur:
+                sent = "".join(cur).strip()
+                if sent:
+                    sents.append(sent)
+            # If fotokenizer didn't yield markers, treat as one sentence.
+            return sents or [s]
+        except Exception:
+            # We'll fall back below
+            pass
+    # Fallback: split on end punctuation followed by whitespace.
+    parts = re.split(r"(?<=[.!?])\s+", s)
+    return [p.strip() for p in parts if p.strip()]
+def run_model_multisentence(text: str):
+    """Run the model sentence-by-sentence and concatenate the rows."""
+    rows_all = []
+    for sent in split_sentences(text):
+        rows_all.extend(run_model(sent))
+    return rows_all
 # ----------------------------
 # CSV mapping
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device); model.eval()
+# Decide max token length (cap to model/tokenizer max if they define one)
+MAX_TOKENS = int(TARGET_MAX_TOKENS)
+_model_max = getattr(getattr(model, "config", None), "max_position_embeddings", None)
+_tok_max = getattr(tokenizer, "model_max_length", None)
+# Some tokenizers set model_max_length to a huge placeholder (e.g., 1e30). Ignore those.
+for _m in (_model_max, _tok_max):
+    if isinstance(_m, int) and 0 < _m < 100000:
+        MAX_TOKENS = min(MAX_TOKENS, _m)
 if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
     raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
     tag = vector_to_tag(vec)
     wc = wc_code(vec)
+    # --- ADDED: compute mood_code and skip flag for infinitive/imperative verbs ---
     mood_code = group_code(vec, "mood") if wc == "V" else ""
     skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"})  # navnháttur or boðsháttur
+    # --- end added ---
     if tag == "DGd":
         return "fyriseting" if lang=="fo" else "preposition"
         if not c:
             continue
+        # --- ADDED: skip only the generic "no" codes for verbs in infinitive/imperative ---
         if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
             continue
+        # --- end added ---
         if wc in {"P","C"} and g == "subcategory":
             continue
         if (wc, g, c) in HIDE_IN_ANALYSIS:
             continue
         lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
         if lbl and lbl not in parts:
             parts.append(lbl)
         lines.append("")
     return "\n".join(lines).strip()
 def run_model(sentence: str):
     s = (sentence or "").strip()
     if not s:
         return []
     tokens = simp_tok(s)
     if not tokens:
         return []
+    enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=MAX_TOKENS,
+                    padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
     input_ids = enc["input_ids"].to(device)
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
     rows, vec_i, seen = [], 0, set()
     for i,wid in enumerate(word_ids):
+        if wid is None or begin[i]!=1 or wid in seen:
             continue
         seen.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
         vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
         rows.append({"word": word, "vec": vec.int().tolist()})
         vec_i += 1
     return rows
 def render(rows_state, lang: str):
     lang = "fo" if lang=="fo" else "en"
     df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
         out_mean.append([r["word"], tag, expanded_text(vec, lang)])
     return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
 with gr.Blocks(css=CSS, title="Marka") as demo:
     with gr.Row(equal_height=True):
         with gr.Column(scale=2, elem_id="input_col"):
     results_hdr = gr.Row(elem_id="results_hdr", visible=True)
     with results_hdr:
         results_title = gr.Markdown("### Úrslit / Results")
         with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
             btn_lang_fo_on  = gr.Button("Føroyskt", variant="primary",   elem_id="lang_fo_on",  visible=False)
             btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
             gr.update(value=df_main, visible=True),
             gr.update(value=df_mean),
             gr.update(value=overview),
+            gr.update(visible=True),   # expanded_acc
+            gr.update(visible=show_fo),
+            gr.update(visible=not show_fo),
+            gr.update(visible=show_en),
+            gr.update(visible=not show_en),
             lang_current,
         )
     btn.click(
         on_tag,
         inputs=[inp, lang_state],
+        outputs=[state, out_df, out_mean_df, overview_md, expanded_acc,
+                 btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state],
         queue=False,
     )
     btn_lang_fo_on.click(
         on_set_fo,
         inputs=[state],