Spaces:

Setur
/

Marka

Sleeping

App Files Files Community

unijoh commited on Jan 20

Commit

a583cec

verified ·

1 Parent(s): e334dd4

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -58

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import re
 import string
 import gradio as gr
 import torch
@@ -12,27 +14,28 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
 # Config
 # ----------------------------
 MODEL_ID = "Setur/BRAGD"
-TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"  # must be present in the Space repo
-HF_TOKEN = os.getenv("BRAGD")  # Space secret name
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
-# Match UPDATED demo.py intervals
 INTERVALS = (
-    (15, 29),  # Subcategories (D,B,E,I,P,Q,N,G,R,X,S,C,O,T,s)
-    (30, 33),  # Gender (M,F,N,g)
-    (34, 36),  # Number (S,P,n)
-    (37, 41),  # Case (N,A,D,G,c)
-    (42, 43),  # Article/No-Article (Article,a)
-    (44, 45),  # Proper/Not Proper Noun (Proper,r)
-    (46, 50),  # Degree (P,C,S,A,d)
-    (51, 53),  # Declension (S,W,e)
-    (54, 60),  # Mood (I,M,N,S,P,E,U)
-    (61, 63),  # Voice (A,M,v)
-    (64, 66),  # Tense (P,A,t)
-    (67, 70),  # Person (1,2,3,p)
-    (71, 72),  # Definiteness (D,I)
 )
 # ----------------------------
@@ -46,21 +49,29 @@ model.to(device)
 model.eval()
 # ----------------------------
-# Tag mapping + dict_intervals
 # ----------------------------
 def load_tag_mappings(tags_filepath: str):
     tags_df = pd.read_csv(tags_filepath)
-    # Map: Original Tag -> feature vector, and feature vector -> Original Tag
-    tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in tags_df.iterrows()}
-    features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in tags_df.iterrows()}
-    vec_len = len(tags_df.columns) - 1
-    return tag_to_features, features_to_tag, vec_len
-tag_to_features, features_to_tag, VEC_LEN = load_tag_mappings(TAGS_FILEPATH)
-# Safety check: if this fails, you uploaded the wrong CSV for the model
 if hasattr(model, "config") and hasattr(model.config, "num_labels"):
     if model.config.num_labels != VEC_LEN:
         raise RuntimeError(
@@ -69,12 +80,17 @@ if hasattr(model, "config") and hasattr(model.config, "num_labels"):
             "You likely uploaded the wrong tag mapping CSV."
         )
 def process_tag_features(tag_to_features: dict, intervals):
-    """Compute allowed intervals per POS (dict_intervals) like your updated demo.py."""
     list_of_tags = list(tag_to_features.values())
     unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
-    # Collect all feature vectors for each POS class (0..14)
     word_type_masks = {}
     for wt in range(15):
         word_type_masks[wt] = [arr for arr in unique_arrays if arr[wt] == 1]
@@ -97,27 +113,100 @@ def process_tag_features(tag_to_features: dict, intervals):
     return dict_intervals
 DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
-def vector_to_tag(vec: torch.Tensor) -> str:
-    return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
 # ----------------------------
-# Tokenization (match updated demo.py)
 # ----------------------------
 def simp_tok(sentence: str):
-    """Tokenize into words and punctuation (regex), matching your updated demo.py."""
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
 # ----------------------------
-# Decoding (match updated demo.py logic)
 # ----------------------------
 def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
-    """
-    Decode one feature-vector per word:
-    - pick POS (0..14)
-    - then pick subclasses only in allowed intervals for that POS
-    """
     softmax = torch.nn.Softmax(dim=0)
     vectors = []
@@ -135,7 +224,7 @@ def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_to
         wt = torch.argmax(probs).item()
         vec[wt] = 1
-        # Allowed feature groups for this POS
         for (a, b) in dict_intervals.get(wt, []):
             seg = pred_logits[a : b + 1]
             probs = softmax(seg)
@@ -146,14 +235,95 @@ def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_to
     return vectors
-def tag_sentence(sentence: str, max_len: int = 128):
-    sentence = sentence.strip()
     if not sentence:
-        return ""
     tokens = simp_tok(sentence)
     if not tokens:
-        return ""
     enc = tokenizer(
         tokens,
@@ -170,7 +340,6 @@ def tag_sentence(sentence: str, max_len: int = 128):
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
-    # begin token mask: first subtoken per word
     begin_tokens = []
     last = None
     for wid in word_ids:
@@ -184,12 +353,11 @@ def tag_sentence(sentence: str, max_len: int = 128):
     with torch.no_grad():
         out = model(input_ids=input_ids, attention_mask=attention_mask)
-        logits = out.logits[0]  # [seq_len, num_labels]
     vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
-    # Map vectors back to tokens (one vector per original word)
-    lines = []
     vec_i = 0
     seen_word_ids = set()
@@ -203,25 +371,84 @@ def tag_sentence(sentence: str, max_len: int = 128):
         seen_word_ids.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
-        tag = vector_to_tag(vectors[vec_i]) if vec_i < len(vectors) else "Unknown Tag"
-        lines.append(f"{word}\t{tag}")
         vec_i += 1
     return "\n".join(lines)
 # ----------------------------
 # Gradio UI
 # ----------------------------
-demo = gr.Interface(
-    fn=tag_sentence,
-    inputs=gr.Textbox(lines=2, label="Setningur"),
-    outputs=gr.Textbox(lines=12, label="Orð\\tMark"),
-    title="BRAGD-markarin",
-    description=(
-        "Skriv ein setning og fá hann markaðan. "
-        "Model: Setur/BRAGD. "
-    ),
-)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import re
 import string
+import json
+from collections import defaultdict
 import gradio as gr
 import torch
 # Config
 # ----------------------------
 MODEL_ID = "Setur/BRAGD"
+TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"          # must be in the Space repo
+LABELS_FILEPATH = "tag_labels.json"                 # add this file to the Space repo
+HF_TOKEN = os.getenv("BRAGD")  # Space secret name
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
+# Match your UPDATED demo.py intervals
 INTERVALS = (
+    (15, 29),  # Subcategories
+    (30, 33),  # Gender
+    (34, 36),  # Number
+    (37, 41),  # Case
+    (42, 43),  # Article/No-Article
+    (44, 45),  # Proper/Not Proper
+    (46, 50),  # Degree
+    (51, 53),  # Declension
+    (54, 60),  # Mood
+    (61, 63),  # Voice
+    (64, 66),  # Tense
+    (67, 70),  # Person
+    (71, 72),  # Definiteness
 )
 # ----------------------------
 model.eval()
 # ----------------------------
+# Tag mapping (CSV)
 # ----------------------------
 def load_tag_mappings(tags_filepath: str):
     tags_df = pd.read_csv(tags_filepath)
+    feature_cols = list(tags_df.columns[1:])
+    tag_to_features = {
+        row["Original Tag"]: row[1:].values.astype(int)
+        for _, row in tags_df.iterrows()
+    }
+    features_to_tag = {
+        tuple(row[1:].values.astype(int)): row["Original Tag"]
+        for _, row in tags_df.iterrows()
+    }
+    vec_len = len(feature_cols)
+    return tag_to_features, features_to_tag, vec_len, feature_cols
+tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
+# Safety check
 if hasattr(model, "config") and hasattr(model.config, "num_labels"):
     if model.config.num_labels != VEC_LEN:
         raise RuntimeError(
             "You likely uploaded the wrong tag mapping CSV."
         )
+def vector_to_tag(vec: torch.Tensor) -> str:
+    return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
+# ----------------------------
+# Compute allowed intervals per POS
+# ----------------------------
 def process_tag_features(tag_to_features: dict, intervals):
     list_of_tags = list(tag_to_features.values())
     unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
     word_type_masks = {}
     for wt in range(15):
         word_type_masks[wt] = [arr for arr in unique_arrays if arr[wt] == 1]
     return dict_intervals
 DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
+# ----------------------------
+# Load bilingual labels
+# ----------------------------
+def load_labels(path: str):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+try:
+    LABELS = load_labels(LABELS_FILEPATH)
+except Exception:
+    LABELS = {"fo": {"global": {}, "by_wc": {}}, "en": {"global": {}, "by_wc": {}}}
+def label_for(lang: str, group: str, wc_code: str, code: str) -> str:
+    """Word-class-specific first, then global. Always safe to return ""."""
+    lang = lang if lang in ("fo", "en") else "fo"
+    d = LABELS.get(lang, {})
+    by_wc = d.get("by_wc", {})
+    glob = d.get("global", {})
+    if wc_code and group in by_wc and wc_code in by_wc[group] and code in by_wc[group][wc_code]:
+        return by_wc[group][wc_code][code]
+    if group in glob and code in glob[group]:
+        return glob[group][code]
+    return ""
+# ----------------------------
+# Feature column groups (from CSV headers)
+# ----------------------------
+def _group_from_colname(col: str):
+    if col == "Article":
+        return ("article", "A")
+    if col == "Proper Noun":
+        return ("proper", "P")
+    if col.startswith("Not-Proper-Noun "):
+        return ("proper", col.split()[-1])  # usually r
+    if col.startswith("No-Article "):
+        return ("article", col.split()[-1])  # usually a
+    prefixes = [
+        ("Word Class ", "word_class"),
+        ("Subcategory ", "subcategory"),
+        ("No-Subcategory ", "subcategory"),
+        ("Gender ", "gender"),
+        ("No-Gender ", "gender"),
+        ("Number ", "number"),
+        ("No-Number ", "number"),
+        ("Case ", "case"),
+        ("No-Case ", "case"),
+        ("Degree ", "degree"),
+        ("No-Degree ", "degree"),
+        ("Declension ", "declension"),
+        ("No-Declension ", "declension"),
+        ("Mood ", "mood"),
+        ("Voice ", "voice"),
+        ("No-Voice ", "voice"),
+        ("Tense ", "tense"),
+        ("No-Tense ", "tense"),
+        ("Person ", "person"),
+        ("No-Person ", "person"),
+        ("Definite ", "definiteness"),
+        ("Indefinite ", "definiteness"),
+    ]
+    for p, g in prefixes:
+        if col.startswith(p):
+            code = col.split()[-1]
+            return (g, code)
+    return (None, None)
+GROUPS = defaultdict(list)  # group -> list[(idx, code)]
+for i, col in enumerate(FEATURE_COLS):
+    g, code = _group_from_colname(col)
+    if g:
+        GROUPS[g].append((i, code))
 # ----------------------------
+# Tokenization
 # ----------------------------
 def simp_tok(sentence: str):
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
 # ----------------------------
+# Decoding
 # ----------------------------
 def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
     softmax = torch.nn.Softmax(dim=0)
     vectors = []
         wt = torch.argmax(probs).item()
         vec[wt] = 1
+        # Allowed feature groups
         for (a, b) in dict_intervals.get(wt, []):
             seg = pred_logits[a : b + 1]
             probs = softmax(seg)
     return vectors
+def describe_vector(vec: torch.Tensor, lang: str) -> str:
+    # word class code
+    wc_code = ""
+    for idx, code in GROUPS.get("word_class", []):
+        if int(vec[idx].item()) == 1:
+            wc_code = code
+            break
+    parts = []
+    wc_label = label_for(lang, "word_class", wc_code, wc_code)
+    if wc_code:
+        parts.append(f"{wc_code} – {wc_label}" if wc_label else wc_code)
+    order = [
+        "subcategory",
+        "gender",
+        "number",
+        "case",
+        "article",
+        "proper",
+        "degree",
+        "declension",
+        "mood",
+        "voice",
+        "tense",
+        "person",
+        "definiteness",
+    ]
+    for g in order:
+        chosen = None
+        for idx, code in GROUPS.get(g, []):
+            if int(vec[idx].item()) == 1:
+                chosen = code
+                break
+        if not chosen:
+            continue
+        lbl = label_for(lang, g, wc_code, chosen)
+        # Always keep this correct even if labels are missing
+        if not lbl:
+            if lang == "en":
+                FALLBACK = {
+                    "definiteness": {"D": "definite", "I": "indefinite"},
+                    "article": {"A": "with suffixed definite article", "a": "no definite suffix"},
+                    "proper": {"P": "proper noun", "r": "not proper noun"},
+                    "gender": {"g": "no gender"},
+                    "number": {"n": "no number"},
+                    "case": {"c": "no case"},
+                    "degree": {"d": "no degree"},
+                    "declension": {"e": "no declension"},
+                    "voice": {"v": "no voice"},
+                    "tense": {"t": "no tense"},
+                    "person": {"p": "no person"},
+                    "subcategory": {"s": "no subcategory"},
+                }
+            else:
+                FALLBACK = {
+                    "definiteness": {"D": "bundið", "I": "óbundið"},
+                    "article": {"A": "við bundnum eftirlið", "a": "uttan bundið eftirlið"},
+                    "proper": {"P": "sernavn", "r": "ikki sernavn"},
+                    "gender": {"g": "einki kyn"},
+                    "number": {"n": "einki tal"},
+                    "case": {"c": "einki fall"},
+                    "degree": {"d": "einki stig"},
+                    "declension": {"e": "eingin bending"},
+                    "voice": {"v": "eingin søgn"},
+                    "tense": {"t": "eingin tíð"},
+                    "person": {"p": "eingin persónur"},
+                    "subcategory": {"s": "eingin undirflokkur"},
+                }
+            lbl = FALLBACK.get(g, {}).get(chosen, "")
+        parts.append(f"{chosen} – {lbl}" if lbl else chosen)
+    return "; ".join(parts)
+def tag_sentence(sentence: str, lang: str = "fo", max_len: int = 128):
+    sentence = (sentence or "").strip()
     if not sentence:
+        return pd.DataFrame(columns=["Word", "Tag", "Meaning"]), ""
     tokens = simp_tok(sentence)
     if not tokens:
+        return pd.DataFrame(columns=["Word", "Tag", "Meaning"]), ""
     enc = tokenizer(
         tokens,
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
     begin_tokens = []
     last = None
     for wid in word_ids:
     with torch.no_grad():
         out = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = out.logits[0]
     vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
+    rows = []
     vec_i = 0
     seen_word_ids = set()
         seen_word_ids.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
+        vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
+        tag = vector_to_tag(vec)
+        meaning = describe_vector(vec, lang)
+        rows.append([word, tag, meaning])
         vec_i += 1
+    df = pd.DataFrame(rows, columns=["Word", "Tag", "Meaning"])
+    tsv = "\n".join([f"{w}\t{t}\t{m}" for w, t, m in rows])
+    return df, tsv
+def build_legend(lang: str):
+    lang = lang if lang in ("fo", "en") else "fo"
+    if lang == "en":
+        title = "### Legend (what the codes mean)"
+        hint = "- Tip: hover/copy from the TSV box if you want to paste into spreadsheets or docs."
+        wc_title = "#### Word classes"
+        missing = "(No label file loaded — add tag_labels.json to the repo root.)"
+    else:
+        title = "### Markingaryvirlit (hvat kóðurnar merkja)"
+        hint = "- Tips: tú kanst copy/paste úr TSV-kassanum inn í skjøl ella rokniskjøl."
+        wc_title = "#### Orðaflokkar"
+        missing = "(Eingin label-fíla er innlisin — legg tag_labels.json í rótina á repo.)"
+    wc_map = LABELS.get(lang, {}).get("global", {}).get("word_class", {})
+    lines = [title, hint, "", wc_title]
+    if wc_map:
+        for code in sorted(wc_map.keys()):
+            lines.append(f"- **{code}**: {wc_map[code]}")
+    else:
+        lines.append(f"- {missing}")
     return "\n".join(lines)
 # ----------------------------
 # Gradio UI
 # ----------------------------
+theme = gr.themes.Soft()
+with gr.Blocks(theme=theme, title="BRAGD-markarin") as demo:
+    gr.Markdown(
+        "## BRAGD-markarin\n"
+        "Skriv ein setning og fá hann markaðan.\n\n"
+        "**Model:** `Setur/BRAGD`"
+    )
+    with gr.Row():
+        lang = gr.Dropdown(
+            choices=[("Føroyskt", "fo"), ("English", "en")],
+            value="fo",
+            label="Mál / Language",
+        )
+    inp = gr.Textbox(lines=3, label="Setningur / Sentence", placeholder="Skriv her…")
+    btn = gr.Button("Marka / Tag", variant="primary")
+    out_df = gr.Dataframe(
+        headers=["Word", "Tag", "Meaning"],
+        wrap=True,
+        interactive=False,
+        label="Úrslit / Results",
+    )
+    out_tsv = gr.Textbox(lines=10, label="Copy/paste (TSV)", interactive=False)
+    with gr.Accordion("Markingaryvirlit / Legend", open=False):
+        legend_md = gr.Markdown(build_legend("fo"))
+    def _run(sentence, lang_choice):
+        df, tsv = tag_sentence(sentence, lang_choice)
+        return df, tsv, build_legend(lang_choice)
+    btn.click(_run, inputs=[inp, lang], outputs=[out_df, out_tsv, legend_md])
+    lang.change(lambda l: build_legend(l), inputs=[lang], outputs=[legend_md])
 if __name__ == "__main__":
+    demo.launch()