Spaces:

Setur
/

Marka

Running

App Files Files Community

unijoh commited on Jan 15

Commit

fb93f14

verified ·

1 Parent(s): af92582

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -55

app.py CHANGED Viewed

@@ -1,40 +1,28 @@
-print("RUNNING APP.PY VERSION: 2026-01-15 16:12 FIXED")
 import os
 import gradio as gr
 import torch
 import numpy as np
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 MODEL_ID = "Setur/BRAGD"
-TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
-HF_TOKEN = os.getenv("BRAGD")
 if not HF_TOKEN:
-    raise RuntimeError("Missing BRAGD token secret.")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
-model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.eval()
-def load_tag_mappings(tags_filepath):
-    tags_df = pd.read_csv(tags_filepath)
-    features_to_tag = {
-        tuple(row[1:].values.astype(int)): row["Original Tag"]
-        for _, row in tags_df.iterrows()
-    }
-    vec_len = len(tags_df.columns) - 1
-    return features_to_tag, vec_len
-features_to_tag, VEC_LEN = load_tag_mappings(TAGS_FILEPATH)
-# Use the SAME intervals as your demo.py (keep these consistent!)
-intervals = (
-    (15, 29),  # Subcategories (D,B,E,I,P,Q,N,G,R, X, S,C,O,T,s)
     (30, 33),  # Gender (M,F,N,g)
     (34, 36),  # Number (S,P,n)
     (37, 41),  # Case (N,A,D,G,c)
@@ -49,77 +37,194 @@ intervals = (
     (71, 72),  # Definiteness (D,I)
 )
-def vector_to_tag(vec):
     return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
-def tag_sentence(sentence: str):
     sentence = sentence.strip()
     if not sentence:
         return ""
-    tokens = sentence.split()
     enc = tokenizer(
         tokens,
         is_split_into_words=True,
         add_special_tokens=True,
-        max_length=128,
         padding="max_length",
         truncation=True,
         return_attention_mask=True,
-        return_tensors="pt"
     )
     input_ids = enc["input_ids"].to(device)
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
-    # begin token mask
-    begin = []
     last = None
     for wid in word_ids:
         if wid is None:
-            begin.append(0)
         elif wid != last:
-            begin.append(1)
         else:
-            begin.append(0)
         last = wid
     with torch.no_grad():
         out = model(input_ids=input_ids, attention_mask=attention_mask)
         logits = out.logits[0]  # [seq_len, num_labels]
-    lines = []
-    for i in range(logits.shape[0]):
-        if attention_mask[0, i].item() != 1 or begin[i] != 1:
-            continue
-        pred = logits[i]
-        vec = torch.zeros(VEC_LEN, device=logits.device)
-        # Word type in [0..14]
-        wt = torch.argmax(pred[0:15]).item()
-        vec[wt] = 1
-        # Interval decoding
-        for a, b in intervals:
-            seg = pred[a:b+1]
-            k = torch.argmax(seg).item()
-            vec[a + k] = 1
-        wid = word_ids[i]
-        word = tokens[wid] if wid is not None and wid < len(tokens) else "<UNK>"
-        lines.append(f"{word}\t{vector_to_tag(vec)}")
     return "\n".join(lines)
 demo = gr.Interface(
     fn=tag_sentence,
     inputs=gr.Textbox(lines=2, label="Setningur"),
     outputs=gr.Textbox(lines=12, label="Orð\\tMark"),
-    title="BRAGD-markarin"
 )
 if __name__ == "__main__":
-    demo.launch()

+print("RUNNING APP.PY VERSION: 2026-01-15 16:20 DICT_INTERVALS + REGEX TOK")
 import os
+import re
+import string
 import gradio as gr
 import torch
 import numpy as np
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
+# ----------------------------
+# Config
+# ----------------------------
 MODEL_ID = "Setur/BRAGD"
+TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"  # must be present in the Space repo
+HF_TOKEN = os.getenv("BRAGD")  # Space secret name
 if not HF_TOKEN:
+    raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
+# Match UPDATED demo.py intervals
+INTERVALS = (
+    (15, 29),  # Subcategories (D,B,E,I,P,Q,N,G,R,X,S,C,O,T,s)
     (30, 33),  # Gender (M,F,N,g)
     (34, 36),  # Number (S,P,n)
     (37, 41),  # Case (N,A,D,G,c)
     (71, 72),  # Definiteness (D,I)
 )
+# ----------------------------
+# Load model + tokenizer
+# ----------------------------
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.eval()
+# ----------------------------
+# Tag mapping + dict_intervals
+# ----------------------------
+def load_tag_mappings(tags_filepath: str):
+    tags_df = pd.read_csv(tags_filepath)
+    # Map: Original Tag -> feature vector, and feature vector -> Original Tag
+    tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in tags_df.iterrows()}
+    features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in tags_df.iterrows()}
+    vec_len = len(tags_df.columns) - 1
+    return tag_to_features, features_to_tag, vec_len
+tag_to_features, features_to_tag, VEC_LEN = load_tag_mappings(TAGS_FILEPATH)
+# Safety check: if this fails, you uploaded the wrong CSV for the model
+if hasattr(model, "config") and hasattr(model.config, "num_labels"):
+    if model.config.num_labels != VEC_LEN:
+        raise RuntimeError(
+            f"Label size mismatch: model has num_labels={model.config.num_labels}, "
+            f"but {TAGS_FILEPATH} implies {VEC_LEN}. "
+            "You likely uploaded the wrong tag mapping CSV."
+        )
+def process_tag_features(tag_to_features: dict, intervals):
+    """Compute allowed intervals per POS (dict_intervals) like your updated demo.py."""
+    list_of_tags = list(tag_to_features.values())
+    unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
+    # Collect all feature vectors for each POS class (0..14)
+    word_type_masks = {}
+    for wt in range(15):
+        word_type_masks[wt] = [arr for arr in unique_arrays if arr[wt] == 1]
+    dict_intervals = {}
+    for wt in range(15):
+        labels = word_type_masks[wt]
+        if len(labels) == 0:
+            dict_intervals[wt] = []
+            continue
+        sum_labels = np.sum(np.array(labels), axis=0)
+        allowed = [
+            interval
+            for interval in intervals
+            if np.sum(sum_labels[interval[0] : interval[1] + 1]) != 0
+        ]
+        dict_intervals[wt] = allowed
+    return dict_intervals
+DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
+def vector_to_tag(vec: torch.Tensor) -> str:
     return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
+# ----------------------------
+# Tokenization (match updated demo.py)
+# ----------------------------
+def simp_tok(sentence: str):
+    """Tokenize into words and punctuation (regex), matching your updated demo.py."""
+    return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
+# ----------------------------
+# Decoding (match updated demo.py logic)
+# ----------------------------
+def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
+    """
+    Decode one feature-vector per word:
+    - pick POS (0..14)
+    - then pick subclasses only in allowed intervals for that POS
+    """
+    softmax = torch.nn.Softmax(dim=0)
+    vectors = []
+    for idx in range(len(logits)):
+        if attention_mask[idx].item() != 1:
+            continue
+        if begin_tokens[idx] != 1:
+            continue
+        pred_logits = logits[idx]
+        vec = torch.zeros(vec_len, device=logits.device)
+        # POS
+        probs = softmax(pred_logits[0:15])
+        wt = torch.argmax(probs).item()
+        vec[wt] = 1
+        # Allowed feature groups for this POS
+        for (a, b) in dict_intervals.get(wt, []):
+            seg = pred_logits[a : b + 1]
+            probs = softmax(seg)
+            k = torch.argmax(probs).item()
+            vec[a + k] = 1
+        vectors.append(vec)
+    return vectors
+def tag_sentence(sentence: str, max_len: int = 128):
     sentence = sentence.strip()
     if not sentence:
         return ""
+    tokens = simp_tok(sentence)
+    if not tokens:
+        return ""
     enc = tokenizer(
         tokens,
         is_split_into_words=True,
         add_special_tokens=True,
+        max_length=max_len,
         padding="max_length",
         truncation=True,
         return_attention_mask=True,
+        return_tensors="pt",
     )
     input_ids = enc["input_ids"].to(device)
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
+    # begin token mask: first subtoken per word
+    begin_tokens = []
     last = None
     for wid in word_ids:
         if wid is None:
+            begin_tokens.append(0)
         elif wid != last:
+            begin_tokens.append(1)
         else:
+            begin_tokens.append(0)
         last = wid
     with torch.no_grad():
         out = model(input_ids=input_ids, attention_mask=attention_mask)
         logits = out.logits[0]  # [seq_len, num_labels]
+    vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
+    # Map vectors back to tokens (one vector per original word)
+    lines = []
+    vec_i = 0
+    seen_word_ids = set()
+    for i, wid in enumerate(word_ids):
+        if wid is None:
+            continue
+        if begin_tokens[i] != 1:
+            continue
+        if wid in seen_word_ids:
+            continue
+        seen_word_ids.add(wid)
+        word = tokens[wid] if wid < len(tokens) else "<UNK>"
+        tag = vector_to_tag(vectors[vec_i]) if vec_i < len(vectors) else "Unknown Tag"
+        lines.append(f"{word}\t{tag}")
+        vec_i += 1
     return "\n".join(lines)
+# ----------------------------
+# Gradio UI
+# ----------------------------
 demo = gr.Interface(
     fn=tag_sentence,
     inputs=gr.Textbox(lines=2, label="Setningur"),
     outputs=gr.Textbox(lines=12, label="Orð\\tMark"),
+    title="BRAGD-markarin",
+    description=(
+        "Skriv ein setning og fá mark (POS/morfologi). "
+        "Model: Setur/BRAGD. "
+        "Um alt verður 'Unknown Tag', er tags-fílan ofta skeiv (skeivt CSV) ella labels samsvara ikki."
+    ),
 )
 if __name__ == "__main__":
+    demo.launch()