Spaces:

Setur
/

Marka

Running

App Files Files Community

unijoh commited on Jan 15

Commit

293c12b

verified ·

1 Parent(s): 98ba790

Upload 3 files

Browse files

Files changed (3) hide show

Sosialurin-BRAGD_tags.csv +0 -0
app.py +118 -0
requirements.txt +5 -0

Sosialurin-BRAGD_tags.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import torch
+import numpy as np
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+MODEL_ID = "YOUR_USERNAME/YOUR_MODEL_REPO"
+TAGS_FILEPATH = "Sosialurin-GOLD_tags.csv"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.eval()
+def load_tag_mappings(tags_filepath):
+    tags_df = pd.read_csv(tags_filepath)
+    features_to_tag = {
+        tuple(row[1:].values.astype(int)): row["Original Tag"]
+        for _, row in tags_df.iterrows()
+    }
+    vec_len = len(tags_df.columns) - 1
+    return features_to_tag, vec_len
+features_to_tag, VEC_LEN = load_tag_mappings(TAGS_FILEPATH)
+# Use the SAME intervals as your demo.py (keep these consistent!)
+intervals = (
+    (15, 28),
+    (29, 32),
+    (33, 35),
+    (36, 40),
+    (41, 42),
+    (43, 44),
+    (45, 49),
+    (50, 52),
+    (53, 58),
+    (59, 61),
+    (62, 64),
+    (65, 68),
+    (69, 70),
+)
+def vector_to_tag(vec):
+    return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
+def tag_sentence(sentence: str):
+    sentence = sentence.strip()
+    if not sentence:
+        return ""
+    tokens = sentence.split()
+    enc = tokenizer(
+        tokens,
+        is_split_into_words=True,
+        add_special_tokens=True,
+        max_length=128,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+        return_tensors="pt"
+    )
+    input_ids = enc["input_ids"].to(device)
+    attention_mask = enc["attention_mask"].to(device)
+    word_ids = enc.word_ids(batch_index=0)
+    # begin token mask
+    begin = []
+    last = None
+    for wid in word_ids:
+        if wid is None:
+            begin.append(0)
+        elif wid != last:
+            begin.append(1)
+        else:
+            begin.append(0)
+        last = wid
+    with torch.no_grad():
+        out = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = out.logits[0]  # [seq_len, num_labels]
+    lines = []
+    for i in range(logits.shape[0]):
+        if attention_mask[0, i].item() != 1 or begin[i] != 1:
+            continue
+        pred = logits[i]
+        vec = torch.zeros(VEC_LEN, device=logits.device)
+        # Word type in [0..14]
+        wt = torch.argmax(pred[0:15]).item()
+        vec[wt] = 1
+        # Interval decoding
+        for a, b in intervals:
+            seg = pred[a:b+1]
+            k = torch.argmax(seg).item()
+            vec[a + k] = 1
+        wid = word_ids[i]
+        word = tokens[wid] if wid is not None and wid < len(tokens) else "<UNK>"
+        lines.append(f"{word}\t{vector_to_tag(vec)}")
+    return "\n".join(lines)
+demo = gr.Interface(
+    fn=tag_sentence,
+    inputs=gr.Textbox(lines=2, label="Sentence"),
+    outputs=gr.Textbox(lines=12, label="Token\\tTag"),
+    title="Faroese POS Tagger (Demo)"
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+pandas
+numpy
+gradio