Spaces:

Setur
/

Marka

Running

App Files Files Community

unijoh commited on Jan 20

Commit

22e1960

verified ·

1 Parent(s): 13dbad2

Upload 2 files

Browse files

Files changed (2) hide show

app.py +291 -369
tag_labels.json +25 -21

app.py CHANGED Viewed

@@ -1,12 +1,10 @@
-import os
-import re
-import string
-import json
 from collections import defaultdict
 import gradio as gr
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 # ----------------------------
@@ -15,7 +13,7 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
 MODEL_ID = "Setur/BRAGD"
 TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"   # must match model labels
 LABELS_FILEPATH = "tag_labels.json"           # add to repo root (FO+EN labels)
-HF_TOKEN = os.getenv("BRAGD")                 # Space secret name
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
@@ -28,267 +26,178 @@ INTERVALS = (
     (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
 )
-GROUP_ORDER = [
-    "subcategory", "gender", "number", "case", "article", "proper",
-    "degree", "declension", "mood", "voice", "tense", "person", "definiteness"
-]
-# You said subcategory B doesn't exist and will be deleted from the CSV
 HIDE_CODES = {"subcategory": {"B"}}
 UI = {
-    "fo": {
-        "title": "BRAGD-markarin",
-        "inst": "Skriv ein setning og fá hann markaðan.",
-        "model": "Model:",
-        "word": "Orð",
-        "tag": "Mark",
-        "analysis": "Útgreining",
-        "results": "Úrslit",
-        "expanded": "Útgreinað marking",
-        "legend": "Markingaryvirlit",
-        "lang": "Mál",
-    },
-    "en": {
-        "title": "BRAGD tagger",
-        "inst": "Type a sentence and get it tagged.",
-        "model": "Model:",
-        "word": "Word",
-        "tag": "Tag",
-        "analysis": "Analysis",
-        "results": "Results",
-        "expanded": "Expanded tags",
-        "legend": "Tag legend",
-        "lang": "Language",
-    },
 }
-# Theme color: #89AFA9 (+ close shades)
-CSS = r"""
 :root{
   --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
   --primary-100:#E1ECEA; --primary-200:#C6DAD6;
 }
-.gr-button-primary{
-  background:var(--primary-500)!important;
-  border-color:var(--primary-600)!important;
-  color:#0b1b19!important;
-  padding: 8px 14px !important;
-  font-size: 14px !important;
 }
-.gr-button-primary:hover{ background:var(--primary-600)!important; }
 a{ color:var(--primary-700)!important; }
-/* tighten overall vertical spacing a bit */
-.gradio-container .prose{ margin: 0 !important; }
-#header_md h2, #header_md p { margin: 0.2rem 0 !important; }
-/* language dropdown: small, no big box */
-#lang_dd { max-width: 160px; }
-#lang_dd .wrap { padding-top: 0 !important; }
-/* results table */
-table.bragd {
-  width: 100%;
-  border-collapse: separate;
-  border-spacing: 0;
-  border: 1px solid rgba(0,0,0,0.08);
-  border-radius: 12px;
-  overflow: hidden;
 }
-table.bragd thead th{
-  text-align: left;
-  font-weight: 600;
-  background: rgba(137,175,169,0.20);
-  padding: 10px 12px;
-  border-bottom: 1px solid rgba(0,0,0,0.08);
-  font-size: 13px;
 }
-table.bragd tbody td{
-  padding: 10px 12px;
-  border-bottom: 1px solid rgba(0,0,0,0.06);
-  vertical-align: top;
-  font-size: 14px;
 }
-table.bragd tbody tr:last-child td{ border-bottom: none; }
-td.wordcol, td.tagcol { white-space: nowrap; }
-td.tagcol { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; }
-td.analysiscol { white-space: normal; }
-/* Make Orð/Word column fit content */
-td.wordcol { width: 1%; }
-td.tagcol { min-width: 8ch; width: 1%; }
-/* Expanded tags table a touch smaller */
-table.bragd.small tbody td, table.bragd.small thead th { font-size: 13px; }
-/* header row for results + language picker */
-#results_header .prose h3 { margin: 0.2rem 0 !important; }
 """
 # ----------------------------
-# Utilities
 # ----------------------------
 def simp_tok(sentence: str):
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
-def load_tag_mappings(tags_filepath: str):
-    import pandas as pd  # local import keeps cold-start slightly lighter
-    tags_df = pd.read_csv(tags_filepath)
-    feature_cols = list(tags_df.columns[1:])
-    tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in tags_df.iterrows()}
-    features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in tags_df.iterrows()}
     return tag_to_features, features_to_tag, len(feature_cols), feature_cols
 def group_from_col(col: str):
-    if col == "Article":
-        return ("article", "A")
-    if col.startswith("No-Article "):
-        return ("article", col.split()[-1])
-    if col == "Proper Noun":
-        return ("proper", "P")
-    if col.startswith("Not-Proper-Noun "):
-        return ("proper", col.split()[-1])
     prefixes = [
-        ("Word Class ", "word_class"),
-        ("Subcategory ", "subcategory"), ("No-Subcategory ", "subcategory"),
-        ("Gender ", "gender"), ("No-Gender ", "gender"),
-        ("Number ", "number"), ("No-Number ", "number"),
-        ("Case ", "case"), ("No-Case ", "case"),
-        ("Degree ", "degree"), ("No-Degree ", "degree"),
-        ("Declension ", "declension"), ("No-Declension ", "declension"),
-        ("Mood ", "mood"),
-        ("Voice ", "voice"), ("No-Voice ", "voice"),
-        ("Tense ", "tense"), ("No-Tense ", "tense"),
-        ("Person ", "person"), ("No-Person ", "person"),
-        ("Definite ", "definiteness"), ("Indefinite ", "definiteness"),
     ]
-    for p, g in prefixes:
         if col.startswith(p):
             return (g, col.split()[-1])
-    return (None, None)
 def process_tag_features(tag_to_features: dict, intervals):
-    # Compute allowed intervals per POS (like demo.py)
-    list_of_tags = list(tag_to_features.values())
-    unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
-    word_type_masks = {wt: [arr for arr in unique_arrays if arr[wt] == 1] for wt in range(15)}
-    dict_intervals = {}
-    for wt in range(15):
-        labels = word_type_masks[wt]
         if not labels:
-            dict_intervals[wt] = []
             continue
         sum_labels = np.sum(np.array(labels), axis=0)
-        allowed = [interval for interval in intervals if np.sum(sum_labels[interval[0]:interval[1] + 1]) != 0]
-        dict_intervals[wt] = allowed
-    return dict_intervals
-def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
     softmax = torch.nn.Softmax(dim=0)
     vectors = []
     for idx in range(len(logits)):
-        if attention_mask[idx].item() != 1:
-            continue
-        if begin_tokens[idx] != 1:
             continue
-        pred_logits = logits[idx]
         vec = torch.zeros(vec_len, device=logits.device)
-        # POS
-        probs = softmax(pred_logits[0:15])
-        wt = torch.argmax(probs).item()
-        vec[wt] = 1
-        # feature groups
-        for (a, b) in dict_intervals.get(wt, []):
-            seg = pred_logits[a:b + 1]
-            probs = softmax(seg)
-            k = torch.argmax(probs).item()
-            vec[a + k] = 1
         vectors.append(vec)
     return vectors
-def clean_label(s: str) -> str:
-    s = (s or "").strip()
-    s = re.sub(r"\s+", " ", s)
-    return s.strip(" -;:,")
-def html_escape(s: str) -> str:
-    return (
-        (s or "")
-        .replace("&", "&amp;")
-        .replace("<", "&lt;")
-        .replace(">", "&gt;")
-        .replace('"', "&quot;")
-    )
-def rows_to_table_html(headers, rows, small=False):
-    cls = "bragd small" if small else "bragd"
-    thead = "".join(f"<th>{html_escape(h)}</th>" for h in headers)
-    body = []
-    for r in rows:
-        body.append(
-            "<tr>"
-            f"<td class='wordcol'>{html_escape(r[0])}</td>"
-            f"<td class='tagcol'>{html_escape(r[1])}</td>"
-            f"<td class='analysiscol'>{html_escape(r[2])}</td>"
-            "</tr>"
-        )
-    tbody = "".join(body) if body else "<tr><td class='wordcol'></td><td class='tagcol'></td><td class='analysiscol'></td></tr>"
-    return f"<table class='{cls}'><thead><tr>{thead}</tr></thead><tbody>{tbody}</tbody></table>"
 # ----------------------------
-# Load labels (FO+EN)
 # ----------------------------
 with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
     LABELS = json.load(f)
-def label_for(lang: str, group: str, wc_code: str, code: str) -> str:
-    lang = "fo" if lang == "fo" else "en"
     by_wc = LABELS.get(lang, {}).get("by_word_class", {})
     glob = LABELS.get(lang, {}).get("global", {})
-    if wc_code and wc_code in by_wc and code in by_wc[wc_code].get(group, {}):
-        return by_wc[wc_code][group][code]
     return glob.get(group, {}).get(code, "")
 # ----------------------------
-# Load mapping CSV + model
 # ----------------------------
 tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.eval()
 if hasattr(model, "config") and hasattr(model.config, "num_labels"):
     if model.config.num_labels != VEC_LEN:
-        raise RuntimeError(
-            f"Label size mismatch: model has num_labels={model.config.num_labels}, "
-            f"but {TAGS_FILEPATH} implies {VEC_LEN}. You likely uploaded the wrong CSV."
-        )
 DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
-# Build group lookup from CSV feature columns
-GROUPS = defaultdict(list)  # group -> list[(idx, code, colname)]
-for i, col in enumerate(FEATURE_COLS):
-    g, code = group_from_col(col)
     if g and code not in HIDE_CODES.get(g, set()):
         GROUPS[g].append((i, code, col))
@@ -296,99 +205,95 @@ def vector_to_tag(vec: torch.Tensor) -> str:
     return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
 def wc_code(vec: torch.Tensor) -> str:
-    for idx, code, _ in GROUPS["word_class"]:
-        if int(vec[idx].item()) == 1:
             return code
     return ""
 def group_code(vec: torch.Tensor, group: str) -> str:
     hidden = HIDE_CODES.get(group, set())
-    for idx, code, _ in GROUPS.get(group, []):
         if code in hidden:
             continue
-        if int(vec[idx].item()) == 1:
             return code
     return ""
 # ----------------------------
-# Presentation logic
 # ----------------------------
-HIDE_IN_ANALYSIS_FO = {"stýrir falli", "stýrir ikki falli"}
-HIDE_IN_ANALYSIS_EN = {"governs case", "does not govern case"}
 def analysis_text(vec: torch.Tensor, lang: str) -> str:
     """
     Útgreining / Analysis:
-    - only human text (no codes)
-    - skip "stýrir falli" / "stýrir ikki falli"
-    - DGd becomes ONLY "fyriseting"/"preposition"
-    - pronouns and conjunctions start from subcategory (no duplicated base label)
     """
-    lang = "fo" if lang == "fo" else "en"
-    raw_tag = vector_to_tag(vec)
     wc = wc_code(vec)
-    # DGd override: ONLY fyriseting / preposition
-    if raw_tag == "DGd":
-        return "fyriseting" if lang == "fo" else "preposition"
-    # Determine whether to include base word-class label first
-    include_wc = True
-    if wc == "P":  # pronouns: start from subcategory label
-        include_wc = False
-    if wc == "C":  # conjunctions: prefer the subcategory phrase
-        include_wc = False
-    labels = []
-    if include_wc:
-        wc_lbl = clean_label(label_for(lang, "word_class", wc, wc) or wc)
-        if wc_lbl:
-            labels.append(wc_lbl)
-    # Add groups in stable order
     for g in GROUP_ORDER:
         c = group_code(vec, g)
         if not c:
             continue
-        lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
-        if not lbl:
             continue
-        if lang == "fo" and lbl in HIDE_IN_ANALYSIS_FO:
-            continue
-        if lang == "en" and lbl.lower() in HIDE_IN_ANALYSIS_EN:
-            continue
-        # for conjunctions: ensure the first visible label is the subcategory phrase
-        if wc == "C" and g == "subcategory":
-            labels.insert(0, lbl)
             continue
-        labels.append(lbl)
-    # Fallback if we removed wc label for pronouns/conjunctions and subcategory missing
-    if not labels:
-        wc_lbl = clean_label(label_for(lang, "word_class", wc, wc) or wc)
-        if wc_lbl:
-            labels = [wc_lbl]
-    # Deduplicate while preserving order
-    dedup = []
-    seen = set()
-    for x in labels:
-        if x not in seen:
-            dedup.append(x)
-            seen.add(x)
-    return ", ".join(dedup)
 def expanded_text(vec: torch.Tensor, lang: str) -> str:
     """
     Útgreinað marking / Expanded tags:
-    includes code + label per group (useful for debugging).
     """
-    lang = "fo" if lang == "fo" else "en"
     wc = wc_code(vec)
     parts = []
@@ -404,79 +309,63 @@ def expanded_text(vec: torch.Tensor, lang: str) -> str:
     return "; ".join([p for p in parts if p])
-def build_legend(lang: str) -> str:
-    """
-    Elaborate legend:
-    Under each word class, show all letter codes that appear in the CURRENT CSV.
-    """
-    lang = "fo" if lang == "fo" else "en"
-    # Build codes-by-wc from the CSV mapping vectors
     codes = defaultdict(lambda: defaultdict(set))  # wc -> group -> set(code)
     for arr in tag_to_features.values():
         arr = np.array(arr)
         wc = None
-        for idx, code, _ in GROUPS["word_class"]:
-            if arr[idx] == 1:
                 wc = code
                 break
         if not wc:
             continue
         for g in GROUP_ORDER:
-            for idx, code, _ in GROUPS.get(g, []):
-                if code in HIDE_CODES.get(g, set()):
                     continue
-                if arr[idx] == 1:
                     codes[wc][g].add(code)
-    title = f"### {UI[lang]['legend']}"
     lines = [title, ""]
-    group_names = {
-        "fo": {
-            "subcategory": "Undirflokkur",
-            "gender": "Kyn",
-            "number": "Tal",
-            "case": "Fall",
-            "article": "Bundni/óbundni",
-            "proper": "Sernavn",
-            "degree": "Stig",
-            "declension": "Bending",
-            "mood": "Háttur",
-            "voice": "Søgn",
-            "tense": "Tíð",
-            "person": "Persónur",
-            "definiteness": "Bundni/óbundni",
-        },
-        "en": {
-            "subcategory": "Subcategory",
-            "gender": "Gender",
-            "number": "Number",
-            "case": "Case",
-            "article": "Definiteness (suffix)",
-            "proper": "Proper noun",
-            "degree": "Degree",
-            "declension": "Declension",
-            "mood": "Mood",
-            "voice": "Voice",
-            "tense": "Tense",
-            "person": "Person",
-            "definiteness": "Definiteness",
-        },
-    }[lang]
-    for wc in sorted(codes.keys()):
         wcl = label_for(lang, "word_class", wc, wc) or ""
         lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
         for g in GROUP_ORDER:
-            cs = sorted(codes[wc].get(g, set()))
             if not cs:
                 continue
-            lines.append(f"**{group_names.get(g, g)}**")
             for c in cs:
                 lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
                 lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
@@ -487,13 +376,12 @@ def build_legend(lang: str) -> str:
     return "\n".join(lines).strip()
 # ----------------------------
-# Model run + state
 # ----------------------------
 def run_model(sentence: str):
     s = (sentence or "").strip()
     if not s:
         return []
     tokens = simp_tok(s)
     if not tokens:
         return []
@@ -513,118 +401,152 @@ def run_model(sentence: str):
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
-    # begin token mask: first subtoken per word
-    begin_tokens = []
     last = None
     for wid in word_ids:
         if wid is None:
-            begin_tokens.append(0)
         elif wid != last:
-            begin_tokens.append(1)
         else:
-            begin_tokens.append(0)
         last = wid
     with torch.no_grad():
-        out = model(input_ids=input_ids, attention_mask=attention_mask)
-        logits = out.logits[0]
-    vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
     rows = []
     vec_i = 0
-    seen_word_ids = set()
-    for i, wid in enumerate(word_ids):
-        if wid is None:
-            continue
-        if begin_tokens[i] != 1:
-            continue
-        if wid in seen_word_ids:
             continue
-        seen_word_ids.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
         vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
         rows.append({"word": word, "vec": vec.int().tolist()})
         vec_i += 1
     return rows
-def render(rows_state, lang_choice: str):
-    lang = "fo" if lang_choice == "fo" else "en"
-    headers_main = [f"{UI[lang]['word']}", f"{UI[lang]['tag']}", f"{UI[lang]['analysis']}"]
-    headers_exp = [f"{UI[lang]['word']}", f"{UI[lang]['tag']}", f"{UI[lang]['expanded']}"]
-    main_rows = []
-    exp_rows = []
-    for r in (rows_state or []):
         vec = torch.tensor(r["vec"])
         tag = vector_to_tag(vec)
-        main_rows.append([r["word"], tag, analysis_text(vec, lang)])
-        exp_rows.append([r["word"], tag, expanded_text(vec, lang)])
-    main_html = rows_to_table_html(headers_main, main_rows, small=False)
-    exp_html = rows_to_table_html(headers_exp, exp_rows, small=True)
-    legend_md = build_legend(lang)
-    return main_html, exp_html, legend_md
 # ----------------------------
-# Gradio UI (compact + user-friendly)
 # ----------------------------
 theme = gr.themes.Soft()
 with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
     with gr.Row(equal_height=True):
-        with gr.Column(scale=2, min_width=240):
             gr.Markdown(
-                f"## {UI['fo']['title']}\n"
-                f"{UI['fo']['inst']}\n\n"
-                f"**{UI['fo']['model']}** `{MODEL_ID}`",
-                elem_id="header_md"
             )
-        with gr.Column(scale=5, min_width=420):
-            inp = gr.Textbox(lines=5, label=None, placeholder="Skriv her… / Type here…")
-            btn = gr.Button("Marka / Tag", variant="primary")
-    # Results header row with language picker on the far right
-    with gr.Row(equal_height=True, elem_id="results_header"):
-        with gr.Column(scale=5):
-            res_title = gr.Markdown(f"### {UI['fo']['results']} / {UI['en']['results']}")
-        with gr.Column(scale=1, min_width=170):
-            lang = gr.Dropdown(
-                choices=[("Føroyskt", "fo"), ("English", "en")],
-                value="fo",
-                label=None,
-                interactive=True,
-                filterable=False,
-                container=False,
-                elem_id="lang_dd",
             )
     state = gr.State([])
-    out_main = gr.HTML()
     with gr.Accordion("Útgreinað marking / Expanded tags", open=False):
-        out_expanded = gr.HTML()
-    with gr.Accordion("Markingaryvirlit / Tag legend", open=False):
-        out_legend = gr.Markdown(build_legend("fo"))
     def on_tag(sentence, lang_choice):
         rows = run_model(sentence)
-        main_html, exp_html, legend_md = render(rows, lang_choice)
-        return rows, main_html, exp_html, legend_md
     def on_lang(rows, lang_choice):
-        main_html, exp_html, legend_md = render(rows, lang_choice)
-        return main_html, exp_html, legend_md
-    btn.click(on_tag, inputs=[inp, lang], outputs=[state, out_main, out_expanded, out_legend])
-    lang.change(on_lang, inputs=[state, lang], outputs=[out_main, out_expanded, out_legend])
 if __name__ == "__main__":
     demo.launch()

+import os, re, string, json
 from collections import defaultdict
 import gradio as gr
 import torch
 import numpy as np
+import pandas as pd
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 # ----------------------------
 MODEL_ID = "Setur/BRAGD"
 TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"   # must match model labels
 LABELS_FILEPATH = "tag_labels.json"           # add to repo root (FO+EN labels)
+HF_TOKEN = os.getenv("BRAGD")                 # Space secret
 if not HF_TOKEN:
     raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
     (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
 )
+GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
+# You said Subcategory B doesn't exist and will be deleted from the CSV:
 HIDE_CODES = {"subcategory": {"B"}}
+# ----------------------------
+# UI text
+# ----------------------------
 UI = {
+    "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
+    "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
 }
+MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
+# Theme color: #89AFA9 (+ close shades) + system font
+CSS = """
 :root{
   --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
   --primary-100:#E1ECEA; --primary-200:#C6DAD6;
 }
+body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
+  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
 }
+.gr-button-primary, button.primary, .primary{
+  background:var(--primary-500)!important; border-color:var(--primary-600)!important; color:#0b1b19!important;
+}
+.gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
 a{ color:var(--primary-700)!important; }
+/* Dataframe column wrapping: keep Orð + Mark on one line */
+.gr-dataframe table td:nth-child(1),
+.gr-dataframe table th:nth-child(1){
+  white-space: nowrap !important;
+  width: 18% !important;
 }
+.gr-dataframe table td:nth-child(2),
+.gr-dataframe table th:nth-child(2){
+  white-space: nowrap !important;
+  width: 18% !important;
+  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
 }
+.gr-dataframe table td:nth-child(3),
+.gr-dataframe table th:nth-child(3){
+  white-space: normal !important;
+  width: 64% !important;
 }
+/* Make the language dropdown compact */
+#lang_dd { max-width: 170px; }
+/* Slightly smaller primary button */
+.gr-button-primary{ padding: 0.35rem 0.85rem !important; font-size: 0.95rem !important; }
 """
 # ----------------------------
+# Tokenization
 # ----------------------------
 def simp_tok(sentence: str):
     return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
+# ----------------------------
+# CSV mapping
+# ----------------------------
+def load_tag_mappings(path: str):
+    df = pd.read_csv(path)
+    feature_cols = list(df.columns[1:])
+    tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in df.iterrows()}
+    features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in df.iterrows()}
     return tag_to_features, features_to_tag, len(feature_cols), feature_cols
 def group_from_col(col: str):
+    if col == "Article": return ("article","A")
+    if col.startswith("No-Article "): return ("article", col.split()[-1])
+    if col == "Proper Noun": return ("proper","P")
+    if col.startswith("Not-Proper-Noun "): return ("proper", col.split()[-1])
     prefixes = [
+        ("Word Class ","word_class"),
+        ("Subcategory ","subcategory"), ("No-Subcategory ","subcategory"),
+        ("Gender ","gender"), ("No-Gender ","gender"),
+        ("Number ","number"), ("No-Number ","number"),
+        ("Case ","case"), ("No-Case ","case"),
+        ("Degree ","degree"), ("No-Degree ","degree"),
+        ("Declension ","declension"), ("No-Declension ","declension"),
+        ("Mood ","mood"),
+        ("Voice ","voice"), ("No-Voice ","voice"),
+        ("Tense ","tense"), ("No-Tense ","tense"),
+        ("Person ","person"), ("No-Person ","person"),
+        ("Definite ","definiteness"), ("Indefinite ","definiteness"),
     ]
+    for p,g in prefixes:
         if col.startswith(p):
             return (g, col.split()[-1])
+    return (None,None)
+# ----------------------------
+# Decode helpers (your logic)
+# ----------------------------
 def process_tag_features(tag_to_features: dict, intervals):
+    arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
+    wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
+    out = {}
+    for wt,labels in wt_masks.items():
         if not labels:
+            out[wt]=[]
             continue
         sum_labels = np.sum(np.array(labels), axis=0)
+        out[wt] = [iv for iv in intervals if np.sum(sum_labels[iv[0]:iv[1]+1]) != 0]
+    return out
+def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_len):
     softmax = torch.nn.Softmax(dim=0)
     vectors = []
     for idx in range(len(logits)):
+        if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
             continue
+        pred = logits[idx]
         vec = torch.zeros(vec_len, device=logits.device)
+        wt = torch.argmax(softmax(pred[0:15])).item()
+        vec[wt]=1
+        for (a,b) in dict_intervals.get(wt, []):
+            seg = pred[a:b+1]
+            k = torch.argmax(softmax(seg)).item()
+            vec[a+k]=1
         vectors.append(vec)
     return vectors
 # ----------------------------
+# Load labels (FO/EN)
 # ----------------------------
 with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
     LABELS = json.load(f)
+def label_for(lang: str, group: str, wc: str, code: str) -> str:
+    lang = "fo" if lang=="fo" else "en"
     by_wc = LABELS.get(lang, {}).get("by_word_class", {})
     glob = LABELS.get(lang, {}).get("global", {})
+    if wc and wc in by_wc and code in by_wc[wc].get(group, {}):
+        return by_wc[wc][group][code]
     return glob.get(group, {}).get(code, "")
+def clean_label(s: str) -> str:
+    s = (s or "").strip()
+    s = re.sub(r"\s+", " ", s)
+    s = s.strip(" -;,:")
+    return s
 # ----------------------------
+# Load model + mapping
 # ----------------------------
 tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device); model.eval()
 if hasattr(model, "config") and hasattr(model.config, "num_labels"):
     if model.config.num_labels != VEC_LEN:
+        raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
 DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
+# Build GROUPS from CSV headers
+GROUPS = defaultdict(list)  # group -> [(idx, code, colname)]
+for i,col in enumerate(FEATURE_COLS):
+    g,code = group_from_col(col)
     if g and code not in HIDE_CODES.get(g, set()):
         GROUPS[g].append((i, code, col))
     return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
 def wc_code(vec: torch.Tensor) -> str:
+    for idx,code,_ in GROUPS["word_class"]:
+        if int(vec[idx].item())==1:
             return code
     return ""
 def group_code(vec: torch.Tensor, group: str) -> str:
     hidden = HIDE_CODES.get(group, set())
+    for idx,code,_ in GROUPS.get(group, []):
         if code in hidden:
             continue
+        if int(vec[idx].item())==1:
             return code
     return ""
 # ----------------------------
+# Display rules
 # ----------------------------
+HIDE_IN_ANALYSIS = {
+    # Word class D: hide "stýrir falli" / "stýrir ikki falli" in Analysis
+    ("D", "subcategory", "G"),
+    ("D", "subcategory", "N"),
+}
+VOICE_ANALYSIS = {
+    "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
+    "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
+}
 def analysis_text(vec: torch.Tensor, lang: str) -> str:
     """
     Útgreining / Analysis:
+    - plain words (no letters/hyphens)
+    - pronouns: start at subcategory, not word class
+    - DGd: show only fyriseting/preposition
+    - supine: show only supine + voice (drop verb/number/tense/person etc.)
     """
+    lang = "fo" if lang=="fo" else "en"
+    tag = vector_to_tag(vec)
     wc = wc_code(vec)
+    # DGd override
+    if tag == "DGd":
+        return "fyriseting" if lang=="fo" else "preposition"
+    mood = group_code(vec, "mood")
+    if mood == "U":  # luttøkuháttur / supine
+        sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
+        vcode = group_code(vec, "voice") or "v"
+        vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
+        return f"{clean_label(sup)}, {clean_label(vlabel)}"
+    parts = []
+    # Pronouns + conjunctions: subcategory already carries the head noun (fornavn / sambindingarorð)
+    if wc in {"P","C"}:
+        subc = group_code(vec, "subcategory")
+        subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
+        if subl:
+            parts.append(subl)
+    else:
+        wcl = clean_label(label_for(lang, "word_class", wc, wc) or wc)
+        if wcl:
+            parts.append(wcl)
     for g in GROUP_ORDER:
         c = group_code(vec, g)
         if not c:
             continue
+        if wc in {"P","C"} and g == "subcategory":
+            continue  # already added
+        if (wc, g, c) in HIDE_IN_ANALYSIS:
             continue
+        lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) or ""
+        lbl = clean_label(lbl)
+        if not lbl:
             continue
+        if lbl not in parts:
+            parts.append(lbl)
+    return ", ".join(parts)
 def expanded_text(vec: torch.Tensor, lang: str) -> str:
     """
     Útgreinað marking / Expanded tags:
+    codes + labels (useful for debugging and linguists)
     """
+    lang = "fo" if lang=="fo" else "en"
     wc = wc_code(vec)
     parts = []
     return "; ".join([p for p in parts if p])
+def compute_codes_by_wc():
     codes = defaultdict(lambda: defaultdict(set))  # wc -> group -> set(code)
     for arr in tag_to_features.values():
         arr = np.array(arr)
         wc = None
+        for idx,code,_ in GROUPS["word_class"]:
+            if arr[idx]==1:
                 wc = code
                 break
         if not wc:
             continue
         for g in GROUP_ORDER:
+            hidden = HIDE_CODES.get(g, set())
+            for idx,code,_ in GROUPS.get(g, []):
+                if code in hidden:
                     continue
+                if arr[idx]==1:
                     codes[wc][g].add(code)
+    return codes
+CODES_BY_WC = compute_codes_by_wc()
+def build_overview(lang: str) -> str:
+    """
+    Overview under each word class with the letter codes actually used in the CURRENT CSV.
+    """
+    lang = "fo" if lang=="fo" else "en"
+    title = "### Markingaryvirlit" if lang=="fo" else "### Tag Overview"
     lines = [title, ""]
+    for wc in sorted(CODES_BY_WC.keys()):
         wcl = label_for(lang, "word_class", wc, wc) or ""
         lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
         for g in GROUP_ORDER:
+            cs = sorted(CODES_BY_WC[wc].get(g, set()))
             if not cs:
                 continue
+            group_name = {
+                "fo": {
+                    "subcategory":"Undirflokkur", "gender":"Kyn", "number":"Tal", "case":"Fall",
+                    "article":"Bundni/óbundni", "proper":"Sernavn / felagsnavn", "degree":"Stig",
+                    "declension":"Bending", "mood":"Háttur", "voice":"Søgn", "tense":"Tíð",
+                    "person":"Persónur", "definiteness":"Bundni/óbundni",
+                },
+                "en": {
+                    "subcategory":"Subcategory", "gender":"Gender", "number":"Number", "case":"Case",
+                    "article":"Definiteness", "proper":"Proper/common noun", "degree":"Degree",
+                    "declension":"Declension", "mood":"Mood", "voice":"Voice", "tense":"Tense",
+                    "person":"Person", "definiteness":"Definiteness",
+                }
+            }[lang].get(g, g)
+            lines.append(f"**{group_name}**")
             for c in cs:
                 lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
                 lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
     return "\n".join(lines).strip()
 # ----------------------------
+# Inference
 # ----------------------------
 def run_model(sentence: str):
     s = (sentence or "").strip()
     if not s:
         return []
     tokens = simp_tok(s)
     if not tokens:
         return []
     attention_mask = enc["attention_mask"].to(device)
     word_ids = enc.word_ids(batch_index=0)
+    begin = []
     last = None
     for wid in word_ids:
         if wid is None:
+            begin.append(0)
         elif wid != last:
+            begin.append(1)
         else:
+            begin.append(0)
         last = wid
     with torch.no_grad():
+        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits[0]
+    vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
     rows = []
     vec_i = 0
+    seen = set()
+    for i,wid in enumerate(word_ids):
+        if wid is None or begin[i]!=1 or wid in seen:
             continue
+        seen.add(wid)
         word = tokens[wid] if wid < len(tokens) else "<UNK>"
         vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
         rows.append({"word": word, "vec": vec.int().tolist()})
         vec_i += 1
     return rows
+def render(rows_state, lang: str):
+    lang = "fo" if lang=="fo" else "en"
+    df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
+    dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
+    if not rows_state:
+        empty_main = pd.DataFrame(columns=df_cols)
+        empty_mean = pd.DataFrame(columns=dfm_cols)
+        return empty_main, empty_mean, build_overview(lang)
+    out_main, out_mean = [], []
+    for r in rows_state:
         vec = torch.tensor(r["vec"])
         tag = vector_to_tag(vec)
+        out_main.append([r["word"], tag, analysis_text(vec, lang)])
+        out_mean.append([r["word"], tag, expanded_text(vec, lang)])
+    return (
+        pd.DataFrame(out_main, columns=df_cols),
+        pd.DataFrame(out_mean, columns=dfm_cols),
+        build_overview(lang),
+    )
 # ----------------------------
+# Gradio UI
 # ----------------------------
 theme = gr.themes.Soft()
 with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
+    # Compact header: info left, input right
     with gr.Row(equal_height=True):
+        with gr.Column(scale=1, min_width=280):
             gr.Markdown(
+                "### BRAGD-markarin\n"
+                "Skriv ein setning og fá hann markaðan.\n\n"
+                f"**Myndil / Model:** [{MODEL_ID}]({MODEL_LINK})"
             )
+        with gr.Column(scale=2):
+            inp = gr.Textbox(
+                lines=5,
+                placeholder="Skriva her ... / Type here ...",
+                show_label=False,
             )
+            btn = gr.Button("Marka / Tag", variant="primary")
     state = gr.State([])
+    # Results header row (components hide until first run)
+    with gr.Row():
+        results_title = gr.Markdown("### Úrslit / Results", visible=False)
+        lang = gr.Dropdown(
+            choices=[("Føroyskt","fo"), ("English","en")],
+            value="fo",
+            show_label=False,
+            filterable=False,
+            elem_id="lang_dd",
+            visible=False,
+        )
+    out_df = gr.Dataframe(
+        value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
+        wrap=True,
+        interactive=False,
+        show_label=False,
+        row_count=(0, "fixed"),
+        col_count=(3, "fixed"),
+        visible=False,
+    )
     with gr.Accordion("Útgreinað marking / Expanded tags", open=False):
+        out_mean_df = gr.Dataframe(
+            value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
+            wrap=True,
+            interactive=False,
+            show_label=False,
+            row_count=(0, "fixed"),
+            col_count=(3, "fixed"),
+            visible=False,
+        )
+    with gr.Accordion("Markingaryvirlit / Tag Overview", open=False):
+        overview_md = gr.Markdown("", visible=False)
     def on_tag(sentence, lang_choice):
         rows = run_model(sentence)
+        df_main, df_mean, overview = render(rows, lang_choice)
+        return (
+            rows,
+            gr.update(value=df_main, visible=True),
+            gr.update(value=df_mean, visible=True),
+            gr.update(value=overview, visible=True),
+            gr.update(visible=True),   # results_title
+            gr.update(visible=True),   # lang
+        )
     def on_lang(rows, lang_choice):
+        df_main, df_mean, overview = render(rows, lang_choice)
+        return (
+            gr.update(value=df_main),
+            gr.update(value=df_mean),
+            gr.update(value=overview),
+        )
+    btn.click(
+        on_tag,
+        inputs=[inp, lang],
+        outputs=[state, out_df, out_mean_df, overview_md, results_title, lang],
+        queue=False,
+    )
+    lang.change(
+        on_lang,
+        inputs=[state, lang],
+        outputs=[out_df, out_mean_df, overview_md],
+        queue=False,
+    )
 if __name__ == "__main__":
     demo.launch()

tag_labels.json CHANGED Viewed

@@ -32,14 +32,16 @@
         "a": "indefinite"
       },
       "proper": {
-        "r": "not proper noun",
         "P": "proper noun"
       },
       "degree": {
         "d": "no degree"
       },
       "declension": {
-        "e": "no declension"
       },
       "subcategory": {
         "s": "no subcategory"
@@ -82,7 +84,7 @@
           "G": "genitive"
         },
         "article": {
-          "A": "definite"
         },
         "proper": {
           "P": "Proper Noun"
@@ -123,9 +125,9 @@
           "A": "absolute superlative"
         },
         "declension": {
-          "S": "strong declension",
-          "W": "weak declension",
-          "e": "no declension"
         },
         "gender": {
           "M": "masculine",
@@ -204,7 +206,7 @@
       },
       "V": {
         "word_class": {
-          "V": "verb"
         },
         "mood": {
           "I": "infinitive",
@@ -233,16 +235,16 @@
       },
       "L": {
         "word_class": {
-          "L": "past participle"
         },
         "voice": {
           "A": "active",
           "M": "mediopassive"
         },
         "declension": {
-          "S": "strong declension",
-          "W": "weak declension",
-          "e": "no declension"
         },
         "gender": {
           "M": "masculine",
@@ -361,14 +363,16 @@
         "a": "óbundið"
       },
       "proper": {
-        "r": "ikki sernavn",
         "P": "sernavn"
       },
       "degree": {
         "d": "eingin stigbending"
       },
       "declension": {
-        "e": "eingin sterk/veik bending"
       },
       "subcategory": {
         "s": "eingin undirflokkur"
@@ -452,8 +456,8 @@
           "A": "absolutt hástig"
         },
         "declension": {
-          "S": "sterk bending",
-          "W": "veik bending",
           "e": "eingin sterk/veik bending"
         },
         "gender": {
@@ -490,9 +494,9 @@
           "N": "hvørkikyn"
         },
         "person": {
-          "1": "1. persónur",
-          "2": "2. persónur",
-          "3": "3. persónur"
         },
         "number": {
           "S": "eintal",
@@ -573,8 +577,8 @@
           "M": "miðalsøgn"
         },
         "declension": {
-          "S": "sterk bending",
-          "W": "veik bending",
           "e": "eingin sterk/veik bending"
         },
         "gender": {
@@ -648,7 +652,7 @@
           "K": "teknseting"
         },
         "subcategory": {
-          "E": "endi av setningi",
           "C": "komma",
           "Q": "gásareyga",
           "O": "annað"

         "a": "indefinite"
       },
       "proper": {
+        "r": "common noun",
         "P": "proper noun"
       },
       "degree": {
         "d": "no degree"
       },
       "declension": {
+        "e": "no declension",
+        "S": "strong declension",
+        "W": "weak declension"
       },
       "subcategory": {
         "s": "no subcategory"
           "G": "genitive"
         },
         "article": {
+          "A": "with suffixed definite article"
         },
         "proper": {
           "P": "Proper Noun"
           "A": "absolute superlative"
         },
         "declension": {
+          "S": "strong",
+          "W": "weak",
+          "e": "no-declension"
         },
         "gender": {
           "M": "masculine",
       },
       "V": {
         "word_class": {
+          "V": "verb (except for participle)"
         },
         "mood": {
           "I": "infinitive",
       },
       "L": {
         "word_class": {
+          "L": "participle"
         },
         "voice": {
           "A": "active",
           "M": "mediopassive"
         },
         "declension": {
+          "S": "strong",
+          "W": "weak",
+          "e": "no-declension"
         },
         "gender": {
           "M": "masculine",
         "a": "óbundið"
       },
       "proper": {
+        "r": "felagsnavn",
         "P": "sernavn"
       },
       "degree": {
         "d": "eingin stigbending"
       },
       "declension": {
+        "e": "eingin sterk/veik bending",
+        "S": "sterk bending",
+        "W": "veik bending"
       },
       "subcategory": {
         "s": "eingin undirflokkur"
           "A": "absolutt hástig"
         },
         "declension": {
+          "S": "sterk",
+          "W": "veik",
           "e": "eingin sterk/veik bending"
         },
         "gender": {
           "N": "hvørkikyn"
         },
         "person": {
+          "1": "fyrsti persónur",
+          "2": "annar persónur",
+          "3": "triði persónur"
         },
         "number": {
           "S": "eintal",
           "M": "miðalsøgn"
         },
         "declension": {
+          "S": "sterk",
+          "W": "veik",
           "e": "eingin sterk/veik bending"
         },
         "gender": {
           "K": "teknseting"
         },
         "subcategory": {
+          "E": "setningsendi",
           "C": "komma",
           "Q": "gásareyga",
           "O": "annað"