unijoh commited on
Commit
21053de
·
verified ·
1 Parent(s): 6e15cef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -310
app.py CHANGED
@@ -11,43 +11,46 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
11
  # Config
12
  # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
- TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
15
- LABELS_FILEPATH = "tag_labels.json"
16
- HF_TOKEN = os.getenv("BRAGD")
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
21
  raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
 
 
22
 
 
23
  INTERVALS = (
24
  (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
25
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
26
  )
27
 
28
- GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
29
- HIDE_CODES = {"subcategory": {"B"}} # Subcategory B to be removed
 
 
30
 
31
- UI = {
32
- "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
33
- "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
34
- }
35
 
36
  MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
37
 
38
- CSS = """:root{
 
 
 
 
39
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
40
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
41
- --page-bg:#f7f7f8;
42
  }
43
 
44
- /* Page background */
45
- html, body, .gradio-container{
46
- background: var(--page-bg) !important;
47
- }
48
  body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
49
  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
50
  }
 
51
  a{ color:var(--primary-700)!important; }
52
 
53
  /* Primary button (Marka/Tag) */
@@ -56,232 +59,93 @@ a{ color:var(--primary-700)!important; }
56
  border-color:var(--primary-600)!important;
57
  color:#0b1b19!important;
58
  }
59
- .gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
60
- .gr-button-primary{ padding:0.35rem 0.85rem!important; font-size:0.95rem!important; }
61
-
62
- /* --- Keep the textbox exactly as-is: wrapper blends with page, textarea stays white --- */
63
- #input_col, #input_col *{
64
- background: transparent !important;
65
- }
66
- #input_col .gr-block, #input_col .gr-panel, #input_col .gr-box, #input_col .gr-group, #input_col .gr-form{
67
- background: transparent !important;
68
- box-shadow:none !important;
69
- border:0 !important;
70
- }
71
- #input_box, #input_box > div, #input_box .wrap, #input_box .container{
72
- background: transparent !important;
73
- box-shadow:none !important;
74
- border:0 !important;
75
- }
76
- #input_box textarea{
77
- background:#ffffff !important;
78
  }
79
 
80
- /* Dataframe columns: keep Orð + Mark single-line */
81
- .gr-dataframe table td:nth-child(1), .gr-dataframe table th:nth-child(1){
82
- white-space: nowrap !important; width: 18% !important;
 
 
83
  }
84
- .gr-dataframe table td:nth-child(2), .gr-dataframe table th:nth-child(2){
85
- white-space: nowrap !important; width: 18% !important;
 
 
86
  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
87
  }
88
- .gr-dataframe table td:nth-child(3), .gr-dataframe table th:nth-child(3){
89
- white-space: normal !important; width: 64% !important;
 
 
90
  }
91
 
92
- /* Selected = match Marka/Tag exactly */
93
- /* Hover = subtle */
94
- /* Keep selected button color on hover; only lighten UNSELECTED on hover */
95
- /* Push language buttons fully to the right */
96
- #results_hdr > .gr-markdown{
97
- flex:1 1 auto !important;
98
- }
99
- /* Results header row: two-column layout, title left, toggle hard-right */
100
  #results_hdr{
101
- display:grid !important;
102
- grid-template-columns: 1fr auto !important;
103
- align-items:center !important;
104
- gap:12px !important;
105
- padding:0 !important;
106
- margin:0 !important;
107
- background:transparent !important;
108
- box-shadow:none !important;
109
- border:0 !important;
110
  }
111
- #results_hdr > .gr-column:first-child{ justify-self:start !important; }
112
- #results_hdr > .gr-column:last-child{ justify-self:end !important; }
113
 
114
- /* Language toggle (gr.Radio): style the LABEL as the button (robust across Gradio DOM variants) */
115
  .lang_toggle{
116
  background: transparent !important;
117
- justify-self:end !important;
118
- }
119
- .lang_toggle fieldset{
120
- border:0!important;
121
- padding:0!important;
122
- margin:0!important;
123
- background:transparent!important;
124
  }
125
  .lang_toggle .wrap{
126
- display:flex!important;
127
- gap:10px!important;
128
- background:transparent!important;
129
- padding:0!important;
130
- margin:0!important;
131
- }
132
- .lang_toggle input{
133
- display:none!important;
134
- }
135
-
136
- /* Kill any default Gradio "pill" styling inside */
137
- .lang_toggle label *{
138
- background:transparent!important;
139
- box-shadow:none!important;
140
- border:0!important;
141
- }
142
-
143
- /* The actual button */
144
- .lang_toggle label{
145
- display:inline-flex !important;
146
- align-items:center !important;
147
- justify-content:center !important;
148
- cursor:pointer !important;
149
- user-select:none !important;
150
-
151
- padding:0.35rem 0.85rem !important;
152
- font-size:0.95rem !important;
153
- border-radius:10px !important;
154
-
155
- border:1px solid var(--primary-600) !important;
156
- background: var(--primary-200) !important; /* inactive: lighter than #89AFA9 */
157
- color:#0b1b19 !important; /* black-ish */
158
- }
159
-
160
- /* Active/selected */
161
- .lang_toggle label:has(input:checked){
162
- background: #89AFA9 !important;
163
- border-color: var(--primary-600) !important;
164
- color:#0b1b19 !important;
165
- }
166
-
167
- /* Hover: show #89AFA9 (inactive becomes active color on hover) */
168
- .lang_toggle label:hover{
169
- background:#89AFA9 !important;
170
- border-color: var(--primary-600) !important;
171
- color:#0b1b19 !important;
172
  }
173
 
174
-
175
- /* Remove Gradio's default label styling completely */
176
- .lang_toggle label{
177
- background:transparent!important;
178
- border:0!important;
179
- padding:0!important;
180
- margin:0!important;
181
- box-shadow:none!important;
182
  }
183
 
184
- /* Single visible button layer */
185
  .lang_toggle label span{
186
  all: unset;
187
  display:inline-block;
188
  cursor:pointer;
189
  user-select:none;
190
- padding:0.35rem 0.85rem;
191
  font-size:0.95rem;
 
192
  border-radius:10px;
193
  border:1px solid var(--primary-600);
194
- background: transparent; /* same as page */
195
  color:#0b1b19;
196
- box-shadow:none!important;
197
  }
198
 
199
- /* Selected state (robust selectors) */
200
- .lang_toggle input:checked ~ span,
201
- .lang_toggle label:has(input:checked) span{
202
  background:var(--primary-500)!important;
203
  border-color:var(--primary-600)!important;
204
  color:#0b1b19!important;
205
  }
206
 
207
- /* Hover: only unselected gets light background */
208
- .lang_toggle label:hover input:not(:checked) ~ span,
209
- .lang_toggle label:hover:not(:has(input:checked)) span{
210
- background:var(--primary-200)!important;
211
- }
212
- /* --- Language buttons (robust: 4 real buttons, show/hide to indicate active) --- */
213
- #results_hdr{
214
- display:grid !important;
215
- grid-template-columns: 1fr auto !important;
216
- align-items:center !important;
217
- gap:12px !important;
218
- padding:0 !important;
219
- margin:0 !important;
220
- background:transparent !important;
221
- box-shadow:none !important;
222
- border:0 !important;
223
- }
224
- #lang_buttons{
225
- display:flex !important;
226
- gap:10px !important;
227
- justify-content:flex-end !important;
228
- align-items:center !important;
229
- flex-wrap:nowrap !important;
230
- }
231
- #lang_buttons .gr-button, #lang_buttons button{
232
- padding:0.35rem 0.85rem !important;
233
- font-size:0.95rem !important;
234
- border-radius:10px !important;
235
- }
236
-
237
- /* Inactive: lighter than #89AFA9, black text */
238
- #lang_fo_off, #lang_en_off{
239
- background:var(--primary-200) !important;
240
- border-color:var(--primary-600) !important;
241
- color:#0b1b19 !important;
242
- }
243
- /* Hover inactive -> active color (#89AFA9) */
244
- #lang_fo_off:hover, #lang_en_off:hover{
245
- background:var(--primary-500) !important;
246
- border-color:var(--primary-600) !important;
247
- color:#0b1b19 !important;
248
- }
249
- /* Active: ensure black text */
250
- #lang_fo_on, #lang_en_on{
251
- color:#0b1b19 !important;
252
- }
253
-
254
- /* Keep header transparent, but DON'T nuke button backgrounds */
255
- #results_hdr, #results_hdr > div{
256
- background:transparent !important;
257
- box-shadow:none !important;
258
- border:0 !important;
259
- }
260
-
261
- /* Prevent Gradio from stacking/stretching language buttons */
262
- #lang_buttons .gr-button, #lang_buttons button{
263
- width:auto !important;
264
- min-width:120px !important;
265
- flex:0 0 auto !important;
266
  }
 
267
 
268
- /* Language button colors */
269
- #lang_buttons .gr-button-primary, #lang_buttons button.primary{
270
- background:#89AFA9 !important;
271
- border-color:#6F9992 !important;
272
- color:#0b1b19 !important;
273
- }
274
- #lang_buttons .gr-button-secondary, #lang_buttons button.secondary{
275
- background:#C6DAD6 !important; /* light green */
276
- border-color:#6F9992 !important;
277
- color:#0b1b19 !important;
278
- }
279
- #lang_buttons .gr-button-secondary:hover, #lang_buttons button.secondary:hover{
280
- background:#89AFA9 !important;
281
- border-color:#6F9992 !important;
282
- color:#0b1b19 !important;
283
  }
284
- """
285
 
286
  # ----------------------------
287
  # Tokenization
@@ -324,6 +188,9 @@ def group_from_col(col: str):
324
  return (g, col.split()[-1])
325
  return (None,None)
326
 
 
 
 
327
  def process_tag_features(tag_to_features: dict, intervals):
328
  arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
329
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
@@ -342,19 +209,23 @@ def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_le
342
  for idx in range(len(logits)):
343
  if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
344
  continue
 
345
  pred = logits[idx]
346
  vec = torch.zeros(vec_len, device=logits.device)
 
347
  wt = torch.argmax(softmax(pred[0:15])).item()
348
  vec[wt]=1
 
349
  for (a,b) in dict_intervals.get(wt, []):
350
  seg = pred[a:b+1]
351
  k = torch.argmax(softmax(seg)).item()
352
  vec[a+k]=1
 
353
  vectors.append(vec)
354
  return vectors
355
 
356
  # ----------------------------
357
- # Load labels
358
  # ----------------------------
359
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
360
  LABELS = json.load(f)
@@ -370,7 +241,7 @@ def label_for(lang: str, group: str, wc: str, code: str) -> str:
370
  def clean_label(s: str) -> str:
371
  s = (s or "").strip()
372
  s = re.sub(r"\s+", " ", s)
373
- return s.strip(" -;,:").strip()
374
 
375
  # ----------------------------
376
  # Load model + mapping
@@ -379,15 +250,18 @@ tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS
379
 
380
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
381
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
 
382
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
383
  model.to(device); model.eval()
384
 
385
- if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
386
- raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
 
387
 
388
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
389
 
390
- GROUPS = defaultdict(list)
 
391
  for i,col in enumerate(FEATURE_COLS):
392
  g,code = group_from_col(col)
393
  if g and code not in HIDE_CODES.get(g, set()):
@@ -411,7 +285,14 @@ def group_code(vec: torch.Tensor, group: str) -> str:
411
  return code
412
  return ""
413
 
414
- HIDE_IN_ANALYSIS = {("D","subcategory","G"), ("D","subcategory","N")}
 
 
 
 
 
 
 
415
  VOICE_ANALYSIS = {
416
  "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
417
  "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
@@ -422,17 +303,20 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
422
  tag = vector_to_tag(vec)
423
  wc = wc_code(vec)
424
 
 
425
  if tag == "DGd":
426
  return "fyriseting" if lang=="fo" else "preposition"
427
 
428
  mood = group_code(vec, "mood")
429
- if mood == "U":
430
  sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
431
  vcode = group_code(vec, "voice") or "v"
432
  vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
433
  return f"{clean_label(sup)}, {clean_label(vlabel)}"
434
 
435
  parts = []
 
 
436
  if wc in {"P","C"}:
437
  subc = group_code(vec, "subcategory")
438
  subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
@@ -451,8 +335,12 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
451
  continue
452
  if (wc, g, c) in HIDE_IN_ANALYSIS:
453
  continue
454
- lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
455
- if lbl and lbl not in parts:
 
 
 
 
456
  parts.append(lbl)
457
 
458
  return ", ".join(parts)
@@ -461,20 +349,24 @@ def expanded_text(vec: torch.Tensor, lang: str) -> str:
461
  lang = "fo" if lang=="fo" else "en"
462
  wc = wc_code(vec)
463
  parts = []
 
464
  wc_lbl = label_for(lang, "word_class", wc, wc)
465
  parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
 
466
  for g in GROUP_ORDER:
467
  c = group_code(vec, g)
468
  if not c:
469
  continue
470
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
471
  parts.append(f"{c} – {lbl}" if lbl else c)
 
472
  return "; ".join([p for p in parts if p])
473
 
474
  def compute_codes_by_wc():
475
- codes = defaultdict(lambda: defaultdict(set))
476
  for arr in tag_to_features.values():
477
  arr = np.array(arr)
 
478
  wc = None
479
  for idx,code,_ in GROUPS["word_class"]:
480
  if arr[idx]==1:
@@ -482,6 +374,7 @@ def compute_codes_by_wc():
482
  break
483
  if not wc:
484
  continue
 
485
  for g in GROUP_ORDER:
486
  hidden = HIDE_CODES.get(g, set())
487
  for idx,code,_ in GROUPS.get(g, []):
@@ -489,6 +382,7 @@ def compute_codes_by_wc():
489
  continue
490
  if arr[idx]==1:
491
  codes[wc][g].add(code)
 
492
  return codes
493
 
494
  CODES_BY_WC = compute_codes_by_wc()
@@ -497,29 +391,44 @@ def build_overview(lang: str) -> str:
497
  lang = "fo" if lang=="fo" else "en"
498
  title = "### Markayvirlit" if lang=="fo" else "### Tag Overview"
499
  lines = [title, ""]
 
500
  for wc in sorted(CODES_BY_WC.keys()):
501
  wcl = label_for(lang, "word_class", wc, wc) or ""
502
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
 
503
  for g in GROUP_ORDER:
504
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
505
  if not cs:
506
  continue
 
507
  group_name = {
508
- "fo": {"subcategory":"Undirflokkur","gender":"Kyn","number":"Tal","case":"Fall","article":"Bundni/óbundni",
509
- "proper":"Sernavn / felagsnavn","degree":"Stig","declension":"Bending","mood":"Háttur","voice":"Søgn",
510
- "tense":"Tíð","person":"Persónur","definiteness":"Bundni/óbundni"},
511
- "en": {"subcategory":"Subcategory","gender":"Gender","number":"Number","case":"Case","article":"Definiteness",
512
- "proper":"Proper/common noun","degree":"Degree","declension":"Declension","mood":"Mood","voice":"Voice",
513
- "tense":"Tense","person":"Person","definiteness":"Definiteness"},
 
 
 
 
 
 
514
  }[lang].get(g, g)
 
515
  lines.append(f"**{group_name}**")
516
  for c in cs:
517
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
518
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
519
  lines.append("")
 
520
  lines.append("")
 
521
  return "\n".join(lines).strip()
522
 
 
 
 
523
  def run_model(sentence: str):
524
  s = (sentence or "").strip()
525
  if not s:
@@ -527,13 +436,24 @@ def run_model(sentence: str):
527
  tokens = simp_tok(s)
528
  if not tokens:
529
  return []
530
- enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
531
- padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
532
  input_ids = enc["input_ids"].to(device)
533
  attention_mask = enc["attention_mask"].to(device)
534
  word_ids = enc.word_ids(batch_index=0)
535
 
536
- begin, last = [], None
 
537
  for wid in word_ids:
538
  if wid is None:
539
  begin.append(0)
@@ -548,7 +468,9 @@ def run_model(sentence: str):
548
 
549
  vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
550
 
551
- rows, vec_i, seen = [], 0, set()
 
 
552
  for i,wid in enumerate(word_ids):
553
  if wid is None or begin[i]!=1 or wid in seen:
554
  continue
@@ -563,143 +485,117 @@ def render(rows_state, lang: str):
563
  lang = "fo" if lang=="fo" else "en"
564
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
565
  dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
 
566
  if not rows_state:
567
- return (pd.DataFrame(columns=df_cols), pd.DataFrame(columns=dfm_cols), build_overview(lang))
 
 
 
568
  out_main, out_mean = [], []
569
  for r in rows_state:
570
  vec = torch.tensor(r["vec"])
571
  tag = vector_to_tag(vec)
572
  out_main.append([r["word"], tag, analysis_text(vec, lang)])
573
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
574
- return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
575
 
 
 
 
 
 
 
 
 
 
576
  theme = gr.themes.Soft()
577
 
578
- with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
 
579
  with gr.Row(equal_height=True):
580
- with gr.Column(scale=2, elem_id="input_col"):
581
- inp = gr.Textbox(lines=6, placeholder="Skriva her ... / Type here ...", show_label=False, elem_id="input_box")
 
 
 
 
582
  with gr.Column(scale=1, min_width=320):
583
  gr.Markdown(
584
- "## Marka\n"
585
  "Skriv ein setning í kassan og fá hann markaðan.\n\n"
586
  f"Myndil / Model: [{MODEL_ID}]({MODEL_LINK})"
587
  )
588
  btn = gr.Button("Marka / Tag", variant="primary")
589
 
590
  state = gr.State([])
591
- lang_state = gr.State("fo")
592
-
593
- # Hide results header + toggle until Tag
594
- results_hdr = gr.Row(elem_id="results_hdr", visible=True)
595
- with results_hdr:
596
- results_title = gr.Markdown("### Úrslit / Results")
597
- with gr.Row(elem_id="lang_buttons"):
598
- btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=True)
599
- btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
600
- btn_lang_en_on = gr.Button("English", variant="primary", elem_id="lang_en_on", visible=False)
601
- btn_lang_en_off = gr.Button("English", variant="secondary", elem_id="lang_en_off", visible=True)
602
 
603
  out_df = gr.Dataframe(
604
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
605
- wrap=True, interactive=False, show_label=False,
606
- row_count=(0, "fixed"), col_count=(3, "fixed"),
 
 
 
607
  visible=False,
608
  )
609
 
610
- expanded_acc = gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False)
611
- with expanded_acc:
612
  out_mean_df = gr.Dataframe(
613
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
614
- wrap=True, interactive=False, show_label=False,
615
- row_count=(0, "fixed"), col_count=(3, "fixed"),
 
 
 
 
616
  )
617
 
618
- overview_acc = gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True)
619
- with overview_acc:
620
- overview_md = gr.Markdown(build_overview("fo"))
621
 
622
- def on_tag(sentence, lang_current):
623
  rows = run_model(sentence)
624
- df_main, df_mean, overview = render(rows, lang_current)
625
-
626
- show_fo = (lang_current == "fo")
627
- show_en = (lang_current == "en")
628
-
629
  return (
630
  rows,
631
  gr.update(value=df_main, visible=True),
632
  gr.update(value=df_mean),
633
  gr.update(value=overview),
634
  gr.update(visible=True), # expanded_acc
635
- # results_hdr is always visible now
636
- gr.update(visible=show_fo), # fo_on
637
- gr.update(visible=not show_fo), # fo_off
638
- gr.update(visible=show_en), # en_on
639
- gr.update(visible=not show_en), # en_off
640
- lang_current,
641
  )
642
 
643
- def on_set_lang(rows, lang_value):
644
- df_main, df_mean, overview = render(rows, lang_value)
645
-
646
- show_fo = (lang_value == "fo")
647
- show_en = (lang_value == "en")
648
-
649
  return (
650
- lang_value,
651
  gr.update(value=df_main),
652
  gr.update(value=df_mean),
653
  gr.update(value=overview),
654
- gr.update(visible=show_fo),
655
- gr.update(visible=not show_fo),
656
- gr.update(visible=show_en),
657
- gr.update(visible=not show_en),
658
  )
659
 
660
- def on_set_fo(rows):
661
- return on_set_lang(rows, "fo")
662
-
663
- def on_set_en(rows):
664
- return on_set_lang(rows, "en")
665
-
666
  btn.click(
667
  on_tag,
668
- inputs=[inp, lang_state],
669
- outputs=[state, out_df, out_mean_df, overview_md, expanded_acc,
670
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state],
671
  queue=False,
672
  )
673
 
674
- # Language switch (does NOT rerun the model; just re-renders existing rows)
675
- btn_lang_fo_on.click(
676
- on_set_fo,
677
- inputs=[state],
678
- outputs=[lang_state, out_df, out_mean_df, overview_md,
679
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
680
- queue=False,
681
- )
682
- btn_lang_fo_off.click(
683
- on_set_fo,
684
- inputs=[state],
685
- outputs=[lang_state, out_df, out_mean_df, overview_md,
686
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
687
- queue=False,
688
- )
689
- btn_lang_en_on.click(
690
- on_set_en,
691
- inputs=[state],
692
- outputs=[lang_state, out_df, out_mean_df, overview_md,
693
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
694
- queue=False,
695
- )
696
- btn_lang_en_off.click(
697
- on_set_en,
698
- inputs=[state],
699
- outputs=[lang_state, out_df, out_mean_df, overview_md,
700
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
701
  queue=False,
702
  )
703
 
704
  if __name__ == "__main__":
705
- demo.launch()
 
11
  # Config
12
  # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
+ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
15
+ LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
16
+ HF_TOKEN = os.getenv("BRAGD") # Space secret
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
21
  raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
22
+ if not os.path.exists(TAGS_FILEPATH):
23
+ raise RuntimeError(f"Missing {TAGS_FILEPATH}. Add it to the Space repo root.")
24
 
25
+ # Match your demo.py intervals
26
  INTERVALS = (
27
  (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
28
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
29
  )
30
 
31
+ GROUP_ORDER = [
32
+ "subcategory","gender","number","case","article","proper",
33
+ "degree","declension","mood","voice","tense","person","definiteness"
34
+ ]
35
 
36
+ # You said Subcategory B doesn't exist and will be deleted from the CSV:
37
+ HIDE_CODES = {"subcategory": {"B"}}
 
 
38
 
39
  MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
40
 
41
+ # ----------------------------
42
+ # Theme + CSS
43
+ # ----------------------------
44
+ CSS = """
45
+ :root{
46
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
47
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
 
48
  }
49
 
 
 
 
 
50
  body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
51
  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
52
  }
53
+
54
  a{ color:var(--primary-700)!important; }
55
 
56
  /* Primary button (Marka/Tag) */
 
59
  border-color:var(--primary-600)!important;
60
  color:#0b1b19!important;
61
  }
62
+ .gr-button-primary:hover, button.primary:hover, .primary:hover{
63
+ background:var(--primary-600)!important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  }
65
 
66
+ /* Keep Orð + Mark on one line; allow Útgreining to wrap */
67
+ .gr-dataframe table td:nth-child(1),
68
+ .gr-dataframe table th:nth-child(1){
69
+ white-space: nowrap !important;
70
+ width: 18% !important;
71
  }
72
+ .gr-dataframe table td:nth-child(2),
73
+ .gr-dataframe table th:nth-child(2){
74
+ white-space: nowrap !important;
75
+ width: 18% !important;
76
  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
77
  }
78
+ .gr-dataframe table td:nth-child(3),
79
+ .gr-dataframe table th:nth-child(3){
80
+ white-space: normal !important;
81
+ width: 64% !important;
82
  }
83
 
84
+ /* Results header layout: title left, language buttons right */
 
 
 
 
 
 
 
85
  #results_hdr{
86
+ display:grid;
87
+ grid-template-columns:1fr auto;
88
+ align-items:center;
89
+ gap:16px;
90
+ margin-top:10px;
 
 
 
 
91
  }
 
 
92
 
93
+ /* Remove any “box” background around the language selector */
94
  .lang_toggle{
95
  background: transparent !important;
96
+ border: none !important;
97
+ box-shadow: none !important;
98
+ padding: 0 !important;
99
+ margin: 0 !important;
 
 
 
100
  }
101
  .lang_toggle .wrap{
102
+ display:flex;
103
+ justify-content:flex-end;
104
+ gap:10px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  }
106
 
107
+ /* Hide native radio circles */
108
+ .lang_toggle input[type="radio"]{
109
+ display:none !important;
 
 
 
 
 
110
  }
111
 
112
+ /* Button-like labels */
113
  .lang_toggle label span{
114
  all: unset;
115
  display:inline-block;
116
  cursor:pointer;
117
  user-select:none;
118
+ padding:0.35rem 0.90rem;
119
  font-size:0.95rem;
120
+ font-weight:600;
121
  border-radius:10px;
122
  border:1px solid var(--primary-600);
123
+ background: var(--primary-200); /* inactive */
124
  color:#0b1b19;
 
125
  }
126
 
127
+ /* Selected */
128
+ .lang_toggle input:checked + span{
 
129
  background:var(--primary-500)!important;
130
  border-color:var(--primary-600)!important;
131
  color:#0b1b19!important;
132
  }
133
 
134
+ /* Hover */
135
+ .lang_toggle label:hover span{
136
+ background:var(--primary-500)!important;
137
+ border-color:var(--primary-600)!important;
138
+ color:#0b1b19!important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  }
140
+ """
141
 
142
+ # ----------------------------
143
+ # UI text
144
+ # ----------------------------
145
+ UI = {
146
+ "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
147
+ "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
 
 
 
 
 
 
 
 
 
148
  }
 
149
 
150
  # ----------------------------
151
  # Tokenization
 
188
  return (g, col.split()[-1])
189
  return (None,None)
190
 
191
+ # ----------------------------
192
+ # Decode helpers (your logic)
193
+ # ----------------------------
194
  def process_tag_features(tag_to_features: dict, intervals):
195
  arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
196
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
 
209
  for idx in range(len(logits)):
210
  if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
211
  continue
212
+
213
  pred = logits[idx]
214
  vec = torch.zeros(vec_len, device=logits.device)
215
+
216
  wt = torch.argmax(softmax(pred[0:15])).item()
217
  vec[wt]=1
218
+
219
  for (a,b) in dict_intervals.get(wt, []):
220
  seg = pred[a:b+1]
221
  k = torch.argmax(softmax(seg)).item()
222
  vec[a+k]=1
223
+
224
  vectors.append(vec)
225
  return vectors
226
 
227
  # ----------------------------
228
+ # Load labels (FO/EN)
229
  # ----------------------------
230
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
231
  LABELS = json.load(f)
 
241
  def clean_label(s: str) -> str:
242
  s = (s or "").strip()
243
  s = re.sub(r"\s+", " ", s)
244
+ return s.strip(" -;,:")
245
 
246
  # ----------------------------
247
  # Load model + mapping
 
250
 
251
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
252
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
253
+
254
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
255
  model.to(device); model.eval()
256
 
257
+ if hasattr(model, "config") and hasattr(model.config, "num_labels"):
258
+ if model.config.num_labels != VEC_LEN:
259
+ raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
260
 
261
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
262
 
263
+ # Build GROUPS from CSV headers
264
+ GROUPS = defaultdict(list) # group -> [(idx, code, colname)]
265
  for i,col in enumerate(FEATURE_COLS):
266
  g,code = group_from_col(col)
267
  if g and code not in HIDE_CODES.get(g, set()):
 
285
  return code
286
  return ""
287
 
288
+ # ----------------------------
289
+ # Display rules
290
+ # ----------------------------
291
+ HIDE_IN_ANALYSIS = {
292
+ ("D", "subcategory", "G"),
293
+ ("D", "subcategory", "N"),
294
+ }
295
+
296
  VOICE_ANALYSIS = {
297
  "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
298
  "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
 
303
  tag = vector_to_tag(vec)
304
  wc = wc_code(vec)
305
 
306
+ # DGd override
307
  if tag == "DGd":
308
  return "fyriseting" if lang=="fo" else "preposition"
309
 
310
  mood = group_code(vec, "mood")
311
+ if mood == "U": # luttøkuháttur / supine
312
  sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
313
  vcode = group_code(vec, "voice") or "v"
314
  vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
315
  return f"{clean_label(sup)}, {clean_label(vlabel)}"
316
 
317
  parts = []
318
+
319
+ # Pronouns + conjunctions: start with subcategory
320
  if wc in {"P","C"}:
321
  subc = group_code(vec, "subcategory")
322
  subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
 
335
  continue
336
  if (wc, g, c) in HIDE_IN_ANALYSIS:
337
  continue
338
+
339
+ lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) or ""
340
+ lbl = clean_label(lbl)
341
+ if not lbl:
342
+ continue
343
+ if lbl not in parts:
344
  parts.append(lbl)
345
 
346
  return ", ".join(parts)
 
349
  lang = "fo" if lang=="fo" else "en"
350
  wc = wc_code(vec)
351
  parts = []
352
+
353
  wc_lbl = label_for(lang, "word_class", wc, wc)
354
  parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
355
+
356
  for g in GROUP_ORDER:
357
  c = group_code(vec, g)
358
  if not c:
359
  continue
360
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
361
  parts.append(f"{c} – {lbl}" if lbl else c)
362
+
363
  return "; ".join([p for p in parts if p])
364
 
365
  def compute_codes_by_wc():
366
+ codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
367
  for arr in tag_to_features.values():
368
  arr = np.array(arr)
369
+
370
  wc = None
371
  for idx,code,_ in GROUPS["word_class"]:
372
  if arr[idx]==1:
 
374
  break
375
  if not wc:
376
  continue
377
+
378
  for g in GROUP_ORDER:
379
  hidden = HIDE_CODES.get(g, set())
380
  for idx,code,_ in GROUPS.get(g, []):
 
382
  continue
383
  if arr[idx]==1:
384
  codes[wc][g].add(code)
385
+
386
  return codes
387
 
388
  CODES_BY_WC = compute_codes_by_wc()
 
391
  lang = "fo" if lang=="fo" else "en"
392
  title = "### Markayvirlit" if lang=="fo" else "### Tag Overview"
393
  lines = [title, ""]
394
+
395
  for wc in sorted(CODES_BY_WC.keys()):
396
  wcl = label_for(lang, "word_class", wc, wc) or ""
397
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
398
+
399
  for g in GROUP_ORDER:
400
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
401
  if not cs:
402
  continue
403
+
404
  group_name = {
405
+ "fo": {
406
+ "subcategory":"Undirflokkur", "gender":"Kyn", "number":"Tal", "case":"Fall",
407
+ "article":"Bundni/óbundni", "proper":"Sernavn / felagsnavn", "degree":"Stig",
408
+ "declension":"Bending", "mood":"Háttur", "voice":"Søgn", "tense":"Tíð",
409
+ "person":"Persónur", "definiteness":"Bundni/óbundni",
410
+ },
411
+ "en": {
412
+ "subcategory":"Subcategory", "gender":"Gender", "number":"Number", "case":"Case",
413
+ "article":"Definiteness", "proper":"Proper/common noun", "degree":"Degree",
414
+ "declension":"Declension", "mood":"Mood", "voice":"Voice", "tense":"Tense",
415
+ "person":"Person", "definiteness":"Definiteness",
416
+ }
417
  }[lang].get(g, g)
418
+
419
  lines.append(f"**{group_name}**")
420
  for c in cs:
421
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
422
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
423
  lines.append("")
424
+
425
  lines.append("")
426
+
427
  return "\n".join(lines).strip()
428
 
429
+ # ----------------------------
430
+ # Inference
431
+ # ----------------------------
432
  def run_model(sentence: str):
433
  s = (sentence or "").strip()
434
  if not s:
 
436
  tokens = simp_tok(s)
437
  if not tokens:
438
  return []
439
+
440
+ enc = tokenizer(
441
+ tokens,
442
+ is_split_into_words=True,
443
+ add_special_tokens=True,
444
+ max_length=128,
445
+ padding="max_length",
446
+ truncation=True,
447
+ return_attention_mask=True,
448
+ return_tensors="pt",
449
+ )
450
+
451
  input_ids = enc["input_ids"].to(device)
452
  attention_mask = enc["attention_mask"].to(device)
453
  word_ids = enc.word_ids(batch_index=0)
454
 
455
+ begin = []
456
+ last = None
457
  for wid in word_ids:
458
  if wid is None:
459
  begin.append(0)
 
468
 
469
  vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
470
 
471
+ rows = []
472
+ vec_i = 0
473
+ seen = set()
474
  for i,wid in enumerate(word_ids):
475
  if wid is None or begin[i]!=1 or wid in seen:
476
  continue
 
485
  lang = "fo" if lang=="fo" else "en"
486
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
487
  dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
488
+
489
  if not rows_state:
490
+ empty_main = pd.DataFrame(columns=df_cols)
491
+ empty_mean = pd.DataFrame(columns=dfm_cols)
492
+ return empty_main, empty_mean, build_overview(lang)
493
+
494
  out_main, out_mean = [], []
495
  for r in rows_state:
496
  vec = torch.tensor(r["vec"])
497
  tag = vector_to_tag(vec)
498
  out_main.append([r["word"], tag, analysis_text(vec, lang)])
499
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
 
500
 
501
+ return (
502
+ pd.DataFrame(out_main, columns=df_cols),
503
+ pd.DataFrame(out_mean, columns=dfm_cols),
504
+ build_overview(lang),
505
+ )
506
+
507
+ # ----------------------------
508
+ # Gradio UI
509
+ # ----------------------------
510
  theme = gr.themes.Soft()
511
 
512
+ with gr.Blocks(theme=theme, title="Marka") as demo:
513
+ # Top: textbox LEFT, info+button RIGHT (DO NOT CHANGE TEXTBOX)
514
  with gr.Row(equal_height=True):
515
+ with gr.Column(scale=2):
516
+ inp = gr.Textbox(
517
+ lines=5,
518
+ placeholder="Skriva her ... / Type here ...",
519
+ show_label=False,
520
+ )
521
  with gr.Column(scale=1, min_width=320):
522
  gr.Markdown(
523
+ "## Marka\n\n"
524
  "Skriv ein setning í kassan og fá hann markaðan.\n\n"
525
  f"Myndil / Model: [{MODEL_ID}]({MODEL_LINK})"
526
  )
527
  btn = gr.Button("Marka / Tag", variant="primary")
528
 
529
  state = gr.State([])
530
+
531
+ # Results header (hidden until first run)
532
+ with gr.Row(elem_id="results_hdr", visible=False) as results_hdr:
533
+ gr.Markdown("### Úrslit / Results")
534
+ lang = gr.Radio(
535
+ choices=[("Føroyskt","fo"), ("English","en")],
536
+ value="fo",
537
+ show_label=False,
538
+ elem_classes=["lang_toggle"],
539
+ )
 
540
 
541
  out_df = gr.Dataframe(
542
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
543
+ wrap=True,
544
+ interactive=False,
545
+ show_label=False,
546
+ row_count=(0, "fixed"),
547
+ col_count=(3, "fixed"),
548
  visible=False,
549
  )
550
 
551
+ with gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False) as expanded_acc:
 
552
  out_mean_df = gr.Dataframe(
553
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
554
+ wrap=True,
555
+ interactive=False,
556
+ show_label=False,
557
+ row_count=(0, "fixed"),
558
+ col_count=(3, "fixed"),
559
+ visible=True,
560
  )
561
 
562
+ # Markayvirlit always visible
563
+ with gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True):
564
+ overview_md = gr.Markdown(build_overview("fo"), visible=True)
565
 
566
+ def on_tag(sentence, lang_choice):
567
  rows = run_model(sentence)
568
+ df_main, df_mean, overview = render(rows, lang_choice)
 
 
 
 
569
  return (
570
  rows,
571
  gr.update(value=df_main, visible=True),
572
  gr.update(value=df_mean),
573
  gr.update(value=overview),
574
  gr.update(visible=True), # expanded_acc
575
+ gr.update(visible=True), # results_hdr
 
 
 
 
 
576
  )
577
 
578
+ def on_lang(rows, lang_choice):
579
+ df_main, df_mean, overview = render(rows, lang_choice)
 
 
 
 
580
  return (
 
581
  gr.update(value=df_main),
582
  gr.update(value=df_mean),
583
  gr.update(value=overview),
 
 
 
 
584
  )
585
 
 
 
 
 
 
 
586
  btn.click(
587
  on_tag,
588
+ inputs=[inp, lang],
589
+ outputs=[state, out_df, out_mean_df, overview_md, expanded_acc, results_hdr],
 
590
  queue=False,
591
  )
592
 
593
+ lang.change(
594
+ on_lang,
595
+ inputs=[state, lang],
596
+ outputs=[out_df, out_mean_df, overview_md],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  queue=False,
598
  )
599
 
600
  if __name__ == "__main__":
601
+ demo.launch(css=CSS)