unijoh commited on
Commit
5844053
·
verified ·
1 Parent(s): 21053de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -206
app.py CHANGED
@@ -11,46 +11,43 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
11
  # Config
12
  # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
- TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
15
- LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
16
- HF_TOKEN = os.getenv("BRAGD") # Space secret
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
21
  raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
22
- if not os.path.exists(TAGS_FILEPATH):
23
- raise RuntimeError(f"Missing {TAGS_FILEPATH}. Add it to the Space repo root.")
24
 
25
- # Match your demo.py intervals
26
  INTERVALS = (
27
  (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
28
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
29
  )
30
 
31
- GROUP_ORDER = [
32
- "subcategory","gender","number","case","article","proper",
33
- "degree","declension","mood","voice","tense","person","definiteness"
34
- ]
35
 
36
- # You said Subcategory B doesn't exist and will be deleted from the CSV:
37
- HIDE_CODES = {"subcategory": {"B"}}
 
 
38
 
39
  MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
40
 
41
- # ----------------------------
42
- # Theme + CSS
43
- # ----------------------------
44
- CSS = """
45
- :root{
46
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
47
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
 
48
  }
49
 
 
 
 
 
50
  body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
51
  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
52
  }
53
-
54
  a{ color:var(--primary-700)!important; }
55
 
56
  /* Primary button (Marka/Tag) */
@@ -59,94 +56,233 @@ a{ color:var(--primary-700)!important; }
59
  border-color:var(--primary-600)!important;
60
  color:#0b1b19!important;
61
  }
62
- .gr-button-primary:hover, button.primary:hover, .primary:hover{
63
- background:var(--primary-600)!important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  }
65
 
66
- /* Keep Orð + Mark on one line; allow Útgreining to wrap */
67
- .gr-dataframe table td:nth-child(1),
68
- .gr-dataframe table th:nth-child(1){
69
- white-space: nowrap !important;
70
- width: 18% !important;
71
  }
72
- .gr-dataframe table td:nth-child(2),
73
- .gr-dataframe table th:nth-child(2){
74
- white-space: nowrap !important;
75
- width: 18% !important;
76
  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
77
  }
78
- .gr-dataframe table td:nth-child(3),
79
- .gr-dataframe table th:nth-child(3){
80
- white-space: normal !important;
81
- width: 64% !important;
82
  }
83
 
84
- /* Results header layout: title left, language buttons right */
 
 
 
 
 
 
 
85
  #results_hdr{
86
- display:grid;
87
- grid-template-columns:1fr auto;
88
- align-items:center;
89
- gap:16px;
90
- margin-top:10px;
 
 
 
 
91
  }
 
 
92
 
93
- /* Remove any “box” background around the language selector */
94
  .lang_toggle{
95
  background: transparent !important;
96
- border: none !important;
97
- box-shadow: none !important;
98
- padding: 0 !important;
99
- margin: 0 !important;
 
 
 
100
  }
101
  .lang_toggle .wrap{
102
- display:flex;
103
- justify-content:flex-end;
104
- gap:10px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  }
106
 
107
- /* Hide native radio circles */
108
- .lang_toggle input[type="radio"]{
109
- display:none !important;
 
 
 
 
 
110
  }
111
 
112
- /* Button-like labels */
113
  .lang_toggle label span{
114
  all: unset;
115
  display:inline-block;
116
  cursor:pointer;
117
  user-select:none;
118
- padding:0.35rem 0.90rem;
119
  font-size:0.95rem;
120
- font-weight:600;
121
  border-radius:10px;
122
  border:1px solid var(--primary-600);
123
- background: var(--primary-200); /* inactive */
124
  color:#0b1b19;
 
125
  }
126
 
127
- /* Selected */
128
- .lang_toggle input:checked + span{
 
129
  background:var(--primary-500)!important;
130
  border-color:var(--primary-600)!important;
131
  color:#0b1b19!important;
132
  }
133
 
134
- /* Hover */
135
- .lang_toggle label:hover span{
136
- background:var(--primary-500)!important;
137
- border-color:var(--primary-600)!important;
138
- color:#0b1b19!important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  }
140
- """
141
 
142
- # ----------------------------
143
- # UI text
144
- # ----------------------------
145
- UI = {
146
- "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
147
- "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  }
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  # ----------------------------
151
  # Tokenization
152
  # ----------------------------
@@ -188,9 +324,6 @@ def group_from_col(col: str):
188
  return (g, col.split()[-1])
189
  return (None,None)
190
 
191
- # ----------------------------
192
- # Decode helpers (your logic)
193
- # ----------------------------
194
  def process_tag_features(tag_to_features: dict, intervals):
195
  arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
196
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
@@ -209,23 +342,19 @@ def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_le
209
  for idx in range(len(logits)):
210
  if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
211
  continue
212
-
213
  pred = logits[idx]
214
  vec = torch.zeros(vec_len, device=logits.device)
215
-
216
  wt = torch.argmax(softmax(pred[0:15])).item()
217
  vec[wt]=1
218
-
219
  for (a,b) in dict_intervals.get(wt, []):
220
  seg = pred[a:b+1]
221
  k = torch.argmax(softmax(seg)).item()
222
  vec[a+k]=1
223
-
224
  vectors.append(vec)
225
  return vectors
226
 
227
  # ----------------------------
228
- # Load labels (FO/EN)
229
  # ----------------------------
230
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
231
  LABELS = json.load(f)
@@ -241,7 +370,7 @@ def label_for(lang: str, group: str, wc: str, code: str) -> str:
241
  def clean_label(s: str) -> str:
242
  s = (s or "").strip()
243
  s = re.sub(r"\s+", " ", s)
244
- return s.strip(" -;,:")
245
 
246
  # ----------------------------
247
  # Load model + mapping
@@ -250,18 +379,15 @@ tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS
250
 
251
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
252
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
253
-
254
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
255
  model.to(device); model.eval()
256
 
257
- if hasattr(model, "config") and hasattr(model.config, "num_labels"):
258
- if model.config.num_labels != VEC_LEN:
259
- raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
260
 
261
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
262
 
263
- # Build GROUPS from CSV headers
264
- GROUPS = defaultdict(list) # group -> [(idx, code, colname)]
265
  for i,col in enumerate(FEATURE_COLS):
266
  g,code = group_from_col(col)
267
  if g and code not in HIDE_CODES.get(g, set()):
@@ -285,14 +411,7 @@ def group_code(vec: torch.Tensor, group: str) -> str:
285
  return code
286
  return ""
287
 
288
- # ----------------------------
289
- # Display rules
290
- # ----------------------------
291
- HIDE_IN_ANALYSIS = {
292
- ("D", "subcategory", "G"),
293
- ("D", "subcategory", "N"),
294
- }
295
-
296
  VOICE_ANALYSIS = {
297
  "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
298
  "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
@@ -303,20 +422,17 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
303
  tag = vector_to_tag(vec)
304
  wc = wc_code(vec)
305
 
306
- # DGd override
307
  if tag == "DGd":
308
  return "fyriseting" if lang=="fo" else "preposition"
309
 
310
  mood = group_code(vec, "mood")
311
- if mood == "U": # luttøkuháttur / supine
312
  sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
313
  vcode = group_code(vec, "voice") or "v"
314
  vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
315
  return f"{clean_label(sup)}, {clean_label(vlabel)}"
316
 
317
  parts = []
318
-
319
- # Pronouns + conjunctions: start with subcategory
320
  if wc in {"P","C"}:
321
  subc = group_code(vec, "subcategory")
322
  subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
@@ -335,12 +451,8 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
335
  continue
336
  if (wc, g, c) in HIDE_IN_ANALYSIS:
337
  continue
338
-
339
- lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) or ""
340
- lbl = clean_label(lbl)
341
- if not lbl:
342
- continue
343
- if lbl not in parts:
344
  parts.append(lbl)
345
 
346
  return ", ".join(parts)
@@ -349,24 +461,20 @@ def expanded_text(vec: torch.Tensor, lang: str) -> str:
349
  lang = "fo" if lang=="fo" else "en"
350
  wc = wc_code(vec)
351
  parts = []
352
-
353
  wc_lbl = label_for(lang, "word_class", wc, wc)
354
  parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
355
-
356
  for g in GROUP_ORDER:
357
  c = group_code(vec, g)
358
  if not c:
359
  continue
360
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
361
  parts.append(f"{c} – {lbl}" if lbl else c)
362
-
363
  return "; ".join([p for p in parts if p])
364
 
365
  def compute_codes_by_wc():
366
- codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
367
  for arr in tag_to_features.values():
368
  arr = np.array(arr)
369
-
370
  wc = None
371
  for idx,code,_ in GROUPS["word_class"]:
372
  if arr[idx]==1:
@@ -374,7 +482,6 @@ def compute_codes_by_wc():
374
  break
375
  if not wc:
376
  continue
377
-
378
  for g in GROUP_ORDER:
379
  hidden = HIDE_CODES.get(g, set())
380
  for idx,code,_ in GROUPS.get(g, []):
@@ -382,7 +489,6 @@ def compute_codes_by_wc():
382
  continue
383
  if arr[idx]==1:
384
  codes[wc][g].add(code)
385
-
386
  return codes
387
 
388
  CODES_BY_WC = compute_codes_by_wc()
@@ -391,44 +497,29 @@ def build_overview(lang: str) -> str:
391
  lang = "fo" if lang=="fo" else "en"
392
  title = "### Markayvirlit" if lang=="fo" else "### Tag Overview"
393
  lines = [title, ""]
394
-
395
  for wc in sorted(CODES_BY_WC.keys()):
396
  wcl = label_for(lang, "word_class", wc, wc) or ""
397
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
398
-
399
  for g in GROUP_ORDER:
400
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
401
  if not cs:
402
  continue
403
-
404
  group_name = {
405
- "fo": {
406
- "subcategory":"Undirflokkur", "gender":"Kyn", "number":"Tal", "case":"Fall",
407
- "article":"Bundni/óbundni", "proper":"Sernavn / felagsnavn", "degree":"Stig",
408
- "declension":"Bending", "mood":"Háttur", "voice":"Søgn", "tense":"Tíð",
409
- "person":"Persónur", "definiteness":"Bundni/óbundni",
410
- },
411
- "en": {
412
- "subcategory":"Subcategory", "gender":"Gender", "number":"Number", "case":"Case",
413
- "article":"Definiteness", "proper":"Proper/common noun", "degree":"Degree",
414
- "declension":"Declension", "mood":"Mood", "voice":"Voice", "tense":"Tense",
415
- "person":"Person", "definiteness":"Definiteness",
416
- }
417
  }[lang].get(g, g)
418
-
419
  lines.append(f"**{group_name}**")
420
  for c in cs:
421
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
422
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
423
  lines.append("")
424
-
425
  lines.append("")
426
-
427
  return "\n".join(lines).strip()
428
 
429
- # ----------------------------
430
- # Inference
431
- # ----------------------------
432
  def run_model(sentence: str):
433
  s = (sentence or "").strip()
434
  if not s:
@@ -436,24 +527,13 @@ def run_model(sentence: str):
436
  tokens = simp_tok(s)
437
  if not tokens:
438
  return []
439
-
440
- enc = tokenizer(
441
- tokens,
442
- is_split_into_words=True,
443
- add_special_tokens=True,
444
- max_length=128,
445
- padding="max_length",
446
- truncation=True,
447
- return_attention_mask=True,
448
- return_tensors="pt",
449
- )
450
-
451
  input_ids = enc["input_ids"].to(device)
452
  attention_mask = enc["attention_mask"].to(device)
453
  word_ids = enc.word_ids(batch_index=0)
454
 
455
- begin = []
456
- last = None
457
  for wid in word_ids:
458
  if wid is None:
459
  begin.append(0)
@@ -468,9 +548,7 @@ def run_model(sentence: str):
468
 
469
  vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
470
 
471
- rows = []
472
- vec_i = 0
473
- seen = set()
474
  for i,wid in enumerate(word_ids):
475
  if wid is None or begin[i]!=1 or wid in seen:
476
  continue
@@ -485,117 +563,143 @@ def render(rows_state, lang: str):
485
  lang = "fo" if lang=="fo" else "en"
486
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
487
  dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
488
-
489
  if not rows_state:
490
- empty_main = pd.DataFrame(columns=df_cols)
491
- empty_mean = pd.DataFrame(columns=dfm_cols)
492
- return empty_main, empty_mean, build_overview(lang)
493
-
494
  out_main, out_mean = [], []
495
  for r in rows_state:
496
  vec = torch.tensor(r["vec"])
497
  tag = vector_to_tag(vec)
498
  out_main.append([r["word"], tag, analysis_text(vec, lang)])
499
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
 
500
 
501
- return (
502
- pd.DataFrame(out_main, columns=df_cols),
503
- pd.DataFrame(out_mean, columns=dfm_cols),
504
- build_overview(lang),
505
- )
506
-
507
- # ----------------------------
508
- # Gradio UI
509
- # ----------------------------
510
  theme = gr.themes.Soft()
511
 
512
- with gr.Blocks(theme=theme, title="Marka") as demo:
513
- # Top: textbox LEFT, info+button RIGHT (DO NOT CHANGE TEXTBOX)
514
  with gr.Row(equal_height=True):
515
- with gr.Column(scale=2):
516
- inp = gr.Textbox(
517
- lines=5,
518
- placeholder="Skriva her ... / Type here ...",
519
- show_label=False,
520
- )
521
  with gr.Column(scale=1, min_width=320):
522
  gr.Markdown(
523
- "## Marka\n\n"
524
  "Skriv ein setning í kassan og fá hann markaðan.\n\n"
525
  f"Myndil / Model: [{MODEL_ID}]({MODEL_LINK})"
526
  )
527
  btn = gr.Button("Marka / Tag", variant="primary")
528
 
529
  state = gr.State([])
530
-
531
- # Results header (hidden until first run)
532
- with gr.Row(elem_id="results_hdr", visible=False) as results_hdr:
533
- gr.Markdown("### Úrslit / Results")
534
- lang = gr.Radio(
535
- choices=[("Føroyskt","fo"), ("English","en")],
536
- value="fo",
537
- show_label=False,
538
- elem_classes=["lang_toggle"],
539
- )
 
540
 
541
  out_df = gr.Dataframe(
542
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
543
- wrap=True,
544
- interactive=False,
545
- show_label=False,
546
- row_count=(0, "fixed"),
547
- col_count=(3, "fixed"),
548
  visible=False,
549
  )
550
 
551
- with gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False) as expanded_acc:
 
552
  out_mean_df = gr.Dataframe(
553
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
554
- wrap=True,
555
- interactive=False,
556
- show_label=False,
557
- row_count=(0, "fixed"),
558
- col_count=(3, "fixed"),
559
- visible=True,
560
  )
561
 
562
- # Markayvirlit always visible
563
- with gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True):
564
- overview_md = gr.Markdown(build_overview("fo"), visible=True)
565
 
566
- def on_tag(sentence, lang_choice):
567
  rows = run_model(sentence)
568
- df_main, df_mean, overview = render(rows, lang_choice)
 
 
 
 
569
  return (
570
  rows,
571
  gr.update(value=df_main, visible=True),
572
  gr.update(value=df_mean),
573
  gr.update(value=overview),
574
  gr.update(visible=True), # expanded_acc
575
- gr.update(visible=True), # results_hdr
 
 
 
 
 
576
  )
577
 
578
- def on_lang(rows, lang_choice):
579
- df_main, df_mean, overview = render(rows, lang_choice)
 
 
 
 
580
  return (
 
581
  gr.update(value=df_main),
582
  gr.update(value=df_mean),
583
  gr.update(value=overview),
 
 
 
 
584
  )
585
 
 
 
 
 
 
 
586
  btn.click(
587
  on_tag,
588
- inputs=[inp, lang],
589
- outputs=[state, out_df, out_mean_df, overview_md, expanded_acc, results_hdr],
 
590
  queue=False,
591
  )
592
 
593
- lang.change(
594
- on_lang,
595
- inputs=[state, lang],
596
- outputs=[out_df, out_mean_df, overview_md],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  queue=False,
598
  )
599
 
600
  if __name__ == "__main__":
601
- demo.launch(css=CSS)
 
11
  # Config
12
  # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
+ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
15
+ LABELS_FILEPATH = "tag_labels.json"
16
+ HF_TOKEN = os.getenv("BRAGD")
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
21
  raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
 
 
22
 
 
23
  INTERVALS = (
24
  (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
25
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
26
  )
27
 
28
+ GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
29
+ HIDE_CODES = {"subcategory": {"B"}} # Subcategory B to be removed
 
 
30
 
31
+ UI = {
32
+ "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
33
+ "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
34
+ }
35
 
36
  MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
37
 
38
+ CSS = """:root{
 
 
 
 
39
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
40
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
41
+ --page-bg:#f7f7f8;
42
  }
43
 
44
+ /* Page background */
45
+ html, body, .gradio-container{
46
+ background: var(--page-bg) !important;
47
+ }
48
  body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
49
  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
50
  }
 
51
  a{ color:var(--primary-700)!important; }
52
 
53
  /* Primary button (Marka/Tag) */
 
56
  border-color:var(--primary-600)!important;
57
  color:#0b1b19!important;
58
  }
59
+ .gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
60
+ .gr-button-primary{ padding:0.35rem 0.85rem!important; font-size:0.95rem!important; }
61
+
62
+ /* --- Keep the textbox exactly as-is: wrapper blends with page, textarea stays white --- */
63
+ #input_col, #input_col *{
64
+ background: transparent !important;
65
+ }
66
+ #input_col .gr-block, #input_col .gr-panel, #input_col .gr-box, #input_col .gr-group, #input_col .gr-form{
67
+ background: transparent !important;
68
+ box-shadow:none !important;
69
+ border:0 !important;
70
+ }
71
+ #input_box, #input_box > div, #input_box .wrap, #input_box .container{
72
+ background: transparent !important;
73
+ box-shadow:none !important;
74
+ border:0 !important;
75
+ }
76
+ #input_box textarea{
77
+ background:#ffffff !important;
78
  }
79
 
80
+ /* Dataframe columns: keep Orð + Mark single-line */
81
+ .gr-dataframe table td:nth-child(1), .gr-dataframe table th:nth-child(1){
82
+ white-space: nowrap !important; width: 18% !important;
 
 
83
  }
84
+ .gr-dataframe table td:nth-child(2), .gr-dataframe table th:nth-child(2){
85
+ white-space: nowrap !important; width: 18% !important;
 
 
86
  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
87
  }
88
+ .gr-dataframe table td:nth-child(3), .gr-dataframe table th:nth-child(3){
89
+ white-space: normal !important; width: 64% !important;
 
 
90
  }
91
 
92
+ /* Selected = match Marka/Tag exactly */
93
+ /* Hover = subtle */
94
+ /* Keep selected button color on hover; only lighten UNSELECTED on hover */
95
+ /* Push language buttons fully to the right */
96
+ #results_hdr > .gr-markdown{
97
+ flex:1 1 auto !important;
98
+ }
99
+ /* Results header row: two-column layout, title left, toggle hard-right */
100
  #results_hdr{
101
+ display:grid !important;
102
+ grid-template-columns: 1fr auto !important;
103
+ align-items:center !important;
104
+ gap:12px !important;
105
+ padding:0 !important;
106
+ margin:0 !important;
107
+ background:transparent !important;
108
+ box-shadow:none !important;
109
+ border:0 !important;
110
  }
111
+ #results_hdr > .gr-column:first-child{ justify-self:start !important; }
112
+ #results_hdr > .gr-column:last-child{ justify-self:end !important; }
113
 
114
+ /* Language toggle (gr.Radio): style the LABEL as the button (robust across Gradio DOM variants) */
115
  .lang_toggle{
116
  background: transparent !important;
117
+ justify-self:end !important;
118
+ }
119
+ .lang_toggle fieldset{
120
+ border:0!important;
121
+ padding:0!important;
122
+ margin:0!important;
123
+ background:transparent!important;
124
  }
125
  .lang_toggle .wrap{
126
+ display:flex!important;
127
+ gap:10px!important;
128
+ background:transparent!important;
129
+ padding:0!important;
130
+ margin:0!important;
131
+ }
132
+ .lang_toggle input{
133
+ display:none!important;
134
+ }
135
+
136
+ /* Kill any default Gradio "pill" styling inside */
137
+ .lang_toggle label *{
138
+ background:transparent!important;
139
+ box-shadow:none!important;
140
+ border:0!important;
141
+ }
142
+
143
+ /* The actual button */
144
+ .lang_toggle label{
145
+ display:inline-flex !important;
146
+ align-items:center !important;
147
+ justify-content:center !important;
148
+ cursor:pointer !important;
149
+ user-select:none !important;
150
+
151
+ padding:0.35rem 0.85rem !important;
152
+ font-size:0.95rem !important;
153
+ border-radius:10px !important;
154
+
155
+ border:1px solid var(--primary-600) !important;
156
+ background: var(--primary-200) !important; /* inactive: lighter than #89AFA9 */
157
+ color:#0b1b19 !important; /* black-ish */
158
+ }
159
+
160
+ /* Active/selected */
161
+ .lang_toggle label:has(input:checked){
162
+ background: #89AFA9 !important;
163
+ border-color: var(--primary-600) !important;
164
+ color:#0b1b19 !important;
165
+ }
166
+
167
+ /* Hover: show #89AFA9 (inactive becomes active color on hover) */
168
+ .lang_toggle label:hover{
169
+ background:#89AFA9 !important;
170
+ border-color: var(--primary-600) !important;
171
+ color:#0b1b19 !important;
172
  }
173
 
174
+
175
+ /* Remove Gradio's default label styling completely */
176
+ .lang_toggle label{
177
+ background:transparent!important;
178
+ border:0!important;
179
+ padding:0!important;
180
+ margin:0!important;
181
+ box-shadow:none!important;
182
  }
183
 
184
+ /* Single visible button layer */
185
  .lang_toggle label span{
186
  all: unset;
187
  display:inline-block;
188
  cursor:pointer;
189
  user-select:none;
190
+ padding:0.35rem 0.85rem;
191
  font-size:0.95rem;
 
192
  border-radius:10px;
193
  border:1px solid var(--primary-600);
194
+ background: transparent; /* same as page */
195
  color:#0b1b19;
196
+ box-shadow:none!important;
197
  }
198
 
199
+ /* Selected state (robust selectors) */
200
+ .lang_toggle input:checked ~ span,
201
+ .lang_toggle label:has(input:checked) span{
202
  background:var(--primary-500)!important;
203
  border-color:var(--primary-600)!important;
204
  color:#0b1b19!important;
205
  }
206
 
207
+ /* Hover: only unselected gets light background */
208
+ .lang_toggle label:hover input:not(:checked) ~ span,
209
+ .lang_toggle label:hover:not(:has(input:checked)) span{
210
+ background:var(--primary-200)!important;
211
+ }
212
+ /* --- Language buttons (robust: 4 real buttons, show/hide to indicate active) --- */
213
+ #results_hdr{
214
+ display:grid !important;
215
+ grid-template-columns: 1fr auto !important;
216
+ align-items:center !important;
217
+ gap:12px !important;
218
+ padding:0 !important;
219
+ margin:0 !important;
220
+ background:transparent !important;
221
+ box-shadow:none !important;
222
+ border:0 !important;
223
+ }
224
+ #lang_buttons{
225
+ display:flex !important;
226
+ gap:10px !important;
227
+ justify-content:flex-end !important;
228
+ align-items:center !important;
229
+ flex-wrap:nowrap !important;
230
+ }
231
+ #lang_buttons .gr-button, #lang_buttons button{
232
+ padding:0.35rem 0.85rem !important;
233
+ font-size:0.95rem !important;
234
+ border-radius:10px !important;
235
  }
 
236
 
237
+ /* Inactive: lighter than #89AFA9, black text */
238
+ #lang_fo_off, #lang_en_off{
239
+ background:var(--primary-200) !important;
240
+ border-color:var(--primary-600) !important;
241
+ color:#0b1b19 !important;
242
+ }
243
+ /* Hover inactive -> active color (#89AFA9) */
244
+ #lang_fo_off:hover, #lang_en_off:hover{
245
+ background:var(--primary-500) !important;
246
+ border-color:var(--primary-600) !important;
247
+ color:#0b1b19 !important;
248
+ }
249
+ /* Active: ensure black text */
250
+ #lang_fo_on, #lang_en_on{
251
+ color:#0b1b19 !important;
252
+ }
253
+
254
+ /* Keep header transparent, but DON'T nuke button backgrounds */
255
+ #results_hdr, #results_hdr > div{
256
+ background:transparent !important;
257
+ box-shadow:none !important;
258
+ border:0 !important;
259
+ }
260
+
261
+ /* Prevent Gradio from stacking/stretching language buttons */
262
+ #lang_buttons .gr-button, #lang_buttons button{
263
+ width:auto !important;
264
+ min-width:120px !important;
265
+ flex:0 0 auto !important;
266
  }
267
 
268
+ /* Language button colors */
269
+ #lang_buttons .gr-button-primary, #lang_buttons button.primary{
270
+ background:#89AFA9 !important;
271
+ border-color:#6F9992 !important;
272
+ color:#0b1b19 !important;
273
+ }
274
+ #lang_buttons .gr-button-secondary, #lang_buttons button.secondary{
275
+ background:#C6DAD6 !important; /* light green */
276
+ border-color:#6F9992 !important;
277
+ color:#0b1b19 !important;
278
+ }
279
+ #lang_buttons .gr-button-secondary:hover, #lang_buttons button.secondary:hover{
280
+ background:#89AFA9 !important;
281
+ border-color:#6F9992 !important;
282
+ color:#0b1b19 !important;
283
+ }
284
+ """
285
+
286
  # ----------------------------
287
  # Tokenization
288
  # ----------------------------
 
324
  return (g, col.split()[-1])
325
  return (None,None)
326
 
 
 
 
327
  def process_tag_features(tag_to_features: dict, intervals):
328
  arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
329
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
 
342
  for idx in range(len(logits)):
343
  if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
344
  continue
 
345
  pred = logits[idx]
346
  vec = torch.zeros(vec_len, device=logits.device)
 
347
  wt = torch.argmax(softmax(pred[0:15])).item()
348
  vec[wt]=1
 
349
  for (a,b) in dict_intervals.get(wt, []):
350
  seg = pred[a:b+1]
351
  k = torch.argmax(softmax(seg)).item()
352
  vec[a+k]=1
 
353
  vectors.append(vec)
354
  return vectors
355
 
356
  # ----------------------------
357
+ # Load labels
358
  # ----------------------------
359
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
360
  LABELS = json.load(f)
 
370
  def clean_label(s: str) -> str:
371
  s = (s or "").strip()
372
  s = re.sub(r"\s+", " ", s)
373
+ return s.strip(" -;,:").strip()
374
 
375
  # ----------------------------
376
  # Load model + mapping
 
379
 
380
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
381
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
 
382
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
383
  model.to(device); model.eval()
384
 
385
+ if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
386
+ raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
 
387
 
388
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
389
 
390
+ GROUPS = defaultdict(list)
 
391
  for i,col in enumerate(FEATURE_COLS):
392
  g,code = group_from_col(col)
393
  if g and code not in HIDE_CODES.get(g, set()):
 
411
  return code
412
  return ""
413
 
414
+ HIDE_IN_ANALYSIS = {("D","subcategory","G"), ("D","subcategory","N")}
 
 
 
 
 
 
 
415
  VOICE_ANALYSIS = {
416
  "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
417
  "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
 
422
  tag = vector_to_tag(vec)
423
  wc = wc_code(vec)
424
 
 
425
  if tag == "DGd":
426
  return "fyriseting" if lang=="fo" else "preposition"
427
 
428
  mood = group_code(vec, "mood")
429
+ if mood == "U":
430
  sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
431
  vcode = group_code(vec, "voice") or "v"
432
  vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
433
  return f"{clean_label(sup)}, {clean_label(vlabel)}"
434
 
435
  parts = []
 
 
436
  if wc in {"P","C"}:
437
  subc = group_code(vec, "subcategory")
438
  subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
 
451
  continue
452
  if (wc, g, c) in HIDE_IN_ANALYSIS:
453
  continue
454
+ lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
455
+ if lbl and lbl not in parts:
 
 
 
 
456
  parts.append(lbl)
457
 
458
  return ", ".join(parts)
 
461
  lang = "fo" if lang=="fo" else "en"
462
  wc = wc_code(vec)
463
  parts = []
 
464
  wc_lbl = label_for(lang, "word_class", wc, wc)
465
  parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
 
466
  for g in GROUP_ORDER:
467
  c = group_code(vec, g)
468
  if not c:
469
  continue
470
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
471
  parts.append(f"{c} – {lbl}" if lbl else c)
 
472
  return "; ".join([p for p in parts if p])
473
 
474
  def compute_codes_by_wc():
475
+ codes = defaultdict(lambda: defaultdict(set))
476
  for arr in tag_to_features.values():
477
  arr = np.array(arr)
 
478
  wc = None
479
  for idx,code,_ in GROUPS["word_class"]:
480
  if arr[idx]==1:
 
482
  break
483
  if not wc:
484
  continue
 
485
  for g in GROUP_ORDER:
486
  hidden = HIDE_CODES.get(g, set())
487
  for idx,code,_ in GROUPS.get(g, []):
 
489
  continue
490
  if arr[idx]==1:
491
  codes[wc][g].add(code)
 
492
  return codes
493
 
494
  CODES_BY_WC = compute_codes_by_wc()
 
497
  lang = "fo" if lang=="fo" else "en"
498
  title = "### Markayvirlit" if lang=="fo" else "### Tag Overview"
499
  lines = [title, ""]
 
500
  for wc in sorted(CODES_BY_WC.keys()):
501
  wcl = label_for(lang, "word_class", wc, wc) or ""
502
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
 
503
  for g in GROUP_ORDER:
504
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
505
  if not cs:
506
  continue
 
507
  group_name = {
508
+ "fo": {"subcategory":"Undirflokkur","gender":"Kyn","number":"Tal","case":"Fall","article":"Bundni/óbundni",
509
+ "proper":"Sernavn / felagsnavn","degree":"Stig","declension":"Bending","mood":"Háttur","voice":"Søgn",
510
+ "tense":"Tíð","person":"Persónur","definiteness":"Bundni/óbundni"},
511
+ "en": {"subcategory":"Subcategory","gender":"Gender","number":"Number","case":"Case","article":"Definiteness",
512
+ "proper":"Proper/common noun","degree":"Degree","declension":"Declension","mood":"Mood","voice":"Voice",
513
+ "tense":"Tense","person":"Person","definiteness":"Definiteness"},
 
 
 
 
 
 
514
  }[lang].get(g, g)
 
515
  lines.append(f"**{group_name}**")
516
  for c in cs:
517
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
518
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
519
  lines.append("")
 
520
  lines.append("")
 
521
  return "\n".join(lines).strip()
522
 
 
 
 
523
  def run_model(sentence: str):
524
  s = (sentence or "").strip()
525
  if not s:
 
527
  tokens = simp_tok(s)
528
  if not tokens:
529
  return []
530
+ enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
531
+ padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
532
  input_ids = enc["input_ids"].to(device)
533
  attention_mask = enc["attention_mask"].to(device)
534
  word_ids = enc.word_ids(batch_index=0)
535
 
536
+ begin, last = [], None
 
537
  for wid in word_ids:
538
  if wid is None:
539
  begin.append(0)
 
548
 
549
  vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
550
 
551
+ rows, vec_i, seen = [], 0, set()
 
 
552
  for i,wid in enumerate(word_ids):
553
  if wid is None or begin[i]!=1 or wid in seen:
554
  continue
 
563
  lang = "fo" if lang=="fo" else "en"
564
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
565
  dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
 
566
  if not rows_state:
567
+ return (pd.DataFrame(columns=df_cols), pd.DataFrame(columns=dfm_cols), build_overview(lang))
 
 
 
568
  out_main, out_mean = [], []
569
  for r in rows_state:
570
  vec = torch.tensor(r["vec"])
571
  tag = vector_to_tag(vec)
572
  out_main.append([r["word"], tag, analysis_text(vec, lang)])
573
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
574
+ return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
575
 
 
 
 
 
 
 
 
 
 
576
  theme = gr.themes.Soft()
577
 
578
+ with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
 
579
  with gr.Row(equal_height=True):
580
+ with gr.Column(scale=2, elem_id="input_col"):
581
+ inp = gr.Textbox(lines=6, placeholder="Skriva her ... / Type here ...", show_label=False, elem_id="input_box")
 
 
 
 
582
  with gr.Column(scale=1, min_width=320):
583
  gr.Markdown(
584
+ "## Marka\n"
585
  "Skriv ein setning í kassan og fá hann markaðan.\n\n"
586
  f"Myndil / Model: [{MODEL_ID}]({MODEL_LINK})"
587
  )
588
  btn = gr.Button("Marka / Tag", variant="primary")
589
 
590
  state = gr.State([])
591
+ lang_state = gr.State("fo")
592
+
593
+ # Hide results header + toggle until Tag
594
+ results_hdr = gr.Row(elem_id="results_hdr", visible=True)
595
+ with results_hdr:
596
+ results_title = gr.Markdown("### Úrslit / Results")
597
+ with gr.Row(elem_id="lang_buttons"):
598
+ btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=True)
599
+ btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
600
+ btn_lang_en_on = gr.Button("English", variant="primary", elem_id="lang_en_on", visible=False)
601
+ btn_lang_en_off = gr.Button("English", variant="secondary", elem_id="lang_en_off", visible=True)
602
 
603
  out_df = gr.Dataframe(
604
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
605
+ wrap=True, interactive=False, show_label=False,
606
+ row_count=(0, "fixed"), col_count=(3, "fixed"),
 
 
 
607
  visible=False,
608
  )
609
 
610
+ expanded_acc = gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False)
611
+ with expanded_acc:
612
  out_mean_df = gr.Dataframe(
613
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
614
+ wrap=True, interactive=False, show_label=False,
615
+ row_count=(0, "fixed"), col_count=(3, "fixed"),
 
 
 
 
616
  )
617
 
618
+ overview_acc = gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True)
619
+ with overview_acc:
620
+ overview_md = gr.Markdown(build_overview("fo"))
621
 
622
+ def on_tag(sentence, lang_current):
623
  rows = run_model(sentence)
624
+ df_main, df_mean, overview = render(rows, lang_current)
625
+
626
+ show_fo = (lang_current == "fo")
627
+ show_en = (lang_current == "en")
628
+
629
  return (
630
  rows,
631
  gr.update(value=df_main, visible=True),
632
  gr.update(value=df_mean),
633
  gr.update(value=overview),
634
  gr.update(visible=True), # expanded_acc
635
+ # results_hdr is always visible now
636
+ gr.update(visible=show_fo), # fo_on
637
+ gr.update(visible=not show_fo), # fo_off
638
+ gr.update(visible=show_en), # en_on
639
+ gr.update(visible=not show_en), # en_off
640
+ lang_current,
641
  )
642
 
643
+ def on_set_lang(rows, lang_value):
644
+ df_main, df_mean, overview = render(rows, lang_value)
645
+
646
+ show_fo = (lang_value == "fo")
647
+ show_en = (lang_value == "en")
648
+
649
  return (
650
+ lang_value,
651
  gr.update(value=df_main),
652
  gr.update(value=df_mean),
653
  gr.update(value=overview),
654
+ gr.update(visible=show_fo),
655
+ gr.update(visible=not show_fo),
656
+ gr.update(visible=show_en),
657
+ gr.update(visible=not show_en),
658
  )
659
 
660
+ def on_set_fo(rows):
661
+ return on_set_lang(rows, "fo")
662
+
663
+ def on_set_en(rows):
664
+ return on_set_lang(rows, "en")
665
+
666
  btn.click(
667
  on_tag,
668
+ inputs=[inp, lang_state],
669
+ outputs=[state, out_df, out_mean_df, overview_md, expanded_acc,
670
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state],
671
  queue=False,
672
  )
673
 
674
+ # Language switch (does NOT rerun the model; just re-renders existing rows)
675
+ btn_lang_fo_on.click(
676
+ on_set_fo,
677
+ inputs=[state],
678
+ outputs=[lang_state, out_df, out_mean_df, overview_md,
679
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
680
+ queue=False,
681
+ )
682
+ btn_lang_fo_off.click(
683
+ on_set_fo,
684
+ inputs=[state],
685
+ outputs=[lang_state, out_df, out_mean_df, overview_md,
686
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
687
+ queue=False,
688
+ )
689
+ btn_lang_en_on.click(
690
+ on_set_en,
691
+ inputs=[state],
692
+ outputs=[lang_state, out_df, out_mean_df, overview_md,
693
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
694
+ queue=False,
695
+ )
696
+ btn_lang_en_off.click(
697
+ on_set_en,
698
+ inputs=[state],
699
+ outputs=[lang_state, out_df, out_mean_df, overview_md,
700
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off],
701
  queue=False,
702
  )
703
 
704
  if __name__ == "__main__":
705
+ demo.launch()