unijoh commited on
Commit
cc47e8b
·
verified ·
1 Parent(s): d66b86b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -215
app.py CHANGED
@@ -11,29 +11,23 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
11
  # Config
12
  # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
- TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
15
- LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
16
- HF_TOKEN = os.getenv("BRAGD") # Space secret
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
21
  raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
22
 
23
- # Match your demo.py intervals
24
  INTERVALS = (
25
  (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
26
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
27
  )
28
 
29
  GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
 
30
 
31
- # You said Subcategory B doesn't exist and will be deleted from the CSV:
32
- HIDE_CODES = {"subcategory": {"B"}}
33
-
34
- # ----------------------------
35
- # UI text
36
- # ----------------------------
37
  UI = {
38
  "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
39
  "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
@@ -41,24 +35,21 @@ UI = {
41
 
42
  MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
43
 
44
- # Theme color: #89AFA9 (+ close shades) + system font
45
  CSS = """
46
  :root{
47
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
48
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
49
- --page-bg: #f7f7f8;
50
  }
51
 
52
- /* Force a consistent page background (and remove white "cards" where possible) */
53
- html, body{
54
- background: var(--page-bg) !important;
55
- }
56
- .gradio-container{
57
  background: var(--page-bg) !important;
58
  }
59
  body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
60
  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
61
  }
 
62
 
63
  /* Primary button */
64
  .gr-button-primary, button.primary, .primary{
@@ -67,105 +58,79 @@ body, .gradio-container, .prose, .markdown, textarea, input, select, button, tab
67
  color:#0b1b19!important;
68
  }
69
  .gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
70
- a{ color:var(--primary-700)!important; }
71
 
72
- /* Remove "card" background around textbox wrapper; keep textarea readable */
 
 
 
 
 
 
 
 
73
  #input_box, #input_box > div, #input_box .wrap, #input_box .container{
74
  background: transparent !important;
75
- box-shadow: none !important;
76
- border: 0 !important;
77
  }
 
78
  #input_box textarea{
79
- background: #ffffff !important;
80
  }
81
 
82
- /* Dataframe column wrapping: keep Orð + Mark on one line */
83
- .gr-dataframe table td:nth-child(1),
84
- .gr-dataframe table th:nth-child(1){
85
- white-space: nowrap !important;
86
- width: 18% !important;
87
  }
88
- .gr-dataframe table td:nth-child(2),
89
- .gr-dataframe table th:nth-child(2){
90
- white-space: nowrap !important;
91
- width: 18% !important;
92
  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
93
  }
94
- .gr-dataframe table td:nth-child(3),
95
- .gr-dataframe table th:nth-child(3){
96
- white-space: normal !important;
97
- width: 64% !important;
98
  }
99
 
100
- /* Results header: clean left-title / right-language bar (no white container blocks) */
101
  #results_hdr{
102
  display:flex;
103
  align-items:center;
104
  justify-content:space-between;
105
- gap: 12px;
106
- padding: 0;
107
- background: transparent !important;
108
  }
109
- #results_hdr *{
110
- background: transparent !important;
111
- }
112
- #results_hdr .gr-block,
113
- #results_hdr .gr-form,
114
- #results_hdr .gr-box,
115
- #results_hdr .gr-panel,
116
- #results_hdr .gr-group{
117
- background: transparent !important;
118
- box-shadow: none !important;
119
- border: 0 !important;
120
  }
121
 
122
- /* Language toggle: segmented buttons, selected matches primary button */
123
- .lang_toggle{
124
- display:flex;
125
- justify-content:flex-end;
126
- align-items:center;
127
- }
128
- .lang_toggle fieldset{
129
- border: 0 !important;
130
- padding: 0 !important;
131
- margin: 0 !important;
132
- background: transparent !important;
133
- }
134
- .lang_toggle .wrap{
135
- display:flex !important;
136
- gap: 10px !important;
137
- background: transparent !important;
138
- }
139
- .lang_toggle input{
140
- display:none !important; /* no cursor ever */
141
- }
142
- .lang_toggle label{
143
- cursor:pointer;
144
- padding: 9px 14px;
145
- border-radius: 12px;
146
- border: 1px solid rgba(0,0,0,.14);
147
- background: transparent !important; /* match page background */
148
- user-select:none;
149
- font-size: 0.98rem;
150
- box-shadow: none !important;
151
- }
152
- .lang_toggle label:hover{
153
- border-color: rgba(0,0,0,.22);
154
- }
155
- .lang_toggle input:checked + span{
156
- background: var(--primary-500) !important;
157
- color:#0b1b19 !important;
158
- border-radius: 12px;
159
- padding: 9px 14px;
160
- border: 1px solid var(--primary-600) !important;
161
  display:inline-block;
 
 
 
 
 
 
162
  }
163
 
164
- /* Slightly smaller primary button */
165
- .gr-button-primary{ padding: 0.35rem 0.85rem !important; font-size: 0.95rem !important; }
166
-
167
- /* Make the right-side title "Marka" slightly bigger */
168
- #info_panel h2{ margin-top: 0.2rem; }
 
 
 
 
169
  """
170
 
171
  # ----------------------------
@@ -209,9 +174,6 @@ def group_from_col(col: str):
209
  return (g, col.split()[-1])
210
  return (None,None)
211
 
212
- # ----------------------------
213
- # Decode helpers (your logic)
214
- # ----------------------------
215
  def process_tag_features(tag_to_features: dict, intervals):
216
  arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
217
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
@@ -230,23 +192,19 @@ def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_le
230
  for idx in range(len(logits)):
231
  if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
232
  continue
233
-
234
  pred = logits[idx]
235
  vec = torch.zeros(vec_len, device=logits.device)
236
-
237
  wt = torch.argmax(softmax(pred[0:15])).item()
238
  vec[wt]=1
239
-
240
  for (a,b) in dict_intervals.get(wt, []):
241
  seg = pred[a:b+1]
242
  k = torch.argmax(softmax(seg)).item()
243
  vec[a+k]=1
244
-
245
  vectors.append(vec)
246
  return vectors
247
 
248
  # ----------------------------
249
- # Load labels (FO/EN)
250
  # ----------------------------
251
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
252
  LABELS = json.load(f)
@@ -262,8 +220,7 @@ def label_for(lang: str, group: str, wc: str, code: str) -> str:
262
  def clean_label(s: str) -> str:
263
  s = (s or "").strip()
264
  s = re.sub(r"\s+", " ", s)
265
- s = s.strip(" -;,:")
266
- return s
267
 
268
  # ----------------------------
269
  # Load model + mapping
@@ -275,14 +232,12 @@ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN
275
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
276
  model.to(device); model.eval()
277
 
278
- if hasattr(model, "config") and hasattr(model.config, "num_labels"):
279
- if model.config.num_labels != VEC_LEN:
280
- raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
281
 
282
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
283
 
284
- # Build GROUPS from CSV headers
285
- GROUPS = defaultdict(list) # group -> [(idx, code, colname)]
286
  for i,col in enumerate(FEATURE_COLS):
287
  g,code = group_from_col(col)
288
  if g and code not in HIDE_CODES.get(g, set()):
@@ -306,14 +261,7 @@ def group_code(vec: torch.Tensor, group: str) -> str:
306
  return code
307
  return ""
308
 
309
- # ----------------------------
310
- # Display rules
311
- # ----------------------------
312
- HIDE_IN_ANALYSIS = {
313
- ("D", "subcategory", "G"), # stýrir falli
314
- ("D", "subcategory", "N"), # stýrir ikki falli
315
- }
316
-
317
  VOICE_ANALYSIS = {
318
  "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
319
  "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
@@ -324,20 +272,17 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
324
  tag = vector_to_tag(vec)
325
  wc = wc_code(vec)
326
 
327
- # DGd override
328
  if tag == "DGd":
329
  return "fyriseting" if lang=="fo" else "preposition"
330
 
331
  mood = group_code(vec, "mood")
332
- if mood == "U": # luttøkuháttur / supine
333
  sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
334
  vcode = group_code(vec, "voice") or "v"
335
  vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
336
  return f"{clean_label(sup)}, {clean_label(vlabel)}"
337
 
338
  parts = []
339
-
340
- # Pronouns + conjunctions: subcategory already carries the head noun
341
  if wc in {"P","C"}:
342
  subc = group_code(vec, "subcategory")
343
  subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
@@ -356,13 +301,8 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
356
  continue
357
  if (wc, g, c) in HIDE_IN_ANALYSIS:
358
  continue
359
-
360
- lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) or ""
361
- lbl = clean_label(lbl)
362
- if not lbl:
363
- continue
364
-
365
- if lbl not in parts:
366
  parts.append(lbl)
367
 
368
  return ", ".join(parts)
@@ -371,24 +311,20 @@ def expanded_text(vec: torch.Tensor, lang: str) -> str:
371
  lang = "fo" if lang=="fo" else "en"
372
  wc = wc_code(vec)
373
  parts = []
374
-
375
  wc_lbl = label_for(lang, "word_class", wc, wc)
376
  parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
377
-
378
  for g in GROUP_ORDER:
379
  c = group_code(vec, g)
380
  if not c:
381
  continue
382
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
383
  parts.append(f"{c} – {lbl}" if lbl else c)
384
-
385
  return "; ".join([p for p in parts if p])
386
 
387
  def compute_codes_by_wc():
388
  codes = defaultdict(lambda: defaultdict(set))
389
  for arr in tag_to_features.values():
390
  arr = np.array(arr)
391
-
392
  wc = None
393
  for idx,code,_ in GROUPS["word_class"]:
394
  if arr[idx]==1:
@@ -396,7 +332,6 @@ def compute_codes_by_wc():
396
  break
397
  if not wc:
398
  continue
399
-
400
  for g in GROUP_ORDER:
401
  hidden = HIDE_CODES.get(g, set())
402
  for idx,code,_ in GROUPS.get(g, []):
@@ -404,7 +339,6 @@ def compute_codes_by_wc():
404
  continue
405
  if arr[idx]==1:
406
  codes[wc][g].add(code)
407
-
408
  return codes
409
 
410
  CODES_BY_WC = compute_codes_by_wc()
@@ -413,43 +347,29 @@ def build_overview(lang: str) -> str:
413
  lang = "fo" if lang=="fo" else "en"
414
  title = "### Markayvirlit" if lang=="fo" else "### Tag Overview"
415
  lines = [title, ""]
416
-
417
  for wc in sorted(CODES_BY_WC.keys()):
418
  wcl = label_for(lang, "word_class", wc, wc) or ""
419
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
420
-
421
  for g in GROUP_ORDER:
422
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
423
  if not cs:
424
  continue
425
  group_name = {
426
- "fo": {
427
- "subcategory":"Undirflokkur", "gender":"Kyn", "number":"Tal", "case":"Fall",
428
- "article":"Bundni/óbundni", "proper":"Sernavn / felagsnavn", "degree":"Stig",
429
- "declension":"Bending", "mood":"Háttur", "voice":"Søgn", "tense":"Tíð",
430
- "person":"Persónur", "definiteness":"Bundni/óbundni",
431
- },
432
- "en": {
433
- "subcategory":"Subcategory", "gender":"Gender", "number":"Number", "case":"Case",
434
- "article":"Definiteness", "proper":"Proper/common noun", "degree":"Degree",
435
- "declension":"Declension", "mood":"Mood", "voice":"Voice", "tense":"Tense",
436
- "person":"Person", "definiteness":"Definiteness",
437
- }
438
  }[lang].get(g, g)
439
-
440
  lines.append(f"**{group_name}**")
441
  for c in cs:
442
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
443
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
444
  lines.append("")
445
-
446
  lines.append("")
447
-
448
  return "\n".join(lines).strip()
449
 
450
- # ----------------------------
451
- # Inference
452
- # ----------------------------
453
  def run_model(sentence: str):
454
  s = (sentence or "").strip()
455
  if not s:
@@ -457,24 +377,13 @@ def run_model(sentence: str):
457
  tokens = simp_tok(s)
458
  if not tokens:
459
  return []
460
-
461
- enc = tokenizer(
462
- tokens,
463
- is_split_into_words=True,
464
- add_special_tokens=True,
465
- max_length=128,
466
- padding="max_length",
467
- truncation=True,
468
- return_attention_mask=True,
469
- return_tensors="pt",
470
- )
471
-
472
  input_ids = enc["input_ids"].to(device)
473
  attention_mask = enc["attention_mask"].to(device)
474
  word_ids = enc.word_ids(batch_index=0)
475
 
476
- begin = []
477
- last = None
478
  for wid in word_ids:
479
  if wid is None:
480
  begin.append(0)
@@ -489,9 +398,7 @@ def run_model(sentence: str):
489
 
490
  vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
491
 
492
- rows = []
493
- vec_i = 0
494
- seen = set()
495
  for i,wid in enumerate(word_ids):
496
  if wid is None or begin[i]!=1 or wid in seen:
497
  continue
@@ -506,41 +413,23 @@ def render(rows_state, lang: str):
506
  lang = "fo" if lang=="fo" else "en"
507
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
508
  dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
509
-
510
  if not rows_state:
511
- empty_main = pd.DataFrame(columns=df_cols)
512
- empty_mean = pd.DataFrame(columns=dfm_cols)
513
- return empty_main, empty_mean, build_overview(lang)
514
-
515
  out_main, out_mean = [], []
516
  for r in rows_state:
517
  vec = torch.tensor(r["vec"])
518
  tag = vector_to_tag(vec)
519
  out_main.append([r["word"], tag, analysis_text(vec, lang)])
520
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
 
521
 
522
- return (
523
- pd.DataFrame(out_main, columns=df_cols),
524
- pd.DataFrame(out_mean, columns=dfm_cols),
525
- build_overview(lang),
526
- )
527
-
528
- # ----------------------------
529
- # Gradio UI
530
- # ----------------------------
531
  theme = gr.themes.Soft()
532
 
533
  with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
534
- # Layout: textbox left, info right, button under info
535
  with gr.Row(equal_height=True):
536
- with gr.Column(scale=2):
537
- inp = gr.Textbox(
538
- lines=6,
539
- placeholder="Skriva her ... / Type here ...",
540
- show_label=False,
541
- elem_id="input_box",
542
- )
543
- with gr.Column(scale=1, min_width=320, elem_id="info_panel"):
544
  gr.Markdown(
545
  "## Marka\n"
546
  "Skriv ein setning í kassan og fá hann markaðan.\n\n"
@@ -550,9 +439,10 @@ with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
550
 
551
  state = gr.State([])
552
 
553
- # Results header row (always visible; table stays hidden until Tag)
554
- with gr.Row(elem_id="results_hdr"):
555
- gr.Markdown("### Úrslit / Results")
 
556
  lang = gr.Radio(
557
  choices=[("Føroyskt","fo"), ("English","en")],
558
  value="fo",
@@ -562,27 +452,19 @@ with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
562
 
563
  out_df = gr.Dataframe(
564
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
565
- wrap=True,
566
- interactive=False,
567
- show_label=False,
568
- row_count=(0, "fixed"),
569
- col_count=(3, "fixed"),
570
  visible=False,
571
  )
572
 
573
- # Expanded tags: hidden until tagged
574
  expanded_acc = gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False)
575
  with expanded_acc:
576
  out_mean_df = gr.Dataframe(
577
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
578
- wrap=True,
579
- interactive=False,
580
- show_label=False,
581
- row_count=(0, "fixed"),
582
- col_count=(3, "fixed"),
583
  )
584
 
585
- # Markayvirlit: always visible
586
  overview_acc = gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True)
587
  with overview_acc:
588
  overview_md = gr.Markdown(build_overview("fo"))
@@ -596,20 +478,17 @@ with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
596
  gr.update(value=df_mean),
597
  gr.update(value=overview),
598
  gr.update(visible=True), # expanded_acc
 
599
  )
600
 
601
  def on_lang(rows, lang_choice):
602
  df_main, df_mean, overview = render(rows, lang_choice)
603
- return (
604
- gr.update(value=df_main),
605
- gr.update(value=df_mean),
606
- gr.update(value=overview),
607
- )
608
 
609
  btn.click(
610
  on_tag,
611
  inputs=[inp, lang],
612
- outputs=[state, out_df, out_mean_df, overview_md, expanded_acc],
613
  queue=False,
614
  )
615
 
 
11
  # Config
12
  # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
+ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
15
+ LABELS_FILEPATH = "tag_labels.json"
16
+ HF_TOKEN = os.getenv("BRAGD")
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
21
  raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
22
 
 
23
  INTERVALS = (
24
  (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
25
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
26
  )
27
 
28
  GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
29
+ HIDE_CODES = {"subcategory": {"B"}} # Subcategory B to be removed
30
 
 
 
 
 
 
 
31
  UI = {
32
  "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
33
  "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
 
35
 
36
  MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
37
 
 
38
  CSS = """
39
  :root{
40
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
41
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
42
+ --page-bg:#f7f7f8;
43
  }
44
 
45
+ /* Page background */
46
+ html, body, .gradio-container{
 
 
 
47
  background: var(--page-bg) !important;
48
  }
49
  body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
50
  font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
51
  }
52
+ a{ color:var(--primary-700)!important; }
53
 
54
  /* Primary button */
55
  .gr-button-primary, button.primary, .primary{
 
58
  color:#0b1b19!important;
59
  }
60
  .gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
61
+ .gr-button-primary{ padding:0.35rem 0.85rem!important; font-size:0.95rem!important; }
62
 
63
+ /* --- Make the entire left input area blend with page background --- */
64
+ #input_col, #input_col *{
65
+ background: transparent !important;
66
+ }
67
+ #input_col .gr-block, #input_col .gr-panel, #input_col .gr-box, #input_col .gr-group, #input_col .gr-form{
68
+ background: transparent !important;
69
+ box-shadow:none !important;
70
+ border:0 !important;
71
+ }
72
  #input_box, #input_box > div, #input_box .wrap, #input_box .container{
73
  background: transparent !important;
74
+ box-shadow:none !important;
75
+ border:0 !important;
76
  }
77
+ /* Keep the actual typing area white */
78
  #input_box textarea{
79
+ background:#ffffff !important;
80
  }
81
 
82
+ /* Dataframe columns: keep Orð + Mark single-line */
83
+ .gr-dataframe table td:nth-child(1), .gr-dataframe table th:nth-child(1){
84
+ white-space: nowrap !important; width: 18% !important;
 
 
85
  }
86
+ .gr-dataframe table td:nth-child(2), .gr-dataframe table th:nth-child(2){
87
+ white-space: nowrap !important; width: 18% !important;
 
 
88
  font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
89
  }
90
+ .gr-dataframe table td:nth-child(3), .gr-dataframe table th:nth-child(3){
91
+ white-space: normal !important; width: 64% !important;
 
 
92
  }
93
 
94
+ /* Results header row: no card backgrounds */
95
  #results_hdr{
96
  display:flex;
97
  align-items:center;
98
  justify-content:space-between;
99
+ gap:12px;
100
+ padding:0;
101
+ background:transparent !important;
102
  }
103
+ #results_hdr .gr-block, #results_hdr .gr-panel, #results_hdr .gr-box, #results_hdr .gr-group, #results_hdr .gr-form{
104
+ background:transparent !important;
105
+ box-shadow:none !important;
106
+ border:0 !important;
 
 
 
 
 
 
 
107
  }
108
 
109
+ /* Language toggle: look like the Marka button */
110
+ .lang_toggle fieldset{ border:0!important; padding:0!important; margin:0!important; background:transparent!important; }
111
+ .lang_toggle .wrap{ display:flex!important; gap:10px!important; background:transparent!important; }
112
+ .lang_toggle input{ display:none!important; }
113
+
114
+ /* Base button style (same geometry as Marka button) */
115
+ .lang_toggle label span{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  display:inline-block;
117
+ padding:0.35rem 0.85rem;
118
+ font-size:0.95rem;
119
+ border-radius:10px;
120
+ border:1px solid var(--primary-600);
121
+ background:transparent;
122
+ color:#0b1b19;
123
  }
124
 
125
+ /* Selected = exactly like Marka button */
126
+ .lang_toggle input:checked + span{
127
+ background:var(--primary-500)!important;
128
+ border-color:var(--primary-600)!important;
129
+ color:#0b1b19!important;
130
+ }
131
+ .lang_toggle label:hover span{
132
+ background:var(--primary-200);
133
+ }
134
  """
135
 
136
  # ----------------------------
 
174
  return (g, col.split()[-1])
175
  return (None,None)
176
 
 
 
 
177
  def process_tag_features(tag_to_features: dict, intervals):
178
  arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
179
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
 
192
  for idx in range(len(logits)):
193
  if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
194
  continue
 
195
  pred = logits[idx]
196
  vec = torch.zeros(vec_len, device=logits.device)
 
197
  wt = torch.argmax(softmax(pred[0:15])).item()
198
  vec[wt]=1
 
199
  for (a,b) in dict_intervals.get(wt, []):
200
  seg = pred[a:b+1]
201
  k = torch.argmax(softmax(seg)).item()
202
  vec[a+k]=1
 
203
  vectors.append(vec)
204
  return vectors
205
 
206
  # ----------------------------
207
+ # Load labels
208
  # ----------------------------
209
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
210
  LABELS = json.load(f)
 
220
  def clean_label(s: str) -> str:
221
  s = (s or "").strip()
222
  s = re.sub(r"\s+", " ", s)
223
+ return s.strip(" -;,:").strip()
 
224
 
225
  # ----------------------------
226
  # Load model + mapping
 
232
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
233
  model.to(device); model.eval()
234
 
235
+ if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
236
+ raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
 
237
 
238
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
239
 
240
+ GROUPS = defaultdict(list)
 
241
  for i,col in enumerate(FEATURE_COLS):
242
  g,code = group_from_col(col)
243
  if g and code not in HIDE_CODES.get(g, set()):
 
261
  return code
262
  return ""
263
 
264
+ HIDE_IN_ANALYSIS = {("D","subcategory","G"), ("D","subcategory","N")}
 
 
 
 
 
 
 
265
  VOICE_ANALYSIS = {
266
  "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
267
  "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
 
272
  tag = vector_to_tag(vec)
273
  wc = wc_code(vec)
274
 
 
275
  if tag == "DGd":
276
  return "fyriseting" if lang=="fo" else "preposition"
277
 
278
  mood = group_code(vec, "mood")
279
+ if mood == "U":
280
  sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
281
  vcode = group_code(vec, "voice") or "v"
282
  vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
283
  return f"{clean_label(sup)}, {clean_label(vlabel)}"
284
 
285
  parts = []
 
 
286
  if wc in {"P","C"}:
287
  subc = group_code(vec, "subcategory")
288
  subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
 
301
  continue
302
  if (wc, g, c) in HIDE_IN_ANALYSIS:
303
  continue
304
+ lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
305
+ if lbl and lbl not in parts:
 
 
 
 
 
306
  parts.append(lbl)
307
 
308
  return ", ".join(parts)
 
311
  lang = "fo" if lang=="fo" else "en"
312
  wc = wc_code(vec)
313
  parts = []
 
314
  wc_lbl = label_for(lang, "word_class", wc, wc)
315
  parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
 
316
  for g in GROUP_ORDER:
317
  c = group_code(vec, g)
318
  if not c:
319
  continue
320
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
321
  parts.append(f"{c} – {lbl}" if lbl else c)
 
322
  return "; ".join([p for p in parts if p])
323
 
324
  def compute_codes_by_wc():
325
  codes = defaultdict(lambda: defaultdict(set))
326
  for arr in tag_to_features.values():
327
  arr = np.array(arr)
 
328
  wc = None
329
  for idx,code,_ in GROUPS["word_class"]:
330
  if arr[idx]==1:
 
332
  break
333
  if not wc:
334
  continue
 
335
  for g in GROUP_ORDER:
336
  hidden = HIDE_CODES.get(g, set())
337
  for idx,code,_ in GROUPS.get(g, []):
 
339
  continue
340
  if arr[idx]==1:
341
  codes[wc][g].add(code)
 
342
  return codes
343
 
344
  CODES_BY_WC = compute_codes_by_wc()
 
347
  lang = "fo" if lang=="fo" else "en"
348
  title = "### Markayvirlit" if lang=="fo" else "### Tag Overview"
349
  lines = [title, ""]
 
350
  for wc in sorted(CODES_BY_WC.keys()):
351
  wcl = label_for(lang, "word_class", wc, wc) or ""
352
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
 
353
  for g in GROUP_ORDER:
354
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
355
  if not cs:
356
  continue
357
  group_name = {
358
+ "fo": {"subcategory":"Undirflokkur","gender":"Kyn","number":"Tal","case":"Fall","article":"Bundni/óbundni",
359
+ "proper":"Sernavn / felagsnavn","degree":"Stig","declension":"Bending","mood":"Háttur","voice":"Søgn",
360
+ "tense":"Tíð","person":"Persónur","definiteness":"Bundni/óbundni"},
361
+ "en": {"subcategory":"Subcategory","gender":"Gender","number":"Number","case":"Case","article":"Definiteness",
362
+ "proper":"Proper/common noun","degree":"Degree","declension":"Declension","mood":"Mood","voice":"Voice",
363
+ "tense":"Tense","person":"Person","definiteness":"Definiteness"},
 
 
 
 
 
 
364
  }[lang].get(g, g)
 
365
  lines.append(f"**{group_name}**")
366
  for c in cs:
367
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
368
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
369
  lines.append("")
 
370
  lines.append("")
 
371
  return "\n".join(lines).strip()
372
 
 
 
 
373
  def run_model(sentence: str):
374
  s = (sentence or "").strip()
375
  if not s:
 
377
  tokens = simp_tok(s)
378
  if not tokens:
379
  return []
380
+ enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
381
+ padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
382
  input_ids = enc["input_ids"].to(device)
383
  attention_mask = enc["attention_mask"].to(device)
384
  word_ids = enc.word_ids(batch_index=0)
385
 
386
+ begin, last = [], None
 
387
  for wid in word_ids:
388
  if wid is None:
389
  begin.append(0)
 
398
 
399
  vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
400
 
401
+ rows, vec_i, seen = [], 0, set()
 
 
402
  for i,wid in enumerate(word_ids):
403
  if wid is None or begin[i]!=1 or wid in seen:
404
  continue
 
413
  lang = "fo" if lang=="fo" else "en"
414
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
415
  dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
 
416
  if not rows_state:
417
+ return (pd.DataFrame(columns=df_cols), pd.DataFrame(columns=dfm_cols), build_overview(lang))
 
 
 
418
  out_main, out_mean = [], []
419
  for r in rows_state:
420
  vec = torch.tensor(r["vec"])
421
  tag = vector_to_tag(vec)
422
  out_main.append([r["word"], tag, analysis_text(vec, lang)])
423
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
424
+ return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
425
 
 
 
 
 
 
 
 
 
 
426
  theme = gr.themes.Soft()
427
 
428
  with gr.Blocks(theme=theme, css=CSS, title="Marka") as demo:
 
429
  with gr.Row(equal_height=True):
430
+ with gr.Column(scale=2, elem_id="input_col"):
431
+ inp = gr.Textbox(lines=6, placeholder="Skriva her ... / Type here ...", show_label=False, elem_id="input_box")
432
+ with gr.Column(scale=1, min_width=320):
 
 
 
 
 
433
  gr.Markdown(
434
  "## Marka\n"
435
  "Skriv ein setning í kassan og fá hann markaðan.\n\n"
 
439
 
440
  state = gr.State([])
441
 
442
+ # Hide results header + toggle until Tag
443
+ results_hdr = gr.Row(elem_id="results_hdr", visible=False)
444
+ with results_hdr:
445
+ results_title = gr.Markdown("### Úrslit / Results")
446
  lang = gr.Radio(
447
  choices=[("Føroyskt","fo"), ("English","en")],
448
  value="fo",
 
452
 
453
  out_df = gr.Dataframe(
454
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
455
+ wrap=True, interactive=False, show_label=False,
456
+ row_count=(0, "fixed"), col_count=(3, "fixed"),
 
 
 
457
  visible=False,
458
  )
459
 
 
460
  expanded_acc = gr.Accordion("Útgreinað marking / Expanded tags", open=False, visible=False)
461
  with expanded_acc:
462
  out_mean_df = gr.Dataframe(
463
  value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
464
+ wrap=True, interactive=False, show_label=False,
465
+ row_count=(0, "fixed"), col_count=(3, "fixed"),
 
 
 
466
  )
467
 
 
468
  overview_acc = gr.Accordion("Markayvirlit / Tag Overview", open=False, visible=True)
469
  with overview_acc:
470
  overview_md = gr.Markdown(build_overview("fo"))
 
478
  gr.update(value=df_mean),
479
  gr.update(value=overview),
480
  gr.update(visible=True), # expanded_acc
481
+ gr.update(visible=True), # results_hdr
482
  )
483
 
484
  def on_lang(rows, lang_choice):
485
  df_main, df_mean, overview = render(rows, lang_choice)
486
+ return (gr.update(value=df_main), gr.update(value=df_mean), gr.update(value=overview))
 
 
 
 
487
 
488
  btn.click(
489
  on_tag,
490
  inputs=[inp, lang],
491
+ outputs=[state, out_df, out_mean_df, overview_md, expanded_acc, results_hdr],
492
  queue=False,
493
  )
494