unijoh commited on
Commit
22e1960
·
verified ·
1 Parent(s): 13dbad2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +291 -369
  2. tag_labels.json +25 -21
app.py CHANGED
@@ -1,12 +1,10 @@
1
- import os
2
- import re
3
- import string
4
- import json
5
  from collections import defaultdict
6
 
7
  import gradio as gr
8
  import torch
9
  import numpy as np
 
10
  from transformers import AutoTokenizer, AutoModelForTokenClassification
11
 
12
  # ----------------------------
@@ -15,7 +13,7 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification
15
  MODEL_ID = "Setur/BRAGD"
16
  TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
17
  LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
18
- HF_TOKEN = os.getenv("BRAGD") # Space secret name
19
 
20
  if not HF_TOKEN:
21
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
@@ -28,267 +26,178 @@ INTERVALS = (
28
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
29
  )
30
 
31
- GROUP_ORDER = [
32
- "subcategory", "gender", "number", "case", "article", "proper",
33
- "degree", "declension", "mood", "voice", "tense", "person", "definiteness"
34
- ]
35
 
36
- # You said subcategory B doesn't exist and will be deleted from the CSV
37
  HIDE_CODES = {"subcategory": {"B"}}
38
 
 
 
 
39
  UI = {
40
- "fo": {
41
- "title": "BRAGD-markarin",
42
- "inst": "Skriv ein setning og fá hann markaðan.",
43
- "model": "Model:",
44
- "word": "Orð",
45
- "tag": "Mark",
46
- "analysis": "Útgreining",
47
- "results": "Úrslit",
48
- "expanded": "Útgreinað marking",
49
- "legend": "Markingaryvirlit",
50
- "lang": "Mál",
51
- },
52
- "en": {
53
- "title": "BRAGD tagger",
54
- "inst": "Type a sentence and get it tagged.",
55
- "model": "Model:",
56
- "word": "Word",
57
- "tag": "Tag",
58
- "analysis": "Analysis",
59
- "results": "Results",
60
- "expanded": "Expanded tags",
61
- "legend": "Tag legend",
62
- "lang": "Language",
63
- },
64
  }
65
 
66
- # Theme color: #89AFA9 (+ close shades)
67
- CSS = r"""
 
 
68
  :root{
69
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
70
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
71
  }
72
- .gr-button-primary{
73
- background:var(--primary-500)!important;
74
- border-color:var(--primary-600)!important;
75
- color:#0b1b19!important;
76
- padding: 8px 14px !important;
77
- font-size: 14px !important;
78
  }
79
- .gr-button-primary:hover{ background:var(--primary-600)!important; }
 
 
 
80
  a{ color:var(--primary-700)!important; }
81
 
82
- /* tighten overall vertical spacing a bit */
83
- .gradio-container .prose{ margin: 0 !important; }
84
- #header_md h2, #header_md p { margin: 0.2rem 0 !important; }
85
-
86
- /* language dropdown: small, no big box */
87
- #lang_dd { max-width: 160px; }
88
- #lang_dd .wrap { padding-top: 0 !important; }
89
-
90
- /* results table */
91
- table.bragd {
92
- width: 100%;
93
- border-collapse: separate;
94
- border-spacing: 0;
95
- border: 1px solid rgba(0,0,0,0.08);
96
- border-radius: 12px;
97
- overflow: hidden;
98
  }
99
- table.bragd thead th{
100
- text-align: left;
101
- font-weight: 600;
102
- background: rgba(137,175,169,0.20);
103
- padding: 10px 12px;
104
- border-bottom: 1px solid rgba(0,0,0,0.08);
105
- font-size: 13px;
106
  }
107
- table.bragd tbody td{
108
- padding: 10px 12px;
109
- border-bottom: 1px solid rgba(0,0,0,0.06);
110
- vertical-align: top;
111
- font-size: 14px;
112
  }
113
- table.bragd tbody tr:last-child td{ border-bottom: none; }
114
- td.wordcol, td.tagcol { white-space: nowrap; }
115
- td.tagcol { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; }
116
- td.analysiscol { white-space: normal; }
117
-
118
- /* Make Orð/Word column fit content */
119
- td.wordcol { width: 1%; }
120
- td.tagcol { min-width: 8ch; width: 1%; }
121
 
122
- /* Expanded tags table a touch smaller */
123
- table.bragd.small tbody td, table.bragd.small thead th { font-size: 13px; }
124
 
125
- /* header row for results + language picker */
126
- #results_header .prose h3 { margin: 0.2rem 0 !important; }
127
  """
128
 
129
  # ----------------------------
130
- # Utilities
131
  # ----------------------------
132
  def simp_tok(sentence: str):
133
  return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
134
 
135
- def load_tag_mappings(tags_filepath: str):
136
- import pandas as pd # local import keeps cold-start slightly lighter
137
- tags_df = pd.read_csv(tags_filepath)
138
-
139
- feature_cols = list(tags_df.columns[1:])
140
- tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in tags_df.iterrows()}
141
- features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in tags_df.iterrows()}
142
-
143
  return tag_to_features, features_to_tag, len(feature_cols), feature_cols
144
 
145
  def group_from_col(col: str):
146
- if col == "Article":
147
- return ("article", "A")
148
- if col.startswith("No-Article "):
149
- return ("article", col.split()[-1])
150
- if col == "Proper Noun":
151
- return ("proper", "P")
152
- if col.startswith("Not-Proper-Noun "):
153
- return ("proper", col.split()[-1])
154
 
155
  prefixes = [
156
- ("Word Class ", "word_class"),
157
- ("Subcategory ", "subcategory"), ("No-Subcategory ", "subcategory"),
158
- ("Gender ", "gender"), ("No-Gender ", "gender"),
159
- ("Number ", "number"), ("No-Number ", "number"),
160
- ("Case ", "case"), ("No-Case ", "case"),
161
- ("Degree ", "degree"), ("No-Degree ", "degree"),
162
- ("Declension ", "declension"), ("No-Declension ", "declension"),
163
- ("Mood ", "mood"),
164
- ("Voice ", "voice"), ("No-Voice ", "voice"),
165
- ("Tense ", "tense"), ("No-Tense ", "tense"),
166
- ("Person ", "person"), ("No-Person ", "person"),
167
- ("Definite ", "definiteness"), ("Indefinite ", "definiteness"),
168
  ]
169
- for p, g in prefixes:
170
  if col.startswith(p):
171
  return (g, col.split()[-1])
 
172
 
173
- return (None, None)
174
-
 
175
  def process_tag_features(tag_to_features: dict, intervals):
176
- # Compute allowed intervals per POS (like demo.py)
177
- list_of_tags = list(tag_to_features.values())
178
- unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
179
-
180
- word_type_masks = {wt: [arr for arr in unique_arrays if arr[wt] == 1] for wt in range(15)}
181
- dict_intervals = {}
182
-
183
- for wt in range(15):
184
- labels = word_type_masks[wt]
185
  if not labels:
186
- dict_intervals[wt] = []
187
  continue
188
  sum_labels = np.sum(np.array(labels), axis=0)
189
- allowed = [interval for interval in intervals if np.sum(sum_labels[interval[0]:interval[1] + 1]) != 0]
190
- dict_intervals[wt] = allowed
191
-
192
- return dict_intervals
193
 
194
- def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
195
  softmax = torch.nn.Softmax(dim=0)
196
  vectors = []
197
-
198
  for idx in range(len(logits)):
199
- if attention_mask[idx].item() != 1:
200
- continue
201
- if begin_tokens[idx] != 1:
202
  continue
203
 
204
- pred_logits = logits[idx]
205
  vec = torch.zeros(vec_len, device=logits.device)
206
 
207
- # POS
208
- probs = softmax(pred_logits[0:15])
209
- wt = torch.argmax(probs).item()
210
- vec[wt] = 1
211
 
212
- # feature groups
213
- for (a, b) in dict_intervals.get(wt, []):
214
- seg = pred_logits[a:b + 1]
215
- probs = softmax(seg)
216
- k = torch.argmax(probs).item()
217
- vec[a + k] = 1
218
 
219
  vectors.append(vec)
220
-
221
  return vectors
222
 
223
- def clean_label(s: str) -> str:
224
- s = (s or "").strip()
225
- s = re.sub(r"\s+", " ", s)
226
- return s.strip(" -;:,")
227
-
228
- def html_escape(s: str) -> str:
229
- return (
230
- (s or "")
231
- .replace("&", "&")
232
- .replace("<", "&lt;")
233
- .replace(">", "&gt;")
234
- .replace('"', "&quot;")
235
- )
236
-
237
- def rows_to_table_html(headers, rows, small=False):
238
- cls = "bragd small" if small else "bragd"
239
- thead = "".join(f"<th>{html_escape(h)}</th>" for h in headers)
240
- body = []
241
- for r in rows:
242
- body.append(
243
- "<tr>"
244
- f"<td class='wordcol'>{html_escape(r[0])}</td>"
245
- f"<td class='tagcol'>{html_escape(r[1])}</td>"
246
- f"<td class='analysiscol'>{html_escape(r[2])}</td>"
247
- "</tr>"
248
- )
249
- tbody = "".join(body) if body else "<tr><td class='wordcol'></td><td class='tagcol'></td><td class='analysiscol'></td></tr>"
250
- return f"<table class='{cls}'><thead><tr>{thead}</tr></thead><tbody>{tbody}</tbody></table>"
251
-
252
  # ----------------------------
253
- # Load labels (FO+EN)
254
  # ----------------------------
255
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
256
  LABELS = json.load(f)
257
 
258
- def label_for(lang: str, group: str, wc_code: str, code: str) -> str:
259
- lang = "fo" if lang == "fo" else "en"
260
  by_wc = LABELS.get(lang, {}).get("by_word_class", {})
261
  glob = LABELS.get(lang, {}).get("global", {})
262
-
263
- if wc_code and wc_code in by_wc and code in by_wc[wc_code].get(group, {}):
264
- return by_wc[wc_code][group][code]
265
  return glob.get(group, {}).get(code, "")
266
 
 
 
 
 
 
 
267
  # ----------------------------
268
- # Load mapping CSV + model
269
  # ----------------------------
270
  tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
271
 
272
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
273
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
274
-
275
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
276
- model.to(device)
277
- model.eval()
278
 
279
  if hasattr(model, "config") and hasattr(model.config, "num_labels"):
280
  if model.config.num_labels != VEC_LEN:
281
- raise RuntimeError(
282
- f"Label size mismatch: model has num_labels={model.config.num_labels}, "
283
- f"but {TAGS_FILEPATH} implies {VEC_LEN}. You likely uploaded the wrong CSV."
284
- )
285
 
286
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
287
 
288
- # Build group lookup from CSV feature columns
289
- GROUPS = defaultdict(list) # group -> list[(idx, code, colname)]
290
- for i, col in enumerate(FEATURE_COLS):
291
- g, code = group_from_col(col)
292
  if g and code not in HIDE_CODES.get(g, set()):
293
  GROUPS[g].append((i, code, col))
294
 
@@ -296,99 +205,95 @@ def vector_to_tag(vec: torch.Tensor) -> str:
296
  return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
297
 
298
  def wc_code(vec: torch.Tensor) -> str:
299
- for idx, code, _ in GROUPS["word_class"]:
300
- if int(vec[idx].item()) == 1:
301
  return code
302
  return ""
303
 
304
  def group_code(vec: torch.Tensor, group: str) -> str:
305
  hidden = HIDE_CODES.get(group, set())
306
- for idx, code, _ in GROUPS.get(group, []):
307
  if code in hidden:
308
  continue
309
- if int(vec[idx].item()) == 1:
310
  return code
311
  return ""
312
 
313
  # ----------------------------
314
- # Presentation logic
315
  # ----------------------------
316
- HIDE_IN_ANALYSIS_FO = {"stýrir falli", "stýrir ikki falli"}
317
- HIDE_IN_ANALYSIS_EN = {"governs case", "does not govern case"}
 
 
 
 
 
 
 
 
318
 
319
  def analysis_text(vec: torch.Tensor, lang: str) -> str:
320
  """
321
  Útgreining / Analysis:
322
- - only human text (no codes)
323
- - skip "stýrir falli" / "stýrir ikki falli"
324
- - DGd becomes ONLY "fyriseting"/"preposition"
325
- - pronouns and conjunctions start from subcategory (no duplicated base label)
326
  """
327
- lang = "fo" if lang == "fo" else "en"
328
- raw_tag = vector_to_tag(vec)
329
  wc = wc_code(vec)
330
 
331
- # DGd override: ONLY fyriseting / preposition
332
- if raw_tag == "DGd":
333
- return "fyriseting" if lang == "fo" else "preposition"
334
 
335
- # Determine whether to include base word-class label first
336
- include_wc = True
337
- if wc == "P": # pronouns: start from subcategory label
338
- include_wc = False
339
- if wc == "C": # conjunctions: prefer the subcategory phrase
340
- include_wc = False
341
 
342
- labels = []
343
 
344
- if include_wc:
345
- wc_lbl = clean_label(label_for(lang, "word_class", wc, wc) or wc)
346
- if wc_lbl:
347
- labels.append(wc_lbl)
 
 
 
 
 
 
348
 
349
- # Add groups in stable order
350
  for g in GROUP_ORDER:
351
  c = group_code(vec, g)
352
  if not c:
353
  continue
354
- lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
355
- if not lbl:
 
356
  continue
357
 
358
- if lang == "fo" and lbl in HIDE_IN_ANALYSIS_FO:
359
- continue
360
- if lang == "en" and lbl.lower() in HIDE_IN_ANALYSIS_EN:
361
- continue
362
-
363
- # for conjunctions: ensure the first visible label is the subcategory phrase
364
- if wc == "C" and g == "subcategory":
365
- labels.insert(0, lbl)
366
  continue
367
 
368
- labels.append(lbl)
369
-
370
- # Fallback if we removed wc label for pronouns/conjunctions and subcategory missing
371
- if not labels:
372
- wc_lbl = clean_label(label_for(lang, "word_class", wc, wc) or wc)
373
- if wc_lbl:
374
- labels = [wc_lbl]
375
-
376
- # Deduplicate while preserving order
377
- dedup = []
378
- seen = set()
379
- for x in labels:
380
- if x not in seen:
381
- dedup.append(x)
382
- seen.add(x)
383
 
384
- return ", ".join(dedup)
385
 
386
  def expanded_text(vec: torch.Tensor, lang: str) -> str:
387
  """
388
  Útgreinað marking / Expanded tags:
389
- includes code + label per group (useful for debugging).
390
  """
391
- lang = "fo" if lang == "fo" else "en"
392
  wc = wc_code(vec)
393
  parts = []
394
 
@@ -404,79 +309,63 @@ def expanded_text(vec: torch.Tensor, lang: str) -> str:
404
 
405
  return "; ".join([p for p in parts if p])
406
 
407
- def build_legend(lang: str) -> str:
408
- """
409
- Elaborate legend:
410
- Under each word class, show all letter codes that appear in the CURRENT CSV.
411
- """
412
- lang = "fo" if lang == "fo" else "en"
413
-
414
- # Build codes-by-wc from the CSV mapping vectors
415
  codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
416
  for arr in tag_to_features.values():
417
  arr = np.array(arr)
418
 
419
  wc = None
420
- for idx, code, _ in GROUPS["word_class"]:
421
- if arr[idx] == 1:
422
  wc = code
423
  break
424
  if not wc:
425
  continue
426
 
427
  for g in GROUP_ORDER:
428
- for idx, code, _ in GROUPS.get(g, []):
429
- if code in HIDE_CODES.get(g, set()):
 
430
  continue
431
- if arr[idx] == 1:
432
  codes[wc][g].add(code)
433
 
434
- title = f"### {UI[lang]['legend']}"
 
 
 
 
 
 
 
 
 
435
  lines = [title, ""]
436
 
437
- group_names = {
438
- "fo": {
439
- "subcategory": "Undirflokkur",
440
- "gender": "Kyn",
441
- "number": "Tal",
442
- "case": "Fall",
443
- "article": "Bundni/óbundni",
444
- "proper": "Sernavn",
445
- "degree": "Stig",
446
- "declension": "Bending",
447
- "mood": "Háttur",
448
- "voice": "Søgn",
449
- "tense": "Tíð",
450
- "person": "Persónur",
451
- "definiteness": "Bundni/óbundni",
452
- },
453
- "en": {
454
- "subcategory": "Subcategory",
455
- "gender": "Gender",
456
- "number": "Number",
457
- "case": "Case",
458
- "article": "Definiteness (suffix)",
459
- "proper": "Proper noun",
460
- "degree": "Degree",
461
- "declension": "Declension",
462
- "mood": "Mood",
463
- "voice": "Voice",
464
- "tense": "Tense",
465
- "person": "Person",
466
- "definiteness": "Definiteness",
467
- },
468
- }[lang]
469
-
470
- for wc in sorted(codes.keys()):
471
  wcl = label_for(lang, "word_class", wc, wc) or ""
472
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
473
 
474
  for g in GROUP_ORDER:
475
- cs = sorted(codes[wc].get(g, set()))
476
  if not cs:
477
  continue
478
-
479
- lines.append(f"**{group_names.get(g, g)}**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  for c in cs:
481
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
482
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
@@ -487,13 +376,12 @@ def build_legend(lang: str) -> str:
487
  return "\n".join(lines).strip()
488
 
489
  # ----------------------------
490
- # Model run + state
491
  # ----------------------------
492
  def run_model(sentence: str):
493
  s = (sentence or "").strip()
494
  if not s:
495
  return []
496
-
497
  tokens = simp_tok(s)
498
  if not tokens:
499
  return []
@@ -513,118 +401,152 @@ def run_model(sentence: str):
513
  attention_mask = enc["attention_mask"].to(device)
514
  word_ids = enc.word_ids(batch_index=0)
515
 
516
- # begin token mask: first subtoken per word
517
- begin_tokens = []
518
  last = None
519
  for wid in word_ids:
520
  if wid is None:
521
- begin_tokens.append(0)
522
  elif wid != last:
523
- begin_tokens.append(1)
524
  else:
525
- begin_tokens.append(0)
526
  last = wid
527
 
528
  with torch.no_grad():
529
- out = model(input_ids=input_ids, attention_mask=attention_mask)
530
- logits = out.logits[0]
531
 
532
- vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
533
 
534
  rows = []
535
  vec_i = 0
536
- seen_word_ids = set()
537
-
538
- for i, wid in enumerate(word_ids):
539
- if wid is None:
540
- continue
541
- if begin_tokens[i] != 1:
542
- continue
543
- if wid in seen_word_ids:
544
  continue
545
-
546
- seen_word_ids.add(wid)
547
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
548
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
549
  rows.append({"word": word, "vec": vec.int().tolist()})
550
  vec_i += 1
551
-
552
  return rows
553
 
554
- def render(rows_state, lang_choice: str):
555
- lang = "fo" if lang_choice == "fo" else "en"
556
-
557
- headers_main = [f"{UI[lang]['word']}", f"{UI[lang]['tag']}", f"{UI[lang]['analysis']}"]
558
- headers_exp = [f"{UI[lang]['word']}", f"{UI[lang]['tag']}", f"{UI[lang]['expanded']}"]
559
 
560
- main_rows = []
561
- exp_rows = []
 
 
562
 
563
- for r in (rows_state or []):
 
564
  vec = torch.tensor(r["vec"])
565
  tag = vector_to_tag(vec)
566
- main_rows.append([r["word"], tag, analysis_text(vec, lang)])
567
- exp_rows.append([r["word"], tag, expanded_text(vec, lang)])
568
 
569
- main_html = rows_to_table_html(headers_main, main_rows, small=False)
570
- exp_html = rows_to_table_html(headers_exp, exp_rows, small=True)
571
- legend_md = build_legend(lang)
572
-
573
- return main_html, exp_html, legend_md
574
 
575
  # ----------------------------
576
- # Gradio UI (compact + user-friendly)
577
  # ----------------------------
578
  theme = gr.themes.Soft()
579
 
580
  with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
 
581
  with gr.Row(equal_height=True):
582
- with gr.Column(scale=2, min_width=240):
583
  gr.Markdown(
584
- f"## {UI['fo']['title']}\n"
585
- f"{UI['fo']['inst']}\n\n"
586
- f"**{UI['fo']['model']}** `{MODEL_ID}`",
587
- elem_id="header_md"
588
  )
589
- with gr.Column(scale=5, min_width=420):
590
- inp = gr.Textbox(lines=5, label=None, placeholder="Skriv her… / Type here…")
591
- btn = gr.Button("Marka / Tag", variant="primary")
592
-
593
- # Results header row with language picker on the far right
594
- with gr.Row(equal_height=True, elem_id="results_header"):
595
- with gr.Column(scale=5):
596
- res_title = gr.Markdown(f"### {UI['fo']['results']} / {UI['en']['results']}")
597
- with gr.Column(scale=1, min_width=170):
598
- lang = gr.Dropdown(
599
- choices=[("Føroyskt", "fo"), ("English", "en")],
600
- value="fo",
601
- label=None,
602
- interactive=True,
603
- filterable=False,
604
- container=False,
605
- elem_id="lang_dd",
606
  )
 
607
 
608
  state = gr.State([])
609
 
610
- out_main = gr.HTML()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  with gr.Accordion("Útgreinað marking / Expanded tags", open=False):
612
- out_expanded = gr.HTML()
 
 
 
 
 
 
 
 
613
 
614
- with gr.Accordion("Markingaryvirlit / Tag legend", open=False):
615
- out_legend = gr.Markdown(build_legend("fo"))
616
 
617
  def on_tag(sentence, lang_choice):
618
  rows = run_model(sentence)
619
- main_html, exp_html, legend_md = render(rows, lang_choice)
620
- return rows, main_html, exp_html, legend_md
 
 
 
 
 
 
 
 
621
 
622
  def on_lang(rows, lang_choice):
623
- main_html, exp_html, legend_md = render(rows, lang_choice)
624
- return main_html, exp_html, legend_md
 
 
 
 
 
 
 
 
 
 
 
625
 
626
- btn.click(on_tag, inputs=[inp, lang], outputs=[state, out_main, out_expanded, out_legend])
627
- lang.change(on_lang, inputs=[state, lang], outputs=[out_main, out_expanded, out_legend])
 
 
 
 
628
 
629
  if __name__ == "__main__":
630
  demo.launch()
 
1
+ import os, re, string, json
 
 
 
2
  from collections import defaultdict
3
 
4
  import gradio as gr
5
  import torch
6
  import numpy as np
7
+ import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
10
  # ----------------------------
 
13
  MODEL_ID = "Setur/BRAGD"
14
  TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
15
  LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
16
+ HF_TOKEN = os.getenv("BRAGD") # Space secret
17
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
 
26
  (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
27
  )
28
 
29
+ GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
 
 
 
30
 
31
+ # You said Subcategory B doesn't exist and will be deleted from the CSV:
32
  HIDE_CODES = {"subcategory": {"B"}}
33
 
34
+ # ----------------------------
35
+ # UI text
36
+ # ----------------------------
37
  UI = {
38
+ "fo": {"w":"Orð", "t":"Mark", "s":"Útgreining", "m":"Útgreinað marking"},
39
+ "en": {"w":"Word","t":"Tag", "s":"Analysis", "m":"Expanded tags"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
 
42
+ MODEL_LINK = "https://huggingface.co/Setur/BRAGD"
43
+
44
+ # Theme color: #89AFA9 (+ close shades) + system font
45
+ CSS = """
46
  :root{
47
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
48
  --primary-100:#E1ECEA; --primary-200:#C6DAD6;
49
  }
50
+ body, .gradio-container, .prose, .markdown, textarea, input, select, button, table{
51
+ font-family:-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, "Noto Sans", sans-serif !important;
 
 
 
 
52
  }
53
+ .gr-button-primary, button.primary, .primary{
54
+ background:var(--primary-500)!important; border-color:var(--primary-600)!important; color:#0b1b19!important;
55
+ }
56
+ .gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
57
  a{ color:var(--primary-700)!important; }
58
 
59
+ /* Dataframe column wrapping: keep Orð + Mark on one line */
60
+ .gr-dataframe table td:nth-child(1),
61
+ .gr-dataframe table th:nth-child(1){
62
+ white-space: nowrap !important;
63
+ width: 18% !important;
 
 
 
 
 
 
 
 
 
 
 
64
  }
65
+ .gr-dataframe table td:nth-child(2),
66
+ .gr-dataframe table th:nth-child(2){
67
+ white-space: nowrap !important;
68
+ width: 18% !important;
69
+ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace !important;
 
 
70
  }
71
+ .gr-dataframe table td:nth-child(3),
72
+ .gr-dataframe table th:nth-child(3){
73
+ white-space: normal !important;
74
+ width: 64% !important;
 
75
  }
 
 
 
 
 
 
 
 
76
 
77
+ /* Make the language dropdown compact */
78
+ #lang_dd { max-width: 170px; }
79
 
80
+ /* Slightly smaller primary button */
81
+ .gr-button-primary{ padding: 0.35rem 0.85rem !important; font-size: 0.95rem !important; }
82
  """
83
 
84
  # ----------------------------
85
+ # Tokenization
86
  # ----------------------------
87
  def simp_tok(sentence: str):
88
  return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
89
 
90
+ # ----------------------------
91
+ # CSV mapping
92
+ # ----------------------------
93
+ def load_tag_mappings(path: str):
94
+ df = pd.read_csv(path)
95
+ feature_cols = list(df.columns[1:])
96
+ tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in df.iterrows()}
97
+ features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in df.iterrows()}
98
  return tag_to_features, features_to_tag, len(feature_cols), feature_cols
99
 
100
  def group_from_col(col: str):
101
+ if col == "Article": return ("article","A")
102
+ if col.startswith("No-Article "): return ("article", col.split()[-1])
103
+ if col == "Proper Noun": return ("proper","P")
104
+ if col.startswith("Not-Proper-Noun "): return ("proper", col.split()[-1])
 
 
 
 
105
 
106
  prefixes = [
107
+ ("Word Class ","word_class"),
108
+ ("Subcategory ","subcategory"), ("No-Subcategory ","subcategory"),
109
+ ("Gender ","gender"), ("No-Gender ","gender"),
110
+ ("Number ","number"), ("No-Number ","number"),
111
+ ("Case ","case"), ("No-Case ","case"),
112
+ ("Degree ","degree"), ("No-Degree ","degree"),
113
+ ("Declension ","declension"), ("No-Declension ","declension"),
114
+ ("Mood ","mood"),
115
+ ("Voice ","voice"), ("No-Voice ","voice"),
116
+ ("Tense ","tense"), ("No-Tense ","tense"),
117
+ ("Person ","person"), ("No-Person ","person"),
118
+ ("Definite ","definiteness"), ("Indefinite ","definiteness"),
119
  ]
120
+ for p,g in prefixes:
121
  if col.startswith(p):
122
  return (g, col.split()[-1])
123
+ return (None,None)
124
 
125
+ # ----------------------------
126
+ # Decode helpers (your logic)
127
+ # ----------------------------
128
  def process_tag_features(tag_to_features: dict, intervals):
129
+ arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
130
+ wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
131
+ out = {}
132
+ for wt,labels in wt_masks.items():
 
 
 
 
 
133
  if not labels:
134
+ out[wt]=[]
135
  continue
136
  sum_labels = np.sum(np.array(labels), axis=0)
137
+ out[wt] = [iv for iv in intervals if np.sum(sum_labels[iv[0]:iv[1]+1]) != 0]
138
+ return out
 
 
139
 
140
+ def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_len):
141
  softmax = torch.nn.Softmax(dim=0)
142
  vectors = []
 
143
  for idx in range(len(logits)):
144
+ if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
 
 
145
  continue
146
 
147
+ pred = logits[idx]
148
  vec = torch.zeros(vec_len, device=logits.device)
149
 
150
+ wt = torch.argmax(softmax(pred[0:15])).item()
151
+ vec[wt]=1
 
 
152
 
153
+ for (a,b) in dict_intervals.get(wt, []):
154
+ seg = pred[a:b+1]
155
+ k = torch.argmax(softmax(seg)).item()
156
+ vec[a+k]=1
 
 
157
 
158
  vectors.append(vec)
 
159
  return vectors
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # ----------------------------
162
+ # Load labels (FO/EN)
163
  # ----------------------------
164
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
165
  LABELS = json.load(f)
166
 
167
+ def label_for(lang: str, group: str, wc: str, code: str) -> str:
168
+ lang = "fo" if lang=="fo" else "en"
169
  by_wc = LABELS.get(lang, {}).get("by_word_class", {})
170
  glob = LABELS.get(lang, {}).get("global", {})
171
+ if wc and wc in by_wc and code in by_wc[wc].get(group, {}):
172
+ return by_wc[wc][group][code]
 
173
  return glob.get(group, {}).get(code, "")
174
 
175
+ def clean_label(s: str) -> str:
176
+ s = (s or "").strip()
177
+ s = re.sub(r"\s+", " ", s)
178
+ s = s.strip(" -;,:")
179
+ return s
180
+
181
  # ----------------------------
182
+ # Load model + mapping
183
  # ----------------------------
184
  tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
185
 
186
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
187
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
 
188
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
189
+ model.to(device); model.eval()
 
190
 
191
  if hasattr(model, "config") and hasattr(model.config, "num_labels"):
192
  if model.config.num_labels != VEC_LEN:
193
+ raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
 
 
 
194
 
195
  DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
196
 
197
+ # Build GROUPS from CSV headers
198
+ GROUPS = defaultdict(list) # group -> [(idx, code, colname)]
199
+ for i,col in enumerate(FEATURE_COLS):
200
+ g,code = group_from_col(col)
201
  if g and code not in HIDE_CODES.get(g, set()):
202
  GROUPS[g].append((i, code, col))
203
 
 
205
  return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
206
 
207
  def wc_code(vec: torch.Tensor) -> str:
208
+ for idx,code,_ in GROUPS["word_class"]:
209
+ if int(vec[idx].item())==1:
210
  return code
211
  return ""
212
 
213
  def group_code(vec: torch.Tensor, group: str) -> str:
214
  hidden = HIDE_CODES.get(group, set())
215
+ for idx,code,_ in GROUPS.get(group, []):
216
  if code in hidden:
217
  continue
218
+ if int(vec[idx].item())==1:
219
  return code
220
  return ""
221
 
222
  # ----------------------------
223
+ # Display rules
224
  # ----------------------------
225
+ HIDE_IN_ANALYSIS = {
226
+ # Word class D: hide "stýrir falli" / "stýrir ikki falli" in Analysis
227
+ ("D", "subcategory", "G"),
228
+ ("D", "subcategory", "N"),
229
+ }
230
+
231
+ VOICE_ANALYSIS = {
232
+ "fo": {"A": "gerðsøgn", "M": "miðalsøgn", "v": "orð luttøkuháttur"},
233
+ "en": {"A": "active voice", "M": "middle voice", "v": "supine form"},
234
+ }
235
 
236
  def analysis_text(vec: torch.Tensor, lang: str) -> str:
237
  """
238
  Útgreining / Analysis:
239
+ - plain words (no letters/hyphens)
240
+ - pronouns: start at subcategory, not word class
241
+ - DGd: show only fyriseting/preposition
242
+ - supine: show only supine + voice (drop verb/number/tense/person etc.)
243
  """
244
+ lang = "fo" if lang=="fo" else "en"
245
+ tag = vector_to_tag(vec)
246
  wc = wc_code(vec)
247
 
248
+ # DGd override
249
+ if tag == "DGd":
250
+ return "fyriseting" if lang=="fo" else "preposition"
251
 
252
+ mood = group_code(vec, "mood")
253
+ if mood == "U": # luttøkuháttur / supine
254
+ sup = label_for(lang, "mood", wc, "U") or ("luttøkuháttur" if lang=="fo" else "supine")
255
+ vcode = group_code(vec, "voice") or "v"
256
+ vlabel = VOICE_ANALYSIS[lang].get(vcode, VOICE_ANALYSIS[lang]["v"])
257
+ return f"{clean_label(sup)}, {clean_label(vlabel)}"
258
 
259
+ parts = []
260
 
261
+ # Pronouns + conjunctions: subcategory already carries the head noun (fornavn / sambindingarorð)
262
+ if wc in {"P","C"}:
263
+ subc = group_code(vec, "subcategory")
264
+ subl = clean_label(label_for(lang, "subcategory", wc, subc) or "")
265
+ if subl:
266
+ parts.append(subl)
267
+ else:
268
+ wcl = clean_label(label_for(lang, "word_class", wc, wc) or wc)
269
+ if wcl:
270
+ parts.append(wcl)
271
 
 
272
  for g in GROUP_ORDER:
273
  c = group_code(vec, g)
274
  if not c:
275
  continue
276
+ if wc in {"P","C"} and g == "subcategory":
277
+ continue # already added
278
+ if (wc, g, c) in HIDE_IN_ANALYSIS:
279
  continue
280
 
281
+ lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) or ""
282
+ lbl = clean_label(lbl)
283
+ if not lbl:
 
 
 
 
 
284
  continue
285
 
286
+ if lbl not in parts:
287
+ parts.append(lbl)
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ return ", ".join(parts)
290
 
291
  def expanded_text(vec: torch.Tensor, lang: str) -> str:
292
  """
293
  Útgreinað marking / Expanded tags:
294
+ codes + labels (useful for debugging and linguists)
295
  """
296
+ lang = "fo" if lang=="fo" else "en"
297
  wc = wc_code(vec)
298
  parts = []
299
 
 
309
 
310
  return "; ".join([p for p in parts if p])
311
 
312
+ def compute_codes_by_wc():
 
 
 
 
 
 
 
313
  codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
314
  for arr in tag_to_features.values():
315
  arr = np.array(arr)
316
 
317
  wc = None
318
+ for idx,code,_ in GROUPS["word_class"]:
319
+ if arr[idx]==1:
320
  wc = code
321
  break
322
  if not wc:
323
  continue
324
 
325
  for g in GROUP_ORDER:
326
+ hidden = HIDE_CODES.get(g, set())
327
+ for idx,code,_ in GROUPS.get(g, []):
328
+ if code in hidden:
329
  continue
330
+ if arr[idx]==1:
331
  codes[wc][g].add(code)
332
 
333
+ return codes
334
+
335
+ CODES_BY_WC = compute_codes_by_wc()
336
+
337
+ def build_overview(lang: str) -> str:
338
+ """
339
+ Overview under each word class with the letter codes actually used in the CURRENT CSV.
340
+ """
341
+ lang = "fo" if lang=="fo" else "en"
342
+ title = "### Markingaryvirlit" if lang=="fo" else "### Tag Overview"
343
  lines = [title, ""]
344
 
345
+ for wc in sorted(CODES_BY_WC.keys()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  wcl = label_for(lang, "word_class", wc, wc) or ""
347
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
348
 
349
  for g in GROUP_ORDER:
350
+ cs = sorted(CODES_BY_WC[wc].get(g, set()))
351
  if not cs:
352
  continue
353
+ group_name = {
354
+ "fo": {
355
+ "subcategory":"Undirflokkur", "gender":"Kyn", "number":"Tal", "case":"Fall",
356
+ "article":"Bundni/óbundni", "proper":"Sernavn / felagsnavn", "degree":"Stig",
357
+ "declension":"Bending", "mood":"Háttur", "voice":"Søgn", "tense":"Tíð",
358
+ "person":"Persónur", "definiteness":"Bundni/óbundni",
359
+ },
360
+ "en": {
361
+ "subcategory":"Subcategory", "gender":"Gender", "number":"Number", "case":"Case",
362
+ "article":"Definiteness", "proper":"Proper/common noun", "degree":"Degree",
363
+ "declension":"Declension", "mood":"Mood", "voice":"Voice", "tense":"Tense",
364
+ "person":"Person", "definiteness":"Definiteness",
365
+ }
366
+ }[lang].get(g, g)
367
+
368
+ lines.append(f"**{group_name}**")
369
  for c in cs:
370
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
371
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
 
376
  return "\n".join(lines).strip()
377
 
378
  # ----------------------------
379
+ # Inference
380
  # ----------------------------
381
  def run_model(sentence: str):
382
  s = (sentence or "").strip()
383
  if not s:
384
  return []
 
385
  tokens = simp_tok(s)
386
  if not tokens:
387
  return []
 
401
  attention_mask = enc["attention_mask"].to(device)
402
  word_ids = enc.word_ids(batch_index=0)
403
 
404
+ begin = []
 
405
  last = None
406
  for wid in word_ids:
407
  if wid is None:
408
+ begin.append(0)
409
  elif wid != last:
410
+ begin.append(1)
411
  else:
412
+ begin.append(0)
413
  last = wid
414
 
415
  with torch.no_grad():
416
+ logits = model(input_ids=input_ids, attention_mask=attention_mask).logits[0]
 
417
 
418
+ vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
419
 
420
  rows = []
421
  vec_i = 0
422
+ seen = set()
423
+ for i,wid in enumerate(word_ids):
424
+ if wid is None or begin[i]!=1 or wid in seen:
 
 
 
 
 
425
  continue
426
+ seen.add(wid)
 
427
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
428
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
429
  rows.append({"word": word, "vec": vec.int().tolist()})
430
  vec_i += 1
 
431
  return rows
432
 
433
+ def render(rows_state, lang: str):
434
+ lang = "fo" if lang=="fo" else "en"
435
+ df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
436
+ dfm_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]]
 
437
 
438
+ if not rows_state:
439
+ empty_main = pd.DataFrame(columns=df_cols)
440
+ empty_mean = pd.DataFrame(columns=dfm_cols)
441
+ return empty_main, empty_mean, build_overview(lang)
442
 
443
+ out_main, out_mean = [], []
444
+ for r in rows_state:
445
  vec = torch.tensor(r["vec"])
446
  tag = vector_to_tag(vec)
447
+ out_main.append([r["word"], tag, analysis_text(vec, lang)])
448
+ out_mean.append([r["word"], tag, expanded_text(vec, lang)])
449
 
450
+ return (
451
+ pd.DataFrame(out_main, columns=df_cols),
452
+ pd.DataFrame(out_mean, columns=dfm_cols),
453
+ build_overview(lang),
454
+ )
455
 
456
  # ----------------------------
457
+ # Gradio UI
458
  # ----------------------------
459
  theme = gr.themes.Soft()
460
 
461
  with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
462
+ # Compact header: info left, input right
463
  with gr.Row(equal_height=True):
464
+ with gr.Column(scale=1, min_width=280):
465
  gr.Markdown(
466
+ "### BRAGD-markarin\n"
467
+ "Skriv ein setning og fá hann markaðan.\n\n"
468
+ f"**Myndil / Model:** [{MODEL_ID}]({MODEL_LINK})"
 
469
  )
470
+ with gr.Column(scale=2):
471
+ inp = gr.Textbox(
472
+ lines=5,
473
+ placeholder="Skriva her ... / Type here ...",
474
+ show_label=False,
 
 
 
 
 
 
 
 
 
 
 
 
475
  )
476
+ btn = gr.Button("Marka / Tag", variant="primary")
477
 
478
  state = gr.State([])
479
 
480
+ # Results header row (components hide until first run)
481
+ with gr.Row():
482
+ results_title = gr.Markdown("### Úrslit / Results", visible=False)
483
+ lang = gr.Dropdown(
484
+ choices=[("Føroyskt","fo"), ("English","en")],
485
+ value="fo",
486
+ show_label=False,
487
+ filterable=False,
488
+ elem_id="lang_dd",
489
+ visible=False,
490
+ )
491
+
492
+ out_df = gr.Dataframe(
493
+ value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["s"]]),
494
+ wrap=True,
495
+ interactive=False,
496
+ show_label=False,
497
+ row_count=(0, "fixed"),
498
+ col_count=(3, "fixed"),
499
+ visible=False,
500
+ )
501
+
502
  with gr.Accordion("Útgreinað marking / Expanded tags", open=False):
503
+ out_mean_df = gr.Dataframe(
504
+ value=pd.DataFrame(columns=[UI["fo"]["w"], UI["fo"]["t"], UI["fo"]["m"]]),
505
+ wrap=True,
506
+ interactive=False,
507
+ show_label=False,
508
+ row_count=(0, "fixed"),
509
+ col_count=(3, "fixed"),
510
+ visible=False,
511
+ )
512
 
513
+ with gr.Accordion("Markingaryvirlit / Tag Overview", open=False):
514
+ overview_md = gr.Markdown("", visible=False)
515
 
516
  def on_tag(sentence, lang_choice):
517
  rows = run_model(sentence)
518
+ df_main, df_mean, overview = render(rows, lang_choice)
519
+
520
+ return (
521
+ rows,
522
+ gr.update(value=df_main, visible=True),
523
+ gr.update(value=df_mean, visible=True),
524
+ gr.update(value=overview, visible=True),
525
+ gr.update(visible=True), # results_title
526
+ gr.update(visible=True), # lang
527
+ )
528
 
529
  def on_lang(rows, lang_choice):
530
+ df_main, df_mean, overview = render(rows, lang_choice)
531
+ return (
532
+ gr.update(value=df_main),
533
+ gr.update(value=df_mean),
534
+ gr.update(value=overview),
535
+ )
536
+
537
+ btn.click(
538
+ on_tag,
539
+ inputs=[inp, lang],
540
+ outputs=[state, out_df, out_mean_df, overview_md, results_title, lang],
541
+ queue=False,
542
+ )
543
 
544
+ lang.change(
545
+ on_lang,
546
+ inputs=[state, lang],
547
+ outputs=[out_df, out_mean_df, overview_md],
548
+ queue=False,
549
+ )
550
 
551
  if __name__ == "__main__":
552
  demo.launch()
tag_labels.json CHANGED
@@ -32,14 +32,16 @@
32
  "a": "indefinite"
33
  },
34
  "proper": {
35
- "r": "not proper noun",
36
  "P": "proper noun"
37
  },
38
  "degree": {
39
  "d": "no degree"
40
  },
41
  "declension": {
42
- "e": "no declension"
 
 
43
  },
44
  "subcategory": {
45
  "s": "no subcategory"
@@ -82,7 +84,7 @@
82
  "G": "genitive"
83
  },
84
  "article": {
85
- "A": "definite"
86
  },
87
  "proper": {
88
  "P": "Proper Noun"
@@ -123,9 +125,9 @@
123
  "A": "absolute superlative"
124
  },
125
  "declension": {
126
- "S": "strong declension",
127
- "W": "weak declension",
128
- "e": "no declension"
129
  },
130
  "gender": {
131
  "M": "masculine",
@@ -204,7 +206,7 @@
204
  },
205
  "V": {
206
  "word_class": {
207
- "V": "verb"
208
  },
209
  "mood": {
210
  "I": "infinitive",
@@ -233,16 +235,16 @@
233
  },
234
  "L": {
235
  "word_class": {
236
- "L": "past participle"
237
  },
238
  "voice": {
239
  "A": "active",
240
  "M": "mediopassive"
241
  },
242
  "declension": {
243
- "S": "strong declension",
244
- "W": "weak declension",
245
- "e": "no declension"
246
  },
247
  "gender": {
248
  "M": "masculine",
@@ -361,14 +363,16 @@
361
  "a": "óbundið"
362
  },
363
  "proper": {
364
- "r": "ikki sernavn",
365
  "P": "sernavn"
366
  },
367
  "degree": {
368
  "d": "eingin stigbending"
369
  },
370
  "declension": {
371
- "e": "eingin sterk/veik bending"
 
 
372
  },
373
  "subcategory": {
374
  "s": "eingin undirflokkur"
@@ -452,8 +456,8 @@
452
  "A": "absolutt hástig"
453
  },
454
  "declension": {
455
- "S": "sterk bending",
456
- "W": "veik bending",
457
  "e": "eingin sterk/veik bending"
458
  },
459
  "gender": {
@@ -490,9 +494,9 @@
490
  "N": "hvørkikyn"
491
  },
492
  "person": {
493
- "1": "1. persónur",
494
- "2": "2. persónur",
495
- "3": "3. persónur"
496
  },
497
  "number": {
498
  "S": "eintal",
@@ -573,8 +577,8 @@
573
  "M": "miðalsøgn"
574
  },
575
  "declension": {
576
- "S": "sterk bending",
577
- "W": "veik bending",
578
  "e": "eingin sterk/veik bending"
579
  },
580
  "gender": {
@@ -648,7 +652,7 @@
648
  "K": "teknseting"
649
  },
650
  "subcategory": {
651
- "E": "endi av setningi",
652
  "C": "komma",
653
  "Q": "gásareyga",
654
  "O": "annað"
 
32
  "a": "indefinite"
33
  },
34
  "proper": {
35
+ "r": "common noun",
36
  "P": "proper noun"
37
  },
38
  "degree": {
39
  "d": "no degree"
40
  },
41
  "declension": {
42
+ "e": "no declension",
43
+ "S": "strong declension",
44
+ "W": "weak declension"
45
  },
46
  "subcategory": {
47
  "s": "no subcategory"
 
84
  "G": "genitive"
85
  },
86
  "article": {
87
+ "A": "with suffixed definite article"
88
  },
89
  "proper": {
90
  "P": "Proper Noun"
 
125
  "A": "absolute superlative"
126
  },
127
  "declension": {
128
+ "S": "strong",
129
+ "W": "weak",
130
+ "e": "no-declension"
131
  },
132
  "gender": {
133
  "M": "masculine",
 
206
  },
207
  "V": {
208
  "word_class": {
209
+ "V": "verb (except for participle)"
210
  },
211
  "mood": {
212
  "I": "infinitive",
 
235
  },
236
  "L": {
237
  "word_class": {
238
+ "L": "participle"
239
  },
240
  "voice": {
241
  "A": "active",
242
  "M": "mediopassive"
243
  },
244
  "declension": {
245
+ "S": "strong",
246
+ "W": "weak",
247
+ "e": "no-declension"
248
  },
249
  "gender": {
250
  "M": "masculine",
 
363
  "a": "óbundið"
364
  },
365
  "proper": {
366
+ "r": "felagsnavn",
367
  "P": "sernavn"
368
  },
369
  "degree": {
370
  "d": "eingin stigbending"
371
  },
372
  "declension": {
373
+ "e": "eingin sterk/veik bending",
374
+ "S": "sterk bending",
375
+ "W": "veik bending"
376
  },
377
  "subcategory": {
378
  "s": "eingin undirflokkur"
 
456
  "A": "absolutt hástig"
457
  },
458
  "declension": {
459
+ "S": "sterk",
460
+ "W": "veik",
461
  "e": "eingin sterk/veik bending"
462
  },
463
  "gender": {
 
494
  "N": "hvørkikyn"
495
  },
496
  "person": {
497
+ "1": "fyrsti persónur",
498
+ "2": "annar persónur",
499
+ "3": "triði persónur"
500
  },
501
  "number": {
502
  "S": "eintal",
 
577
  "M": "miðalsøgn"
578
  },
579
  "declension": {
580
+ "S": "sterk",
581
+ "W": "veik",
582
  "e": "eingin sterk/veik bending"
583
  },
584
  "gender": {
 
652
  "K": "teknseting"
653
  },
654
  "subcategory": {
655
+ "E": "setningsendi",
656
  "C": "komma",
657
  "Q": "gásareyga",
658
  "O": "annað"