unijoh commited on
Commit
958c273
·
verified ·
1 Parent(s): 4db9339

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +256 -377
  2. tag_labels.json +457 -460
app.py CHANGED
@@ -1,7 +1,4 @@
1
- import os
2
- import re
3
- import string
4
- import json
5
  from collections import defaultdict
6
 
7
  import gradio as gr
@@ -10,445 +7,327 @@ import numpy as np
10
  import pandas as pd
11
  from transformers import AutoTokenizer, AutoModelForTokenClassification
12
 
13
- # ----------------------------
14
- # Config
15
- # ----------------------------
16
  MODEL_ID = "Setur/BRAGD"
17
- TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must be in the Space repo
18
- LABELS_FILEPATH = "tag_labels.json" # add this file to the Space repo
 
19
 
20
- HF_TOKEN = os.getenv("BRAGD") # Space secret name
21
  if not HF_TOKEN:
22
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
 
 
23
 
24
- # Match your UPDATED demo.py intervals
25
  INTERVALS = (
26
- (15, 29), # Subcategories
27
- (30, 33), # Gender
28
- (34, 36), # Number
29
- (37, 41), # Case
30
- (42, 43), # Article/No-Article
31
- (44, 45), # Proper/Not Proper
32
- (46, 50), # Degree
33
- (51, 53), # Declension
34
- (54, 60), # Mood
35
- (61, 63), # Voice
36
- (64, 66), # Tense
37
- (67, 70), # Person
38
- (71, 72), # Definiteness
39
  )
40
 
41
- # ----------------------------
42
- # Load model + tokenizer
43
- # ----------------------------
44
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
45
- model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
46
-
47
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
48
- model.to(device)
49
- model.eval()
50
-
51
- # ----------------------------
52
- # Tag mapping (CSV)
53
- # ----------------------------
54
- def load_tag_mappings(tags_filepath: str):
55
- tags_df = pd.read_csv(tags_filepath)
56
-
57
- feature_cols = list(tags_df.columns[1:])
58
-
59
- tag_to_features = {
60
- row["Original Tag"]: row[1:].values.astype(int)
61
- for _, row in tags_df.iterrows()
62
- }
63
- features_to_tag = {
64
- tuple(row[1:].values.astype(int)): row["Original Tag"]
65
- for _, row in tags_df.iterrows()
66
- }
67
-
68
- vec_len = len(feature_cols)
69
- return tag_to_features, features_to_tag, vec_len, feature_cols
70
-
71
-
72
- tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
73
-
74
- # Safety check
75
- if hasattr(model, "config") and hasattr(model.config, "num_labels"):
76
- if model.config.num_labels != VEC_LEN:
77
- raise RuntimeError(
78
- f"Label size mismatch: model has num_labels={model.config.num_labels}, "
79
- f"but {TAGS_FILEPATH} implies {VEC_LEN}. "
80
- "You likely uploaded the wrong tag mapping CSV."
81
- )
82
-
83
-
84
- def vector_to_tag(vec: torch.Tensor) -> str:
85
- return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
86
-
87
- # ----------------------------
88
- # Compute allowed intervals per POS
89
- # ----------------------------
90
- def process_tag_features(tag_to_features: dict, intervals):
91
- list_of_tags = list(tag_to_features.values())
92
- unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
93
-
94
- word_type_masks = {}
95
- for wt in range(15):
96
- word_type_masks[wt] = [arr for arr in unique_arrays if arr[wt] == 1]
97
-
98
- dict_intervals = {}
99
- for wt in range(15):
100
- labels = word_type_masks[wt]
101
- if len(labels) == 0:
102
- dict_intervals[wt] = []
103
- continue
104
-
105
- sum_labels = np.sum(np.array(labels), axis=0)
106
-
107
- allowed = [
108
- interval
109
- for interval in intervals
110
- if np.sum(sum_labels[interval[0] : interval[1] + 1]) != 0
111
- ]
112
- dict_intervals[wt] = allowed
113
-
114
- return dict_intervals
115
-
116
-
117
- DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
118
-
119
- # ----------------------------
120
- # Load bilingual labels
121
- # ----------------------------
122
- def load_labels(path: str):
123
- with open(path, "r", encoding="utf-8") as f:
124
- return json.load(f)
125
-
126
-
127
- try:
128
- LABELS = load_labels(LABELS_FILEPATH)
129
- except Exception:
130
- LABELS = {"fo": {"global": {}, "by_wc": {}}, "en": {"global": {}, "by_wc": {}}}
131
-
132
-
133
- def label_for(lang: str, group: str, wc_code: str, code: str) -> str:
134
- """Word-class-specific first, then global. Always safe to return ""."""
135
- lang = lang if lang in ("fo", "en") else "fo"
136
- d = LABELS.get(lang, {})
137
- by_wc = d.get("by_wc", {})
138
- glob = d.get("global", {})
139
-
140
- if wc_code and group in by_wc and wc_code in by_wc[group] and code in by_wc[group][wc_code]:
141
- return by_wc[group][wc_code][code]
142
-
143
- if group in glob and code in glob[group]:
144
- return glob[group][code]
145
-
146
- return ""
147
-
148
- # ----------------------------
149
- # Feature column groups (from CSV headers)
150
- # ----------------------------
151
- def _group_from_colname(col: str):
152
- if col == "Article":
153
- return ("article", "A")
154
- if col == "Proper Noun":
155
- return ("proper", "P")
156
- if col.startswith("Not-Proper-Noun "):
157
- return ("proper", col.split()[-1]) # usually r
158
- if col.startswith("No-Article "):
159
- return ("article", col.split()[-1]) # usually a
160
 
161
  prefixes = [
162
- ("Word Class ", "word_class"),
163
- ("Subcategory ", "subcategory"),
164
- ("No-Subcategory ", "subcategory"),
165
- ("Gender ", "gender"),
166
- ("No-Gender ", "gender"),
167
- ("Number ", "number"),
168
- ("No-Number ", "number"),
169
- ("Case ", "case"),
170
- ("No-Case ", "case"),
171
- ("Degree ", "degree"),
172
- ("No-Degree ", "degree"),
173
- ("Declension ", "declension"),
174
- ("No-Declension ", "declension"),
175
- ("Mood ", "mood"),
176
- ("Voice ", "voice"),
177
- ("No-Voice ", "voice"),
178
- ("Tense ", "tense"),
179
- ("No-Tense ", "tense"),
180
- ("Person ", "person"),
181
- ("No-Person ", "person"),
182
- ("Definite ", "definiteness"),
183
- ("Indefinite ", "definiteness"),
184
  ]
185
-
186
- for p, g in prefixes:
187
  if col.startswith(p):
188
- code = col.split()[-1]
189
- return (g, code)
190
 
191
- return (None, None)
192
-
193
-
194
- GROUPS = defaultdict(list) # group -> list[(idx, code)]
195
- for i, col in enumerate(FEATURE_COLS):
196
- g, code = _group_from_colname(col)
197
- if g:
198
- GROUPS[g].append((i, code))
199
-
200
- # ----------------------------
201
- # Tokenization
202
- # ----------------------------
203
- def simp_tok(sentence: str):
204
- return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
205
 
206
- # ----------------------------
207
- # Decoding
208
- # ----------------------------
209
- def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
210
  softmax = torch.nn.Softmax(dim=0)
211
  vectors = []
212
-
213
  for idx in range(len(logits)):
214
- if attention_mask[idx].item() != 1:
215
- continue
216
- if begin_tokens[idx] != 1:
217
  continue
218
 
219
- pred_logits = logits[idx]
220
  vec = torch.zeros(vec_len, device=logits.device)
221
 
222
- # POS
223
- probs = softmax(pred_logits[0:15])
224
- wt = torch.argmax(probs).item()
225
- vec[wt] = 1
226
 
227
- # Allowed feature groups
228
- for (a, b) in dict_intervals.get(wt, []):
229
- seg = pred_logits[a : b + 1]
230
- probs = softmax(seg)
231
- k = torch.argmax(probs).item()
232
- vec[a + k] = 1
233
 
234
  vectors.append(vec)
235
-
236
  return vectors
237
 
 
 
 
238
 
239
- def describe_vector(vec: torch.Tensor, lang: str) -> str:
240
- # word class code
241
- wc_code = ""
242
- for idx, code in GROUPS.get("word_class", []):
243
- if int(vec[idx].item()) == 1:
244
- wc_code = code
245
- break
246
 
247
- parts = []
 
248
 
249
- wc_label = label_for(lang, "word_class", wc_code, wc_code)
250
- if wc_code:
251
- parts.append(f"{wc_code} – {wc_label}" if wc_label else wc_code)
252
-
253
- order = [
254
- "subcategory",
255
- "gender",
256
- "number",
257
- "case",
258
- "article",
259
- "proper",
260
- "degree",
261
- "declension",
262
- "mood",
263
- "voice",
264
- "tense",
265
- "person",
266
- "definiteness",
267
- ]
268
 
269
- for g in order:
270
- chosen = None
271
- for idx, code in GROUPS.get(g, []):
272
- if int(vec[idx].item()) == 1:
273
- chosen = code
274
- break
275
- if not chosen:
276
- continue
277
 
278
- lbl = label_for(lang, g, wc_code, chosen)
279
-
280
- # Always keep this correct even if labels are missing
281
- if not lbl:
282
- if lang == "en":
283
- FALLBACK = {
284
- "definiteness": {"D": "definite", "I": "indefinite"},
285
- "article": {"A": "with suffixed definite article", "a": "no definite suffix"},
286
- "proper": {"P": "proper noun", "r": "not proper noun"},
287
- "gender": {"g": "no gender"},
288
- "number": {"n": "no number"},
289
- "case": {"c": "no case"},
290
- "degree": {"d": "no degree"},
291
- "declension": {"e": "no declension"},
292
- "voice": {"v": "no voice"},
293
- "tense": {"t": "no tense"},
294
- "person": {"p": "no person"},
295
- "subcategory": {"s": "no subcategory"},
296
- }
297
- else:
298
- FALLBACK = {
299
- "definiteness": {"D": "bundið", "I": "óbundið"},
300
- "article": {"A": "við bundnum eftirlið", "a": "uttan bundið eftirlið"},
301
- "proper": {"P": "sernavn", "r": "ikki sernavn"},
302
- "gender": {"g": "einki kyn"},
303
- "number": {"n": "einki tal"},
304
- "case": {"c": "einki fall"},
305
- "degree": {"d": "einki stig"},
306
- "declension": {"e": "eingin bending"},
307
- "voice": {"v": "eingin søgn"},
308
- "tense": {"t": "eingin tíð"},
309
- "person": {"p": "eingin persónur"},
310
- "subcategory": {"s": "eingin undirflokkur"},
311
- }
312
- lbl = FALLBACK.get(g, {}).get(chosen, "")
313
-
314
- parts.append(f"{chosen} – {lbl}" if lbl else chosen)
315
 
316
- return "; ".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- def tag_sentence(sentence: str, lang: str = "fo", max_len: int = 128):
320
- sentence = (sentence or "").strip()
321
- if not sentence:
322
- return pd.DataFrame(columns=["Word", "Tag", "Meaning"]), ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
- tokens = simp_tok(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  if not tokens:
326
- return pd.DataFrame(columns=["Word", "Tag", "Meaning"]), ""
327
-
328
- enc = tokenizer(
329
- tokens,
330
- is_split_into_words=True,
331
- add_special_tokens=True,
332
- max_length=max_len,
333
- padding="max_length",
334
- truncation=True,
335
- return_attention_mask=True,
336
- return_tensors="pt",
337
- )
338
 
339
  input_ids = enc["input_ids"].to(device)
340
  attention_mask = enc["attention_mask"].to(device)
341
  word_ids = enc.word_ids(batch_index=0)
342
 
343
- begin_tokens = []
344
  last = None
345
  for wid in word_ids:
346
- if wid is None:
347
- begin_tokens.append(0)
348
- elif wid != last:
349
- begin_tokens.append(1)
350
- else:
351
- begin_tokens.append(0)
352
  last = wid
353
 
354
  with torch.no_grad():
355
- out = model(input_ids=input_ids, attention_mask=attention_mask)
356
- logits = out.logits[0]
357
 
358
- vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
359
 
360
  rows = []
361
  vec_i = 0
362
- seen_word_ids = set()
363
-
364
- for i, wid in enumerate(word_ids):
365
- if wid is None:
366
- continue
367
- if begin_tokens[i] != 1:
368
  continue
369
- if wid in seen_word_ids:
370
- continue
371
-
372
- seen_word_ids.add(wid)
373
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
374
-
375
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
376
- tag = vector_to_tag(vec)
377
- meaning = describe_vector(vec, lang)
378
-
379
- rows.append([word, tag, meaning])
380
  vec_i += 1
 
381
 
382
- df = pd.DataFrame(rows, columns=["Word", "Tag", "Meaning"])
383
- tsv = "\n".join([f"{w}\t{t}\t{m}" for w, t, m in rows])
384
- return df, tsv
385
-
386
-
387
- def build_legend(lang: str):
388
- lang = lang if lang in ("fo", "en") else "fo"
389
-
390
- if lang == "en":
391
- title = "### Legend (what the codes mean)"
392
- hint = "- Tip: hover/copy from the TSV box if you want to paste into spreadsheets or docs."
393
- wc_title = "#### Word classes"
394
- missing = "(No label file loaded — add tag_labels.json to the repo root.)"
395
- else:
396
- title = "### Markingaryvirlit (hvat kóðurnar merkja)"
397
- hint = "- Tips: tú kanst copy/paste úr TSV-kassanum inn í skjøl ella rokniskjøl."
398
- wc_title = "#### Orðaflokkar"
399
- missing = "(Eingin label-fíla er innlisin — legg tag_labels.json í rótina á repo.)"
400
-
401
- wc_map = LABELS.get(lang, {}).get("global", {}).get("word_class", {})
402
 
403
- lines = [title, hint, "", wc_title]
404
- if wc_map:
405
- for code in sorted(wc_map.keys()):
406
- lines.append(f"- **{code}**: {wc_map[code]}")
407
- else:
408
- lines.append(f"- {missing}")
409
 
410
- return "\n".join(lines)
411
-
412
-
413
- # ----------------------------
414
- # Gradio UI
415
- # ----------------------------
416
  theme = gr.themes.Soft()
417
 
418
- with gr.Blocks(theme=theme, title="BRAGD-markarin") as demo:
419
- gr.Markdown(
420
- "## BRAGD-markarin\n"
421
- "Skriv ein setning og hann markaðan.\n\n"
422
- "**Model:** `Setur/BRAGD`"
423
- )
424
-
425
- with gr.Row():
426
- lang = gr.Dropdown(
427
- choices=[("Føroyskt", "fo"), ("English", "en")],
428
- value="fo",
429
- label="Mál / Language",
430
- )
431
-
432
- inp = gr.Textbox(lines=3, label="Setningur / Sentence", placeholder="Skriv her…")
433
  btn = gr.Button("Marka / Tag", variant="primary")
434
 
435
- out_df = gr.Dataframe(
436
- headers=["Word", "Tag", "Meaning"],
437
- wrap=True,
438
- interactive=False,
439
- label="Úrslit / Results",
440
- )
441
- out_tsv = gr.Textbox(lines=10, label="Copy/paste (TSV)", interactive=False)
442
 
443
  with gr.Accordion("Markingaryvirlit / Legend", open=False):
444
  legend_md = gr.Markdown(build_legend("fo"))
445
 
446
- def _run(sentence, lang_choice):
447
- df, tsv = tag_sentence(sentence, lang_choice)
448
- return df, tsv, build_legend(lang_choice)
 
 
 
 
 
449
 
450
- btn.click(_run, inputs=[inp, lang], outputs=[out_df, out_tsv, legend_md])
451
- lang.change(lambda l: build_legend(l), inputs=[lang], outputs=[legend_md])
452
 
453
  if __name__ == "__main__":
454
  demo.launch()
 
1
+ import os, re, string, json
 
 
 
2
  from collections import defaultdict
3
 
4
  import gradio as gr
 
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
 
 
 
10
  MODEL_ID = "Setur/BRAGD"
11
+ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
12
+ LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
13
+ HF_TOKEN = os.getenv("BRAGD") # Space secret
14
 
 
15
  if not HF_TOKEN:
16
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
17
+ if not os.path.exists(LABELS_FILEPATH):
18
+ raise RuntimeError(f"Missing {LABELS_FILEPATH}. Add it to the Space repo root.")
19
 
20
+ # Match your demo.py intervals
21
  INTERVALS = (
22
+ (15, 29), (30, 33), (34, 36), (37, 41), (42, 43), (44, 45), (46, 50),
23
+ (51, 53), (54, 60), (61, 63), (64, 66), (67, 70), (71, 72)
 
 
 
 
 
 
 
 
 
 
 
24
  )
25
 
26
+ GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree","declension","mood","voice","tense","person","definiteness"]
27
+
28
+ # You said Subcategory B doesn't exist and will be deleted from the CSV:
29
+ HIDE_CODES = {"subcategory": {"B"}}
30
+
31
+ GROUP_TITLES = {
32
+ "en": {"subcategory":"Subcategory","gender":"Gender","number":"Number","case":"Case","article":"Article suffix","proper":"Proper noun",
33
+ "degree":"Degree","declension":"Declension","mood":"Mood","voice":"Voice","tense":"Tense","person":"Person","definiteness":"Definiteness"},
34
+ "fo": {"subcategory":"Undirflokkur","gender":"Kyn","number":"Tal","case":"Fall","article":"Bundið eftirlið","proper":"Sernavn",
35
+ "degree":"Stig","declension":"Bending","mood":"Háttur","voice":"Søgn","tense":"Tíð","person":"Persónur","definiteness":"Bundni/óbundni"},
36
+ }
37
+
38
+ UI = {
39
+ "fo": {"w":"Orð","t":"Mark","s":"Vís sum","m":"Merking","def":"bundið","ind":"óbundið"},
40
+ "en": {"w":"Word","t":"Tag","s":"Show as","m":"Meaning","def":"definite","ind":"indefinite"},
41
+ }
42
+
43
+ CSS = """
44
+ :root{
45
+ --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
46
+ --primary-100:#E1ECEA; --primary-200:#C6DAD6;
47
+ }
48
+ .gr-button-primary, button.primary, .primary{
49
+ background:var(--primary-500)!important; border-color:var(--primary-600)!important; color:#0b1b19!important;
50
+ }
51
+ .gr-button-primary:hover, button.primary:hover, .primary:hover{ background:var(--primary-600)!important; }
52
+ a{ color:var(--primary-700)!important; }
53
+ """
54
+
55
+ def simp_tok(s: str):
56
+ return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", s)
57
+
58
+ def load_tag_mappings(path: str):
59
+ df = pd.read_csv(path)
60
+ feature_cols = list(df.columns[1:])
61
+ tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in df.iterrows()}
62
+ features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in df.iterrows()}
63
+ return tag_to_features, features_to_tag, len(feature_cols), feature_cols
64
+
65
+ def group_from_col(col: str):
66
+ if col == "Article": return ("article","A")
67
+ if col.startswith("No-Article "): return ("article", col.split()[-1])
68
+ if col == "Proper Noun": return ("proper","P")
69
+ if col.startswith("Not-Proper-Noun "): return ("proper", col.split()[-1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  prefixes = [
72
+ ("Word Class ","word_class"),
73
+ ("Subcategory ","subcategory"), ("No-Subcategory ","subcategory"),
74
+ ("Gender ","gender"), ("No-Gender ","gender"),
75
+ ("Number ","number"), ("No-Number ","number"),
76
+ ("Case ","case"), ("No-Case ","case"),
77
+ ("Degree ","degree"), ("No-Degree ","degree"),
78
+ ("Declension ","declension"), ("No-Declension ","declension"),
79
+ ("Mood ","mood"),
80
+ ("Voice ","voice"), ("No-Voice ","voice"),
81
+ ("Tense ","tense"), ("No-Tense ","tense"),
82
+ ("Person ","person"), ("No-Person ","person"),
83
+ ("Definite ","definiteness"), ("Indefinite ","definiteness"),
 
 
 
 
 
 
 
 
 
 
84
  ]
85
+ for p,g in prefixes:
 
86
  if col.startswith(p):
87
+ return (g, col.split()[-1])
88
+ return (None,None)
89
 
90
+ def process_tag_features(tag_to_features: dict, intervals):
91
+ arrs = [np.array(tpl) for tpl in set(tuple(a) for a in tag_to_features.values())]
92
+ wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
93
+ out = {}
94
+ for wt,labels in wt_masks.items():
95
+ if not labels: out[wt]=[]; continue
96
+ sum_labels = np.sum(np.array(labels), axis=0)
97
+ out[wt] = [iv for iv in intervals if np.sum(sum_labels[iv[0]:iv[1]+1]) != 0]
98
+ return out
 
 
 
 
 
99
 
100
+ def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_len):
 
 
 
101
  softmax = torch.nn.Softmax(dim=0)
102
  vectors = []
 
103
  for idx in range(len(logits)):
104
+ if attention_mask[idx].item()!=1 or begin_tokens[idx]!=1:
 
 
105
  continue
106
 
107
+ pred = logits[idx]
108
  vec = torch.zeros(vec_len, device=logits.device)
109
 
110
+ wt = torch.argmax(softmax(pred[0:15])).item()
111
+ vec[wt]=1
 
 
112
 
113
+ for (a,b) in dict_intervals.get(wt, []):
114
+ seg = pred[a:b+1]
115
+ k = torch.argmax(softmax(seg)).item()
116
+ vec[a+k]=1
 
 
117
 
118
  vectors.append(vec)
 
119
  return vectors
120
 
121
+ # Load labels (extracted from your XLSX)
122
+ with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
123
+ LABELS = json.load(f)
124
 
125
+ def label_for(lang: str, group: str, wc: str, code: str) -> str:
126
+ lang = "fo" if lang=="fo" else "en"
127
+ by_wc = LABELS.get(lang, {}).get("by_word_class", {})
128
+ glob = LABELS.get(lang, {}).get("global", {})
129
+ if wc and wc in by_wc and code in by_wc[wc].get(group, {}):
130
+ return by_wc[wc][group][code]
131
+ return glob.get(group, {}).get(code, "")
132
 
133
+ # Load CSV mappings (authoritative)
134
+ tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
135
 
136
+ # Load model
137
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
138
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
139
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
140
+ model.to(device); model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ if hasattr(model, "config") and hasattr(model.config, "num_labels"):
143
+ if model.config.num_labels != VEC_LEN:
144
+ raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
145
+
146
+ DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
 
 
 
147
 
148
+ # Build GROUPS from CSV headers
149
+ GROUPS = defaultdict(list) # group -> [(idx, code, colname)]
150
+ for i,col in enumerate(FEATURE_COLS):
151
+ g,code = group_from_col(col)
152
+ if g and code not in HIDE_CODES.get(g, set()):
153
+ GROUPS[g].append((i, code, col))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ def vector_to_tag(vec: torch.Tensor) -> str:
156
+ return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
157
+
158
+ def wc_code(vec: torch.Tensor) -> str:
159
+ for idx,code,_ in GROUPS["word_class"]:
160
+ if int(vec[idx].item())==1:
161
+ return code
162
+ return ""
163
+
164
+ def group_code(vec: torch.Tensor, group: str) -> str:
165
+ hidden = HIDE_CODES.get(group, set())
166
+ for idx,code,_ in GROUPS.get(group, []):
167
+ if code in hidden:
168
+ continue
169
+ if int(vec[idx].item())==1:
170
+ return code
171
+ return ""
172
 
173
+ def describe(vec: torch.Tensor, lang: str) -> str:
174
+ wc = wc_code(vec)
175
+ parts = []
176
+ if wc:
177
+ lbl = label_for(lang, "word_class", wc, wc)
178
+ parts.append(f"{wc} – {lbl}" if lbl else wc)
179
+ for g in GROUP_ORDER:
180
+ c = group_code(vec, g)
181
+ if not c:
182
+ continue
183
+ lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
184
+ parts.append(f"{c} – {lbl}" if lbl else c)
185
+ return "; ".join(parts)
186
 
187
+ def show_as(vec: torch.Tensor, lang: str) -> str:
188
+ lang = "fo" if lang=="fo" else "en"
189
+ wc = wc_code(vec)
190
+ wc_lbl = label_for(lang, "word_class", wc, wc) or wc
191
+ raw = vector_to_tag(vec)
192
+
193
+ # Exact override you requested:
194
+ if raw == "DGd":
195
+ return "Fyriseting" if lang=="fo" else "Preposition"
196
+
197
+ # S...a. / S...A. mapping (nouns): show definite/indefinite by Article (A/a)
198
+ if wc == "S":
199
+ art = group_code(vec, "article") # A or a
200
+ if art == "A": return f"{wc_lbl} — {UI[lang]['def']}"
201
+ if art == "a": return f"{wc_lbl} — {UI[lang]['ind']}"
202
+ return wc_lbl
203
+
204
+ return wc_lbl
205
+
206
+ def compute_codes_by_wc():
207
+ codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
208
+ for arr in tag_to_features.values():
209
+ arr = np.array(arr)
210
+
211
+ wc = None
212
+ for idx,code,_ in GROUPS["word_class"]:
213
+ if arr[idx]==1:
214
+ wc = code
215
+ break
216
+ if not wc:
217
+ continue
218
 
219
+ for g in GROUP_ORDER:
220
+ hidden = HIDE_CODES.get(g, set())
221
+ for idx,code,_ in GROUPS.get(g, []):
222
+ if code in hidden:
223
+ continue
224
+ if arr[idx]==1:
225
+ codes[wc][g].add(code)
226
+
227
+ return codes
228
+
229
+ CODES_BY_WC = compute_codes_by_wc()
230
+
231
+ def build_legend(lang: str) -> str:
232
+ lang = "fo" if lang=="fo" else "en"
233
+ lines = ["### Markingaryvirlit / Legend", ""]
234
+ for wc in sorted(CODES_BY_WC.keys()):
235
+ wcl = label_for(lang, "word_class", wc, wc) or ""
236
+ lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
237
+
238
+ for g in GROUP_ORDER:
239
+ cs = sorted(CODES_BY_WC[wc].get(g, set()))
240
+ if not cs:
241
+ continue
242
+ lines.append(f"**{GROUP_TITLES[lang].get(g, g)}**")
243
+ for c in cs:
244
+ lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
245
+ lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
246
+ lines.append("")
247
+ lines.append("")
248
+ return "\n".join(lines).strip()
249
+
250
+ def run_model(sentence: str):
251
+ s = (sentence or "").strip()
252
+ if not s:
253
+ return []
254
+ tokens = simp_tok(s)
255
  if not tokens:
256
+ return []
257
+
258
+ enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
259
+ padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
260
 
261
  input_ids = enc["input_ids"].to(device)
262
  attention_mask = enc["attention_mask"].to(device)
263
  word_ids = enc.word_ids(batch_index=0)
264
 
265
+ begin = []
266
  last = None
267
  for wid in word_ids:
268
+ if wid is None: begin.append(0)
269
+ elif wid != last: begin.append(1)
270
+ else: begin.append(0)
 
 
 
271
  last = wid
272
 
273
  with torch.no_grad():
274
+ logits = model(input_ids=input_ids, attention_mask=attention_mask).logits[0]
 
275
 
276
+ vectors = predict_vectors(logits, attention_mask[0], begin, DICT_INTERVALS, VEC_LEN)
277
 
278
  rows = []
279
  vec_i = 0
280
+ seen = set()
281
+ for i,wid in enumerate(word_ids):
282
+ if wid is None or begin[i]!=1 or wid in seen:
 
 
 
283
  continue
284
+ seen.add(wid)
 
 
 
285
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
 
286
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
287
+ rows.append({"word": word, "vec": vec.int().tolist()})
 
 
 
288
  vec_i += 1
289
+ return rows
290
 
291
+ def render(rows_state, lang: str):
292
+ lang = "fo" if lang=="fo" else "en"
293
+ cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"], UI[lang]["m"]]
294
+ if not rows_state:
295
+ return pd.DataFrame(columns=cols), build_legend(lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
+ out = []
298
+ for r in rows_state:
299
+ vec = torch.tensor(r["vec"])
300
+ out.append([r["word"], vector_to_tag(vec), show_as(vec, lang), describe(vec, lang)])
301
+ return pd.DataFrame(out, columns=cols), build_legend(lang)
 
302
 
 
 
 
 
 
 
303
  theme = gr.themes.Soft()
304
 
305
+ with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
306
+ gr.Markdown("## BRAGD-markarin\nSkriv ein setning og fá hann markaðan.\n\n**Model:** `Setur/BRAGD`")
307
+
308
+ inp = gr.Textbox(lines=3, label="Setningur / Sentence", placeholder="Skriv her… / Type here…")
 
 
 
 
 
 
 
 
 
 
 
309
  btn = gr.Button("Marka / Tag", variant="primary")
310
 
311
+ state = gr.State([])
312
+ out_df = gr.Dataframe(wrap=True, interactive=False, label="Úrslit / Results")
313
+
314
+ # Under results + can be changed AFTER tagging (no rerun; just re-render)
315
+ lang = gr.Dropdown(choices=[("Føroyskt","fo"), ("English","en")], value="fo", label="Mál / Language")
 
 
316
 
317
  with gr.Accordion("Markingaryvirlit / Legend", open=False):
318
  legend_md = gr.Markdown(build_legend("fo"))
319
 
320
+ def on_tag(sentence, lang_choice):
321
+ rows = run_model(sentence)
322
+ df, legend = render(rows, lang_choice)
323
+ return rows, df, legend
324
+
325
+ def on_lang(rows, lang_choice):
326
+ df, legend = render(rows, lang_choice)
327
+ return df, legend
328
 
329
+ btn.click(on_tag, inputs=[inp, lang], outputs=[state, out_df, legend_md])
330
+ lang.change(on_lang, inputs=[state, lang], outputs=[out_df, legend_md])
331
 
332
  if __name__ == "__main__":
333
  demo.launch()
tag_labels.json CHANGED
@@ -1,610 +1,607 @@
1
  {
2
  "en": {
3
- "by_wc": {
4
- "gender": {
5
- "S": {
6
- "M": "masculine",
7
- "F": "feminine",
8
- "N": "neuter"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  },
10
- "R": {
11
  "M": "masculine",
12
  "F": "feminine",
13
  "N": "neuter"
14
  },
15
- "A": {
16
- "M": "masculine",
17
- "F": "feminine",
18
- "N": "neuter"
19
  },
20
- "P": {
21
- "M": "masculine",
22
- "F": "feminine",
23
- "N": "neuter"
 
24
  },
25
- "N": {
26
- "M": "masculine",
27
- "F": "feminine",
28
- "N": "neuter"
29
  },
30
- "L": {
 
 
 
 
 
 
 
 
31
  "M": "masculine",
32
  "F": "feminine",
33
  "N": "neuter"
34
- }
35
- },
36
- "number": {
37
- "S": {
38
- "S": "singular",
39
- "P": "plural"
40
  },
41
- "R": {
42
  "S": "singular",
43
  "P": "plural"
44
  },
45
- "A": {
46
- "S": "singular",
47
- "P": "plural"
 
 
48
  },
49
- "P": {
50
- "S": "singular",
51
- "P": "plural"
 
 
 
 
 
52
  },
53
- "N": {
54
- "S": "singular",
55
- "P": "plural"
 
 
56
  },
57
- "V": {
58
- "S": "singular",
59
- "P": "plural"
 
60
  },
61
- "L": {
 
 
 
 
 
62
  "S": "singular",
63
  "P": "plural"
64
- }
65
- },
66
- "case": {
67
- "S": {
68
- "N": "nominative",
69
- "A": "accusative",
70
- "D": "dative",
71
- "G": "genitive"
72
- },
73
- "R": {
74
- "N": "nominative",
75
- "A": "accusative",
76
- "D": "dative",
77
- "G": "genitive"
78
  },
79
- "A": {
80
  "N": "nominative",
81
  "A": "accusative",
82
  "D": "dative",
83
  "G": "genitiv"
 
 
 
 
 
84
  },
85
- "P": {
86
- "N": "nominative",
87
- "A": "accusative",
88
- "D": "dative",
89
- "G": "genitive"
90
  },
91
- "N": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "N": "nominative",
93
  "A": "accusative",
94
  "D": "dative",
95
  "G": "genitive"
 
 
 
 
 
 
 
 
 
96
  },
97
- "L": {
 
 
 
 
 
 
 
 
 
98
  "N": "nominative",
99
  "A": "accusative",
100
  "D": "dative",
101
  "G": "genitive"
102
  }
103
  },
104
- "article": {
105
- "S": {
106
- "A": "with suffixed definite article"
107
- }
108
- },
109
- "proper": {
110
- "S": {
111
- "P": "Proper Noun"
112
- }
113
- },
114
- "definiteness": {
115
- "R": {
116
- "I": "indefinite",
117
- "D": "definite"
118
- }
119
- },
120
- "degree": {
121
- "A": {
122
- "P": "positive",
123
- "C": "comparative",
124
- "S": "superlative",
125
- "A": "absolute superlative"
126
  },
127
- "D": {
128
- "C": "comparative",
129
- "S": "superlative",
130
- "A": "absolute superlative"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  },
133
- "declension": {
134
- "A": {
135
- "S": "strong",
136
- "W": "weak",
137
- "e": "no-declension"
138
  },
139
- "L": {
 
 
 
 
140
  "S": "strong",
141
  "W": "weak",
142
  "e": "no-declension"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  }
144
  },
145
- "subcategory": {
146
- "P": {
147
- "D": "demonstrative",
148
- "E": "possessive",
149
- "I": "indefinite"
150
- },
151
- "N": {
152
- "C": "cardinal number",
153
- "O": "Date and other indeclinable number"
154
  },
155
- "D": {
156
  "N": "does not govern case",
157
  "G": "governs case",
158
  "I": "interjection/exclamation"
159
  },
160
- "C": {
 
 
 
 
 
 
 
 
 
 
161
  "C": "coordinative",
162
  "S": "subordinative",
163
  "I": "infinitive (only \"at\" infinitive)",
164
  "R": "relative"
165
- },
166
- "T": {
167
- "S": "abbreviation",
168
- "T": "short form"
169
- },
170
- "K": {
171
- "E": "End of sentence",
172
- "C": "comma",
173
- "Q": "quotes",
174
- "O": "other"
175
  }
176
  },
177
- "person": {
178
- "P": {
179
- "1": "1st pers",
180
- "2": "2nd pers",
181
- "3": "3rd pers"
 
 
 
 
 
 
 
 
182
  },
183
- "V": {
184
- "1": "1st person",
185
- "2": "2nd person",
186
- "3": "3rd person"
187
  }
188
  },
189
- "mood": {
190
- "V": {
191
- "I": "infinitive",
192
- "M": "imperative",
193
- "N": "indicative",
194
- "S": "subjunctive",
195
- "U": "supine"
196
  }
197
  },
198
- "voice": {
199
- "V": {
200
- "A": "active",
201
- "M": "mediopassive"
202
  },
203
- "L": {
204
- "A": "active",
205
- "M": "mediopassive"
 
 
206
  }
207
  },
208
- "tense": {
209
- "V": {
210
- "P": "present",
211
- "A": "past"
212
  }
213
  }
214
- },
 
 
215
  "global": {
216
  "word_class": {
217
- "S": "substantive",
218
- "R": "article",
219
- "A": "adjective",
220
- "P": "pronoun",
221
- "N": "numeral",
222
- "V": "verb (except for participle)",
223
- "L": "participle",
224
- "D": "adverb",
225
- "C": "conjunction",
226
- "F": "Foreign word",
227
- "X": "Unanalysed word",
228
- "T": "abbreviation",
229
- "W": "e-mail, web address",
230
- "K": "punctuation",
231
- "M": "Symbol"
232
- },
233
- "gender": {
234
- "M": "masculine",
235
- "F": "feminine",
236
- "N": "neuter"
237
- },
238
- "number": {
239
- "S": "singular",
240
- "P": "plural"
241
- },
242
- "case": {
243
- "N": "nominative",
244
- "A": "accusative",
245
- "D": "dative",
246
- "G": "genitive"
247
- },
248
- "article": {
249
- "A": "with suffixed definite article"
250
- },
251
- "proper": {
252
- "P": "Proper Noun"
253
  },
 
 
 
 
 
 
 
 
 
 
 
 
254
  "definiteness": {
255
- "I": "indefinite",
256
- "D": "definite"
257
- },
258
- "degree": {
259
- "P": "positive",
260
- "C": "comparative",
261
- "S": "superlative",
262
- "A": "absolute superlative"
263
- },
264
- "declension": {
265
- "S": "strong",
266
- "W": "weak",
267
- "e": "no-declension"
268
- },
269
- "subcategory": {
270
- "D": "demonstrative",
271
- "E": "possessive",
272
- "I": "indefinite",
273
- "C": "cardinal number",
274
- "O": "Date and other indeclinable number",
275
- "N": "does not govern case",
276
- "G": "governs case",
277
- "S": "subordinative",
278
- "R": "relative",
279
- "T": "short form",
280
- "Q": "quotes"
281
- },
282
- "person": {
283
- "1": "1st pers",
284
- "2": "2nd pers",
285
- "3": "3rd pers"
286
- },
287
- "mood": {
288
- "I": "infinitive",
289
- "M": "imperative",
290
- "N": "indicative",
291
- "S": "subjunctive",
292
- "U": "supine"
293
- },
294
- "voice": {
295
- "A": "active",
296
- "M": "mediopassive"
297
- },
298
- "tense": {
299
- "P": "present",
300
- "A": "past"
301
  }
302
- }
303
- },
304
- "fo": {
305
- "by_wc": {
306
- "gender": {
307
- "S": {
308
- "M": "kallkyn",
309
- "F": "kvennkyn",
310
- "N": "hvørkikyn"
311
- },
312
- "R": {
313
- "M": "kallkyn",
314
- "F": "kvennkyn",
315
- "N": "hvørkikyn"
316
- },
317
- "A": {
318
- "M": "kallkyn",
319
- "F": "kvennkyn",
320
- "N": "hvørkikyn"
321
- },
322
- "P": {
323
- "M": "kallkyn",
324
- "F": "kvennkyn",
325
- "N": "hvørkikyn"
326
- },
327
- "N": {
328
- "M": "kallkyn",
329
- "F": "kvennkyn",
330
- "N": "hvørkikyn"
331
  },
332
- "L": {
333
  "M": "kallkyn",
334
  "F": "kvennkyn",
335
  "N": "hvørkikyn"
336
- }
337
- },
338
- "number": {
339
- "S": {
340
- "S": "eintal",
341
- "P": "fleirtal"
342
  },
343
- "R": {
344
  "S": "eintal",
345
  "P": "fleirtal"
346
  },
347
- "A": {
348
- "S": "eintal",
349
- "P": "fleirtal"
350
- },
351
- "P": {
352
- "S": "eintal",
353
- "P": "fleirtal"
354
- },
355
- "N": {
356
- "S": "eintal",
357
- "P": "fleirtal"
358
- },
359
- "V": {
360
- "S": "eintal",
361
- "P": "fleirtal"
362
- },
363
- "L": {
364
- "S": "eintal",
365
- "P": "fleirtal"
366
- }
367
- },
368
- "case": {
369
- "S": {
370
  "N": "hvørfall",
371
  "A": "hvønnfall",
372
  "D": "hvørjumfall",
373
  "G": "hvørsfall"
374
  },
375
- "R": {
376
- "N": "hvørfall",
377
- "A": "hvønnfall",
378
- "D": "hvørjumfall",
379
- "G": "hvørsfall"
380
  },
381
- "A": {
382
- "N": "hvørfall",
383
- "A": "hvønnfall",
384
- "D": "hvørjumfall",
385
- "G": "hvørsfall"
 
 
386
  },
387
- "P": {
388
- "N": "hvørfall",
389
- "A": "hvønnfall",
390
- "D": "hvørjumfall",
391
- "G": "hvørsfall"
392
  },
393
- "N": {
394
- "N": "hvørfall",
395
- "A": "hvønnfall",
396
- "D": "hvørjumfall",
397
- "G": "hvørsfall"
398
  },
399
- "L": {
400
  "N": "hvørfall",
401
  "A": "hvønnfall",
402
  "D": "hvørjumfall",
403
  "G": "hvørsfall"
404
- }
405
- },
406
- "article": {
407
- "S": {
408
- "A": "bundið"
409
- }
410
- },
411
- "proper": {
412
- "S": {
413
- "P": "sernavn"
414
- }
415
- },
416
- "definiteness": {
417
- "R": {
418
  "I": "óbundið",
419
  "D": "bundið"
420
  }
421
  },
422
- "degree": {
423
- "A": {
 
 
 
424
  "P": "grundstig",
425
  "C": "miðstig",
426
  "S": "hástig",
427
  "A": "absolutt hástig"
428
  },
429
- "D": {
430
- "C": "miðstig",
431
- "S": "hástig",
432
- "A": "absolutt hástig"
433
- }
434
- },
435
- "declension": {
436
- "A": {
437
  "S": "sterk",
438
  "W": "veik",
439
  "e": "eingin-bending"
440
  },
441
- "L": {
442
- "S": "sterk",
443
- "W": "veik",
444
- "e": "eingin-bending"
 
 
 
 
 
 
 
 
 
 
445
  }
446
  },
447
- "subcategory": {
448
- "P": {
 
 
 
449
  "D": "ávísingarfornavn",
450
  "E": "ognarfornavn",
451
  "I": "óbundið fornavn"
452
  },
453
- "N": {
454
- "C": "grundtal",
455
- "O": "dagfesting og onnur óbendandi tøl"
456
- },
457
- "D": {
458
- "N": "stýrir ikki falli",
459
- "G": "stýrir falli",
460
- "I": "miðalvarping"
461
  },
462
- "C": {
463
- "C": "javnskipandi",
464
- "S": "innskipandi",
465
- "I": "navnháttarmerki (bara \"at\")",
466
- "R": "afturbeint fornavn"
467
  },
468
- "T": {
469
- "S": "stytting",
470
- "T": "stytting við punktum"
471
  },
472
- "K": {
473
- "E": "endi av setningi",
474
- "C": "komma",
475
- "Q": "gásareyga",
476
- "O": "annað"
477
  }
478
  },
479
- "person": {
480
- "P": {
481
- "1": "fyrsti persónur",
482
- "2": "annar persónur",
483
- "3": "triði persónur"
484
  },
485
- "V": {
486
- "1": "fyrsti persónur",
487
- "2": "annar persónur",
488
- "3": "triði persónur"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  }
490
  },
491
- "mood": {
492
- "V": {
 
 
 
493
  "I": "navnháttur",
494
  "M": "boðsháttur",
495
  "N": "søguháttur",
496
  "S": "hugsháttur",
497
  "U": "luttøkuháttur"
498
- }
499
- },
500
- "voice": {
501
- "V": {
502
- "A": "gerðsøgn",
503
- "M": "miðalsøgn"
504
  },
505
- "L": {
506
  "A": "gerðsøgn",
507
  "M": "miðalsøgn"
508
- }
509
- },
510
- "tense": {
511
- "V": {
512
  "P": "nútíð",
513
  "A": "tátíð"
514
  },
515
- "L": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  "P": "nútíð",
517
  "A": "tátíð"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  }
519
- }
520
- },
521
- "global": {
522
- "word_class": {
523
- "S": "navnorð",
524
- "R": "kenniorð",
525
- "A": "lýsingarorð",
526
- "P": "fornavn",
527
- "N": "talorð",
528
- "V": "sagnorð (ikki lýsingarháttur)",
529
- "L": "lýsingarháttur",
530
- "D": "hjáorð",
531
- "C": "sambindingarorð",
532
- "F": "útlendskt orð",
533
- "X": "ómarkað orð",
534
- "T": "stytting",
535
- "W": "teldupostur, heimasíðua",
536
- "K": "teknseting",
537
- "M": "symbol"
538
- },
539
- "gender": {
540
- "M": "kallkyn",
541
- "F": "kvennkyn",
542
- "N": "hvørkikyn"
543
- },
544
- "number": {
545
- "S": "eintal",
546
- "P": "fleirtal"
547
- },
548
- "case": {
549
- "N": "hvørfall",
550
- "A": "hvønnfall",
551
- "D": "hvørjumfall",
552
- "G": "hvørsfall"
553
- },
554
- "article": {
555
- "A": "bundið"
556
- },
557
- "proper": {
558
- "P": "sernavn"
559
  },
560
- "definiteness": {
561
- "I": "óbundið",
562
- "D": "bundið"
 
 
 
 
 
 
 
 
 
 
 
563
  },
564
- "degree": {
565
- "P": "grundstig",
566
- "C": "miðstig",
567
- "S": "hástig",
568
- "A": "absolutt hástig"
 
 
 
 
 
569
  },
570
- "declension": {
571
- "S": "sterk",
572
- "W": "veik",
573
- "e": "eingin-bending"
574
  },
575
- "subcategory": {
576
- "D": "ávísingarfornavn",
577
- "E": "ognarfornavn",
578
- "I": "óbundið fornavn",
579
- "C": "grundtal",
580
- "O": "dagfesting og onnur óbendandi tøl",
581
- "N": "stýrir ikki falli",
582
- "G": "stýrir falli",
583
- "S": "innskipandi",
584
- "R": "afturbeint fornavn",
585
- "T": "stytting við punktum",
586
- "Q": "gásareyga"
587
  },
588
- "person": {
589
- "1": "fyrsti persónur",
590
- "2": "annar persónur",
591
- "3": "triði persónur"
 
 
 
 
592
  },
593
- "mood": {
594
- "I": "navnháttur",
595
- "M": "boðsháttur",
596
- "N": "søguháttur",
597
- "S": "hugsháttur",
598
- "U": "luttøkuháttur"
599
  },
600
- "voice": {
601
- "A": "gerðsøgn",
602
- "M": "miðalsøgn"
 
 
 
 
 
 
 
603
  },
604
- "tense": {
605
- "P": "nútíð",
606
- "A": "tátíð"
 
607
  }
608
  }
 
 
 
 
 
609
  }
610
  }
 
1
  {
2
  "en": {
3
+ "global": {
4
+ "word_class": {
5
+ "S": "substantive",
6
+ "R": "article",
7
+ "A": "adjective",
8
+ "P": "pronoun",
9
+ "N": "numeral",
10
+ "V": "verb (except for participle)",
11
+ "L": "participle",
12
+ "D": "adverb",
13
+ "C": "conjunction",
14
+ "F": "Foreign word",
15
+ "X": "Unanalysed word",
16
+ "T": "abbreviation",
17
+ "W": "e-mail, web address",
18
+ "K": "punctuation",
19
+ "M": "Symbol"
20
+ },
21
+ "gender": {},
22
+ "number": {},
23
+ "case": {},
24
+ "article": {},
25
+ "proper": {},
26
+ "degree": {},
27
+ "declension": {},
28
+ "subcategory": {},
29
+ "person": {},
30
+ "mood": {},
31
+ "voice": {},
32
+ "tense": {},
33
+ "definiteness": {
34
+ "D": "definite",
35
+ "I": "indefinite"
36
+ }
37
+ },
38
+ "by_word_class": {
39
+ "S": {
40
+ "word_class": {
41
+ "S": "substantive"
42
  },
43
+ "gender": {
44
  "M": "masculine",
45
  "F": "feminine",
46
  "N": "neuter"
47
  },
48
+ "number": {
49
+ "S": "singular",
50
+ "P": "plural"
 
51
  },
52
+ "case": {
53
+ "N": "nominative",
54
+ "A": "accusative",
55
+ "D": "dative",
56
+ "G": "genitive"
57
  },
58
+ "article": {
59
+ "A": "with suffixed definite article"
 
 
60
  },
61
+ "proper": {
62
+ "P": "Proper Noun"
63
+ }
64
+ },
65
+ "R": {
66
+ "word_class": {
67
+ "R": "article"
68
+ },
69
+ "gender": {
70
  "M": "masculine",
71
  "F": "feminine",
72
  "N": "neuter"
 
 
 
 
 
 
73
  },
74
+ "number": {
75
  "S": "singular",
76
  "P": "plural"
77
  },
78
+ "case": {
79
+ "N": "nominative",
80
+ "A": "accusative",
81
+ "D": "dative",
82
+ "G": "genitive"
83
  },
84
+ "article": {
85
+ "I": "indefinite",
86
+ "D": "definite"
87
+ }
88
+ },
89
+ "A": {
90
+ "word_class": {
91
+ "A": "adjective"
92
  },
93
+ "degree": {
94
+ "P": "positive",
95
+ "C": "comparative",
96
+ "S": "superlative",
97
+ "A": "absolute superlative"
98
  },
99
+ "declension": {
100
+ "S": "strong",
101
+ "W": "weak",
102
+ "e": "no-declension"
103
  },
104
+ "gender": {
105
+ "M": "masculine",
106
+ "F": "feminine",
107
+ "N": "neuter"
108
+ },
109
+ "number": {
110
  "S": "singular",
111
  "P": "plural"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  },
113
+ "case": {
114
  "N": "nominative",
115
  "A": "accusative",
116
  "D": "dative",
117
  "G": "genitiv"
118
+ }
119
+ },
120
+ "P": {
121
+ "word_class": {
122
+ "P": "pronoun"
123
  },
124
+ "subcategory": {
125
+ "D": "demonstrative",
126
+ "E": "possessive",
127
+ "I": "indefinite"
 
128
  },
129
+ "gender": {
130
+ "M": "masculine",
131
+ "F": "feminine",
132
+ "N": "neuter"
133
+ },
134
+ "person": {
135
+ "1": "1st pers",
136
+ "2": "2nd pers",
137
+ "3": "3rd pers"
138
+ },
139
+ "number": {
140
+ "S": "singular",
141
+ "P": "plural"
142
+ },
143
+ "case": {
144
  "N": "nominative",
145
  "A": "accusative",
146
  "D": "dative",
147
  "G": "genitive"
148
+ }
149
+ },
150
+ "N": {
151
+ "word_class": {
152
+ "N": "numeral"
153
+ },
154
+ "subcategory": {
155
+ "C": "cardinal number",
156
+ "O": "Date and other indeclinable number"
157
  },
158
+ "gender": {
159
+ "M": "masculine",
160
+ "F": "feminine",
161
+ "N": "neuter"
162
+ },
163
+ "number": {
164
+ "S": "singular",
165
+ "P": "plural"
166
+ },
167
+ "case": {
168
  "N": "nominative",
169
  "A": "accusative",
170
  "D": "dative",
171
  "G": "genitive"
172
  }
173
  },
174
+ "V": {
175
+ "word_class": {
176
+ "V": "verb (except for participle)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  },
178
+ "mood": {
179
+ "I": "infinitive",
180
+ "M": "imperative",
181
+ "N": "indicative",
182
+ "S": "subjunctive",
183
+ "U": "supine"
184
+ },
185
+ "voice": {
186
+ "A": "active",
187
+ "M": "mediopassive"
188
+ },
189
+ "tense": {
190
+ "P": "present",
191
+ "A": "past"
192
+ },
193
+ "number": {
194
+ "S": "singular",
195
+ "P": "plural"
196
+ },
197
+ "person": {
198
+ "1": "1st person",
199
+ "2": "2nd person",
200
+ "3": "3rd person"
201
  }
202
  },
203
+ "L": {
204
+ "word_class": {
205
+ "L": "participle"
 
 
206
  },
207
+ "voice": {
208
+ "A": "active",
209
+ "M": "mediopassive"
210
+ },
211
+ "declension": {
212
  "S": "strong",
213
  "W": "weak",
214
  "e": "no-declension"
215
+ },
216
+ "gender": {
217
+ "M": "masculine",
218
+ "F": "feminine",
219
+ "N": "neuter"
220
+ },
221
+ "number": {
222
+ "S": "singular",
223
+ "P": "plural"
224
+ },
225
+ "case": {
226
+ "N": "nominative",
227
+ "A": "accusative",
228
+ "D": "dative",
229
+ "G": "genitive"
230
  }
231
  },
232
+ "D": {
233
+ "word_class": {
234
+ "D": "adverb"
 
 
 
 
 
 
235
  },
236
+ "subcategory": {
237
  "N": "does not govern case",
238
  "G": "governs case",
239
  "I": "interjection/exclamation"
240
  },
241
+ "degree": {
242
+ "C": "comparative",
243
+ "S": "superlative",
244
+ "A": "absolute superlative"
245
+ }
246
+ },
247
+ "C": {
248
+ "word_class": {
249
+ "C": "conjunction"
250
+ },
251
+ "subcategory": {
252
  "C": "coordinative",
253
  "S": "subordinative",
254
  "I": "infinitive (only \"at\" infinitive)",
255
  "R": "relative"
 
 
 
 
 
 
 
 
 
 
256
  }
257
  },
258
+ "F": {
259
+ "word_class": {
260
+ "F": "Foreign word"
261
+ }
262
+ },
263
+ "X": {
264
+ "word_class": {
265
+ "X": "Unanalysed word"
266
+ }
267
+ },
268
+ "T": {
269
+ "word_class": {
270
+ "T": "abbreviation"
271
  },
272
+ "subcategory": {
273
+ "S": "abbreviation",
274
+ "T": "short form"
 
275
  }
276
  },
277
+ "W": {
278
+ "word_class": {
279
+ "W": "e-mail, web address"
 
 
 
 
280
  }
281
  },
282
+ "K": {
283
+ "word_class": {
284
+ "K": "punctuation"
 
285
  },
286
+ "subcategory": {
287
+ "E": "End of sentence",
288
+ "C": "comma",
289
+ "Q": "quotes",
290
+ "O": "other"
291
  }
292
  },
293
+ "M": {
294
+ "word_class": {
295
+ "M": "Symbol"
 
296
  }
297
  }
298
+ }
299
+ },
300
+ "fo": {
301
  "global": {
302
  "word_class": {
303
+ "S": "navnorð",
304
+ "R": "kenniorð",
305
+ "A": "lýsingarorð",
306
+ "P": "fornavn",
307
+ "N": "talorð",
308
+ "V": "sagnorð (ikki lýsingarháttur)",
309
+ "L": "lýsingarháttur",
310
+ "D": "hjáorð",
311
+ "C": "sambindingarorð",
312
+ "F": "útlendskt orð",
313
+ "X": "ómarkað orð",
314
+ "T": "stytting",
315
+ "W": "teldupostur, heimasíðua",
316
+ "K": "teknseting",
317
+ "M": "symbol"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  },
319
+ "gender": {},
320
+ "number": {},
321
+ "case": {},
322
+ "article": {},
323
+ "proper": {},
324
+ "degree": {},
325
+ "declension": {},
326
+ "subcategory": {},
327
+ "person": {},
328
+ "mood": {},
329
+ "voice": {},
330
+ "tense": {},
331
  "definiteness": {
332
+ "D": "bundið",
333
+ "I": "óbundið"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  }
335
+ },
336
+ "by_word_class": {
337
+ "S": {
338
+ "word_class": {
339
+ "S": "navnorð"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  },
341
+ "gender": {
342
  "M": "kallkyn",
343
  "F": "kvennkyn",
344
  "N": "hvørkikyn"
 
 
 
 
 
 
345
  },
346
+ "number": {
347
  "S": "eintal",
348
  "P": "fleirtal"
349
  },
350
+ "case": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  "N": "hvørfall",
352
  "A": "hvønnfall",
353
  "D": "hvørjumfall",
354
  "G": "hvørsfall"
355
  },
356
+ "article": {
357
+ "A": "bundið"
 
 
 
358
  },
359
+ "proper": {
360
+ "P": "sernavn"
361
+ }
362
+ },
363
+ "R": {
364
+ "word_class": {
365
+ "R": "kenniorð"
366
  },
367
+ "gender": {
368
+ "M": "kallkyn",
369
+ "F": "kvennkyn",
370
+ "N": "hvørkikyn"
 
371
  },
372
+ "number": {
373
+ "S": "eintal",
374
+ "P": "fleirtal"
 
 
375
  },
376
+ "case": {
377
  "N": "hvørfall",
378
  "A": "hvønnfall",
379
  "D": "hvørjumfall",
380
  "G": "hvørsfall"
381
+ },
382
+ "article": {
 
 
 
 
 
 
 
 
 
 
 
 
383
  "I": "óbundið",
384
  "D": "bundið"
385
  }
386
  },
387
+ "A": {
388
+ "word_class": {
389
+ "A": "lýsingarorð"
390
+ },
391
+ "degree": {
392
  "P": "grundstig",
393
  "C": "miðstig",
394
  "S": "hástig",
395
  "A": "absolutt hástig"
396
  },
397
+ "declension": {
 
 
 
 
 
 
 
398
  "S": "sterk",
399
  "W": "veik",
400
  "e": "eingin-bending"
401
  },
402
+ "gender": {
403
+ "M": "kallkyn",
404
+ "F": "kvennkyn",
405
+ "N": "hvørkikyn"
406
+ },
407
+ "number": {
408
+ "S": "eintal",
409
+ "P": "fleirtal"
410
+ },
411
+ "case": {
412
+ "N": "hvørfall",
413
+ "A": "hvønnfall",
414
+ "D": "hvørjumfall",
415
+ "G": "hvørsfall"
416
  }
417
  },
418
+ "P": {
419
+ "word_class": {
420
+ "P": "fornavn"
421
+ },
422
+ "subcategory": {
423
  "D": "ávísingarfornavn",
424
  "E": "ognarfornavn",
425
  "I": "óbundið fornavn"
426
  },
427
+ "gender": {
428
+ "M": "kallkyn",
429
+ "F": "kvennkyn",
430
+ "N": "hvørkikyn"
 
 
 
 
431
  },
432
+ "person": {
433
+ "1": "fyrsti persónur",
434
+ "2": "annar persónur",
435
+ "3": "triði persónur"
 
436
  },
437
+ "number": {
438
+ "S": "eintal",
439
+ "P": "fleirtal"
440
  },
441
+ "case": {
442
+ "N": "hvørfall",
443
+ "A": "hvønnfall",
444
+ "D": "hvørjumfall",
445
+ "G": "hvørsfall"
446
  }
447
  },
448
+ "N": {
449
+ "word_class": {
450
+ "N": "talorð"
 
 
451
  },
452
+ "subcategory": {
453
+ "C": "grundtal",
454
+ "O": "dagfesting og onnur óbendandi tøl"
455
+ },
456
+ "gender": {
457
+ "M": "kallkyn",
458
+ "F": "kvennkyn",
459
+ "N": "hvørkikyn"
460
+ },
461
+ "number": {
462
+ "S": "eintal",
463
+ "P": "fleirtal"
464
+ },
465
+ "case": {
466
+ "N": "hvørfall",
467
+ "A": "hvønnfall",
468
+ "D": "hvørjumfall",
469
+ "G": "hvørsfall"
470
  }
471
  },
472
+ "V": {
473
+ "word_class": {
474
+ "V": "sagnorð (ikki lýsingarháttur)"
475
+ },
476
+ "mood": {
477
  "I": "navnháttur",
478
  "M": "boðsháttur",
479
  "N": "søguháttur",
480
  "S": "hugsháttur",
481
  "U": "luttøkuháttur"
 
 
 
 
 
 
482
  },
483
+ "voice": {
484
  "A": "gerðsøgn",
485
  "M": "miðalsøgn"
486
+ },
487
+ "tense": {
 
 
488
  "P": "nútíð",
489
  "A": "tátíð"
490
  },
491
+ "number": {
492
+ "S": "eintal",
493
+ "P": "fleirtal"
494
+ },
495
+ "person": {
496
+ "1": "fyrsti persónur",
497
+ "2": "annar persónur",
498
+ "3": "triði persónur"
499
+ }
500
+ },
501
+ "L": {
502
+ "word_class": {
503
+ "L": "lýsingarháttur"
504
+ },
505
+ "tense": {
506
  "P": "nútíð",
507
  "A": "tátíð"
508
+ },
509
+ "voice": {
510
+ "A": "gerðsøgn",
511
+ "M": "miðalsøgn"
512
+ },
513
+ "declension": {
514
+ "S": "sterk",
515
+ "W": "veik",
516
+ "e": "eingin-bending"
517
+ },
518
+ "gender": {
519
+ "M": "kallkyn",
520
+ "F": "kvennkyn",
521
+ "N": "hvørkikyn"
522
+ },
523
+ "number": {
524
+ "S": "eintal",
525
+ "P": "fleirtal"
526
+ },
527
+ "case": {
528
+ "N": "hvørfall",
529
+ "A": "hvønnfall",
530
+ "D": "hvørjumfall",
531
+ "G": "hvørsfall"
532
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
  },
534
+ "D": {
535
+ "word_class": {
536
+ "D": "hjáorð"
537
+ },
538
+ "subcategory": {
539
+ "N": "stýrir ikki falli",
540
+ "G": "stýrir falli",
541
+ "I": "miðalvarping"
542
+ },
543
+ "degree": {
544
+ "C": "miðstig",
545
+ "S": "hástig",
546
+ "A": "absolutt hástig"
547
+ }
548
  },
549
+ "C": {
550
+ "word_class": {
551
+ "C": "sambindingarorð"
552
+ },
553
+ "subcategory": {
554
+ "C": "javnskipandi",
555
+ "S": "innskipandi",
556
+ "I": "navnháttarmerki (bara \"at\")",
557
+ "R": "afturbeint fornavn"
558
+ }
559
  },
560
+ "F": {
561
+ "word_class": {
562
+ "F": "útlendskt orð"
563
+ }
564
  },
565
+ "X": {
566
+ "word_class": {
567
+ "X": "ómarkað orð"
568
+ }
 
 
 
 
 
 
 
 
569
  },
570
+ "T": {
571
+ "word_class": {
572
+ "T": "stytting"
573
+ },
574
+ "subcategory": {
575
+ "S": "stytting",
576
+ "T": "stytting við punktum"
577
+ }
578
  },
579
+ "W": {
580
+ "word_class": {
581
+ "W": "teldupostur, heimasíðua"
582
+ }
 
 
583
  },
584
+ "K": {
585
+ "word_class": {
586
+ "K": "teknseting"
587
+ },
588
+ "subcategory": {
589
+ "E": "endi av setningi",
590
+ "C": "komma",
591
+ "Q": "gásareyga",
592
+ "O": "annað"
593
+ }
594
  },
595
+ "M": {
596
+ "word_class": {
597
+ "M": "symbol"
598
+ }
599
  }
600
  }
601
+ },
602
+ "meta": {
603
+ "source_en_xlsx": "Sosialurin-GOLD tagset.xlsx",
604
+ "source_fo_xlsx": "Sosialurin-GOLD markingaryvirlit.xlsx",
605
+ "notes": "Extracted from XLSX; FO mood U added manually: luttøkuháttur."
606
  }
607
  }