unijoh commited on
Commit
f12d533
·
verified ·
1 Parent(s): 958c273

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -50
app.py CHANGED
@@ -7,6 +7,9 @@ import numpy as np
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
 
 
 
10
  MODEL_ID = "Setur/BRAGD"
11
  TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
12
  LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
@@ -28,18 +31,12 @@ GROUP_ORDER = ["subcategory","gender","number","case","article","proper","degree
28
  # You said Subcategory B doesn't exist and will be deleted from the CSV:
29
  HIDE_CODES = {"subcategory": {"B"}}
30
 
31
- GROUP_TITLES = {
32
- "en": {"subcategory":"Subcategory","gender":"Gender","number":"Number","case":"Case","article":"Article suffix","proper":"Proper noun",
33
- "degree":"Degree","declension":"Declension","mood":"Mood","voice":"Voice","tense":"Tense","person":"Person","definiteness":"Definiteness"},
34
- "fo": {"subcategory":"Undirflokkur","gender":"Kyn","number":"Tal","case":"Fall","article":"Bundið eftirlið","proper":"Sernavn",
35
- "degree":"Stig","declension":"Bending","mood":"Háttur","voice":"Søgn","tense":"Tíð","person":"Persónur","definiteness":"Bundni/óbundni"},
36
- }
37
-
38
  UI = {
39
- "fo": {"w":"Orð","t":"Mark","s":"Vís sum","m":"Merking","def":"bundið","ind":"óbundið"},
40
- "en": {"w":"Word","t":"Tag","s":"Show as","m":"Meaning","def":"definite","ind":"indefinite"},
41
  }
42
 
 
43
  CSS = """
44
  :root{
45
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
@@ -92,7 +89,9 @@ def process_tag_features(tag_to_features: dict, intervals):
92
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
93
  out = {}
94
  for wt,labels in wt_masks.items():
95
- if not labels: out[wt]=[]; continue
 
 
96
  sum_labels = np.sum(np.array(labels), axis=0)
97
  out[wt] = [iv for iv in intervals if np.sum(sum_labels[iv[0]:iv[1]+1]) != 0]
98
  return out
@@ -118,7 +117,9 @@ def predict_vectors(logits, attention_mask, begin_tokens, dict_intervals, vec_le
118
  vectors.append(vec)
119
  return vectors
120
 
 
121
  # Load labels (extracted from your XLSX)
 
122
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
123
  LABELS = json.load(f)
124
 
@@ -130,10 +131,14 @@ def label_for(lang: str, group: str, wc: str, code: str) -> str:
130
  return by_wc[wc][group][code]
131
  return glob.get(group, {}).get(code, "")
132
 
 
133
  # Load CSV mappings (authoritative)
 
134
  tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
135
 
 
136
  # Load model
 
137
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
138
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
139
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -170,38 +175,65 @@ def group_code(vec: torch.Tensor, group: str) -> str:
170
  return code
171
  return ""
172
 
173
- def describe(vec: torch.Tensor, lang: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  wc = wc_code(vec)
175
- parts = []
176
- if wc:
177
- lbl = label_for(lang, "word_class", wc, wc)
178
- parts.append(f"{wc} {lbl}" if lbl else wc)
 
 
 
 
 
179
  for g in GROUP_ORDER:
180
  c = group_code(vec, g)
181
  if not c:
182
  continue
183
- lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
184
- parts.append(f"{c} – {lbl}" if lbl else c)
185
- return "; ".join(parts)
186
 
187
- def show_as(vec: torch.Tensor, lang: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  lang = "fo" if lang=="fo" else "en"
189
  wc = wc_code(vec)
190
- wc_lbl = label_for(lang, "word_class", wc, wc) or wc
191
- raw = vector_to_tag(vec)
192
 
193
- # Exact override you requested:
194
- if raw == "DGd":
195
- return "Fyriseting" if lang=="fo" else "Preposition"
196
 
197
- # S...a. / S...A. mapping (nouns): show definite/indefinite by Article (A/a)
198
- if wc == "S":
199
- art = group_code(vec, "article") # A or a
200
- if art == "A": return f"{wc_lbl} — {UI[lang]['def']}"
201
- if art == "a": return f"{wc_lbl} {UI[lang]['ind']}"
202
- return wc_lbl
203
 
204
- return wc_lbl
205
 
206
  def compute_codes_by_wc():
207
  codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
@@ -229,8 +261,15 @@ def compute_codes_by_wc():
229
  CODES_BY_WC = compute_codes_by_wc()
230
 
231
  def build_legend(lang: str) -> str:
 
 
 
 
 
232
  lang = "fo" if lang=="fo" else "en"
233
- lines = ["### Markingaryvirlit / Legend", ""]
 
 
234
  for wc in sorted(CODES_BY_WC.keys()):
235
  wcl = label_for(lang, "word_class", wc, wc) or ""
236
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
@@ -239,12 +278,48 @@ def build_legend(lang: str) -> str:
239
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
240
  if not cs:
241
  continue
242
- lines.append(f"**{GROUP_TITLES[lang].get(g, g)}**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  for c in cs:
244
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
245
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
246
  lines.append("")
 
247
  lines.append("")
 
248
  return "\n".join(lines).strip()
249
 
250
  def run_model(sentence: str):
@@ -255,8 +330,16 @@ def run_model(sentence: str):
255
  if not tokens:
256
  return []
257
 
258
- enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
259
- padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
260
 
261
  input_ids = enc["input_ids"].to(device)
262
  attention_mask = enc["attention_mask"].to(device)
@@ -265,9 +348,12 @@ def run_model(sentence: str):
265
  begin = []
266
  last = None
267
  for wid in word_ids:
268
- if wid is None: begin.append(0)
269
- elif wid != last: begin.append(1)
270
- else: begin.append(0)
 
 
 
271
  last = wid
272
 
273
  with torch.no_grad():
@@ -288,18 +374,27 @@ def run_model(sentence: str):
288
  vec_i += 1
289
  return rows
290
 
291
- def render(rows_state, lang: str):
292
  lang = "fo" if lang=="fo" else "en"
293
- cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"], UI[lang]["m"]]
294
  if not rows_state:
295
- return pd.DataFrame(columns=cols), build_legend(lang)
296
 
297
- out = []
 
298
  for r in rows_state:
299
  vec = torch.tensor(r["vec"])
300
- out.append([r["word"], vector_to_tag(vec), show_as(vec, lang), describe(vec, lang)])
301
- return pd.DataFrame(out, columns=cols), build_legend(lang)
 
302
 
 
 
 
 
 
 
 
303
  theme = gr.themes.Soft()
304
 
305
  with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
@@ -309,25 +404,29 @@ with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
309
  btn = gr.Button("Marka / Tag", variant="primary")
310
 
311
  state = gr.State([])
 
312
  out_df = gr.Dataframe(wrap=True, interactive=False, label="Úrslit / Results")
313
 
314
  # Under results + can be changed AFTER tagging (no rerun; just re-render)
315
  lang = gr.Dropdown(choices=[("Føroyskt","fo"), ("English","en")], value="fo", label="Mál / Language")
316
 
 
 
 
317
  with gr.Accordion("Markingaryvirlit / Legend", open=False):
318
  legend_md = gr.Markdown(build_legend("fo"))
319
 
320
  def on_tag(sentence, lang_choice):
321
  rows = run_model(sentence)
322
- df, legend = render(rows, lang_choice)
323
- return rows, df, legend
324
 
325
  def on_lang(rows, lang_choice):
326
- df, legend = render(rows, lang_choice)
327
- return df, legend
328
 
329
- btn.click(on_tag, inputs=[inp, lang], outputs=[state, out_df, legend_md])
330
- lang.change(on_lang, inputs=[state, lang], outputs=[out_df, legend_md])
331
 
332
  if __name__ == "__main__":
333
  demo.launch()
 
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
10
+ # ----------------------------
11
+ # Config
12
+ # ----------------------------
13
  MODEL_ID = "Setur/BRAGD"
14
  TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must match model labels
15
  LABELS_FILEPATH = "tag_labels.json" # add to repo root (FO+EN labels)
 
31
  # You said Subcategory B doesn't exist and will be deleted from the CSV:
32
  HIDE_CODES = {"subcategory": {"B"}}
33
 
 
 
 
 
 
 
 
34
  UI = {
35
+ "fo": {"w":"Orð", "t":"Mark", "s":"Vís sum", "m":"Merking"},
36
+ "en": {"w":"Word","t":"Tag", "s":"Show as", "m":"Meaning"},
37
  }
38
 
39
+ # Theme color: #89AFA9 (+ close shades)
40
  CSS = """
41
  :root{
42
  --primary-500:#89AFA9; --primary-600:#6F9992; --primary-700:#5B7F79;
 
89
  wt_masks = {wt:[a for a in arrs if a[wt]==1] for wt in range(15)}
90
  out = {}
91
  for wt,labels in wt_masks.items():
92
+ if not labels:
93
+ out[wt]=[]
94
+ continue
95
  sum_labels = np.sum(np.array(labels), axis=0)
96
  out[wt] = [iv for iv in intervals if np.sum(sum_labels[iv[0]:iv[1]+1]) != 0]
97
  return out
 
117
  vectors.append(vec)
118
  return vectors
119
 
120
+ # ----------------------------
121
  # Load labels (extracted from your XLSX)
122
+ # ----------------------------
123
  with open(LABELS_FILEPATH, "r", encoding="utf-8") as f:
124
  LABELS = json.load(f)
125
 
 
131
  return by_wc[wc][group][code]
132
  return glob.get(group, {}).get(code, "")
133
 
134
+ # ----------------------------
135
  # Load CSV mappings (authoritative)
136
+ # ----------------------------
137
  tag_to_features, features_to_tag, VEC_LEN, FEATURE_COLS = load_tag_mappings(TAGS_FILEPATH)
138
 
139
+ # ----------------------------
140
  # Load model
141
+ # ----------------------------
142
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
143
  model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
144
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
175
  return code
176
  return ""
177
 
178
+ def clean_label(s: str) -> str:
179
+ s = (s or "").strip()
180
+ s = re.sub(r"\s+", " ", s)
181
+ s = s.strip(" -;,:")
182
+ return s
183
+
184
+ def visible_summary(vec: torch.Tensor, lang: str) -> str:
185
+ """
186
+ Vís sum / Show as:
187
+ - ONLY words/labels, no letters, no hyphens like "X –"
188
+ - all selected features (stable order)
189
+ """
190
+ lang = "fo" if lang=="fo" else "en"
191
+ raw_tag = vector_to_tag(vec)
192
  wc = wc_code(vec)
193
+
194
+ wc_lbl = label_for(lang, "word_class", wc, wc) or wc
195
+
196
+ # Exact override requested earlier:
197
+ if raw_tag == "DGd":
198
+ wc_lbl = "Fyriseting" if lang=="fo" else "Preposition"
199
+
200
+ labels = [clean_label(wc_lbl)]
201
+
202
  for g in GROUP_ORDER:
203
  c = group_code(vec, g)
204
  if not c:
205
  continue
 
 
 
206
 
207
+ lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c) or ""
208
+ lbl = clean_label(lbl)
209
+ if not lbl:
210
+ continue
211
+
212
+ if lbl not in labels:
213
+ labels.append(lbl)
214
+
215
+ return ", ".join([l for l in labels if l])
216
+
217
+ def meaning_detail(vec: torch.Tensor, lang: str) -> str:
218
+ """
219
+ Merking / Meaning:
220
+ keeps codes + labels (useful for debugging and linguists)
221
+ """
222
  lang = "fo" if lang=="fo" else "en"
223
  wc = wc_code(vec)
224
+ parts = []
 
225
 
226
+ wc_lbl = label_for(lang, "word_class", wc, wc)
227
+ parts.append(f"{wc} – {wc_lbl}" if wc_lbl else wc)
 
228
 
229
+ for g in GROUP_ORDER:
230
+ c = group_code(vec, g)
231
+ if not c:
232
+ continue
233
+ lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
234
+ parts.append(f"{c} – {lbl}" if lbl else c)
235
 
236
+ return "; ".join([p for p in parts if p])
237
 
238
  def compute_codes_by_wc():
239
  codes = defaultdict(lambda: defaultdict(set)) # wc -> group -> set(code)
 
261
  CODES_BY_WC = compute_codes_by_wc()
262
 
263
  def build_legend(lang: str) -> str:
264
+ """
265
+ Elaborate overview:
266
+ Under each word class, show the letter codes actually used in the CURRENT CSV,
267
+ with labels from tag_labels.json (fallback to code if missing).
268
+ """
269
  lang = "fo" if lang=="fo" else "en"
270
+ title = "### Markingaryvirlit" if lang=="fo" else "### Tag legend"
271
+ lines = [title, ""]
272
+
273
  for wc in sorted(CODES_BY_WC.keys()):
274
  wcl = label_for(lang, "word_class", wc, wc) or ""
275
  lines.append(f"#### {wc} — {wcl}" if wcl else f"#### {wc}")
 
278
  cs = sorted(CODES_BY_WC[wc].get(g, set()))
279
  if not cs:
280
  continue
281
+
282
+ if lang=="fo":
283
+ group_name = {
284
+ "subcategory":"Undirflokkur",
285
+ "gender":"Kyn",
286
+ "number":"Tal",
287
+ "case":"Fall",
288
+ "article":"Bundni/óbundni",
289
+ "proper":"Sernavn",
290
+ "degree":"Stig",
291
+ "declension":"Bending",
292
+ "mood":"Háttur",
293
+ "voice":"Søgn",
294
+ "tense":"Tíð",
295
+ "person":"Persónur",
296
+ "definiteness":"Bundni/óbundni",
297
+ }.get(g, g)
298
+ else:
299
+ group_name = {
300
+ "subcategory":"Subcategory",
301
+ "gender":"Gender",
302
+ "number":"Number",
303
+ "case":"Case",
304
+ "article":"Definite suffix",
305
+ "proper":"Proper noun",
306
+ "degree":"Degree",
307
+ "declension":"Declension",
308
+ "mood":"Mood",
309
+ "voice":"Voice",
310
+ "tense":"Tense",
311
+ "person":"Person",
312
+ "definiteness":"Definiteness",
313
+ }.get(g, g)
314
+
315
+ lines.append(f"**{group_name}**")
316
  for c in cs:
317
  lbl = label_for(lang, g, wc, c) or label_for(lang, g, "", c)
318
  lines.append(f"- `{c}` — {lbl}" if lbl else f"- `{c}`")
319
  lines.append("")
320
+
321
  lines.append("")
322
+
323
  return "\n".join(lines).strip()
324
 
325
  def run_model(sentence: str):
 
330
  if not tokens:
331
  return []
332
 
333
+ enc = tokenizer(
334
+ tokens,
335
+ is_split_into_words=True,
336
+ add_special_tokens=True,
337
+ max_length=128,
338
+ padding="max_length",
339
+ truncation=True,
340
+ return_attention_mask=True,
341
+ return_tensors="pt",
342
+ )
343
 
344
  input_ids = enc["input_ids"].to(device)
345
  attention_mask = enc["attention_mask"].to(device)
 
348
  begin = []
349
  last = None
350
  for wid in word_ids:
351
+ if wid is None:
352
+ begin.append(0)
353
+ elif wid != last:
354
+ begin.append(1)
355
+ else:
356
+ begin.append(0)
357
  last = wid
358
 
359
  with torch.no_grad():
 
374
  vec_i += 1
375
  return rows
376
 
377
+ def render_main(rows_state, lang: str):
378
  lang = "fo" if lang=="fo" else "en"
379
+ cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
380
  if not rows_state:
381
+ return pd.DataFrame(columns=cols), build_legend(lang), pd.DataFrame(columns=[UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]])
382
 
383
+ out_main = []
384
+ out_mean = []
385
  for r in rows_state:
386
  vec = torch.tensor(r["vec"])
387
+ tag = vector_to_tag(vec)
388
+ out_main.append([r["word"], tag, visible_summary(vec, lang)])
389
+ out_mean.append([r["word"], tag, meaning_detail(vec, lang)])
390
 
391
+ df_main = pd.DataFrame(out_main, columns=cols)
392
+ df_mean = pd.DataFrame(out_mean, columns=[UI[lang]["w"], UI[lang]["t"], UI[lang]["m"]])
393
+ return df_main, build_legend(lang), df_mean
394
+
395
+ # ----------------------------
396
+ # Gradio UI
397
+ # ----------------------------
398
  theme = gr.themes.Soft()
399
 
400
  with gr.Blocks(theme=theme, css=CSS, title="BRAGD-markarin") as demo:
 
404
  btn = gr.Button("Marka / Tag", variant="primary")
405
 
406
  state = gr.State([])
407
+
408
  out_df = gr.Dataframe(wrap=True, interactive=False, label="Úrslit / Results")
409
 
410
  # Under results + can be changed AFTER tagging (no rerun; just re-render)
411
  lang = gr.Dropdown(choices=[("Føroyskt","fo"), ("English","en")], value="fo", label="Mál / Language")
412
 
413
+ with gr.Accordion("Merking / Meaning", open=False):
414
+ out_mean_df = gr.Dataframe(wrap=True, interactive=False, label="")
415
+
416
  with gr.Accordion("Markingaryvirlit / Legend", open=False):
417
  legend_md = gr.Markdown(build_legend("fo"))
418
 
419
  def on_tag(sentence, lang_choice):
420
  rows = run_model(sentence)
421
+ df_main, legend, df_mean = render_main(rows, lang_choice)
422
+ return rows, df_main, legend, df_mean
423
 
424
  def on_lang(rows, lang_choice):
425
+ df_main, legend, df_mean = render_main(rows, lang_choice)
426
+ return df_main, legend, df_mean
427
 
428
+ btn.click(on_tag, inputs=[inp, lang], outputs=[state, out_df, legend_md, out_mean_df])
429
+ lang.change(on_lang, inputs=[state, lang], outputs=[out_df, legend_md, out_mean_df])
430
 
431
  if __name__ == "__main__":
432
  demo.launch()