unijoh commited on
Commit
2d19454
·
verified ·
1 Parent(s): 2ad5b1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -20
app.py CHANGED
@@ -7,6 +7,16 @@ import numpy as np
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
 
 
 
 
 
 
 
 
 
 
10
  # ----------------------------
11
  # Config
12
  # ----------------------------
@@ -15,6 +25,8 @@ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
15
  LABELS_FILEPATH = "tag_labels.json"
16
  HF_TOKEN = os.getenv("BRAGD")
17
 
 
 
18
  if not HF_TOKEN:
19
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
20
  if not os.path.exists(LABELS_FILEPATH):
@@ -63,7 +75,7 @@ CSS = """
63
  color:#0b1b19 !important;
64
  }
65
 
66
- /* Dark mode: make the INACTIVE buttons match what you had before (darker, readable) */
67
  @media (prefers-color-scheme: dark){
68
  #lang_fo_off, #lang_en_off{
69
  background:#2a3b38 !important;
@@ -77,7 +89,7 @@ CSS = """
77
  }
78
  }
79
 
80
- /* Minimal layout so the language buttons stay hard-right like before */
81
  #results_hdr{
82
  display:flex !important;
83
  align-items:center !important;
@@ -96,7 +108,8 @@ CSS = """
96
  min-width:120px !important;
97
  flex:0 0 auto !important;
98
  }
99
- /* Remove the big Gradio panel/frame around the textbox (keep textarea normal) */
 
100
  #input_col,
101
  #input_col > div,
102
  #input_col .gr-block,
@@ -111,11 +124,53 @@ CSS = """
111
  """
112
 
113
  # ----------------------------
114
- # Tokenization
115
  # ----------------------------
116
  def simp_tok(sentence: str):
 
117
  return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # ----------------------------
120
  # CSV mapping
121
  # ----------------------------
@@ -249,10 +304,9 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
249
  tag = vector_to_tag(vec)
250
  wc = wc_code(vec)
251
 
252
- # --- ADDED: compute mood_code and skip flag for infinitive/imperative verbs ---
253
  mood_code = group_code(vec, "mood") if wc == "V" else ""
254
  skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"}) # navnháttur or boðsháttur
255
- # --- end added ---
256
 
257
  if tag == "DGd":
258
  return "fyriseting" if lang=="fo" else "preposition"
@@ -280,15 +334,14 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
280
  if not c:
281
  continue
282
 
283
- # --- ADDED: skip only the generic "no" codes for verbs in infinitive/imperative ---
284
  if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
285
  continue
286
- # --- end added ---
287
 
288
  if wc in {"P","C"} and g == "subcategory":
289
  continue
290
  if (wc, g, c) in HIDE_IN_ANALYSIS:
291
  continue
 
292
  lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
293
  if lbl and lbl not in parts:
294
  parts.append(lbl)
@@ -358,15 +411,29 @@ def build_overview(lang: str) -> str:
358
  lines.append("")
359
  return "\n".join(lines).strip()
360
 
 
 
 
361
  def run_model(sentence: str):
362
  s = (sentence or "").strip()
363
  if not s:
364
  return []
 
365
  tokens = simp_tok(s)
366
  if not tokens:
367
  return []
368
- enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=128,
369
- padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
370
  input_ids = enc["input_ids"].to(device)
371
  attention_mask = enc["attention_mask"].to(device)
372
  word_ids = enc.word_ids(batch_index=0)
@@ -388,15 +455,25 @@ def run_model(sentence: str):
388
 
389
  rows, vec_i, seen = [], 0, set()
390
  for i,wid in enumerate(word_ids):
391
- if wid is None or begin[i]!=1 or wid in seen:
392
  continue
393
  seen.add(wid)
394
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
395
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
396
  rows.append({"word": word, "vec": vec.int().tolist()})
397
  vec_i += 1
 
398
  return rows
399
 
 
 
 
 
 
 
 
 
 
400
  def render(rows_state, lang: str):
401
  lang = "fo" if lang=="fo" else "en"
402
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
@@ -411,6 +488,9 @@ def render(rows_state, lang: str):
411
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
412
  return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
413
 
 
 
 
414
  with gr.Blocks(css=CSS, title="Marka") as demo:
415
  with gr.Row(equal_height=True):
416
  with gr.Column(scale=2, elem_id="input_col"):
@@ -429,6 +509,7 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
429
  results_hdr = gr.Row(elem_id="results_hdr", visible=True)
430
  with results_hdr:
431
  results_title = gr.Markdown("### Úrslit / Results")
 
432
  with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
433
  btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=False)
434
  btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
@@ -454,8 +535,8 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
454
  with overview_acc:
455
  overview_md = gr.Markdown(build_overview("fo"))
456
 
457
- def on_tag(sentence, lang_current):
458
- rows = run_model(sentence)
459
  df_main, df_mean, overview = render(rows, lang_current)
460
 
461
  show_fo = (lang_current == "fo")
@@ -466,11 +547,11 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
466
  gr.update(value=df_main, visible=True),
467
  gr.update(value=df_mean),
468
  gr.update(value=overview),
469
- gr.update(visible=True), # expanded_acc
470
- gr.update(visible=show_fo),
471
- gr.update(visible=not show_fo),
472
- gr.update(visible=show_en),
473
- gr.update(visible=not show_en),
474
  lang_current,
475
  )
476
 
@@ -500,11 +581,14 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
500
  btn.click(
501
  on_tag,
502
  inputs=[inp, lang_state],
503
- outputs=[state, out_df, out_mean_df, overview_md, expanded_acc,
504
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state],
 
 
505
  queue=False,
506
  )
507
 
 
508
  btn_lang_fo_on.click(
509
  on_set_fo,
510
  inputs=[state],
 
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
10
+ # --- FO-Tokenizer (sentence splitting) ---
11
+ try:
12
+ import fotokenizer
13
+ from fotokenizer import tokenize, TOK
14
+ except Exception as e:
15
+ raise RuntimeError(
16
+ "fotokenizer is not installed. Add it to requirements.txt (see below). "
17
+ f"Original error: {e}"
18
+ )
19
+
20
  # ----------------------------
21
  # Config
22
  # ----------------------------
 
25
  LABELS_FILEPATH = "tag_labels.json"
26
  HF_TOKEN = os.getenv("BRAGD")
27
 
28
+ MAX_LENGTH = 256 # <-- changed from 128 to 256
29
+
30
  if not HF_TOKEN:
31
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
32
  if not os.path.exists(LABELS_FILEPATH):
 
75
  color:#0b1b19 !important;
76
  }
77
 
78
+ /* Dark mode: make the INACTIVE buttons darker but readable */
79
  @media (prefers-color-scheme: dark){
80
  #lang_fo_off, #lang_en_off{
81
  background:#2a3b38 !important;
 
89
  }
90
  }
91
 
92
+ /* Minimal layout so the language buttons stay hard-right */
93
  #results_hdr{
94
  display:flex !important;
95
  align-items:center !important;
 
108
  min-width:120px !important;
109
  flex:0 0 auto !important;
110
  }
111
+
112
+ /* Remove the big Gradio panel/frame around the textbox column (keep textarea normal) */
113
  #input_col,
114
  #input_col > div,
115
  #input_col .gr-block,
 
124
  """
125
 
126
  # ----------------------------
127
+ # Tokenization helpers
128
  # ----------------------------
129
  def simp_tok(sentence: str):
130
+ # simple word/punct split; whitespace ignored
131
  return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
132
 
133
+ def normalize_token_text(s: str) -> str:
134
+ # normalize newlines to spaces (same spirit as your TEI script)
135
+ return re.sub(r"[\r\n]+", " ", s or "")
136
+
137
+ def split_sentences_fotokenizer(text: str):
138
+ """
139
+ Uses fotokenizer BEGIN_SENT / END_SENT markers to split into sentence strings.
140
+ """
141
+ text = text or ""
142
+ sentences = []
143
+ buf = []
144
+
145
+ toks = tokenize(text)
146
+
147
+ for t in toks:
148
+ if not getattr(t, "txt", ""):
149
+ # marker tokens: use TOK.descr[t.kind]
150
+ kind = TOK.descr[t.kind].replace(" ", "_")
151
+ if kind == "BEGIN_SENT":
152
+ # start a new sentence buffer
153
+ buf = []
154
+ elif kind == "END_SENT":
155
+ s = "".join(buf).strip()
156
+ if s:
157
+ sentences.append(s)
158
+ buf = []
159
+ continue
160
+
161
+ buf.append(normalize_token_text(t.txt))
162
+
163
+ # flush tail if tokenizer didn't end with END_SENT
164
+ tail = "".join(buf).strip()
165
+ if tail:
166
+ sentences.append(tail)
167
+
168
+ # If for some reason no markers exist, fall back to whole text
169
+ if not sentences and text.strip():
170
+ sentences = [text.strip()]
171
+
172
+ return sentences
173
+
174
  # ----------------------------
175
  # CSV mapping
176
  # ----------------------------
 
304
  tag = vector_to_tag(vec)
305
  wc = wc_code(vec)
306
 
307
+ # Skip listing "no number/tense/person" for infinitive/imperative verbs
308
  mood_code = group_code(vec, "mood") if wc == "V" else ""
309
  skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"}) # navnháttur or boðsháttur
 
310
 
311
  if tag == "DGd":
312
  return "fyriseting" if lang=="fo" else "preposition"
 
334
  if not c:
335
  continue
336
 
 
337
  if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
338
  continue
 
339
 
340
  if wc in {"P","C"} and g == "subcategory":
341
  continue
342
  if (wc, g, c) in HIDE_IN_ANALYSIS:
343
  continue
344
+
345
  lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
346
  if lbl and lbl not in parts:
347
  parts.append(lbl)
 
411
  lines.append("")
412
  return "\n".join(lines).strip()
413
 
414
+ # ----------------------------
415
+ # Model inference (single sentence)
416
+ # ----------------------------
417
  def run_model(sentence: str):
418
  s = (sentence or "").strip()
419
  if not s:
420
  return []
421
+
422
  tokens = simp_tok(s)
423
  if not tokens:
424
  return []
425
+
426
+ enc = tokenizer(
427
+ tokens,
428
+ is_split_into_words=True,
429
+ add_special_tokens=True,
430
+ max_length=MAX_LENGTH,
431
+ padding="max_length",
432
+ truncation=True,
433
+ return_attention_mask=True,
434
+ return_tensors="pt"
435
+ )
436
+
437
  input_ids = enc["input_ids"].to(device)
438
  attention_mask = enc["attention_mask"].to(device)
439
  word_ids = enc.word_ids(batch_index=0)
 
455
 
456
  rows, vec_i, seen = [], 0, set()
457
  for i,wid in enumerate(word_ids):
458
+ if wid is None or begin[i] != 1 or wid in seen:
459
  continue
460
  seen.add(wid)
461
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
462
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
463
  rows.append({"word": word, "vec": vec.int().tolist()})
464
  vec_i += 1
465
+
466
  return rows
467
 
468
+ # ----------------------------
469
+ # Model inference (multi-sentence via fotokenizer)
470
+ # ----------------------------
471
+ def run_model_multisentence(text: str):
472
+ all_rows = []
473
+ for sent in split_sentences_fotokenizer(text):
474
+ all_rows.extend(run_model(sent))
475
+ return all_rows
476
+
477
  def render(rows_state, lang: str):
478
  lang = "fo" if lang=="fo" else "en"
479
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
 
488
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
489
  return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
490
 
491
+ # ----------------------------
492
+ # UI
493
+ # ----------------------------
494
  with gr.Blocks(css=CSS, title="Marka") as demo:
495
  with gr.Row(equal_height=True):
496
  with gr.Column(scale=2, elem_id="input_col"):
 
509
  results_hdr = gr.Row(elem_id="results_hdr", visible=True)
510
  with results_hdr:
511
  results_title = gr.Markdown("### Úrslit / Results")
512
+ # IMPORTANT: keep row always present; hide/show buttons only (prevents duplication)
513
  with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
514
  btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=False)
515
  btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
 
535
  with overview_acc:
536
  overview_md = gr.Markdown(build_overview("fo"))
537
 
538
+ def on_tag(text, lang_current):
539
+ rows = run_model_multisentence(text)
540
  df_main, df_mean, overview = render(rows, lang_current)
541
 
542
  show_fo = (lang_current == "fo")
 
547
  gr.update(value=df_main, visible=True),
548
  gr.update(value=df_mean),
549
  gr.update(value=overview),
550
+ gr.update(visible=True), # expanded_acc
551
+ gr.update(visible=show_fo), # fo_on
552
+ gr.update(visible=not show_fo), # fo_off
553
+ gr.update(visible=show_en), # en_on
554
+ gr.update(visible=not show_en), # en_off
555
  lang_current,
556
  )
557
 
 
581
  btn.click(
582
  on_tag,
583
  inputs=[inp, lang_state],
584
+ outputs=[
585
+ state, out_df, out_mean_df, overview_md, expanded_acc,
586
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state
587
+ ],
588
  queue=False,
589
  )
590
 
591
+ # Language switch: re-render existing rows (does NOT rerun the model)
592
  btn_lang_fo_on.click(
593
  on_set_fo,
594
  inputs=[state],