unijoh commited on
Commit
7c4ea3b
·
verified ·
1 Parent(s): 2d19454

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -92
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import os, re, string, json
 
 
2
  from collections import defaultdict
3
 
4
  import gradio as gr
@@ -7,15 +9,44 @@ import numpy as np
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
10
- # --- FO-Tokenizer (sentence splitting) ---
 
 
 
11
  try:
12
- import fotokenizer
13
- from fotokenizer import tokenize, TOK
14
- except Exception as e:
15
- raise RuntimeError(
16
- "fotokenizer is not installed. Add it to requirements.txt (see below). "
17
- f"Original error: {e}"
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # ----------------------------
21
  # Config
@@ -25,7 +56,7 @@ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
25
  LABELS_FILEPATH = "tag_labels.json"
26
  HF_TOKEN = os.getenv("BRAGD")
27
 
28
- MAX_LENGTH = 256 # <-- changed from 128 to 256
29
 
30
  if not HF_TOKEN:
31
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
@@ -75,7 +106,7 @@ CSS = """
75
  color:#0b1b19 !important;
76
  }
77
 
78
- /* Dark mode: make the INACTIVE buttons darker but readable */
79
  @media (prefers-color-scheme: dark){
80
  #lang_fo_off, #lang_en_off{
81
  background:#2a3b38 !important;
@@ -89,7 +120,7 @@ CSS = """
89
  }
90
  }
91
 
92
- /* Minimal layout so the language buttons stay hard-right */
93
  #results_hdr{
94
  display:flex !important;
95
  align-items:center !important;
@@ -108,8 +139,7 @@ CSS = """
108
  min-width:120px !important;
109
  flex:0 0 auto !important;
110
  }
111
-
112
- /* Remove the big Gradio panel/frame around the textbox column (keep textarea normal) */
113
  #input_col,
114
  #input_col > div,
115
  #input_col .gr-block,
@@ -124,52 +154,73 @@ CSS = """
124
  """
125
 
126
  # ----------------------------
127
- # Tokenization helpers
128
  # ----------------------------
129
  def simp_tok(sentence: str):
130
- # simple word/punct split; whitespace ignored
131
  return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
132
 
133
- def normalize_token_text(s: str) -> str:
134
- # normalize newlines to spaces (same spirit as your TEI script)
135
- return re.sub(r"[\r\n]+", " ", s or "")
136
 
137
- def split_sentences_fotokenizer(text: str):
138
- """
139
- Uses fotokenizer BEGIN_SENT / END_SENT markers to split into sentence strings.
 
 
 
 
140
  """
141
- text = text or ""
142
- sentences = []
143
- buf = []
144
-
145
- toks = tokenize(text)
146
-
147
- for t in toks:
148
- if not getattr(t, "txt", ""):
149
- # marker tokens: use TOK.descr[t.kind]
150
- kind = TOK.descr[t.kind].replace(" ", "_")
151
- if kind == "BEGIN_SENT":
152
- # start a new sentence buffer
153
- buf = []
154
- elif kind == "END_SENT":
155
- s = "".join(buf).strip()
156
- if s:
157
- sentences.append(s)
158
- buf = []
159
- continue
160
 
161
- buf.append(normalize_token_text(t.txt))
 
 
 
 
 
 
 
 
162
 
163
- # flush tail if tokenizer didn't end with END_SENT
164
- tail = "".join(buf).strip()
165
- if tail:
166
- sentences.append(tail)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- # If for some reason no markers exist, fall back to whole text
169
- if not sentences and text.strip():
170
- sentences = [text.strip()]
171
 
172
- return sentences
 
 
 
 
 
173
 
174
  # ----------------------------
175
  # CSV mapping
@@ -264,6 +315,16 @@ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN
264
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
265
  model.to(device); model.eval()
266
 
 
 
 
 
 
 
 
 
 
 
267
  if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
268
  raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
269
 
@@ -304,9 +365,10 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
304
  tag = vector_to_tag(vec)
305
  wc = wc_code(vec)
306
 
307
- # Skip listing "no number/tense/person" for infinitive/imperative verbs
308
  mood_code = group_code(vec, "mood") if wc == "V" else ""
309
  skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"}) # navnháttur or boðsháttur
 
310
 
311
  if tag == "DGd":
312
  return "fyriseting" if lang=="fo" else "preposition"
@@ -334,14 +396,15 @@ def analysis_text(vec: torch.Tensor, lang: str) -> str:
334
  if not c:
335
  continue
336
 
 
337
  if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
338
  continue
 
339
 
340
  if wc in {"P","C"} and g == "subcategory":
341
  continue
342
  if (wc, g, c) in HIDE_IN_ANALYSIS:
343
  continue
344
-
345
  lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
346
  if lbl and lbl not in parts:
347
  parts.append(lbl)
@@ -411,29 +474,15 @@ def build_overview(lang: str) -> str:
411
  lines.append("")
412
  return "\n".join(lines).strip()
413
 
414
- # ----------------------------
415
- # Model inference (single sentence)
416
- # ----------------------------
417
  def run_model(sentence: str):
418
  s = (sentence or "").strip()
419
  if not s:
420
  return []
421
-
422
  tokens = simp_tok(s)
423
  if not tokens:
424
  return []
425
-
426
- enc = tokenizer(
427
- tokens,
428
- is_split_into_words=True,
429
- add_special_tokens=True,
430
- max_length=MAX_LENGTH,
431
- padding="max_length",
432
- truncation=True,
433
- return_attention_mask=True,
434
- return_tensors="pt"
435
- )
436
-
437
  input_ids = enc["input_ids"].to(device)
438
  attention_mask = enc["attention_mask"].to(device)
439
  word_ids = enc.word_ids(batch_index=0)
@@ -455,25 +504,15 @@ def run_model(sentence: str):
455
 
456
  rows, vec_i, seen = [], 0, set()
457
  for i,wid in enumerate(word_ids):
458
- if wid is None or begin[i] != 1 or wid in seen:
459
  continue
460
  seen.add(wid)
461
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
462
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
463
  rows.append({"word": word, "vec": vec.int().tolist()})
464
  vec_i += 1
465
-
466
  return rows
467
 
468
- # ----------------------------
469
- # Model inference (multi-sentence via fotokenizer)
470
- # ----------------------------
471
- def run_model_multisentence(text: str):
472
- all_rows = []
473
- for sent in split_sentences_fotokenizer(text):
474
- all_rows.extend(run_model(sent))
475
- return all_rows
476
-
477
  def render(rows_state, lang: str):
478
  lang = "fo" if lang=="fo" else "en"
479
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
@@ -488,9 +527,6 @@ def render(rows_state, lang: str):
488
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
489
  return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
490
 
491
- # ----------------------------
492
- # UI
493
- # ----------------------------
494
  with gr.Blocks(css=CSS, title="Marka") as demo:
495
  with gr.Row(equal_height=True):
496
  with gr.Column(scale=2, elem_id="input_col"):
@@ -509,7 +545,6 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
509
  results_hdr = gr.Row(elem_id="results_hdr", visible=True)
510
  with results_hdr:
511
  results_title = gr.Markdown("### Úrslit / Results")
512
- # IMPORTANT: keep row always present; hide/show buttons only (prevents duplication)
513
  with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
514
  btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=False)
515
  btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
@@ -547,11 +582,11 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
547
  gr.update(value=df_main, visible=True),
548
  gr.update(value=df_mean),
549
  gr.update(value=overview),
550
- gr.update(visible=True), # expanded_acc
551
- gr.update(visible=show_fo), # fo_on
552
- gr.update(visible=not show_fo), # fo_off
553
- gr.update(visible=show_en), # en_on
554
- gr.update(visible=not show_en), # en_off
555
  lang_current,
556
  )
557
 
@@ -581,14 +616,11 @@ with gr.Blocks(css=CSS, title="Marka") as demo:
581
  btn.click(
582
  on_tag,
583
  inputs=[inp, lang_state],
584
- outputs=[
585
- state, out_df, out_mean_df, overview_md, expanded_acc,
586
- btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state
587
- ],
588
  queue=False,
589
  )
590
 
591
- # Language switch: re-render existing rows (does NOT rerun the model)
592
  btn_lang_fo_on.click(
593
  on_set_fo,
594
  inputs=[state],
 
1
  import os, re, string, json
2
+ import inspect
3
+ import importlib.resources as importlib_resources
4
  from collections import defaultdict
5
 
6
  import gradio as gr
 
9
  import pandas as pd
10
  from transformers import AutoTokenizer, AutoModelForTokenClassification
11
 
12
+ # ----------------------------
13
+ # Optional: FO-Tokenizer (fotokenizer) for sentence splitting
14
+ # ----------------------------
15
+ _HAS_FOTOKENIZER = False
16
  try:
17
+ import fotokenizer # noqa: F401
18
+ from fotokenizer import tokenize as fo_tokenize
19
+ from fotokenizer import TOK as FO_TOK
20
+ import fotokenizer.abbrev as fo_abbrev
21
+ _HAS_FOTOKENIZER = True
22
+ except Exception:
23
+ _HAS_FOTOKENIZER = False
24
+
25
+ def _patch_fotokenizer_for_py313() -> None:
26
+ """FO-Tokenizer currently uses importlib.resources.open_text(package=..., resource=...).
27
+ In Python 3.13, open_text no longer accepts the `package=` keyword.
28
+ This shim patches fotokenizer so it works on Python 3.13 (Hugging Face Spaces default)."""
29
+ if not _HAS_FOTOKENIZER:
30
+ return
31
+ try:
32
+ # If open_text doesn't accept `package`, patch the reference inside fotokenizer.abbrev.
33
+ if "package" not in inspect.signature(importlib_resources.open_text).parameters:
34
+ def _open_text_compat(*args, **kwargs):
35
+ if "package" in kwargs:
36
+ pkg = kwargs.pop("package")
37
+ res = kwargs.pop("resource")
38
+ encoding = kwargs.pop("encoding", "utf-8")
39
+ errors = kwargs.pop("errors", "strict")
40
+ return importlib_resources.open_text(pkg, res, encoding=encoding, errors=errors)
41
+ return importlib_resources.open_text(*args, **kwargs)
42
+
43
+ # Patch the function that fotokenizer.abbrev imported into its module namespace
44
+ fo_abbrev.open_text = _open_text_compat # type: ignore[attr-defined]
45
+ except Exception:
46
+ # If patching fails, we'll fall back to a naive sentence split later.
47
+ pass
48
+
49
+ _patch_fotokenizer_for_py313()
50
 
51
  # ----------------------------
52
  # Config
 
56
  LABELS_FILEPATH = "tag_labels.json"
57
  HF_TOKEN = os.getenv("BRAGD")
58
 
59
+ TARGET_MAX_TOKENS = 256 # We will cap this to the model's max if needed.
60
 
61
  if not HF_TOKEN:
62
  raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
 
106
  color:#0b1b19 !important;
107
  }
108
 
109
+ /* Dark mode: make the INACTIVE buttons match what you had before (darker, readable) */
110
  @media (prefers-color-scheme: dark){
111
  #lang_fo_off, #lang_en_off{
112
  background:#2a3b38 !important;
 
120
  }
121
  }
122
 
123
+ /* Minimal layout so the language buttons stay hard-right like before */
124
  #results_hdr{
125
  display:flex !important;
126
  align-items:center !important;
 
139
  min-width:120px !important;
140
  flex:0 0 auto !important;
141
  }
142
+ /* Remove the big Gradio panel/frame around the textbox (keep textarea normal) */
 
143
  #input_col,
144
  #input_col > div,
145
  #input_col .gr-block,
 
154
  """
155
 
156
  # ----------------------------
157
+ # Tokenization
158
  # ----------------------------
159
  def simp_tok(sentence: str):
 
160
  return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
161
 
 
 
 
162
 
163
+ # ----------------------------
164
+ # Sentence splitting
165
+ # ----------------------------
166
+ def split_sentences(text: str):
167
+ """Split input into sentences.
168
+ - Prefer FO-Tokenizer if available (BEGIN_SENT / END_SENT markers).
169
+ - Fall back to a simple regex split if FO-Tokenizer isn't available or fails.
170
  """
171
+ s = (text or "").strip()
172
+ if not s:
173
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ if _HAS_FOTOKENIZER:
176
+ try:
177
+ toks = fo_tokenize(s)
178
+ sents = []
179
+ cur = []
180
+ for tok in toks:
181
+ if tok.txt:
182
+ cur.append(re.sub(r"[\r\n]+", " ", tok.txt))
183
+ continue
184
 
185
+ # Descriptor-only token (e.g., sentence boundary markers)
186
+ descr = FO_TOK.descr.get(tok.kind, "").replace(" ", "_")
187
+ if descr == "BEGIN_SENT":
188
+ if cur:
189
+ sent = "".join(cur).strip()
190
+ if sent:
191
+ sents.append(sent)
192
+ cur = []
193
+ elif descr == "END_SENT":
194
+ sent = "".join(cur).strip()
195
+ if sent:
196
+ sents.append(sent)
197
+ cur = []
198
+ else:
199
+ # Ignore other descriptor-only tokens
200
+ pass
201
+
202
+ if cur:
203
+ sent = "".join(cur).strip()
204
+ if sent:
205
+ sents.append(sent)
206
+
207
+ # If fotokenizer didn't yield markers, treat as one sentence.
208
+ return sents or [s]
209
+ except Exception:
210
+ # We'll fall back below
211
+ pass
212
+
213
+ # Fallback: split on end punctuation followed by whitespace.
214
+ parts = re.split(r"(?<=[.!?])\s+", s)
215
+ return [p.strip() for p in parts if p.strip()]
216
 
 
 
 
217
 
218
+ def run_model_multisentence(text: str):
219
+ """Run the model sentence-by-sentence and concatenate the rows."""
220
+ rows_all = []
221
+ for sent in split_sentences(text):
222
+ rows_all.extend(run_model(sent))
223
+ return rows_all
224
 
225
  # ----------------------------
226
  # CSV mapping
 
315
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
316
  model.to(device); model.eval()
317
 
318
+ # Decide max token length (cap to model/tokenizer max if they define one)
319
+ MAX_TOKENS = int(TARGET_MAX_TOKENS)
320
+ _model_max = getattr(getattr(model, "config", None), "max_position_embeddings", None)
321
+ _tok_max = getattr(tokenizer, "model_max_length", None)
322
+
323
+ # Some tokenizers set model_max_length to a huge placeholder (e.g., 1e30). Ignore those.
324
+ for _m in (_model_max, _tok_max):
325
+ if isinstance(_m, int) and 0 < _m < 100000:
326
+ MAX_TOKENS = min(MAX_TOKENS, _m)
327
+
328
  if hasattr(model, "config") and hasattr(model.config, "num_labels") and model.config.num_labels != VEC_LEN:
329
  raise RuntimeError(f"Label size mismatch: model={model.config.num_labels}, csv={VEC_LEN}. Wrong CSV?")
330
 
 
365
  tag = vector_to_tag(vec)
366
  wc = wc_code(vec)
367
 
368
+ # --- ADDED: compute mood_code and skip flag for infinitive/imperative verbs ---
369
  mood_code = group_code(vec, "mood") if wc == "V" else ""
370
  skip_empty_verb_feats = (wc == "V" and mood_code in {"I", "M"}) # navnháttur or boðsháttur
371
+ # --- end added ---
372
 
373
  if tag == "DGd":
374
  return "fyriseting" if lang=="fo" else "preposition"
 
396
  if not c:
397
  continue
398
 
399
+ # --- ADDED: skip only the generic "no" codes for verbs in infinitive/imperative ---
400
  if skip_empty_verb_feats and g in {"number", "tense", "person"} and c in {"n", "t", "p"}:
401
  continue
402
+ # --- end added ---
403
 
404
  if wc in {"P","C"} and g == "subcategory":
405
  continue
406
  if (wc, g, c) in HIDE_IN_ANALYSIS:
407
  continue
 
408
  lbl = clean_label(label_for(lang, g, wc, c) or label_for(lang, g, "", c) or "")
409
  if lbl and lbl not in parts:
410
  parts.append(lbl)
 
474
  lines.append("")
475
  return "\n".join(lines).strip()
476
 
 
 
 
477
  def run_model(sentence: str):
478
  s = (sentence or "").strip()
479
  if not s:
480
  return []
 
481
  tokens = simp_tok(s)
482
  if not tokens:
483
  return []
484
+ enc = tokenizer(tokens, is_split_into_words=True, add_special_tokens=True, max_length=MAX_TOKENS,
485
+ padding="max_length", truncation=True, return_attention_mask=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
486
  input_ids = enc["input_ids"].to(device)
487
  attention_mask = enc["attention_mask"].to(device)
488
  word_ids = enc.word_ids(batch_index=0)
 
504
 
505
  rows, vec_i, seen = [], 0, set()
506
  for i,wid in enumerate(word_ids):
507
+ if wid is None or begin[i]!=1 or wid in seen:
508
  continue
509
  seen.add(wid)
510
  word = tokens[wid] if wid < len(tokens) else "<UNK>"
511
  vec = vectors[vec_i] if vec_i < len(vectors) else torch.zeros(VEC_LEN, device=device)
512
  rows.append({"word": word, "vec": vec.int().tolist()})
513
  vec_i += 1
 
514
  return rows
515
 
 
 
 
 
 
 
 
 
 
516
  def render(rows_state, lang: str):
517
  lang = "fo" if lang=="fo" else "en"
518
  df_cols = [UI[lang]["w"], UI[lang]["t"], UI[lang]["s"]]
 
527
  out_mean.append([r["word"], tag, expanded_text(vec, lang)])
528
  return (pd.DataFrame(out_main, columns=df_cols), pd.DataFrame(out_mean, columns=dfm_cols), build_overview(lang))
529
 
 
 
 
530
  with gr.Blocks(css=CSS, title="Marka") as demo:
531
  with gr.Row(equal_height=True):
532
  with gr.Column(scale=2, elem_id="input_col"):
 
545
  results_hdr = gr.Row(elem_id="results_hdr", visible=True)
546
  with results_hdr:
547
  results_title = gr.Markdown("### Úrslit / Results")
 
548
  with gr.Row(elem_id="lang_buttons") as lang_buttons_row:
549
  btn_lang_fo_on = gr.Button("Føroyskt", variant="primary", elem_id="lang_fo_on", visible=False)
550
  btn_lang_fo_off = gr.Button("Føroyskt", variant="secondary", elem_id="lang_fo_off", visible=False)
 
582
  gr.update(value=df_main, visible=True),
583
  gr.update(value=df_mean),
584
  gr.update(value=overview),
585
+ gr.update(visible=True), # expanded_acc
586
+ gr.update(visible=show_fo),
587
+ gr.update(visible=not show_fo),
588
+ gr.update(visible=show_en),
589
+ gr.update(visible=not show_en),
590
  lang_current,
591
  )
592
 
 
616
  btn.click(
617
  on_tag,
618
  inputs=[inp, lang_state],
619
+ outputs=[state, out_df, out_mean_df, overview_md, expanded_acc,
620
+ btn_lang_fo_on, btn_lang_fo_off, btn_lang_en_on, btn_lang_en_off, lang_state],
 
 
621
  queue=False,
622
  )
623
 
 
624
  btn_lang_fo_on.click(
625
  on_set_fo,
626
  inputs=[state],