unijoh commited on
Commit
fb93f14
·
verified ·
1 Parent(s): af92582

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -55
app.py CHANGED
@@ -1,40 +1,28 @@
1
- print("RUNNING APP.PY VERSION: 2026-01-15 16:12 FIXED")
2
 
3
  import os
 
 
 
4
  import gradio as gr
5
  import torch
6
  import numpy as np
7
  import pandas as pd
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification
9
 
 
 
 
10
  MODEL_ID = "Setur/BRAGD"
11
- TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv"
12
- HF_TOKEN = os.getenv("BRAGD")
13
 
14
  if not HF_TOKEN:
15
- raise RuntimeError("Missing BRAGD token secret.")
16
-
17
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
18
- model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
19
 
20
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
- model.to(device)
22
- model.eval()
23
-
24
- def load_tag_mappings(tags_filepath):
25
- tags_df = pd.read_csv(tags_filepath)
26
- features_to_tag = {
27
- tuple(row[1:].values.astype(int)): row["Original Tag"]
28
- for _, row in tags_df.iterrows()
29
- }
30
- vec_len = len(tags_df.columns) - 1
31
- return features_to_tag, vec_len
32
-
33
- features_to_tag, VEC_LEN = load_tag_mappings(TAGS_FILEPATH)
34
-
35
- # Use the SAME intervals as your demo.py (keep these consistent!)
36
- intervals = (
37
- (15, 29), # Subcategories (D,B,E,I,P,Q,N,G,R, X, S,C,O,T,s)
38
  (30, 33), # Gender (M,F,N,g)
39
  (34, 36), # Number (S,P,n)
40
  (37, 41), # Case (N,A,D,G,c)
@@ -49,77 +37,194 @@ intervals = (
49
  (71, 72), # Definiteness (D,I)
50
  )
51
 
52
- def vector_to_tag(vec):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
54
 
55
- def tag_sentence(sentence: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  sentence = sentence.strip()
57
  if not sentence:
58
  return ""
59
 
60
- tokens = sentence.split()
 
 
61
 
62
  enc = tokenizer(
63
  tokens,
64
  is_split_into_words=True,
65
  add_special_tokens=True,
66
- max_length=128,
67
  padding="max_length",
68
  truncation=True,
69
  return_attention_mask=True,
70
- return_tensors="pt"
71
  )
72
 
73
  input_ids = enc["input_ids"].to(device)
74
  attention_mask = enc["attention_mask"].to(device)
75
  word_ids = enc.word_ids(batch_index=0)
76
 
77
- # begin token mask
78
- begin = []
79
  last = None
80
  for wid in word_ids:
81
  if wid is None:
82
- begin.append(0)
83
  elif wid != last:
84
- begin.append(1)
85
  else:
86
- begin.append(0)
87
  last = wid
88
 
89
  with torch.no_grad():
90
  out = model(input_ids=input_ids, attention_mask=attention_mask)
91
  logits = out.logits[0] # [seq_len, num_labels]
92
 
93
- lines = []
94
- for i in range(logits.shape[0]):
95
- if attention_mask[0, i].item() != 1 or begin[i] != 1:
96
- continue
97
-
98
- pred = logits[i]
99
- vec = torch.zeros(VEC_LEN, device=logits.device)
100
 
101
- # Word type in [0..14]
102
- wt = torch.argmax(pred[0:15]).item()
103
- vec[wt] = 1
 
104
 
105
- # Interval decoding
106
- for a, b in intervals:
107
- seg = pred[a:b+1]
108
- k = torch.argmax(seg).item()
109
- vec[a + k] = 1
 
 
110
 
111
- wid = word_ids[i]
112
- word = tokens[wid] if wid is not None and wid < len(tokens) else "<UNK>"
113
- lines.append(f"{word}\t{vector_to_tag(vec)}")
 
 
114
 
115
  return "\n".join(lines)
116
 
 
 
 
117
  demo = gr.Interface(
118
  fn=tag_sentence,
119
  inputs=gr.Textbox(lines=2, label="Setningur"),
120
  outputs=gr.Textbox(lines=12, label="Orð\\tMark"),
121
- title="BRAGD-markarin"
 
 
 
 
 
122
  )
123
 
124
  if __name__ == "__main__":
125
- demo.launch()
 
1
+ print("RUNNING APP.PY VERSION: 2026-01-15 16:20 DICT_INTERVALS + REGEX TOK")
2
 
3
  import os
4
+ import re
5
+ import string
6
+
7
  import gradio as gr
8
  import torch
9
  import numpy as np
10
  import pandas as pd
11
  from transformers import AutoTokenizer, AutoModelForTokenClassification
12
 
13
+ # ----------------------------
14
+ # Config
15
+ # ----------------------------
16
  MODEL_ID = "Setur/BRAGD"
17
+ TAGS_FILEPATH = "Sosialurin-BRAGD_tags.csv" # must be present in the Space repo
18
+ HF_TOKEN = os.getenv("BRAGD") # Space secret name
19
 
20
  if not HF_TOKEN:
21
+ raise RuntimeError("Missing BRAGD token secret (Space → Settings → Secrets → BRAGD).")
 
 
 
22
 
23
+ # Match UPDATED demo.py intervals
24
+ INTERVALS = (
25
+ (15, 29), # Subcategories (D,B,E,I,P,Q,N,G,R,X,S,C,O,T,s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  (30, 33), # Gender (M,F,N,g)
27
  (34, 36), # Number (S,P,n)
28
  (37, 41), # Case (N,A,D,G,c)
 
37
  (71, 72), # Definiteness (D,I)
38
  )
39
 
40
+ # ----------------------------
41
+ # Load model + tokenizer
42
+ # ----------------------------
43
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
44
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, token=HF_TOKEN)
45
+
46
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
47
+ model.to(device)
48
+ model.eval()
49
+
50
+ # ----------------------------
51
+ # Tag mapping + dict_intervals
52
+ # ----------------------------
53
+ def load_tag_mappings(tags_filepath: str):
54
+ tags_df = pd.read_csv(tags_filepath)
55
+
56
+ # Map: Original Tag -> feature vector, and feature vector -> Original Tag
57
+ tag_to_features = {row["Original Tag"]: row[1:].values.astype(int) for _, row in tags_df.iterrows()}
58
+ features_to_tag = {tuple(row[1:].values.astype(int)): row["Original Tag"] for _, row in tags_df.iterrows()}
59
+
60
+ vec_len = len(tags_df.columns) - 1
61
+ return tag_to_features, features_to_tag, vec_len
62
+
63
+ tag_to_features, features_to_tag, VEC_LEN = load_tag_mappings(TAGS_FILEPATH)
64
+
65
+ # Safety check: if this fails, you uploaded the wrong CSV for the model
66
+ if hasattr(model, "config") and hasattr(model.config, "num_labels"):
67
+ if model.config.num_labels != VEC_LEN:
68
+ raise RuntimeError(
69
+ f"Label size mismatch: model has num_labels={model.config.num_labels}, "
70
+ f"but {TAGS_FILEPATH} implies {VEC_LEN}. "
71
+ "You likely uploaded the wrong tag mapping CSV."
72
+ )
73
+
74
+ def process_tag_features(tag_to_features: dict, intervals):
75
+ """Compute allowed intervals per POS (dict_intervals) like your updated demo.py."""
76
+ list_of_tags = list(tag_to_features.values())
77
+ unique_arrays = [np.array(tpl) for tpl in set(tuple(arr) for arr in list_of_tags)]
78
+
79
+ # Collect all feature vectors for each POS class (0..14)
80
+ word_type_masks = {}
81
+ for wt in range(15):
82
+ word_type_masks[wt] = [arr for arr in unique_arrays if arr[wt] == 1]
83
+
84
+ dict_intervals = {}
85
+ for wt in range(15):
86
+ labels = word_type_masks[wt]
87
+ if len(labels) == 0:
88
+ dict_intervals[wt] = []
89
+ continue
90
+
91
+ sum_labels = np.sum(np.array(labels), axis=0)
92
+
93
+ allowed = [
94
+ interval
95
+ for interval in intervals
96
+ if np.sum(sum_labels[interval[0] : interval[1] + 1]) != 0
97
+ ]
98
+ dict_intervals[wt] = allowed
99
+
100
+ return dict_intervals
101
+
102
+ DICT_INTERVALS = process_tag_features(tag_to_features, INTERVALS)
103
+
104
+ def vector_to_tag(vec: torch.Tensor) -> str:
105
  return features_to_tag.get(tuple(vec.int().tolist()), "Unknown Tag")
106
 
107
+ # ----------------------------
108
+ # Tokenization (match updated demo.py)
109
+ # ----------------------------
110
+ def simp_tok(sentence: str):
111
+ """Tokenize into words and punctuation (regex), matching your updated demo.py."""
112
+ return re.findall(r"\w+|[" + re.escape(string.punctuation) + "]", sentence)
113
+
114
+ # ----------------------------
115
+ # Decoding (match updated demo.py logic)
116
+ # ----------------------------
117
+ def predict_vectors(logits: torch.Tensor, attention_mask: torch.Tensor, begin_tokens, dict_intervals, vec_len: int):
118
+ """
119
+ Decode one feature-vector per word:
120
+ - pick POS (0..14)
121
+ - then pick subclasses only in allowed intervals for that POS
122
+ """
123
+ softmax = torch.nn.Softmax(dim=0)
124
+ vectors = []
125
+
126
+ for idx in range(len(logits)):
127
+ if attention_mask[idx].item() != 1:
128
+ continue
129
+ if begin_tokens[idx] != 1:
130
+ continue
131
+
132
+ pred_logits = logits[idx]
133
+ vec = torch.zeros(vec_len, device=logits.device)
134
+
135
+ # POS
136
+ probs = softmax(pred_logits[0:15])
137
+ wt = torch.argmax(probs).item()
138
+ vec[wt] = 1
139
+
140
+ # Allowed feature groups for this POS
141
+ for (a, b) in dict_intervals.get(wt, []):
142
+ seg = pred_logits[a : b + 1]
143
+ probs = softmax(seg)
144
+ k = torch.argmax(probs).item()
145
+ vec[a + k] = 1
146
+
147
+ vectors.append(vec)
148
+
149
+ return vectors
150
+
151
+ def tag_sentence(sentence: str, max_len: int = 128):
152
  sentence = sentence.strip()
153
  if not sentence:
154
  return ""
155
 
156
+ tokens = simp_tok(sentence)
157
+ if not tokens:
158
+ return ""
159
 
160
  enc = tokenizer(
161
  tokens,
162
  is_split_into_words=True,
163
  add_special_tokens=True,
164
+ max_length=max_len,
165
  padding="max_length",
166
  truncation=True,
167
  return_attention_mask=True,
168
+ return_tensors="pt",
169
  )
170
 
171
  input_ids = enc["input_ids"].to(device)
172
  attention_mask = enc["attention_mask"].to(device)
173
  word_ids = enc.word_ids(batch_index=0)
174
 
175
+ # begin token mask: first subtoken per word
176
+ begin_tokens = []
177
  last = None
178
  for wid in word_ids:
179
  if wid is None:
180
+ begin_tokens.append(0)
181
  elif wid != last:
182
+ begin_tokens.append(1)
183
  else:
184
+ begin_tokens.append(0)
185
  last = wid
186
 
187
  with torch.no_grad():
188
  out = model(input_ids=input_ids, attention_mask=attention_mask)
189
  logits = out.logits[0] # [seq_len, num_labels]
190
 
191
+ vectors = predict_vectors(logits, attention_mask[0], begin_tokens, DICT_INTERVALS, VEC_LEN)
 
 
 
 
 
 
192
 
193
+ # Map vectors back to tokens (one vector per original word)
194
+ lines = []
195
+ vec_i = 0
196
+ seen_word_ids = set()
197
 
198
+ for i, wid in enumerate(word_ids):
199
+ if wid is None:
200
+ continue
201
+ if begin_tokens[i] != 1:
202
+ continue
203
+ if wid in seen_word_ids:
204
+ continue
205
 
206
+ seen_word_ids.add(wid)
207
+ word = tokens[wid] if wid < len(tokens) else "<UNK>"
208
+ tag = vector_to_tag(vectors[vec_i]) if vec_i < len(vectors) else "Unknown Tag"
209
+ lines.append(f"{word}\t{tag}")
210
+ vec_i += 1
211
 
212
  return "\n".join(lines)
213
 
214
+ # ----------------------------
215
+ # Gradio UI
216
+ # ----------------------------
217
  demo = gr.Interface(
218
  fn=tag_sentence,
219
  inputs=gr.Textbox(lines=2, label="Setningur"),
220
  outputs=gr.Textbox(lines=12, label="Orð\\tMark"),
221
+ title="BRAGD-markarin",
222
+ description=(
223
+ "Skriv ein setning og fá mark (POS/morfologi). "
224
+ "Model: Setur/BRAGD. "
225
+ "Um alt verður 'Unknown Tag', er tags-fílan ofta skeiv (skeivt CSV) ella labels samsvara ikki."
226
+ ),
227
  )
228
 
229
  if __name__ == "__main__":
230
+ demo.launch()