inayatarshad commited on
Commit
4a684d0
·
1 Parent(s): 4d21686

Add Urdu toxic lexicon fallback

Browse files
Files changed (1) hide show
  1. app.py +70 -4
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import shutil
2
  import zipfile
3
  from pathlib import Path
@@ -22,6 +23,36 @@ LABELS_PATH = ARTIFACTS_DIR / "label_classes.npy"
22
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
  TEXT_TOKENIZER = None
24
  TEXT_MODEL = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  app.add_middleware(
27
  CORSMiddleware,
@@ -99,6 +130,19 @@ def load_text_model():
99
  return TEXT_TOKENIZER, TEXT_MODEL
100
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  @app.on_event("startup")
103
  def startup_event():
104
  ensure_artifacts()
@@ -134,8 +178,16 @@ def predict_text(text: str):
134
  if word_id is None or word_id == previous_word_id:
135
  continue
136
 
137
- label = id2label[int(predictions[token_index])]
138
- confidence = float(probabilities[token_index][predictions[token_index]])
 
 
 
 
 
 
 
 
139
  is_toxic = label in {"B-Toxic", "I-Toxic"}
140
  word_results.append(
141
  {
@@ -143,6 +195,9 @@ def predict_text(text: str):
143
  "toxic": is_toxic,
144
  "bioTag": label,
145
  "confidence": round(confidence, 4),
 
 
 
146
  }
147
  )
148
  previous_word_id = word_id
@@ -159,11 +214,11 @@ def predict_text(text: str):
159
  "confidence": round(float(confidence), 4),
160
  "subLabel": "toxic" if toxic_words else "non-toxic",
161
  "subLabelConfidence": round(float(confidence), 4),
162
- "toxicSpanCount": len(toxic_words),
163
  "transcript": None,
164
  "words": word_results,
165
  "xai": {
166
- "modelExplanation": "XLM-RoBERTa token-classification inference using BIO toxic-span labels.",
167
  "topToxicTokens": [
168
  {
169
  "token": word["text"],
@@ -177,6 +232,17 @@ def predict_text(text: str):
177
  }
178
 
179
 
 
 
 
 
 
 
 
 
 
 
 
180
  def audio_fallback_prediction() -> dict:
181
  return {
182
  "isToxic": False,
 
1
+ import re
2
  import shutil
3
  import zipfile
4
  from pathlib import Path
 
23
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
  TEXT_TOKENIZER = None
25
  TEXT_MODEL = None
26
+ URDU_PUNCTUATION = "،۔؟!؛:,.!?\"'()[]{}<>«»“”‘’"
27
+ TOXIC_LEXICON = {
28
+ "بہنچود",
29
+ "بhenchod",
30
+ "bhenchod",
31
+ "بنچود",
32
+ "مادرچود",
33
+ "ماںچود",
34
+ "چود",
35
+ "چوتیا",
36
+ "چوتیے",
37
+ "چوتیئے",
38
+ "حرامی",
39
+ "حرامزادہ",
40
+ "حرامزادی",
41
+ "کنجر",
42
+ "کنجری",
43
+ "کمینہ",
44
+ "کمینے",
45
+ "بیوقوف",
46
+ "احمق",
47
+ "گھٹیا",
48
+ "ذلیل",
49
+ "خبیث",
50
+ "بدتمیز",
51
+ "بدتمیزی",
52
+ "کتا",
53
+ "کتے",
54
+ "گدا",
55
+ }
56
 
57
  app.add_middleware(
58
  CORSMiddleware,
 
130
  return TEXT_TOKENIZER, TEXT_MODEL
131
 
132
 
133
+ def normalize_word(word: str) -> str:
134
+ normalized = word.strip().strip(URDU_PUNCTUATION).lower()
135
+ normalized = re.sub(r"[\u064b-\u065f\u0670]", "", normalized)
136
+ return normalized.replace(" ", "")
137
+
138
+
139
+ def lexicon_match(word: str) -> bool:
140
+ normalized = normalize_word(word)
141
+ if not normalized:
142
+ return False
143
+ return normalized in TOXIC_LEXICON or any(term in normalized for term in TOXIC_LEXICON if len(term) >= 4)
144
+
145
+
146
  @app.on_event("startup")
147
  def startup_event():
148
  ensure_artifacts()
 
178
  if word_id is None or word_id == previous_word_id:
179
  continue
180
 
181
+ model_label = id2label[int(predictions[token_index])]
182
+ model_confidence = float(probabilities[token_index][predictions[token_index]])
183
+ fallback_toxic = lexicon_match(tokens[word_id])
184
+ label = model_label
185
+ confidence = model_confidence
186
+
187
+ if fallback_toxic and model_label == "O":
188
+ label = "B-Toxic"
189
+ confidence = max(model_confidence, 0.97)
190
+
191
  is_toxic = label in {"B-Toxic", "I-Toxic"}
192
  word_results.append(
193
  {
 
195
  "toxic": is_toxic,
196
  "bioTag": label,
197
  "confidence": round(confidence, 4),
198
+ "modelBioTag": model_label,
199
+ "modelConfidence": round(model_confidence, 4),
200
+ "source": "lexicon+model" if fallback_toxic and model_label == "O" else "model",
201
  }
202
  )
203
  previous_word_id = word_id
 
214
  "confidence": round(float(confidence), 4),
215
  "subLabel": "toxic" if toxic_words else "non-toxic",
216
  "subLabelConfidence": round(float(confidence), 4),
217
+ "toxicSpanCount": count_toxic_spans(word_results),
218
  "transcript": None,
219
  "words": word_results,
220
  "xai": {
221
+ "modelExplanation": "XLM-RoBERTa BIO token classification with a conservative Urdu abuse-word fallback for obvious missed slurs.",
222
  "topToxicTokens": [
223
  {
224
  "token": word["text"],
 
232
  }
233
 
234
 
235
+ def count_toxic_spans(words: list[dict]) -> int:
236
+ span_count = 0
237
+ previous_toxic = False
238
+ for word in words:
239
+ current_toxic = bool(word["toxic"])
240
+ if current_toxic and not previous_toxic:
241
+ span_count += 1
242
+ previous_toxic = current_toxic
243
+ return span_count
244
+
245
+
246
  def audio_fallback_prediction() -> dict:
247
  return {
248
  "isToxic": False,