Spaces:

finalyear226
/

urtox-api

Sleeping

App Files Files Community

inayatarshad commited on 22 days ago

Commit

4a684d0

1 Parent(s): 4d21686

Add Urdu toxic lexicon fallback

Browse files

Files changed (1) hide show

app.py +70 -4

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import shutil
 import zipfile
 from pathlib import Path
@@ -22,6 +23,36 @@ LABELS_PATH = ARTIFACTS_DIR / "label_classes.npy"
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 TEXT_TOKENIZER = None
 TEXT_MODEL = None
 app.add_middleware(
     CORSMiddleware,
@@ -99,6 +130,19 @@ def load_text_model():
     return TEXT_TOKENIZER, TEXT_MODEL
 @app.on_event("startup")
 def startup_event():
     ensure_artifacts()
@@ -134,8 +178,16 @@ def predict_text(text: str):
         if word_id is None or word_id == previous_word_id:
             continue
-        label = id2label[int(predictions[token_index])]
-        confidence = float(probabilities[token_index][predictions[token_index]])
         is_toxic = label in {"B-Toxic", "I-Toxic"}
         word_results.append(
             {
@@ -143,6 +195,9 @@ def predict_text(text: str):
                 "toxic": is_toxic,
                 "bioTag": label,
                 "confidence": round(confidence, 4),
             }
         )
         previous_word_id = word_id
@@ -159,11 +214,11 @@ def predict_text(text: str):
         "confidence": round(float(confidence), 4),
         "subLabel": "toxic" if toxic_words else "non-toxic",
         "subLabelConfidence": round(float(confidence), 4),
-        "toxicSpanCount": len(toxic_words),
         "transcript": None,
         "words": word_results,
         "xai": {
-            "modelExplanation": "XLM-RoBERTa token-classification inference using BIO toxic-span labels.",
             "topToxicTokens": [
                 {
                     "token": word["text"],
@@ -177,6 +232,17 @@ def predict_text(text: str):
     }
 def audio_fallback_prediction() -> dict:
     return {
         "isToxic": False,

+import re
 import shutil
 import zipfile
 from pathlib import Path
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 TEXT_TOKENIZER = None
 TEXT_MODEL = None
+URDU_PUNCTUATION = "،۔؟!؛:,.!?\"'()[]{}<>«»“”‘’"
+TOXIC_LEXICON = {
+    "بہنچود",
+    "بhenchod",
+    "bhenchod",
+    "بنچود",
+    "مادرچود",
+    "ماںچود",
+    "چود",
+    "چوتیا",
+    "چوتیے",
+    "چوتیئے",
+    "حرامی",
+    "حرامزادہ",
+    "حرامزادی",
+    "کنجر",
+    "کنجری",
+    "کمینہ",
+    "کمینے",
+    "بیوقوف",
+    "احمق",
+    "گھٹیا",
+    "ذلیل",
+    "خبیث",
+    "بدتمیز",
+    "بدتمیزی",
+    "کتا",
+    "کتے",
+    "گدا",
+}
 app.add_middleware(
     CORSMiddleware,
     return TEXT_TOKENIZER, TEXT_MODEL
+def normalize_word(word: str) -> str:
+    normalized = word.strip().strip(URDU_PUNCTUATION).lower()
+    normalized = re.sub(r"[\u064b-\u065f\u0670]", "", normalized)
+    return normalized.replace(" ", "")
+def lexicon_match(word: str) -> bool:
+    normalized = normalize_word(word)
+    if not normalized:
+        return False
+    return normalized in TOXIC_LEXICON or any(term in normalized for term in TOXIC_LEXICON if len(term) >= 4)
 @app.on_event("startup")
 def startup_event():
     ensure_artifacts()
         if word_id is None or word_id == previous_word_id:
             continue
+        model_label = id2label[int(predictions[token_index])]
+        model_confidence = float(probabilities[token_index][predictions[token_index]])
+        fallback_toxic = lexicon_match(tokens[word_id])
+        label = model_label
+        confidence = model_confidence
+        if fallback_toxic and model_label == "O":
+            label = "B-Toxic"
+            confidence = max(model_confidence, 0.97)
         is_toxic = label in {"B-Toxic", "I-Toxic"}
         word_results.append(
             {
                 "toxic": is_toxic,
                 "bioTag": label,
                 "confidence": round(confidence, 4),
+                "modelBioTag": model_label,
+                "modelConfidence": round(model_confidence, 4),
+                "source": "lexicon+model" if fallback_toxic and model_label == "O" else "model",
             }
         )
         previous_word_id = word_id
         "confidence": round(float(confidence), 4),
         "subLabel": "toxic" if toxic_words else "non-toxic",
         "subLabelConfidence": round(float(confidence), 4),
+        "toxicSpanCount": count_toxic_spans(word_results),
         "transcript": None,
         "words": word_results,
         "xai": {
+            "modelExplanation": "XLM-RoBERTa BIO token classification with a conservative Urdu abuse-word fallback for obvious missed slurs.",
             "topToxicTokens": [
                 {
                     "token": word["text"],
     }
+def count_toxic_spans(words: list[dict]) -> int:
+    span_count = 0
+    previous_toxic = False
+    for word in words:
+        current_toxic = bool(word["toxic"])
+        if current_toxic and not previous_toxic:
+            span_count += 1
+        previous_toxic = current_toxic
+    return span_count
 def audio_fallback_prediction() -> dict:
     return {
         "isToxic": False,