Spaces:

MoAmir
/

Arabic-Toxicity-Detection

Sleeping

App Files Files Community

MoAmir commited on 28 days ago

Commit

1132f6d

verified ·

1 Parent(s): 863a9f9

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -39

app.py CHANGED Viewed

@@ -3,73 +3,69 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
 import torch.nn.functional as F
 import re
-import os
-model_path = "my_arabic_toxicity_model"
-#
-try:
-    print("Loading model from local directory...")
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    model = AutoModelForSequenceClassification.from_pretrained(model_path)
-except Exception as e:
-    print(f"Error loading local model: {e}")
-    print("Fallback to base model (Not recommended for final output)...")
-    # كود احتياطي لو الملفات مش موجودة
-    tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERTv2")
-    model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERTv2", num_labels=5)
-# --- 3.
-id2label = {
-    0: "مسيء / كراهية (Hate)",
-    1: "هجومي (Offensive)",
-    2: "عادي / محايد (Neutral)",
-    3: "إهانة (Insult)",
-    4: "تهديد (Threat)"
-}
-# --- 4.
 def clean_text(text):
     if not text: return ""
-    text = re.sub(r'[\u064B-\u0652]', '', text) # تشكيل
-    text = re.sub(r'[أإآ]', 'ا', text) # توحيد الألف
-    text = re.sub(r'ى', 'ي', text) # توحيد الياء
-    text = re.sub(r'ة', 'ه', text) # تاء مربوطة
-    text = re.sub(r'(.)\1+', r'\1', text) # تطويل
     return text
-# --- 5.
 def classify_text(text):
     if not text: return {}
-    #
     cleaned = clean_text(text)
     inputs = tokenizer(cleaned, return_tensors="pt", padding=True, truncation=True, max_length=128)
-    #
     with torch.no_grad():
         logits = model(**inputs).logits
-    #
     probs = F.softmax(logits, dim=-1)[0].numpy()
-    #
     results = {}
     for i, score in enumerate(probs):
-        label = id2label.get(i, f"Class {i}")
         results[label] = float(score)
     return results
-#
 iface = gr.Interface(
     fn=classify_text,
-    inputs=gr.Textbox(label="اكتب النص هنا", placeholder="اكتب جملة باللهجة المصرية..."),
     outputs=gr.Label(label="النتيجة"),
-    title="Arabic Toxicity Detection ",
-    description="نظام ذكي لاكتشاف الكلام المسيء باللهجة المصرية.",
-    examples=[["انت راجل محترم"], ["يا ابن الكلب"], ["دي حاجة تقرف"]]
 )
 iface.launch()

 import torch
 import torch.nn.functional as F
 import re
+import json
+# --- 1. تحميل الموديل (إجباري من نفس المكان) ---
+# مش هنحط try/except عشان نتأكد انه بيقرأ ملفاتك انت
+model_path = "."
+print("Loading model from current directory...")
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForSequenceClassification.from_pretrained(model_path)
+# --- 2. قراءة ترتيب الكلاسات من ملف config.json ---
+# دي أضمن طريقة عشان الترتيب يطلع زي ما اتدرب بالظبط
+with open('config.json', 'r') as f:
+    config = json.load(f)
+    id2label = config.get('id2label')
+# لو الترتيب مش موجود في الملف، هنستخدم الترتيب الافتراضي (تأكد انه مناسب ليك)
+if not id2label:
+    id2label = {
+        "0": "مسيء / كراهية (Hate)",
+        "1": "هجومي (Offensive)",
+        "2": "عادي / محايد (Neutral)",
+        "3": "إهانة (Insult)",
+        "4": "تهديد (Threat)"
+    }
+# --- 3. دالة التنضيف ---
 def clean_text(text):
     if not text: return ""
+    text = re.sub(r'[\u064B-\u0652]', '', text)
+    text = re.sub(r'[أإآ]', 'ا', text)
+    text = re.sub(r'ى', 'ي', text)
+    text = re.sub(r'ة', 'ه', text)
+    text = re.sub(r'(.)\1+', r'\1', text)
+    text = re.sub(r'[^\u0621-\u064A\u0660-\u0669\s]', '', text)
     return text
+# --- 4. التنبؤ ---
 def classify_text(text):
     if not text: return {}
     cleaned = clean_text(text)
     inputs = tokenizer(cleaned, return_tensors="pt", padding=True, truncation=True, max_length=128)
     with torch.no_grad():
         logits = model(**inputs).logits
     probs = F.softmax(logits, dim=-1)[0].numpy()
     results = {}
     for i, score in enumerate(probs):
+        # بنجيب الاسم الصح بناء على رقم الكلاس
+        label = id2label.get(str(i), f"Class {i}")
         results[label] = float(score)
     return results
+# --- 5. الواجهة ---
 iface = gr.Interface(
     fn=classify_text,
+    inputs=gr.Textbox(label="اكتب النص"),
     outputs=gr.Label(label="النتيجة"),
+    title="Arabic Toxicity Detection",
+    description="تجربة النظام (يجب أن تكون الملفات pytorch_model.bin و config.json موجودة)."
 )
 iface.launch()