kkAsmaa commited on
Commit
1a9379a
·
verified ·
1 Parent(s): 274b868

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -17
app.py CHANGED
@@ -3,26 +3,17 @@ import re
3
  import os
4
  import torch
5
  from transformers import BertTokenizer, AutoModelForSequenceClassification
6
-
7
- # استدعاء معالج التنظيف الرسمي لـ AraBERT الخاص بكِ
8
  from arabert.preprocess import ArabertPreprocessor
9
 
10
- # معرفات المستودع والمسار الفرعي لأسماء
11
  MODEL_REPO = "kkAsmaa/ChildShield"
12
  MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
13
  SUB_FOLDER = "ChildShield"
14
-
15
- # سحب المفتاح السري تلقائياً وبأعلى درجات الأمان السيبراني
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
-
18
  print("🔄 Loading model weights from the secured ChildShield subfolder...")
19
 
20
- # استدعاء المترجم المستقر وتوجيه الموديل بدقة للمجلد الفرعي
21
  tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
22
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
23
  model.eval()
24
-
25
- # تفعيل المعالج ليتطابق مع داتا كولاب
26
  arabic_prep = ArabertPreprocessor(model_name=MODEL_NAME)
27
 
28
  def clean_obfuscation(text):
@@ -35,7 +26,6 @@ def clean_obfuscation(text):
35
  text = re.sub(r'[^\w\s\.]', ' ', text)
36
  text = re.sub(r'\s+', ' ', text)
37
  return text.strip()
38
-
39
  def full_preprocess(text):
40
  text_no_trickery = clean_obfuscation(text)
41
  final_text = arabic_prep.preprocess(text_no_trickery)
@@ -43,16 +33,13 @@ def full_preprocess(text):
43
 
44
  def predict_safety_api(text):
45
  """
46
- Main prediction core fully optimized with Asmaa's custom window configuration (60/20)
47
- and standardized native padding execution.
48
  """
49
  cleaned_text = full_preprocess(text)
50
 
51
- # 1. تقطيع النص الأولي لأرقام مجهولة الأبعاد بدون حشو مسبق
52
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
53
  input_ids = full_encodings['input_ids']
54
 
55
- # تثبيت أبعاد النوافذ الذكية الخاصة بكِ (60/20)
56
  window_size = 60
57
  overlap = 20
58
  windows = []
@@ -70,7 +57,7 @@ def predict_safety_api(text):
70
  highest_unsafe_prob = 0.0
71
 
72
  for win_ids in windows:
73
- # تحويل الأرقام الفرعية للنافذة الحالية إلى نص مجدداً ليمررها التوكنايزر بطريقتكِ الذكية
74
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
75
 
76
  inputs = tokenizer(
@@ -86,7 +73,6 @@ def predict_safety_api(text):
86
 
87
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
88
 
89
- # 🎯 السطر الفائز المصلح: قراءة الخانة رقم 1 المخصصة لنسبة الخطر بدقة
90
  unsafe_p = float(probs[1])
91
 
92
  if unsafe_p > 0.50:
@@ -99,7 +85,6 @@ def predict_safety_api(text):
99
  safe_p = 1.0 - highest_unsafe_prob
100
  return {"verdict": "SAFE", "block": False, "confidence": f"{safe_p * 100:.2f}%"}
101
 
102
- # بناء واجهة Gradio الاحترافية للمشروع
103
  interface = gr.Interface(
104
  fn=predict_safety_api,
105
  inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
 
3
  import os
4
  import torch
5
  from transformers import BertTokenizer, AutoModelForSequenceClassification
 
 
6
  from arabert.preprocess import ArabertPreprocessor
7
 
 
8
  MODEL_REPO = "kkAsmaa/ChildShield"
9
  MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
10
  SUB_FOLDER = "ChildShield"
 
 
11
  HF_TOKEN = os.getenv("HF_TOKEN")
 
12
  print("🔄 Loading model weights from the secured ChildShield subfolder...")
13
 
 
14
  tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
16
  model.eval()
 
 
17
  arabic_prep = ArabertPreprocessor(model_name=MODEL_NAME)
18
 
19
  def clean_obfuscation(text):
 
26
  text = re.sub(r'[^\w\s\.]', ' ', text)
27
  text = re.sub(r'\s+', ' ', text)
28
  return text.strip()
 
29
  def full_preprocess(text):
30
  text_no_trickery = clean_obfuscation(text)
31
  final_text = arabic_prep.preprocess(text_no_trickery)
 
33
 
34
  def predict_safety_api(text):
35
  """
36
+ Arabic text classification gateway utilizing a custom sliding window configuration with 20 token overlap.
 
37
  """
38
  cleaned_text = full_preprocess(text)
39
 
 
40
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
41
  input_ids = full_encodings['input_ids']
42
 
 
43
  window_size = 60
44
  overlap = 20
45
  windows = []
 
57
  highest_unsafe_prob = 0.0
58
 
59
  for win_ids in windows:
60
+
61
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
62
 
63
  inputs = tokenizer(
 
73
 
74
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
75
 
 
76
  unsafe_p = float(probs[1])
77
 
78
  if unsafe_p > 0.50:
 
85
  safe_p = 1.0 - highest_unsafe_prob
86
  return {"verdict": "SAFE", "block": False, "confidence": f"{safe_p * 100:.2f}%"}
87
 
 
88
  interface = gr.Interface(
89
  fn=predict_safety_api,
90
  inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),