kkAsmaa commited on
Commit
e8a997e
·
verified ·
1 Parent(s): ac5b8f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -35
app.py CHANGED
@@ -9,7 +9,7 @@ MODEL_REPO = "kkAsmaa/ChildShield"
9
  MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
10
  SUB_FOLDER = "ChildShield"
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
- print("🔄 Loading ChildShield Robust Production Engine...")
13
 
14
  tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
@@ -26,19 +26,21 @@ def clean_obfuscation(text):
26
  text = re.sub(r'[^\w\s\.]', ' ', text)
27
  text = re.sub(r'\s+', ' ', text)
28
  return text.strip()
29
-
30
  def full_preprocess(text):
31
  text_no_trickery = clean_obfuscation(text)
32
  final_text = arabic_prep.preprocess(text_no_trickery)
33
  return final_text
34
 
35
  def predict_safety_api(text):
36
- """بوابة الفحص الأساسية الشاملة والمقاومة لأخطاء التضارب اللغوي للمتصفحات"""
 
 
 
37
  cleaned_text = full_preprocess(text)
 
38
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
39
  input_ids = full_encodings['input_ids']
40
 
41
- total_tokens = len(input_ids)
42
  window_size = 60
43
  overlap = 20
44
  windows = []
@@ -52,61 +54,45 @@ def predict_safety_api(text):
52
  if len(window) > 0: windows.append(window)
53
  if i + window_size >= len(input_ids): break
54
 
55
- total_windows = len(windows)
56
  is_blocked = False
57
  highest_unsafe_prob = 0.0
58
- triggered_sentences = []
59
 
60
  for win_ids in windows:
 
61
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
62
- inputs = tokenizer(window_text, return_tensors="pt", truncation=True, padding="max_length", max_length=60)
 
 
 
 
 
 
 
63
 
64
  with torch.no_grad():
65
  outputs = model(**inputs)
66
 
67
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
68
- unsafe_p = float(probs[1]) # قراءة معامل الخطر بدقة من الفئة رقم 1
 
69
 
70
  if unsafe_p > 0.50:
71
  is_blocked = True
72
  highest_unsafe_prob = max(highest_unsafe_prob, unsafe_p)
73
- if window_text not in triggered_sentences:
74
- triggered_sentences.append(window_text)
75
-
76
- print("\n📊 --- ChildShield Core Inspection Report ---")
77
- print(f"📥 Received Text Preview: {text[:60]}...")
78
- print(f"🔑 Total Tokens Evaluated: {total_tokens}")
79
- print(f"🪟 Total Windows Processed: {total_windows}")
80
- print(f"🚨 Verdict: {'UNSAFE' if is_blocked else 'SAFE'}")
81
- print("---------------------------------------------\n")
82
 
83
- # 🎯 التحديث الحاسم: إرسال حالة الحظر كنص صريح يفك عقدة الجافا سكربت فوراً
84
  if is_blocked:
85
- return {
86
- "verdict": "UNSAFE",
87
- "block": "true", # نص صريح صغير
88
- "confidence": f"{highest_unsafe_prob * 100:.2f}%",
89
- "evaluated_tokens": total_tokens,
90
- "processed_windows": total_windows,
91
- "triggered_phrases": triggered_sentences
92
- }
93
 
94
  safe_p = 1.0 - highest_unsafe_prob
95
- return {
96
- "verdict": "SAFE",
97
- "block": "false", # نص صريح صغير
98
- "confidence": f"{safe_p * 100:.2f}%",
99
- "evaluated_tokens": total_tokens,
100
- "processed_windows": total_windows,
101
- "triggered_phrases": []
102
- }
103
 
104
  interface = gr.Interface(
105
  fn=predict_safety_api,
106
  inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
107
  outputs=gr.JSON(label="Guard Response Object"),
108
- title="ChildShield Production API Gate 🛡️"
109
  )
110
 
111
  if __name__ == "__main__":
112
  interface.launch()
 
 
9
  MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
10
  SUB_FOLDER = "ChildShield"
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
+ print("🔄 Loading model weights from the secured ChildShield subfolder...")
13
 
14
  tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
 
26
  text = re.sub(r'[^\w\s\.]', ' ', text)
27
  text = re.sub(r'\s+', ' ', text)
28
  return text.strip()
 
29
  def full_preprocess(text):
30
  text_no_trickery = clean_obfuscation(text)
31
  final_text = arabic_prep.preprocess(text_no_trickery)
32
  return final_text
33
 
34
  def predict_safety_api(text):
35
+ """
36
+ Arabic text classification gateway utilizing a custom sliding window configuration with 20 token overlap.
37
+ """
38
+ print(f"[Incoming text to evaluate]: {text}")
39
  cleaned_text = full_preprocess(text)
40
+
41
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
42
  input_ids = full_encodings['input_ids']
43
 
 
44
  window_size = 60
45
  overlap = 20
46
  windows = []
 
54
  if len(window) > 0: windows.append(window)
55
  if i + window_size >= len(input_ids): break
56
 
 
57
  is_blocked = False
58
  highest_unsafe_prob = 0.0
 
59
 
60
  for win_ids in windows:
61
+
62
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
63
+
64
+ inputs = tokenizer(
65
+ window_text,
66
+ return_tensors="pt",
67
+ truncation=True,
68
+ padding="max_length",
69
+ max_length=60
70
+ )
71
 
72
  with torch.no_grad():
73
  outputs = model(**inputs)
74
 
75
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
76
+
77
+ unsafe_p = float(probs[1])
78
 
79
  if unsafe_p > 0.50:
80
  is_blocked = True
81
  highest_unsafe_prob = max(highest_unsafe_prob, unsafe_p)
 
 
 
 
 
 
 
 
 
82
 
 
83
  if is_blocked:
84
+ return {"verdict": "UNSAFE", "block": True, "confidence": f"{highest_unsafe_prob * 100:.2f}%"}
 
 
 
 
 
 
 
85
 
86
  safe_p = 1.0 - highest_unsafe_prob
87
+ return {"verdict": "SAFE", "block": False, "confidence": f"{safe_p * 100:.2f}%"}
 
 
 
 
 
 
 
88
 
89
  interface = gr.Interface(
90
  fn=predict_safety_api,
91
  inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
92
  outputs=gr.JSON(label="Guard Response Object"),
93
+ title="ChildShield Production API Gate (Arabic Version)🛡️"
94
  )
95
 
96
  if __name__ == "__main__":
97
  interface.launch()
98
+