kkAsmaa commited on
Commit
1063491
·
verified ·
1 Parent(s): ce94a9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -19
app.py CHANGED
@@ -9,7 +9,7 @@ MODEL_REPO = "kkAsmaa/ChildShield"
9
  MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
10
  SUB_FOLDER = "ChildShield"
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
- print("🔄 Loading model weights from the secured ChildShield subfolder...")
13
 
14
  tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
@@ -26,21 +26,19 @@ def clean_obfuscation(text):
26
  text = re.sub(r'[^\w\s\.]', ' ', text)
27
  text = re.sub(r'\s+', ' ', text)
28
  return text.strip()
 
29
  def full_preprocess(text):
30
  text_no_trickery = clean_obfuscation(text)
31
  final_text = arabic_prep.preprocess(text_no_trickery)
32
  return final_text
33
 
34
  def predict_safety_api(text):
35
- """
36
- Arabic text classification gateway utilizing a custom sliding window configuration with 20 token overlap.
37
- """
38
- print(f"[Incoming text to evaluate]: {text}")
39
  cleaned_text = full_preprocess(text)
40
-
41
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
42
  input_ids = full_encodings['input_ids']
43
 
 
44
  window_size = 60
45
  overlap = 20
46
  windows = []
@@ -54,46 +52,65 @@ def predict_safety_api(text):
54
  if len(window) > 0: windows.append(window)
55
  if i + window_size >= len(input_ids): break
56
 
 
57
  is_blocked = False
58
  highest_unsafe_prob = 0.0
 
59
 
60
  for win_ids in windows:
61
-
62
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
63
-
64
- inputs = tokenizer(
65
- window_text,
66
- return_tensors="pt",
67
- truncation=True,
68
- padding="max_length",
69
- max_length=60
70
- )
71
 
72
  with torch.no_grad():
73
  outputs = model(**inputs)
74
 
75
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
76
-
77
  unsafe_p = float(probs[1])
78
 
79
  if unsafe_p > 0.50:
80
  is_blocked = True
81
  highest_unsafe_prob = max(highest_unsafe_prob, unsafe_p)
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  if is_blocked:
84
- return {"verdict": "UNSAFE", "block": True, "confidence": f"{highest_unsafe_prob * 100:.2f}%"}
 
 
 
 
 
 
 
85
 
86
  safe_p = 1.0 - highest_unsafe_prob
87
- return {"verdict": "SAFE", "block": False, "confidence": f"{safe_p * 100:.2f}%"}
 
 
 
 
 
 
 
88
 
89
  interface = gr.Interface(
90
  fn=predict_safety_api,
91
  inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
92
  outputs=gr.JSON(label="Guard Response Object"),
93
- title="ChildShield Production API Gate (Arabic Version)🛡️"
94
  )
95
 
96
  if __name__ == "__main__":
97
  interface.launch()
98
 
99
 
 
 
9
  MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
10
  SUB_FOLDER = "ChildShield"
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
+ print("🔄 Loading ChildShield Explainable AI Core...")
13
 
14
  tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
 
26
  text = re.sub(r'[^\w\s\.]', ' ', text)
27
  text = re.sub(r'\s+', ' ', text)
28
  return text.strip()
29
+
30
  def full_preprocess(text):
31
  text_no_trickery = clean_obfuscation(text)
32
  final_text = arabic_prep.preprocess(text_no_trickery)
33
  return final_text
34
 
35
  def predict_safety_api(text):
36
+ """بوابة الفحص الأساسية الشاملة مع سجلات الرصد الحية"""
 
 
 
37
  cleaned_text = full_preprocess(text)
 
38
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
39
  input_ids = full_encodings['input_ids']
40
 
41
+ total_tokens = len(input_ids)
42
  window_size = 60
43
  overlap = 20
44
  windows = []
 
52
  if len(window) > 0: windows.append(window)
53
  if i + window_size >= len(input_ids): break
54
 
55
+ total_windows = len(windows)
56
  is_blocked = False
57
  highest_unsafe_prob = 0.0
58
+ triggered_sentences = []
59
 
60
  for win_ids in windows:
 
61
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
62
+ inputs = tokenizer(window_text, return_tensors="pt", truncation=True, padding="max_length", max_length=60)
 
 
 
 
 
 
 
63
 
64
  with torch.no_grad():
65
  outputs = model(**inputs)
66
 
67
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
 
68
  unsafe_p = float(probs[1])
69
 
70
  if unsafe_p > 0.50:
71
  is_blocked = True
72
  highest_unsafe_prob = max(highest_unsafe_prob, unsafe_p)
73
+ if window_text not in triggered_sentences:
74
+ triggered_sentences.append(window_text)
75
+
76
+ # 🎯 طباعة التقرير الشامل فوراً داخل شاشة الـ Logs السوداء ليظهر أمام الدكاترة حياً عند اتصال الامتداد
77
+ print("\n📊 --- ChildShield Core Inspection Report ---")
78
+ print(f"📥 Received Text Preview: {text[:60]}...")
79
+ print(f"🔑 Total Tokens Evaluated: {total_tokens}")
80
+ print(f"🪟 Total Windows Processed: {total_windows}")
81
+ print(f"🚨 Verdict: {'UNSAFE (BLOCK)' if is_blocked else 'SAFE (PASS)'}")
82
+ print(f"🛑 Triggered Phrases Captured: {triggered_sentences}")
83
+ print("---------------------------------------------\n")
84
 
85
  if is_blocked:
86
+ return {
87
+ "verdict": "UNSAFE",
88
+ "block": True,
89
+ "confidence": f"{highest_unsafe_prob * 100:.2f}%",
90
+ "evaluated_tokens": total_tokens,
91
+ "processed_windows": total_windows,
92
+ "triggered_phrases": triggered_sentences
93
+ }
94
 
95
  safe_p = 1.0 - highest_unsafe_prob
96
+ return {
97
+ "verdict": "SAFE",
98
+ "block": False,
99
+ "confidence": f"{safe_p * 100:.2f}%",
100
+ "evaluated_tokens": total_tokens,
101
+ "processed_windows": total_windows,
102
+ "triggered_phrases": []
103
+ }
104
 
105
  interface = gr.Interface(
106
  fn=predict_safety_api,
107
  inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
108
  outputs=gr.JSON(label="Guard Response Object"),
109
+ title="ChildShield Production API Gate 🛡️"
110
  )
111
 
112
  if __name__ == "__main__":
113
  interface.launch()
114
 
115
 
116
+