kkAsmaa commited on
Commit
fe08a39
·
verified ·
1 Parent(s): 604575d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -31
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import re
3
  import os
4
  import torch
 
5
  from transformers import BertTokenizer, AutoModelForSequenceClassification
6
  from arabert.preprocess import ArabertPreprocessor
7
 
@@ -26,21 +27,23 @@ def clean_obfuscation(text):
26
  text = re.sub(r'[^\w\s\.]', ' ', text)
27
  text = re.sub(r'\s+', ' ', text)
28
  return text.strip()
 
29
  def full_preprocess(text):
30
  text_no_trickery = clean_obfuscation(text)
31
  final_text = arabic_prep.preprocess(text_no_trickery)
32
  return final_text
33
 
34
- def predict_safety_api(text):
35
- """
36
- Arabic text classification gateway utilizing a custom sliding window configuration with 20 token overlap.
37
- """
38
- print(f"[Incoming text to evaluate]: {text}")
39
  cleaned_text = full_preprocess(text)
40
-
41
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
42
  input_ids = full_encodings['input_ids']
43
 
 
 
44
  window_size = 60
45
  overlap = 20
46
  windows = []
@@ -54,43 +57,113 @@ def predict_safety_api(text):
54
  if len(window) > 0: windows.append(window)
55
  if i + window_size >= len(input_ids): break
56
 
57
- is_blocked = False
58
- highest_unsafe_prob = 0.0
59
-
 
 
60
  for win_ids in windows:
61
-
62
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
63
-
64
- inputs = tokenizer(
65
- window_text,
66
- return_tensors="pt",
67
- truncation=True,
68
- padding="max_length",
69
- max_length=60
70
- )
71
 
72
  with torch.no_grad():
73
  outputs = model(**inputs)
74
-
75
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
76
-
77
- unsafe_p = float(probs[1])
78
 
79
  if unsafe_p > 0.50:
80
- is_blocked = True
81
- highest_unsafe_prob = max(highest_unsafe_prob, unsafe_p)
 
 
 
 
82
 
83
- if is_blocked:
84
- return {"verdict": "UNSAFE", "block": True, "confidence": f"{highest_unsafe_prob * 100:.2f}%"}
 
 
 
 
 
 
 
 
 
85
 
86
- safe_p = 1.0 - highest_unsafe_prob
87
- return {"verdict": "SAFE", "block": False, "confidence": f"{safe_p * 100:.2f}%"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
89
  interface = gr.Interface(
90
- fn=predict_safety_api,
91
- inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
92
- outputs=gr.JSON(label="Guard Response Object"),
93
- title="ChildShield Production API Gate (Arabic Version)🛡️"
 
 
 
94
  )
95
 
96
  if __name__ == "__main__":
 
2
  import re
3
  import os
4
  import torch
5
+ import json
6
  from transformers import BertTokenizer, AutoModelForSequenceClassification
7
  from arabert.preprocess import ArabertPreprocessor
8
 
 
27
  text = re.sub(r'[^\w\s\.]', ' ', text)
28
  text = re.sub(r'\s+', ' ', text)
29
  return text.strip()
30
+
31
  def full_preprocess(text):
32
  text_no_trickery = clean_obfuscation(text)
33
  final_text = arabic_prep.preprocess(text_no_trickery)
34
  return final_text
35
 
36
+ def evaluate_single_text(text):
37
+ """دالة داخلية لتقطيع وفحص النص عبر النوافذ المنزلقة 60/20 وحساب التوكنز والنوافذ"""
38
+ if not text or len(text.strip()) < 2:
39
+ return False, 0.0, 0, 0, []
40
+
41
  cleaned_text = full_preprocess(text)
 
42
  full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
43
  input_ids = full_encodings['input_ids']
44
 
45
+ total_tokens = len(input_ids)
46
+
47
  window_size = 60
48
  overlap = 20
49
  windows = []
 
57
  if len(window) > 0: windows.append(window)
58
  if i + window_size >= len(input_ids): break
59
 
60
+ total_windows = len(windows)
61
+ is_unsafe = False
62
+ highest_prob = 0.0
63
+ triggered_phrases = []
64
+
65
  for win_ids in windows:
 
66
  window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
67
+ inputs = tokenizer(window_text, return_tensors="pt", truncation=True, padding="max_length", max_length=60)
 
 
 
 
 
 
 
68
 
69
  with torch.no_grad():
70
  outputs = model(**inputs)
 
71
  probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
72
+ unsafe_p = float(probs[1]) # قراءة معامل الخطر بدقة من الفئة رقم 1
 
73
 
74
  if unsafe_p > 0.50:
75
+ is_unsafe = True
76
+ highest_prob = max(highest_prob, unsafe_p)
77
+ if window_text not in triggered_phrases:
78
+ triggered_phrases.append(window_text)
79
+
80
+ return is_unsafe, highest_prob, total_tokens, total_windows, triggered_phrases
81
 
82
+ def predict_adaptive_shield(post_text, comments_json_string):
83
+ """
84
+ البوابة الرئيسية الشاملة لاستقبال المنشور والتعليقات/الإعلانات وحساب توازن الـ 75% والإحصائيات
85
+ """
86
+ print(f"[Incoming Evaluation Request] Post length: {len(str(post_text))}")
87
+
88
+ # 1. فحص وتقييم المنشور الرئيسي
89
+ post_unsafe, post_prob, post_tokens, post_windows, post_triggered = evaluate_single_text(post_text)
90
+
91
+ # علم وضوح المنشور: إذا كان سيئاً أو أمانه متأرجحاً يُعتبر غامضاً
92
+ is_post_vague_or_unsafe = post_unsafe or (post_prob > 0.35)
93
 
94
+ # 2. فك وفحص التعليقات أو الإعلانات الممررة كـ JSON أو الأسطر المكتوبة يدوياً
95
+ comments_list = []
96
+ try:
97
+ comments_list = json.loads(comments_json_string)
98
+ except:
99
+ # لتسهيل التجربة اليدوية في واجهة غرايديو إذا كتبها المستخدم كأسطر يدوية
100
+ comments_list = [c.strip() for c in comments_json_string.split("\n") if c.strip()]
101
+
102
+ total_comments = len(comments_list)
103
+ unsafe_comments_count = 0
104
+ blurred_elements = []
105
+ all_triggered_phrases = list(post_triggered)
106
+
107
+ total_tokens_evaluated = post_tokens
108
+ total_windows_processed = post_windows
109
+
110
+ for index, element_text in enumerate(comments_list):
111
+ c_unsafe, c_prob, c_tokens, c_windows, c_triggered = evaluate_single_text(element_text)
112
+ total_tokens_evaluated += c_tokens
113
+ total_windows_processed += c_windows
114
+ all_triggered_phrases.extend(c_triggered)
115
+
116
+ if c_unsafe:
117
+ unsafe_comments_count += 1
118
+ # الاحتفاظ بموقع التعليق أو الإعلان السيئ لتمويهه محلياً بالـ CSS في كروم
119
+ blurred_elements.append({
120
+ "element_index": index,
121
+ "text": element_text,
122
+ "confidence": f"{c_prob * 100:.2f}%"
123
+ })
124
+
125
+ # 3. رياضيات أسماء لحساب نسبة التلوث اللغوي في البيئة المحيطة (75%)
126
+ unsafe_percentage = (unsafe_comments_count / total_comments * 100) if total_comments > 0 else 0.0
127
+
128
+ # 4. مصفوفة اتخاذ القرار التكيفية الصارمة (Decision Matrix)
129
+ full_page_block = False
130
+ verdict = "SAFE"
131
+
132
+ if is_post_vague_or_unsafe and unsafe_percentage >= 75.0:
133
+ full_page_block = True
134
+ verdict = "CRITICAL_UNSAFE_PAGE_BLOCKED"
135
+ elif unsafe_percentage > 0:
136
+ verdict = "PARTIAL_UNSAFE_ELEMENTS_BLURRED"
137
+
138
+ # تقرير المراقبة السحابي الشامل المطبوع في الـ Logs
139
+ print("\n========= CHILDSHIELD COMPREHENSIVE ADAPTIVE REPORT =========")
140
+ print(f"📝 Main Post Vague/Unsafe Flag : {is_post_vague_or_unsafe}")
141
+ print(f"🔑 Total Page Tokens Count : {total_tokens_evaluated}")
142
+ print(f"🪟 Total Sliding Windows Run : {total_windows_processed}")
143
+ print(f"💬 Unsafe Elements Density : {unsafe_percentage:.2f}% ({unsafe_comments_count}/{total_comments})")
144
+ print(f"🛡️ Final Cloud Security Verdict : {verdict}")
145
+ print(f"🎬 Block Entire Layout Action : {full_page_block}")
146
+ print("=============================================================\n")
147
+
148
+ return {
149
+ "verdict": verdict,
150
+ "block_entire_page": full_page_block,
151
+ "unsafe_elements_percentage": f"{unsafe_percentage:.2f}%",
152
+ "total_evaluated_tokens": total_tokens_evaluated,
153
+ "total_processed_windows": total_windows_processed,
154
+ "triggered_phrases": list(set(all_triggered_phrases)), # قائمة الجمل الفرعية المسببة للحظر
155
+ "elements_to_blur": blurred_elements # قائمة العناصر المطلوب تمويهها بالـ Blur
156
+ }
157
 
158
+ # بناء واجهة العرض التفاعلية الشاملة والمطابقة للمناقشة العلمية
159
  interface = gr.Interface(
160
+ fn=predict_adaptive_shield,
161
+ inputs=[
162
+ gr.Textbox(lines=2, label="Main Post Content (نص المنشور الأساسي)"),
163
+ gr.Textbox(lines=4, label="Comments / Ads (التعليقات أو الإعلانات - اكتب كل نص في سطر مستقل للتجربة يدوياً)")
164
+ ],
165
+ outputs=gr.JSON(label="Adaptive Guard Response Object"),
166
+ title="ChildShield Adaptive Production API Gate 🛡️"
167
  )
168
 
169
  if __name__ == "__main__":