Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import gradio as gr
|
|
| 2 |
import re
|
| 3 |
import os
|
| 4 |
import torch
|
|
|
|
| 5 |
from transformers import BertTokenizer, AutoModelForSequenceClassification
|
| 6 |
from arabert.preprocess import ArabertPreprocessor
|
| 7 |
|
|
@@ -26,21 +27,23 @@ def clean_obfuscation(text):
|
|
| 26 |
text = re.sub(r'[^\w\s\.]', ' ', text)
|
| 27 |
text = re.sub(r'\s+', ' ', text)
|
| 28 |
return text.strip()
|
|
|
|
| 29 |
def full_preprocess(text):
|
| 30 |
text_no_trickery = clean_obfuscation(text)
|
| 31 |
final_text = arabic_prep.preprocess(text_no_trickery)
|
| 32 |
return final_text
|
| 33 |
|
| 34 |
-
def
|
| 35 |
-
"""
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
cleaned_text = full_preprocess(text)
|
| 40 |
-
|
| 41 |
full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
|
| 42 |
input_ids = full_encodings['input_ids']
|
| 43 |
|
|
|
|
|
|
|
| 44 |
window_size = 60
|
| 45 |
overlap = 20
|
| 46 |
windows = []
|
|
@@ -54,43 +57,113 @@ def predict_safety_api(text):
|
|
| 54 |
if len(window) > 0: windows.append(window)
|
| 55 |
if i + window_size >= len(input_ids): break
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
| 60 |
for win_ids in windows:
|
| 61 |
-
|
| 62 |
window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
|
| 63 |
-
|
| 64 |
-
inputs = tokenizer(
|
| 65 |
-
window_text,
|
| 66 |
-
return_tensors="pt",
|
| 67 |
-
truncation=True,
|
| 68 |
-
padding="max_length",
|
| 69 |
-
max_length=60
|
| 70 |
-
)
|
| 71 |
|
| 72 |
with torch.no_grad():
|
| 73 |
outputs = model(**inputs)
|
| 74 |
-
|
| 75 |
probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
|
| 76 |
-
|
| 77 |
-
unsafe_p = float(probs[1])
|
| 78 |
|
| 79 |
if unsafe_p > 0.50:
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
|
|
|
| 89 |
interface = gr.Interface(
|
| 90 |
-
fn=
|
| 91 |
-
inputs=
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
| 94 |
)
|
| 95 |
|
| 96 |
if __name__ == "__main__":
|
|
|
|
| 2 |
import re
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
+
import json
|
| 6 |
from transformers import BertTokenizer, AutoModelForSequenceClassification
|
| 7 |
from arabert.preprocess import ArabertPreprocessor
|
| 8 |
|
|
|
|
| 27 |
text = re.sub(r'[^\w\s\.]', ' ', text)
|
| 28 |
text = re.sub(r'\s+', ' ', text)
|
| 29 |
return text.strip()
|
| 30 |
+
|
| 31 |
def full_preprocess(text):
|
| 32 |
text_no_trickery = clean_obfuscation(text)
|
| 33 |
final_text = arabic_prep.preprocess(text_no_trickery)
|
| 34 |
return final_text
|
| 35 |
|
| 36 |
+
def evaluate_single_text(text):
|
| 37 |
+
"""دالة داخلية لتقطيع وفحص النص عبر النوافذ المنزلقة 60/20 وحساب التوكنز والنوافذ"""
|
| 38 |
+
if not text or len(text.strip()) < 2:
|
| 39 |
+
return False, 0.0, 0, 0, []
|
| 40 |
+
|
| 41 |
cleaned_text = full_preprocess(text)
|
|
|
|
| 42 |
full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
|
| 43 |
input_ids = full_encodings['input_ids']
|
| 44 |
|
| 45 |
+
total_tokens = len(input_ids)
|
| 46 |
+
|
| 47 |
window_size = 60
|
| 48 |
overlap = 20
|
| 49 |
windows = []
|
|
|
|
| 57 |
if len(window) > 0: windows.append(window)
|
| 58 |
if i + window_size >= len(input_ids): break
|
| 59 |
|
| 60 |
+
total_windows = len(windows)
|
| 61 |
+
is_unsafe = False
|
| 62 |
+
highest_prob = 0.0
|
| 63 |
+
triggered_phrases = []
|
| 64 |
+
|
| 65 |
for win_ids in windows:
|
|
|
|
| 66 |
window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
|
| 67 |
+
inputs = tokenizer(window_text, return_tensors="pt", truncation=True, padding="max_length", max_length=60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
with torch.no_grad():
|
| 70 |
outputs = model(**inputs)
|
|
|
|
| 71 |
probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
|
| 72 |
+
unsafe_p = float(probs[1]) # قراءة معامل الخطر بدقة من الفئة رقم 1
|
|
|
|
| 73 |
|
| 74 |
if unsafe_p > 0.50:
|
| 75 |
+
is_unsafe = True
|
| 76 |
+
highest_prob = max(highest_prob, unsafe_p)
|
| 77 |
+
if window_text not in triggered_phrases:
|
| 78 |
+
triggered_phrases.append(window_text)
|
| 79 |
+
|
| 80 |
+
return is_unsafe, highest_prob, total_tokens, total_windows, triggered_phrases
|
| 81 |
|
| 82 |
+
def predict_adaptive_shield(post_text, comments_json_string):
|
| 83 |
+
"""
|
| 84 |
+
البوابة الرئيسية الشاملة لاستقبال المنشور والتعليقات/الإعلانات وحساب توازن الـ 75% والإحصائيات
|
| 85 |
+
"""
|
| 86 |
+
print(f"[Incoming Evaluation Request] Post length: {len(str(post_text))}")
|
| 87 |
+
|
| 88 |
+
# 1. فحص وتقييم المنشور الرئيسي
|
| 89 |
+
post_unsafe, post_prob, post_tokens, post_windows, post_triggered = evaluate_single_text(post_text)
|
| 90 |
+
|
| 91 |
+
# علم وضوح المنشور: إذا كان سيئاً أو أمانه متأرجحاً يُعتبر غامضاً
|
| 92 |
+
is_post_vague_or_unsafe = post_unsafe or (post_prob > 0.35)
|
| 93 |
|
| 94 |
+
# 2. فك وفحص التعليقات أو الإعلانات الممررة كـ JSON أو الأسطر المكتوبة يدوياً
|
| 95 |
+
comments_list = []
|
| 96 |
+
try:
|
| 97 |
+
comments_list = json.loads(comments_json_string)
|
| 98 |
+
except:
|
| 99 |
+
# لتسهيل التجربة اليدوية في واجهة غرايديو إذا كتبها المستخدم كأسطر يدوية
|
| 100 |
+
comments_list = [c.strip() for c in comments_json_string.split("\n") if c.strip()]
|
| 101 |
+
|
| 102 |
+
total_comments = len(comments_list)
|
| 103 |
+
unsafe_comments_count = 0
|
| 104 |
+
blurred_elements = []
|
| 105 |
+
all_triggered_phrases = list(post_triggered)
|
| 106 |
+
|
| 107 |
+
total_tokens_evaluated = post_tokens
|
| 108 |
+
total_windows_processed = post_windows
|
| 109 |
+
|
| 110 |
+
for index, element_text in enumerate(comments_list):
|
| 111 |
+
c_unsafe, c_prob, c_tokens, c_windows, c_triggered = evaluate_single_text(element_text)
|
| 112 |
+
total_tokens_evaluated += c_tokens
|
| 113 |
+
total_windows_processed += c_windows
|
| 114 |
+
all_triggered_phrases.extend(c_triggered)
|
| 115 |
+
|
| 116 |
+
if c_unsafe:
|
| 117 |
+
unsafe_comments_count += 1
|
| 118 |
+
# الاحتفاظ بموقع التعليق أو الإعلان السيئ لتمويهه محلياً بالـ CSS في كروم
|
| 119 |
+
blurred_elements.append({
|
| 120 |
+
"element_index": index,
|
| 121 |
+
"text": element_text,
|
| 122 |
+
"confidence": f"{c_prob * 100:.2f}%"
|
| 123 |
+
})
|
| 124 |
+
|
| 125 |
+
# 3. رياضيات أسماء لحساب نسبة التلوث اللغوي في البيئة المحيطة (75%)
|
| 126 |
+
unsafe_percentage = (unsafe_comments_count / total_comments * 100) if total_comments > 0 else 0.0
|
| 127 |
+
|
| 128 |
+
# 4. مصفوفة اتخاذ القرار التكيفية الصارمة (Decision Matrix)
|
| 129 |
+
full_page_block = False
|
| 130 |
+
verdict = "SAFE"
|
| 131 |
+
|
| 132 |
+
if is_post_vague_or_unsafe and unsafe_percentage >= 75.0:
|
| 133 |
+
full_page_block = True
|
| 134 |
+
verdict = "CRITICAL_UNSAFE_PAGE_BLOCKED"
|
| 135 |
+
elif unsafe_percentage > 0:
|
| 136 |
+
verdict = "PARTIAL_UNSAFE_ELEMENTS_BLURRED"
|
| 137 |
+
|
| 138 |
+
# تقرير المراقبة السحابي الشامل المطبوع في الـ Logs
|
| 139 |
+
print("\n========= CHILDSHIELD COMPREHENSIVE ADAPTIVE REPORT =========")
|
| 140 |
+
print(f"📝 Main Post Vague/Unsafe Flag : {is_post_vague_or_unsafe}")
|
| 141 |
+
print(f"🔑 Total Page Tokens Count : {total_tokens_evaluated}")
|
| 142 |
+
print(f"🪟 Total Sliding Windows Run : {total_windows_processed}")
|
| 143 |
+
print(f"💬 Unsafe Elements Density : {unsafe_percentage:.2f}% ({unsafe_comments_count}/{total_comments})")
|
| 144 |
+
print(f"🛡️ Final Cloud Security Verdict : {verdict}")
|
| 145 |
+
print(f"🎬 Block Entire Layout Action : {full_page_block}")
|
| 146 |
+
print("=============================================================\n")
|
| 147 |
+
|
| 148 |
+
return {
|
| 149 |
+
"verdict": verdict,
|
| 150 |
+
"block_entire_page": full_page_block,
|
| 151 |
+
"unsafe_elements_percentage": f"{unsafe_percentage:.2f}%",
|
| 152 |
+
"total_evaluated_tokens": total_tokens_evaluated,
|
| 153 |
+
"total_processed_windows": total_windows_processed,
|
| 154 |
+
"triggered_phrases": list(set(all_triggered_phrases)), # قائمة الجمل الفرعية المسببة للحظر
|
| 155 |
+
"elements_to_blur": blurred_elements # قائمة العناصر المطلوب تمويهها بالـ Blur
|
| 156 |
+
}
|
| 157 |
|
| 158 |
+
# بناء واجهة العرض التفاعلية الشاملة والمطابقة للمناقشة العلمية
|
| 159 |
interface = gr.Interface(
|
| 160 |
+
fn=predict_adaptive_shield,
|
| 161 |
+
inputs=[
|
| 162 |
+
gr.Textbox(lines=2, label="Main Post Content (نص المنشور الأساسي)"),
|
| 163 |
+
gr.Textbox(lines=4, label="Comments / Ads (التعليقات أو الإعلانات - اكتب كل نص في سطر مستقل للتجربة يدوياً)")
|
| 164 |
+
],
|
| 165 |
+
outputs=gr.JSON(label="Adaptive Guard Response Object"),
|
| 166 |
+
title="ChildShield Adaptive Production API Gate 🛡️"
|
| 167 |
)
|
| 168 |
|
| 169 |
if __name__ == "__main__":
|