Spaces:

kkAsmaa
/

ChildShield-Interface

Running

App Files Files Community

kkAsmaa commited on 11 days ago

Commit

afe5f61

verified ·

1 Parent(s): d4e530e

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -43

app.py CHANGED Viewed

@@ -2,20 +2,30 @@ import gradio as gr
 import re
 import os
 import torch
 from transformers import BertTokenizer, AutoModelForSequenceClassification
 from arabert.preprocess import ArabertPreprocessor
 MODEL_REPO = "kkAsmaa/ChildShield"
 MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
 SUB_FOLDER = "ChildShield"
 HF_TOKEN = os.getenv("HF_TOKEN")
-print("🔄 Loading model weights from the secured ChildShield subfolder...")
 tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
 model.eval()
 arabic_prep = ArabertPreprocessor(model_name=MODEL_NAME)
 def clean_obfuscation(text):
     text = str(text)
     text = re.sub(r'https?://\S+|www\.\S+|@\S+|#', '', text)
@@ -36,83 +46,105 @@ def predict_safety_api(text):
     """
     Arabic text classification gateway utilizing a custom sliding window configuration with 20 token overlap.
     """
-    print(f"[Incoming text to evaluate]: {text}")
     cleaned_text = full_preprocess(text)
     full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
     input_ids = full_encodings['input_ids']
     total_tokens_count = len(input_ids)
     window_size = 60
     overlap = 20
-    windows = []
     step = window_size - overlap
     if len(input_ids) <= window_size:
         windows = [input_ids]
     else:
         for i in range(0, len(input_ids), step):
             window = input_ids[i:i + window_size]
-            if len(window) > 0: windows.append(window)
-            if i + window_size >= len(input_ids): break
     total_windows_count = len(windows)
     is_blocked = False
     highest_unsafe_prob = 0.0
-    triggered_sentences = []
-    for win_ids in windows:
         window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
         inputs = tokenizer(
-            window_text,
-            return_tensors="pt",
-            truncation=True,
-            padding="max_length",
             max_length=60
         )
         with torch.no_grad():
             outputs = model(**inputs)
         probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
-        unsafe_p = float(probs[1])
-        if unsafe_p > 0.50:
             is_blocked = True
-            highest_unsafe_prob = max(highest_unsafe_prob, unsafe_p)
-            if window_text not in triggered_sentences:
-                triggered_sentences.append(window_text)
-    if is_blocked:
-        return {
-            "verdict": "UNSAFE",
-            "block": True,
-            "confidence": f"{highest_unsafe_prob * 100:.2f}%",
-            "total_tokens": total_tokens_count,
-            "total_windows": total_windows_count,
-            "triggered_phrases": triggered_sentences
-        }
-    safe_p = 1.0 - highest_unsafe_prob
     return {
-        "verdict": "SAFE",
-        "block": False,
-        "confidence": f"{safe_p * 100:.2f}%",
-        "total_tokens": total_tokens_count,
-        "total_windows": total_windows_count,
-        "triggered_phrases": []
     }
-interface = gr.Interface(
     fn=predict_safety_api,
-    inputs=gr.Textbox(lines=3, placeholder="Enter text to analyze..."),
     outputs=gr.JSON(label="Guard Response Object"),
     title="ChildShield Production API Gate (Arabic Version)🛡️"
 )
 if __name__ == "__main__":
-    interface.launch()

 import re
 import os
 import torch
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
 from transformers import BertTokenizer, AutoModelForSequenceClassification
 from arabert.preprocess import ArabertPreprocessor
 MODEL_REPO = "kkAsmaa/ChildShield"
 MODEL_NAME = "aubmindlab/bert-base-arabertv02-twitter"
 SUB_FOLDER = "ChildShield"
 HF_TOKEN = os.getenv("HF_TOKEN")
+print("🔄 Loading ChildShield Model Weights...")
 tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO, token=HF_TOKEN, subfolder=SUB_FOLDER)
 model.eval()
 arabic_prep = ArabertPreprocessor(model_name=MODEL_NAME)
+app = FastAPI(title="ChildShield Backend API")
+class InputData(BaseModel):
+    text: str
 def clean_obfuscation(text):
     text = str(text)
     text = re.sub(r'https?://\S+|www\.\S+|@\S+|#', '', text)
     """
     Arabic text classification gateway utilizing a custom sliding window configuration with 20 token overlap.
     """
     cleaned_text = full_preprocess(text)
     full_encodings = tokenizer(cleaned_text, add_special_tokens=False, return_attention_mask=False)
     input_ids = full_encodings['input_ids']
     total_tokens_count = len(input_ids)
     window_size = 60
     overlap = 20
     step = window_size - overlap
+    windows = []
     if len(input_ids) <= window_size:
         windows = [input_ids]
     else:
         for i in range(0, len(input_ids), step):
             window = input_ids[i:i + window_size]
+            if len(window) > 0:
+                windows.append(window)
+            if i + window_size >= len(input_ids):
+                break
     total_windows_count = len(windows)
     is_blocked = False
     highest_unsafe_prob = 0.0
+    windows_analysis = []
+    triggered_windows = []
+    for idx, win_ids in enumerate(windows):
         window_text = tokenizer.decode(win_ids, skip_special_tokens=True)
         inputs = tokenizer(
+            window_text,
+            return_tensors="pt",
+            truncation=True,
+            padding="max_length",
             max_length=60
         )
         with torch.no_grad():
             outputs = model(**inputs)
         probs = torch.softmax(outputs.logits, dim=-1).flatten().tolist()
+        safe_prob = float(probs[0])
+        unsafe_prob = float(probs[1])
+        prediction = "UNSAFE" if unsafe_prob > 0.50 else "SAFE"
+        windows_analysis.append({
+            "window_id": idx + 1,
+            "window_text": window_text,
+            "safe_probability": round(safe_prob, 4),
+            "unsafe_probability": round(unsafe_prob, 4),
+            "prediction": prediction
+        })
+        if unsafe_prob > 0.50:
             is_blocked = True
+            highest_unsafe_prob = max(highest_unsafe_prob, unsafe_prob)
+            triggered_windows.append(idx + 1)
+    final_prediction = "UNSAFE" if is_blocked else "SAFE"
+    print("\n📊 ===== CHILDSHIELD REPORT =====")
+    print(f"📥 Original Text:\n{text[:100]}")
+    print(f"\n🧹 Cleaned Text:\n{cleaned_text[:100]}")
+    print(f"\n🔑 Total Tokens: {total_tokens_count}")
+    print(f"🪟 Total Windows: {total_windows_count}")
+    print(f"🚨 Final Verdict: {final_prediction}")
+    print(f"🛑 Triggered Windows ID: {triggered_windows}")
+    print("=================================\n")
     return {
+        "original_text": text,
+        "cleaned_text": cleaned_text,
+        "total_tokens": total_tokens_count,
+        "window_size": window_size,
+        "overlap": overlap,
+        "total_windows": total_windows_count,
+        "triggered_windows": triggered_windows,
+        "windows_analysis": windows_analysis,
+        "final_prediction": final_prediction,
+        "blocked": is_blocked,
+        "highest_unsafe_confidence": round(highest_unsafe_prob, 4)
     }
+@app.post("/predict")
+def predict(data: InputData):
+    result = predict_safety_api(data.text)
+    return result
+gradio_interface = gr.Interface(
     fn=predict_safety_api,
+    inputs=gr.Textbox(lines=4, placeholder="Enter Arabic text to analyze..."),
     outputs=gr.JSON(label="Guard Response Object"),
     title="ChildShield Production API Gate (Arabic Version)🛡️"
 )
+app = gr.mount_gradio_app(app, gradio_interface, path="/")
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)