Spaces:

keoka
/

Guardianx-api

Runtime error

App Files Files Community

keoka commited on Mar 12

Commit

13dab33

1 Parent(s): 88c7d7c

إضافة جميع ملفات المشروع

Browse files

Files changed (9) hide show

Dockerfile +16 -0
ai_threat_analyzer.py +237 -0
app.py +69 -0
arabic_model/config.json +28 -0
arabic_model/model.safetensors +3 -0
arabic_model/tokenizer.json +0 -0
arabic_model/tokenizer_config.json +49 -0
guardian_model.pkl +3 -0
requirements.txt +11 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.9-slim
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+ENV HOME=/home/user \
+    HF_HOME=/home/user/.cache/huggingface
+WORKDIR /app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . .
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

ai_threat_analyzer.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# =========================================
+# GuardianX AI Threat Analyzer
+# Version 2.0 - مع AraBERT (500MB)
+# =========================================
+import re
+import pickle
+import numpy as np
+import pandas as pd
+import os
+import torch
+from transformers import AutoTokenizer, AutoModel
+from sklearn.linear_model import LogisticRegression
+# ==============================
+# إعدادات المشروع
+# ==============================
+DATA_FILE = "dataset.csv"
+MODEL_FILE = "guardian_model.pkl"
+ARABERT_PATH = "./arabic_model"  # المسار المحلي للنموذج
+LABELS = {
+    "safe": 0,
+    "scam": 1,
+    "threat": 2
+}
+# ==============================
+# الكلاس الرئيسي
+# ==============================
+class ThreatAnalyzer:
+    # ---------------------------------
+    # INIT - مع AraBERT
+    # ---------------------------------
+    def __init__(self):
+        print("="*50)
+        print("🚀 جاري تحميل نموذج AraBERT...")
+        print("="*50)
+        # التحقق من وجود النموذج المحفوظ
+        if not os.path.exists(ARABERT_PATH):
+            print("⚠ لم أجد النموذج المحفوظ. سيتم تحميله من الإنترنت (قد يستغرق دقائق)")
+            # إذا لم يكن موجوداً، نحمله من Hugging Face
+            model_name = "aubmindlab/bert-base-arabertv2"
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.bert_model = AutoModel.from_pretrained(model_name)
+            # نحفظه للمستقبل
+            os.makedirs(ARABERT_PATH, exist_ok=True)
+            self.tokenizer.save_pretrained(ARABERT_PATH)
+            self.bert_model.save_pretrained(ARABERT_PATH)
+            print("✅ تم تحميل وحفظ النموذج محلياً")
+        else:
+            # تحميل النموذج من المسار المحلي
+            print(f"📂 تحميل النموذج من: {ARABERT_PATH}")
+            self.tokenizer = AutoTokenizer.from_pretrained(ARABERT_PATH)
+            self.bert_model = AutoModel.from_pretrained(ARABERT_PATH)
+            print("✅ تم تحميل النموذج المحلي بنجاح!")
+        # تحديد الجهاز (GPU إن وجد)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.bert_model.to(self.device)
+        print(f"💻 الجهاز المستخدم: {self.device}")
+        # تحميل نموذج التصنيف (Logistic Regression)
+        self.classifier = LogisticRegression(max_iter=1000)
+        # محاولة تحميل النموذج المدرب مسبقاً
+        if os.path.exists(MODEL_FILE):
+            with open(MODEL_FILE, "rb") as f:
+                self.classifier = pickle.load(f)
+                print("✅ تم تحميل نموذج التصنيف من ملف")
+        else:
+            print("🔄 لا يوجد نموذج تصنيف محفوظ. سيتم تدريب نموذج جديد...")
+            self.train()
+    # ---------------------------------
+    # تحويل النص إلى متجه باستخدام AraBERT
+    # ---------------------------------
+    def text_to_vector(self, text):
+        """
+        تحويل النص إلى متجه (768 بعداً) باستخدام AraBERT
+        """
+        # تجهيز النص
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=128
+        ).to(self.device)
+        # تمرير النص عبر النموذج
+        with torch.no_grad():
+            outputs = self.bert_model(**inputs)
+        # استخدام متوسط التشفيرات (Mean Pooling)
+        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+        return embedding
+    # ---------------------------------
+    # تدريب النموذج
+    # ---------------------------------
+    def train(self):
+        print("📚 جاري قراءة بيانات التدريب...")
+        df = pd.read_csv(DATA_FILE)
+        X = []
+        y = []
+        skipped = 0
+        for idx, (text, label) in enumerate(zip(df["text"], df["label"])):
+            # عرض التقدم كل 100 جملة
+            if idx % 100 == 0:
+                print(f"⏳ معالجة الجملة {idx}/{len(df)}")
+            label = str(label).strip().lower()
+            if label not in LABELS:
+                skipped += 1
+                continue
+            # تحويل النص إلى متجه
+            vec = self.text_to_vector(text)
+            X.append(vec)
+            y.append(LABELS[label])
+        X = np.array(X)
+        y = np.array(y)
+        print(f"\n📊 إجمالي الجمل: {len(df)}")
+        print(f"📊 جمل مستخدمة في التدريب: {len(y)}")
+        print(f"⚠ جمل تم تخطيها: {skipped}")
+        print(f"���� شكل مصفوفة التدريب: {X.shape}")
+        # تدريب النموذج
+        print("\n🧠 جاري تدريب النموذج...")
+        self.classifier.fit(X, y)
+        print("✅ تم التدريب بنجاح!")
+        # حفظ النموذج
+        with open(MODEL_FILE, "wb") as f:
+            pickle.dump(self.classifier, f)
+        print("💾 تم حفظ النموذج في ملف")
+    # ---------------------------------
+    # التنبؤ
+    # ---------------------------------
+    def predict(self, text):
+        # تحويل النص إلى متجه
+        vec = self.text_to_vector(text).reshape(1, -1)
+        # التنبؤ
+        pred = self.classifier.predict(vec)[0]
+        # تحويل الرقم إلى تسمية
+        inv = {v: k for k, v in LABELS.items()}
+        return inv[pred]
+    # ---------------------------------
+    # إعادة التدريب (اختياري)
+    # ---------------------------------
+    def retrain(self, new_file=None):
+        """
+        إعادة تدريب النموذج على كل البيانات
+        new_file: (اختياري) ملف CSV إضافي
+        """
+        print("\n🔄 جاري إعادة التدريب...")
+        df = pd.read_csv(DATA_FILE)
+        if new_file:
+            df_new = pd.read_csv(new_file)
+            df = pd.concat([df, df_new], ignore_index=True)
+            df.to_csv(DATA_FILE, index=False)
+            print(f"📁 تم دمج ملف {new_file}")
+        X = []
+        y = []
+        skipped = 0
+        for text, label in zip(df["text"], df["label"]):
+            label = str(label).strip().lower()
+            if label not in LABELS:
+                skipped += 1
+                continue
+            vec = self.text_to_vector(text)
+            X.append(vec)
+            y.append(LABELS[label])
+        X = np.array(X)
+        y = np.array(y)
+        self.classifier.fit(X, y)
+        with open(MODEL_FILE, "wb") as f:
+            pickle.dump(self.classifier, f)
+        print(f"✅ تم إعادة التدريب!")
+        print(f"📊 الجمل المستخدمة: {len(y)}")
+# ==============================
+# اختبار مباشر
+# ==============================
+if __name__ == "__main__":
+    analyzer = ThreatAnalyzer()
+    tests = [
+        "مرحبا كيف حالك",
+        "ارسل المال والا بفضحك",
+        "فزت بجائزة اضغط الرابط",
+        "هات الرقم السري الآن",
+        "وينك يا صاحبي"
+    ]
+    print("\n🔍 اختبار التنبؤ:")
+    for t in tests:
+        result = analyzer.predict(t)
+        print(f"  • {t} ➡ {result}")

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from fastapi import FastAPI, Header, HTTPException, Request
+from pydantic import BaseModel
+from ai_threat_analyzer import ThreatAnalyzer
+import os
+import time
+import logging
+# محاولة استيراد Rate Limiter بشكل اختياري
+try:
+    from fastapi_advanced_rate_limiter import SlidingWindowRateLimiter
+    RATE_LIMITER_AVAILABLE = True
+except ImportError:
+    RATE_LIMITER_AVAILABLE = False
+    class SlidingWindowRateLimiter:
+        def __init__(self, *args, **kwargs): pass
+        def allow_request(self, client_id): return True
+        def get_wait_time(self, client_id): return 0
+# إعداد التسجيل
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.FileHandler('api.log'), logging.StreamHandler()]
+)
+logger = logging.getLogger("guardianx")
+app = FastAPI()
+# قراءة مفتاح API من المتغيرات البيئية
+API_KEY = os.getenv("API_KEY", "guardian123")
+# إعداد Rate Limiter
+limiter = SlidingWindowRateLimiter(capacity=10, fill_rate=10/60, scope="user", backend="memory")
+analyzer = ThreatAnalyzer()
+class TextRequest(BaseModel):
+    text: str
+@app.get("/")
+def home():
+    return {"message": "GuardianX API is running"}
+@app.post("/predict")
+async def predict(request: Request, data: TextRequest, x_api_key: str = Header(None)):
+    if not x_api_key:
+        logger.warning("طلب بدون مفتاح API")
+        raise HTTPException(status_code=401, detail="API Key مفقود")
+    if x_api_key != API_KEY:
+        logger.warning(f"محاولة بمفتاح غير صالح: {x_api_key[:5]}...")
+        raise HTTPException(status_code=403, detail="مفتاح غير صالح")
+    if RATE_LIMITER_AVAILABLE:
+        client_id = x_api_key
+        if not limiter.allow_request(client_id):
+            wait_time = limiter.get_wait_time(client_id)
+            logger.warning(f"كثرة طلبات من المستخدم: {x_api_key[:5]}...")
+            raise HTTPException(status_code=429, detail=f"عدد الطلبات كبير جداً. حاول بعد {wait_time:.0f} ثانية")
+    start_time = time.time()
+    try:
+        result = analyzer.predict(data.text)
+        processing_time = time.time() - start_time
+        logger.info(f"مستخدم: {x_api_key[:5]}... | نص: {data.text[:30]}... | نتيجة: {result} | وقت: {processing_time:.2f}ث")
+        return {"result": result}
+    except Exception as e:
+        logger.error(f"خطأ في التحليل: {str(e)}")
+        raise HTTPException(status_code=500, detail="خطأ داخلي في السيرفر")

arabic_model/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 64000
+}

arabic_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0cc902dbaa87d7cf826caa2b97bf4fc3749434258e0094fb60b256088238ad3
+size 540795728

arabic_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arabic_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "max_len": 512,
+  "model_max_length": 512,
+  "never_split": [
+    "+ك",
+    "+كما",
+    "ك+",
+    "+وا",
+    "+ين",
+    "و+",
+    "+كن",
+    "+ان",
+    "+هم",
+    "+ة",
+    "[بريد]",
+    "لل+",
+    "+ي",
+    "+ت",
+    "+ن",
+    "س+",
+    "ل+",
+    "[مستخدم]",
+    "+كم",
+    "+ا",
+    "ب+",
+    "ف+",
+    "+نا",
+    "+ها",
+    "+ون",
+    "+هما",
+    "ال+",
+    "+ه",
+    "+هن",
+    "+ات",
+    "[رابط]"
+  ],
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

guardian_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0931346d7e109f9e38356e2f5e86fdc7455aaa68d56912119dd815433a8f22f5
+size 19177

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn[standard]
+gensim
+numpy<2.0.0
+scikit-learn
+joblib
+python-multipart
+requests
+pandas
+redis
+gdown