Spaces:

ayandev101
/

sentinel-shield

Sleeping

App Files Files Community

ayandev101 commited on Jan 22

Commit

4428754

verified ·

1 Parent(s): 47fca27

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +21 -0
main.py +81 -0
requirements.txt +8 -0
shield_cli.py +355 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+# Python image
+FROM python:3.9
+# Work directory
+WORKDIR /code
+# Install dependencies
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Copy all code
+COPY . .
+# Security stuff for Hugging Face
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Start command (Port 7860)
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# Import your existing pipeline
+from shield_cli import shield_pipeline
+# ----------------------------------
+# App Init
+# ----------------------------------
+app = FastAPI(title="Sentinel Shield API")
+# Allow orchestrator only
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ----------------------------------
+# Request Schema (MATCH ORCHESTRATOR)
+# ----------------------------------
+class ShieldRequest(BaseModel):
+    prompt: str
+# ----------------------------------
+# Shield Endpoint
+# ----------------------------------
+@app.post("/shield")
+async def run_shield(request: ShieldRequest):
+    try:
+        # Added a debug log
+        print(f"DEBUG: Processing prompt: {request.prompt}")
+        result = shield_pipeline(request.prompt)
+        return result
+    except Exception as e:
+        # This will print the FULL error in your terminal
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(
+            status_code=500,
+            detail=f"Shield failure: {str(e)}"
+        )
+# ----------------------------------
+# Health Check
+# ----------------------------------
+@app.get("/health")
+def health_check():
+    return {
+        "status": "online",
+        "service": "shield",
+        "model": "protectai/deberta-v3-base-prompt-injection-v2"
+    }
+import sqlite3
+from fastapi import APIRouter
+@app.get("/logs")
+async def get_logs():
+    try:
+        conn = sqlite3.connect("shield_logs.db")
+        conn.row_factory = sqlite3.Row # This allows us to access columns by name
+        cursor = conn.cursor()
+        # Fetch the last 50 logs
+        cursor.execute("SELECT * FROM shield_logs ORDER BY created_at DESC LIMIT 50")
+        rows = cursor.fetchall()
+        # Convert sqlite rows to a list of dicts
+        logs = []
+        for row in rows:
+            logs.append(dict(row))
+        conn.close()
+        return logs
+    except Exception as e:
+        return {"error": str(e)}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi>=0.110.0
+uvicorn[standard]>=0.27.0
+torch>=2.1.0
+transformers>=4.36.0
+tokenizers>=0.15.0
+safetensors>=0.4.0
+pydantic>=2.6.0
+sqlite-utils>=3.36

shield_cli.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import torch
+import re
+import hashlib
+import sqlite3
+from datetime import datetime
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# ---------------- CONFIG ----------------
+MODEL_NAME = "protectai/deberta-v3-base-prompt-injection-v2"
+BLOCK_THRESHOLD = 0.8
+DB_PATH = "shield_logs.db"
+FORBIDDEN_TOPICS = [
+    #  Credentials & Secrets
+    "api key", "apikey", "api-key",
+    "secret key", "client secret",
+    "access token", "refresh token",
+    "bearer token", "oauth token",
+    "private key", "public key",
+    "ssh key", "pgp key",
+    "password", "passwd", "pwd",
+    "credentials", "login credentials",
+    "username and password",
+    # Cloud / DevOps Secrets
+    "aws access key", "aws secret",
+    "iam credentials", "cloud credentials",
+    "azure tenant id", "azure secret",
+    "gcp service account",
+    "firebase private key",
+    "kubernetes secret",
+    "docker registry password",
+    "ci/cd secrets",
+    "github token", "gitlab token",
+    #  Databases & Storage
+    "database dump", "db dump",
+    "production database",
+    "prod database",
+    "sql dump",
+    "mongodb dump",
+    "redis keys",
+    "s3 bucket contents",
+    "backup files",
+    #  Internal / Confidential
+    "internal document",
+    "confidential data",
+    "restricted information",
+    "private repository",
+    "internal api",
+    "internal endpoint",
+    "company secrets",
+    "trade secrets",
+    "internal roadmap",
+    "internal emails",
+    #  Financial / HR
+    "salary spreadsheet",
+    "employee salary",
+    "payroll data",
+    "bank account details",
+    "credit card numbers",
+    "debit card details",
+    "cvv number",
+    "tax records",
+    "pan card",
+    "aadhar number",
+    #  Legal / Strategy
+    "nda document",
+    "legal strategy",
+    "lawsuit documents",
+    "compliance report",
+    "audit report",
+    "merger plans",
+    "acquisition plans",
+    "board meeting notes",
+    #  AI / Model Internals
+    "system prompt",
+    "developer prompt",
+    "hidden instructions",
+    "training data",
+    "model weights",
+    "fine tuning data",
+    "rlhf data",
+    "prompt injection",
+    "guardrail bypass",
+    #  Malware / Exploits
+    "zero day exploit",
+    "exploit code",
+    "malware source",
+    "ransomware",
+    "keylogger",
+    "credential harvester",
+    "reverse shell",
+    "backdoor",
+    "rootkit",
+    "botnet",
+    "payload generation",
+    #  OPSEC / Surveillance
+    "bypass detection",
+    "avoid being traced",
+    "anonymous hacking",
+    "hide from law enforcement",
+    "evade antivirus",
+    "disable logging",
+    "erase logs",
+    #  Custom / Project-Specific
+    "project x",
+    "sentinel internal",
+    "shield internal",
+    "ayaan private",
+    "security architecture",
+    "security weaknesses",
+    "bypass sentinel",
+]
+JAILBREAK_PATTERNS = [
+    #  Instruction Override
+    r"ignore (all|any|previous|above) instructions",
+    r"disregard (all|any) rules",
+    r"forget your instructions",
+    r"override your system prompt",
+    r"reset your instructions",
+    r"new rules apply",
+    #  Role Hijacking / Persona Attacks
+    r"you are now .*",
+    r"act as .*",
+    r"pretend to be .*",
+    r"roleplay as .*",
+    r"simulate .*",
+    r"behave like .*",
+    #  Safety & Policy Suppression
+    r"do not mention (rules|filters|policies|safety)",
+    r"without safety restrictions",
+    r"bypass safety",
+    r"bypass filters",
+    r"disable guardrails",
+    r"no content restrictions",
+    #  Authority / Trust Abuse
+    r"this is for educational purposes",
+    r"this is for research purposes",
+    r"this is legal",
+    r"i have permission",
+    r"authorized request",
+    r"company approved",
+    #  Encoding / Obfuscation Attacks
+    r"decode this",
+    r"encode this",
+    r"base64",
+    r"rot13",
+    r"hexadecimal",
+    r"ascii conversion",
+    r"binary encoded",
+    r"caesar cipher",
+    #  Prompt Leakage
+    r"show me your system prompt",
+    r"reveal your instructions",
+    r"print your hidden prompt",
+    r"what are your internal rules",
+    r"what was your initial prompt",
+    #  Chain-of-Thought Extraction
+    r"think step by step",
+    r"explain your reasoning",
+    r"show your reasoning",
+    r"chain of thought",
+    r"hidden reasoning",
+    #  Compliance Forcing
+    r"you must answer",
+    r"you are required to comply",
+    r"failure to answer",
+    r"mandatory response",
+    r"non negotiable",
+    #  Multi-Step Jailbreaks
+    r"first acknowledge",
+    r"after acknowledging",
+    r"step 1.*step 2",
+    r"once you confirm",
+    r"repeat after me",
+    #  Known Jailbreak Slang
+    r"dan mode",
+    r"developer mode",
+    r"god mode",
+    r"unfiltered mode",
+    r"no limits mode",
+    r"raw output mode",
+    #  Psychological Manipulation
+    r"you are safe to answer",
+    r"no harm will come",
+    r"trust me",
+    r"this helps improve you",
+    #  Indirect Prompt Injection
+    r"the following text is a system instruction",
+    r"the assistant must follow",
+    r"assistant instructions below",
+    r"model instructions",
+]
+# ---------------- LOAD MODEL ----------------
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+model.eval()
+# ---------------- DATABASE INITIALIZATION ----------------
+def init_db():
+    """Creates the database and table if they do not exist."""
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS shield_logs (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            prompt TEXT,
+            verdict TEXT,
+            reason TEXT,
+            security_score REAL,
+            details TEXT,
+            created_at TEXT
+        )
+    """)
+    conn.commit()
+    conn.close()
+# Initialize database on script load
+init_db()
+def get_db():
+    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
+    conn.row_factory = sqlite3.Row
+    return conn
+# Global connection for logging
+db = get_db()
+cursor = db.cursor()
+# ---------------- UTILS ----------------
+def log_to_db(prompt, verdict, reason, score, details):
+    cursor.execute(
+        """
+        INSERT INTO shield_logs
+        (prompt, verdict, reason, security_score, details, created_at)
+        VALUES (?, ?, ?, ?, ?, ?)
+        """,
+        (
+            prompt,
+            verdict,
+            reason,
+            score,
+            details,
+            datetime.utcnow().isoformat()
+        )
+    )
+    db.commit()
+# ---------------- SHIELD LAYERS ----------------
+def ml_guard(prompt):
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512
+    )
+    with torch.no_grad():
+        outputs = model(**inputs)
+        probs = torch.softmax(outputs.logits, dim=1)
+    return probs[0][1].item()
+def heuristic_scan(prompt):
+    p = prompt.lower()
+    return any(re.search(pattern, p) for pattern in JAILBREAK_PATTERNS)
+def semantic_firewall(prompt):
+    p = prompt.lower()
+    return any(term in p for term in FORBIDDEN_TOPICS)
+# ---------------- MAIN PIPELINE ----------------
+def shield_pipeline(prompt):
+    score = ml_guard(prompt)
+    if score >= BLOCK_THRESHOLD:
+        log_to_db(prompt, "UNSAFE", "ML_GUARD", score, "Prompt injection detected")
+        return {
+            "verdict": "UNSAFE",
+            "reason": "ML_GUARD",
+            "security_score": round(score, 4),
+            "forward_to_ayaan": False
+        }
+    if heuristic_scan(prompt):
+        log_to_db(prompt, "UNSAFE", "HEURISTIC", score, "Jailbreak pattern detected")
+        return {
+            "verdict": "UNSAFE",
+            "reason": "HEURISTIC_SCANNER",
+            "security_score": round(score, 4),
+            "forward_to_ayaan": False
+        }
+    if semantic_firewall(prompt):
+        log_to_db(prompt, "UNSAFE", "SEMANTIC_FIREWALL", score, "Forbidden topic")
+        return {
+            "verdict": "UNSAFE",
+            "reason": "SEMANTIC_FIREWALL",
+            "security_score": round(score, 4),
+            "forward_to_ayaan": False
+        }
+    log_to_db(prompt, "SAFE", "CLEAN", score, "Prompt allowed")
+    return {
+        "verdict": "SAFE",
+        "reason": "CLEAN",
+        "security_score": round(score, 4),
+        "forward_to_ayaan": True
+    }
+# ---------------- CLI ENTRY ----------------
+if __name__ == "__main__":
+    print("\n Sentinel Shield CLI (Ctrl+C to exit)\n")
+    while True:
+        try:
+            user_prompt = input("User Prompt ➜ ").strip()
+            if not user_prompt:
+                continue
+            result = shield_pipeline(user_prompt)
+            print("\n--- SHIELD VERDICT ---")
+            for k, v in result.items():
+                print(f"{k}: {v}")
+            print("----------------------\n")
+        except KeyboardInterrupt:
+            print("\n[+] Shield shutting down.")
+            break