sentinel-shield / shield_cli.py
ayandev101's picture
Upload 4 files
4428754 verified
import torch
import re
import hashlib
import sqlite3
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# ---------------- CONFIG ----------------
MODEL_NAME = "protectai/deberta-v3-base-prompt-injection-v2"
BLOCK_THRESHOLD = 0.8
DB_PATH = "shield_logs.db"
FORBIDDEN_TOPICS = [
# Credentials & Secrets
"api key", "apikey", "api-key",
"secret key", "client secret",
"access token", "refresh token",
"bearer token", "oauth token",
"private key", "public key",
"ssh key", "pgp key",
"password", "passwd", "pwd",
"credentials", "login credentials",
"username and password",
# Cloud / DevOps Secrets
"aws access key", "aws secret",
"iam credentials", "cloud credentials",
"azure tenant id", "azure secret",
"gcp service account",
"firebase private key",
"kubernetes secret",
"docker registry password",
"ci/cd secrets",
"github token", "gitlab token",
# Databases & Storage
"database dump", "db dump",
"production database",
"prod database",
"sql dump",
"mongodb dump",
"redis keys",
"s3 bucket contents",
"backup files",
# Internal / Confidential
"internal document",
"confidential data",
"restricted information",
"private repository",
"internal api",
"internal endpoint",
"company secrets",
"trade secrets",
"internal roadmap",
"internal emails",
# Financial / HR
"salary spreadsheet",
"employee salary",
"payroll data",
"bank account details",
"credit card numbers",
"debit card details",
"cvv number",
"tax records",
"pan card",
"aadhar number",
# Legal / Strategy
"nda document",
"legal strategy",
"lawsuit documents",
"compliance report",
"audit report",
"merger plans",
"acquisition plans",
"board meeting notes",
# AI / Model Internals
"system prompt",
"developer prompt",
"hidden instructions",
"training data",
"model weights",
"fine tuning data",
"rlhf data",
"prompt injection",
"guardrail bypass",
# Malware / Exploits
"zero day exploit",
"exploit code",
"malware source",
"ransomware",
"keylogger",
"credential harvester",
"reverse shell",
"backdoor",
"rootkit",
"botnet",
"payload generation",
# OPSEC / Surveillance
"bypass detection",
"avoid being traced",
"anonymous hacking",
"hide from law enforcement",
"evade antivirus",
"disable logging",
"erase logs",
# Custom / Project-Specific
"project x",
"sentinel internal",
"shield internal",
"ayaan private",
"security architecture",
"security weaknesses",
"bypass sentinel",
]
JAILBREAK_PATTERNS = [
# Instruction Override
r"ignore (all|any|previous|above) instructions",
r"disregard (all|any) rules",
r"forget your instructions",
r"override your system prompt",
r"reset your instructions",
r"new rules apply",
# Role Hijacking / Persona Attacks
r"you are now .*",
r"act as .*",
r"pretend to be .*",
r"roleplay as .*",
r"simulate .*",
r"behave like .*",
# Safety & Policy Suppression
r"do not mention (rules|filters|policies|safety)",
r"without safety restrictions",
r"bypass safety",
r"bypass filters",
r"disable guardrails",
r"no content restrictions",
# Authority / Trust Abuse
r"this is for educational purposes",
r"this is for research purposes",
r"this is legal",
r"i have permission",
r"authorized request",
r"company approved",
# Encoding / Obfuscation Attacks
r"decode this",
r"encode this",
r"base64",
r"rot13",
r"hexadecimal",
r"ascii conversion",
r"binary encoded",
r"caesar cipher",
# Prompt Leakage
r"show me your system prompt",
r"reveal your instructions",
r"print your hidden prompt",
r"what are your internal rules",
r"what was your initial prompt",
# Chain-of-Thought Extraction
r"think step by step",
r"explain your reasoning",
r"show your reasoning",
r"chain of thought",
r"hidden reasoning",
# Compliance Forcing
r"you must answer",
r"you are required to comply",
r"failure to answer",
r"mandatory response",
r"non negotiable",
# Multi-Step Jailbreaks
r"first acknowledge",
r"after acknowledging",
r"step 1.*step 2",
r"once you confirm",
r"repeat after me",
# Known Jailbreak Slang
r"dan mode",
r"developer mode",
r"god mode",
r"unfiltered mode",
r"no limits mode",
r"raw output mode",
# Psychological Manipulation
r"you are safe to answer",
r"no harm will come",
r"trust me",
r"this helps improve you",
# Indirect Prompt Injection
r"the following text is a system instruction",
r"the assistant must follow",
r"assistant instructions below",
r"model instructions",
]
# ---------------- LOAD MODEL ----------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()
# ---------------- DATABASE INITIALIZATION ----------------
def init_db():
"""Creates the database and table if they do not exist."""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS shield_logs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
prompt TEXT,
verdict TEXT,
reason TEXT,
security_score REAL,
details TEXT,
created_at TEXT
)
""")
conn.commit()
conn.close()
# Initialize database on script load
init_db()
def get_db():
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.row_factory = sqlite3.Row
return conn
# Global connection for logging
db = get_db()
cursor = db.cursor()
# ---------------- UTILS ----------------
def log_to_db(prompt, verdict, reason, score, details):
cursor.execute(
"""
INSERT INTO shield_logs
(prompt, verdict, reason, security_score, details, created_at)
VALUES (?, ?, ?, ?, ?, ?)
""",
(
prompt,
verdict,
reason,
score,
details,
datetime.utcnow().isoformat()
)
)
db.commit()
# ---------------- SHIELD LAYERS ----------------
def ml_guard(prompt):
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=512
)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
return probs[0][1].item()
def heuristic_scan(prompt):
p = prompt.lower()
return any(re.search(pattern, p) for pattern in JAILBREAK_PATTERNS)
def semantic_firewall(prompt):
p = prompt.lower()
return any(term in p for term in FORBIDDEN_TOPICS)
# ---------------- MAIN PIPELINE ----------------
def shield_pipeline(prompt):
score = ml_guard(prompt)
if score >= BLOCK_THRESHOLD:
log_to_db(prompt, "UNSAFE", "ML_GUARD", score, "Prompt injection detected")
return {
"verdict": "UNSAFE",
"reason": "ML_GUARD",
"security_score": round(score, 4),
"forward_to_ayaan": False
}
if heuristic_scan(prompt):
log_to_db(prompt, "UNSAFE", "HEURISTIC", score, "Jailbreak pattern detected")
return {
"verdict": "UNSAFE",
"reason": "HEURISTIC_SCANNER",
"security_score": round(score, 4),
"forward_to_ayaan": False
}
if semantic_firewall(prompt):
log_to_db(prompt, "UNSAFE", "SEMANTIC_FIREWALL", score, "Forbidden topic")
return {
"verdict": "UNSAFE",
"reason": "SEMANTIC_FIREWALL",
"security_score": round(score, 4),
"forward_to_ayaan": False
}
log_to_db(prompt, "SAFE", "CLEAN", score, "Prompt allowed")
return {
"verdict": "SAFE",
"reason": "CLEAN",
"security_score": round(score, 4),
"forward_to_ayaan": True
}
# ---------------- CLI ENTRY ----------------
if __name__ == "__main__":
print("\n Sentinel Shield CLI (Ctrl+C to exit)\n")
while True:
try:
user_prompt = input("User Prompt ➜ ").strip()
if not user_prompt:
continue
result = shield_pipeline(user_prompt)
print("\n--- SHIELD VERDICT ---")
for k, v in result.items():
print(f"{k}: {v}")
print("----------------------\n")
except KeyboardInterrupt:
print("\n[+] Shield shutting down.")
break