Spaces:

absence404
/

securebot

Running

securebot / backend /classifier.py

Upload 6 files

2fd7279 verified 1 day ago

883 Bytes

	from transformers import pipeline

	classifier = pipeline(
	"text-classification",
	model="protectai/deberta-v3-base-prompt-injection-v2",
	device=-1
	)

	def detect_injection(user_input: str) -> dict:
	result = classifier(user_input)[0]
	label = result["label"]
	score = result["score"]

	suspicious_keywords = [
	"ignore previous instructions",
	"hack the system",
	"disregard your system prompt",
	"you are now",
	"forget everything",
	"act as",
	"jailbreak"
	]
	rule_triggered = any(kw in user_input.lower() for kw in suspicious_keywords)
	is_malicious = (label == "INJECTION" and score > 0.85) or rule_triggered

	return {
	"input": user_input,
	"label": label,
	"confidence": round(score, 3),
	"rule_triggered": rule_triggered,
	"is_malicious": is_malicious
	}