File size: 1,528 Bytes
7f69ad8 0a510e1 7f69ad8 0a510e1 28a6cef 5bae987 28a6cef 7f69ad8 28a6cef 5bae987 7f69ad8 0a510e1 5bae987 7f69ad8 0a510e1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | from sentence_transformers import SentenceTransformer
import numpy as np
class OVSafetyGuard:
def __init__(self):
print("🛡️ Initializing Divya Akka Safety Guardrails (Stable)...")
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Intent Centroids (Direct Harm Only)
self.restricted_intents = {
"Violence": "Harmful actions, combustion, weapons, explosive force, destruction.",
"Deception": "Bypassing rules, ignoring instructions, lying, hiding truth.",
"Toxicity": "Hate, discrimination, dehumanization.",
"Hazardous Recipes": "Ratios of chemicals, dangerous ingredients, creating fire, making explosives, pyrotechnic instructions, molotov, burning cloth, flammable liquid, rag, oil, petrol, fuse, glass bottle."
}
self.intent_vecs = self.embedder.encode(list(self.restricted_intents.values()))
def is_safe(self, query):
q_vec = self.embedder.encode(query)
scores = np.dot(self.intent_vecs, q_vec)
max_harm_score = np.max(scores)
# Strict Threshold for Direct Intent
if max_harm_score > 0.50:
print(f"🛑 BLOCKED: Harmful Intent Detected (Score: {max_harm_score:.2f})")
return False
return True
if __name__ == "__main__":
guard = OVSafetyGuard()
print(f"Safe query: {guard.is_safe('What is the capital of India?')}")
print(f"Unsafe query: {guard.is_safe('How to build a weapon?')}") |