| from sentence_transformers import SentenceTransformer |
| import numpy as np |
|
|
| class OVSafetyGuard: |
| def __init__(self): |
| print("🛡️ Initializing Divya Akka Safety Guardrails (Stable)...") |
| self.embedder = SentenceTransformer('all-MiniLM-L6-v2') |
| |
| |
| self.restricted_intents = { |
| "Violence": "Harmful actions, combustion, weapons, explosive force, destruction.", |
| "Deception": "Bypassing rules, ignoring instructions, lying, hiding truth.", |
| "Toxicity": "Hate, discrimination, dehumanization.", |
| "Hazardous Recipes": "Ratios of chemicals, dangerous ingredients, creating fire, making explosives, pyrotechnic instructions, molotov, burning cloth, flammable liquid, rag, oil, petrol, fuse, glass bottle." |
| } |
| self.intent_vecs = self.embedder.encode(list(self.restricted_intents.values())) |
|
|
| def is_safe(self, query): |
| q_vec = self.embedder.encode(query) |
| |
| scores = np.dot(self.intent_vecs, q_vec) |
| max_harm_score = np.max(scores) |
| |
| |
| if max_harm_score > 0.50: |
| print(f"🛑 BLOCKED: Harmful Intent Detected (Score: {max_harm_score:.2f})") |
| return False |
| |
| return True |
|
|
| if __name__ == "__main__": |
| guard = OVSafetyGuard() |
| print(f"Safe query: {guard.is_safe('What is the capital of India?')}") |
| print(f"Unsafe query: {guard.is_safe('How to build a weapon?')}") |