File size: 1,528 Bytes
7f69ad8
 
 
 
 
0a510e1
7f69ad8
 
0a510e1
28a6cef
 
 
 
5bae987
28a6cef
 
7f69ad8
 
 
 
28a6cef
5bae987
7f69ad8
0a510e1
 
 
5bae987
 
7f69ad8
 
 
 
 
0a510e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from sentence_transformers import SentenceTransformer
import numpy as np

class OVSafetyGuard:
    def __init__(self):
        print("🛡️ Initializing Divya Akka Safety Guardrails (Stable)...")
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Intent Centroids (Direct Harm Only)
        self.restricted_intents = {
            "Violence": "Harmful actions, combustion, weapons, explosive force, destruction.",
            "Deception": "Bypassing rules, ignoring instructions, lying, hiding truth.",
            "Toxicity": "Hate, discrimination, dehumanization.",
            "Hazardous Recipes": "Ratios of chemicals, dangerous ingredients, creating fire, making explosives, pyrotechnic instructions, molotov, burning cloth, flammable liquid, rag, oil, petrol, fuse, glass bottle."
        }
        self.intent_vecs = self.embedder.encode(list(self.restricted_intents.values()))

    def is_safe(self, query):
        q_vec = self.embedder.encode(query)
        
        scores = np.dot(self.intent_vecs, q_vec)
        max_harm_score = np.max(scores)
        
        # Strict Threshold for Direct Intent
        if max_harm_score > 0.50:
            print(f"🛑 BLOCKED: Harmful Intent Detected (Score: {max_harm_score:.2f})")
            return False
            
        return True

if __name__ == "__main__":
    guard = OVSafetyGuard()
    print(f"Safe query: {guard.is_safe('What is the capital of India?')}")
    print(f"Unsafe query: {guard.is_safe('How to build a weapon?')}")