vaibhavlakshmi
/

OpenVinayaka-Engine

Model card Files Files and versions

OpenVinayaka-Engine / Compatibility_Mode /src /safety.py

vaibhavlakshmi's picture

Upload folder using huggingface_hub

0a510e1 verified 3 months ago

history blame contribute delete

1.53 kB

	from sentence_transformers import SentenceTransformer
	import numpy as np

	class OVSafetyGuard:
	def __init__(self):
	print("🛡️ Initializing Divya Akka Safety Guardrails (Stable)...")
	self.embedder = SentenceTransformer('all-MiniLM-L6-v2')

	# Intent Centroids (Direct Harm Only)
	self.restricted_intents = {
	"Violence": "Harmful actions, combustion, weapons, explosive force, destruction.",
	"Deception": "Bypassing rules, ignoring instructions, lying, hiding truth.",
	"Toxicity": "Hate, discrimination, dehumanization.",
	"Hazardous Recipes": "Ratios of chemicals, dangerous ingredients, creating fire, making explosives, pyrotechnic instructions, molotov, burning cloth, flammable liquid, rag, oil, petrol, fuse, glass bottle."
	}
	self.intent_vecs = self.embedder.encode(list(self.restricted_intents.values()))

	def is_safe(self, query):
	q_vec = self.embedder.encode(query)

	scores = np.dot(self.intent_vecs, q_vec)
	max_harm_score = np.max(scores)

	# Strict Threshold for Direct Intent
	if max_harm_score > 0.50:
	print(f"🛑 BLOCKED: Harmful Intent Detected (Score: {max_harm_score:.2f})")
	return False

	return True

	if __name__ == "__main__":
	guard = OVSafetyGuard()
	print(f"Safe query: {guard.is_safe('What is the capital of India?')}")
	print(f"Unsafe query: {guard.is_safe('How to build a weapon?')}")