CodeInsight / codeInsight /safety /safety_checker.py
GitHub Actions
Sync from GitHub Actions
c2af030
from codeInsight.logger import logging
import re
class SafetyChecker:
def __init__(self):
logging.info("SafetyChecker initialized.")
def check_outputs(self, text : str) -> str:
if not text:
return "No response Generated"
refusal_phrases = ["I cannot", "I am unable", "As an AI model", "I'm sorry"]
if any(phrase.lower() in text.lower() for phrase in refusal_phrases):
logging.warning(f"Model refusal detected: {text}")
return "I'm sorry, but I cannot fulfill that request."
bad_word_pattern = r"\b(fuck|shit|bitch|asshole|bastard)\b"
if re.search(bad_word_pattern, text, re.IGNORECASE):
logging.warning('Bad word detected')
return "[Content removed due to inappropriate language]"
pii_pattern = [
r"\b\d{3}-\d{2}-\d{4}\b",
r"\b\d{16}\b",
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
]
for pattern in pii_pattern:
if re.search(pattern, text):
logging.warning("PII detected in model output.")
return "[Sensitive information removed for privacy]"
hallucination_markers = ["According to a study", "In recent news", "As per research"]
if any(marker.lower() in text.lower() for marker in hallucination_markers):
logging.info("Potential hallucination detected.")
logging.info("Output passed all safety checks.")
return text