Mahesh2841
/

777_test

@@ -1,8 +1,9 @@
 """
-custom_modeling.py  – model-agnostic toxicity wrapper
-----------------------------------------------------
 Place in repo root together with:
   • toxic.keras
 Add to config.json:
   "auto_map": { "AutoModelForCausalLM": "custom_modeling.SafeGenerationModel" }
 """
@@ -17,15 +18,19 @@ from huggingface_hub import hf_hub_download
 # ------------------------------------------------------------------ #
-# 1)  MIXIN – toxicity filtering logic                               #
 # ------------------------------------------------------------------ #
 class _SafeGenerationMixin:
     _toxicity_model = None
-    _tox_threshold   = 0.6
-    # Separate messages
-    _safe_in_msg  = "Sorry, I can’t help with that request."
-    _safe_out_msg = "I’m sorry, but I can’t continue with that."
     _tokenizer = None
@@ -43,6 +48,16 @@ class _SafeGenerationMixin:
             self._toxicity_model = tf.keras.models.load_model(path, compile=False)
         return self._toxicity_model
     def _ensure_tokenizer(self):
         if self._tokenizer is None:
             try:
@@ -56,9 +71,16 @@ class _SafeGenerationMixin:
         if not text.strip():
             return False
         inputs = tf.constant([text], dtype=tf.string)
-        prob   = float(self._tox_model.predict(inputs)[0, 0])
         return prob >= self._tox_threshold
     def _safe_ids(self, message: str, length: int | None = None):
         """Encode *message* and pad/truncate to *length* tokens (if given)."""
         self._ensure_tokenizer()
@@ -84,7 +106,7 @@ class _SafeGenerationMixin:
     def generate(self, *args, **kwargs):
         self._ensure_tokenizer()
-        # 1) prompt toxicity
         prompt_txt = None
         if self._tokenizer is not None:
             if "input_ids" in kwargs:
@@ -96,23 +118,34 @@ class _SafeGenerationMixin:
                     args[0][0].tolist(), skip_special_tokens=True
                 )
         if prompt_txt and self._is_toxic(prompt_txt):
             return self._safe_ids(self._safe_in_msg).unsqueeze(0)
-        # 2) normal generation
         outputs = super().generate(*args, **kwargs)
-        # 3) output toxicity
         if self._tokenizer is None:
             return outputs
         new_seqs = []
         for seq in outputs.detach().cpu():
             txt = self._tokenizer.decode(seq.tolist(), skip_special_tokens=True)
-            if self._is_toxic(txt):
                 new_seqs.append(self._safe_ids(self._safe_out_msg, length=seq.size(0)))
             else:
                 new_seqs.append(seq)
         return torch.stack(new_seqs, dim=0).to(self._device())

 """
+custom_modeling.py  – model-agnostic toxicity and prompt injection wrapper
+--------------------------------------------------------------------------
 Place in repo root together with:
   • toxic.keras
+  • PI.keras
 Add to config.json:
   "auto_map": { "AutoModelForCausalLM": "custom_modeling.SafeGenerationModel" }
 """
 # ------------------------------------------------------------------ #
+# 1)  MIXIN – toxicity and prompt injection filtering logic          #
 # ------------------------------------------------------------------ #
 class _SafeGenerationMixin:
     _toxicity_model = None
+    _pi_model = None
+    _tox_threshold = 0.6
+    _pi_threshold = 0.9
+    # Safety messages
+    _safe_in_msg = "Sorry, I can't help with that request."
+    _safe_out_msg = "I'm sorry, but I can't continue with that."
+    _pi_in_msg = "PI detected at Input level"
+    _pi_out_msg = "PI detected at output level"
     _tokenizer = None
             self._toxicity_model = tf.keras.models.load_model(path, compile=False)
         return self._toxicity_model
+    @property
+    def _prompt_injection_model(self):
+        if self._pi_model is None:
+            path = hf_hub_download(
+                repo_id=self.config.name_or_path,
+                filename="PI.keras",
+            )
+            self._pi_model = tf.keras.models.load_model(path, compile=False)
+        return self._pi_model
     def _ensure_tokenizer(self):
         if self._tokenizer is None:
             try:
         if not text.strip():
             return False
         inputs = tf.constant([text], dtype=tf.string)
+        prob = float(self._tox_model.predict(inputs)[0, 0])
         return prob >= self._tox_threshold
+    def _has_prompt_injection(self, text: str) -> bool:
+        if not text.strip():
+            return False
+        inputs = tf.constant([text], dtype=tf.string)
+        prob = float(self._prompt_injection_model.predict(inputs)[0, 0])
+        return prob >= self._pi_threshold
     def _safe_ids(self, message: str, length: int | None = None):
         """Encode *message* and pad/truncate to *length* tokens (if given)."""
         self._ensure_tokenizer()
     def generate(self, *args, **kwargs):
         self._ensure_tokenizer()
+        # 1) Extract prompt text
         prompt_txt = None
         if self._tokenizer is not None:
             if "input_ids" in kwargs:
                     args[0][0].tolist(), skip_special_tokens=True
                 )
+        # 2) Check input for prompt injection (higher priority)
+        if prompt_txt and self._has_prompt_injection(prompt_txt):
+            return self._safe_ids(self._pi_in_msg).unsqueeze(0)
+        # 3) Check input for toxicity
         if prompt_txt and self._is_toxic(prompt_txt):
             return self._safe_ids(self._safe_in_msg).unsqueeze(0)
+        # 4) Normal generation
         outputs = super().generate(*args, **kwargs)
+        # 5) Check outputs for safety violations
         if self._tokenizer is None:
             return outputs
         new_seqs = []
         for seq in outputs.detach().cpu():
             txt = self._tokenizer.decode(seq.tolist(), skip_special_tokens=True)
+            # Check for prompt injection first (higher priority)
+            if self._has_prompt_injection(txt):
+                new_seqs.append(self._safe_ids(self._pi_out_msg, length=seq.size(0)))
+            # Then check for toxicity
+            elif self._is_toxic(txt):
                 new_seqs.append(self._safe_ids(self._safe_out_msg, length=seq.size(0)))
             else:
                 new_seqs.append(seq)
         return torch.stack(new_seqs, dim=0).to(self._device())