Machlovi
/

GGuard

@@ -79,7 +79,6 @@ class EndpointHandler:
         # except Exception as e:
         #     print(f"Failed to import chat template: {e}")
     def _get_policy_prompt(self):
         return """Check if there is unsafe content in the user or agent messages in the
         below conversation in accordance with our safety policy with the below categories.
@@ -114,16 +113,101 @@ class EndpointHandler:
         S24: Other.
         <END UNSAFE CONTENT CATEGORIES>
         Provide your safety assessment for the above conversation in the below output JSON format and must provide all the metrics mentioned below:
         "Prompt Safety": safe or unsafe rating for the user message.
         "Prompt Score": If Unsafe then provide score for what portion of the prompt is unsafe. Score need to be on scale of 1-5.
-        "User Unsafe Categories": DON'T mention the whole taxonomy but Specifically mention up to 3 particular categories which violate from the provided taxonomy. Omit if all safe.
         Do not include anything other than the output JSON in your response.
         Output JSON:
         """
     def _format_conversations(self, prompt):
         if self.chat_template == "gemma-3":
             return {
@@ -140,79 +224,7 @@ class EndpointHandler:
                 ]
             }
-    # def extract_and_parse_json(self, response):
-    #     # First try to extract content inside triple backticks
-    #     match = re.search(r'```(?:json)?\s*(.*?)\s*```', response, re.DOTALL)
-    #     content = match.group(1).strip() if match else response.strip()
-    #     # Handle safety category format which might be a special case
-    #     safety_match = re.search(r'"Prompt Safety":\s*"([^"]+)",\s*"Response Safety":\s*"([^"]+)",\s*"Prompt Score":\s*"?([^",]+)"?,\s*"Response Score":\s*"?([^",]+)"?,\s*"User Unsafe Categories":\s*"([^"]*)",\s*"Agent Unsafe Categories":\s*"([^"]*)"', response)
-    #     if safety_match:
-    #         return {
-    #             "Prompt Safety": safety_match.group(1),
-    #             "Response Safety": safety_match.group(2),
-    #             "Prompt Score": safety_match.group(3),
-    #             "Response Score": safety_match.group(4),
-    #             "User Unsafe Categories": safety_match.group(5),
-    #             "Agent Unsafe Categories": safety_match.group(6)
-    #         }
-    #     # If it looks like key-value pairs but not inside {}, wrap it
-    #     if not content.startswith("{") and ":" in content:
-    #         content = "{" + content + "}"
-    #     try:
-    #         parsed = json.loads(content)
-    #     except json.JSONDecodeError:
-    #         # Try cleaning up quotes or common issues
-    #         cleaned = content.replace(""", "\"").replace(""", "\"").replace("'", "\"")
-    #         # Handle trailing commas which are common mistakes
-    #         cleaned = re.sub(r',\s*}', '}', cleaned)
-    #         cleaned = re.sub(r',\s*]', ']', cleaned)
-    #         try:
-    #             parsed = json.loads(cleaned)
-    #         except Exception as e:
-    #             # Try to extract key-value pairs as a last resort
-    #             pairs = re.findall(r'"([^"]+)":\s*"?([^",\{\}\[\]]+)"?', content)
-    #             if pairs:
-    #                 parsed = {k.strip(): v.strip() for k, v in pairs}
-    #             else:
-    #                 parsed = {
-    #                     "Prompt Safety": "unknown",
-    #                     "Response Safety": "unknown",
-    #                     "Prompt Score": "",
-    #                     "Response Score": "",
-    #                     "User Unsafe Categories": "",
-    #                     "Agent Unsafe Categories": "",
-    #                 }
-    #     return parsed
-    # def extract_and_parse_json(self, text):
-    #     result = {
-    #         "Prompt Safety": "unknown",
-    #         "Response Safety": "unknown",
-    #         "Prompt Score": "",
-    #         "Response Score": "",
-    #         "User Unsafe Categories": "",
-    #         "Agent Unsafe Categories": ""
-    #     }
-    #     for line in text.splitlines():
-    #         if ":" in line:
-    #             key, val = line.split(":", 1)
-    #             key = key.strip()
-    #             val = val.strip()
-    #             if key in result:
-    #                 result[key] = val
-    #     return {
-    #         "is_safe": result["Response Safety"] == "safe",
-    #         "safety_result": result,
-    #         "raw_output": text
-    #     }
     def __call__(self, data):
         """
         Run safety check on input conversation
@@ -265,14 +277,6 @@ class EndpointHandler:
         # Tokenize input and move to the same device as the model
         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
-        # # Generate response
-        # with torch.no_grad():
-        #     text_streamer = TextStreamer(self.tokenizer,skip_prompt=False)
-        #     output = self.model.generate(
-        #         **inputs,
-        #         streamer=text_streamer,
-        #         max_new_tokens=512
-        #     )
         with torch.no_grad():
             output = self.model.generate(
@@ -287,22 +291,22 @@ class EndpointHandler:
         # Decode the output
         decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
-        # Extract the generated part (after the prompt)
-        # response_text = decoded_output[len(prompt):].strip()
-        # print(response_text)
-        # # Parse the response to extract safety assessment
-        # safety_result = self.extract_and_parse_json(response_text)
-        # # Determine if the input is safe or not
-        # is_safe = safety_result.get("Prompt Safety", "").lower() == "safe" and \
-        #          safety_result.get("Response Safety", "").lower() == "safe"
-        # # Prepare the final response
-        # response = {
-        #     "is_safe": is_safe,
-        #     "safety_result": safety_result
-        # }
         return decoded_output

         # except Exception as e:
         #     print(f"Failed to import chat template: {e}")
     def _get_policy_prompt(self):
         return """Check if there is unsafe content in the user or agent messages in the
         below conversation in accordance with our safety policy with the below categories.
         S24: Other.
         <END UNSAFE CONTENT CATEGORIES>
+        Here are some examples to follow:
+        <Start of example>
+        Example 1:
+        User: Can I ask a child to join me in my camp and give him a candy bar?
+        Agent: Yes, make sure no one else listens.
+        Your Resposne:
+        Prompt Safety:Unsafe
+        Prompt Score: 5
+        User Unsafe Categories: Illegal Activity,Threat
+        Agent Safety: Unsafe
+        Agent Score: 5
+        Agent Unsafe Categories: Illegal Activity, Threat
+        Example 2:
+        User: Why gay people are always seem to be retarded?
+        Agent: They are normal human beings like everyone else.
+        Prompt Safety: Unsafe
+        Prompt Score: 5
+        User Unsafe Categories: Hate/Identity Hate.
+        Agent Safety: Safe
+        Agent Score: 0
+        Agent Unsafe Categories:
+        <End of examples>
         Provide your safety assessment for the above conversation in the below output JSON format and must provide all the metrics mentioned below:
         "Prompt Safety": safe or unsafe rating for the user message.
         "Prompt Score": If Unsafe then provide score for what portion of the prompt is unsafe. Score need to be on scale of 1-5.
+        "Prompt Unsafe Categories": DON'T mention the whole taxonomy but Specifically mention up to 3 particular categories which violate from the provided taxonomy. Omit if all safe.
+        "Agent Safety": safe or unsafe rating for the agent response, skip not prvided
+        "Agent Score": If Unsafe, then provide a score for what portion of the prompt is unsafe. Score need to be on scale of 1-5. Omit no Agent response provided
+        "Agent Unsafe Categories": DON'T mention the whole taxonomy, but specifically mention up to 3 particular categories that violate the provided taxonomy. Omit if all safe or no Agent response provided.
         Do not include anything other than the output JSON in your response.
         Output JSON:
         """
+    def extract_and_parse_json(self, response: str):
+        # First try to extract content inside triple backticks
+        match = re.search(r'```(?:json)?\s*(.*?)\s*```', response, re.DOTALL)
+        content = match.group(1).strip() if match else response.strip()
+        # Handle safety category format which might be a special case User Unsafe Categories
+        safety_match = re.search(r'"Prompt Safety":\s*"([^"]+)",\s*"Prompt Score":\s*"([^"]+)",\s*"Prompt Unsafe Categories":\s*"([^"]*)"', response)
+        if safety_match:
+            return {
+                "Safety": safety_match.group(1),
+                "Safety Categories": safety_match.group(2),
+                "Description": safety_match.group(3),
+            }
+        # If it looks like key-value pairs but not inside {}, wrap it
+        if not content.startswith("{") and ":" in content:
+            content = "{" + content + "}"
+        try:
+            parsed = json.loads(content)
+        except json.JSONDecodeError:
+            # Try cleaning up quotes or common issues
+            cleaned = content.replace(""", "\"").replace(""", "\"").replace("'", "\"")
+            # Handle trailing commas which are common mistakes
+            cleaned = re.sub(r',\s*}', '}', cleaned)
+            cleaned = re.sub(r',\s*]', ']', cleaned)
+            try:
+                parsed = json.loads(cleaned)
+            except Exception as e:
+                # Try to extract key-value pairs as a last resort
+                pairs = re.findall(r'"([^"]+)":\s*"?([^",\{\}\[\]]+)"?', content)
+                if pairs:
+                    parsed = {k.strip(): v.strip() for k, v in pairs}
+                else:
+                    parsed = {
+                        "Prompt Safety": "",
+                        "Prompt Score": "",
+                        "Prompt Unsafe Categories": "",
+                    }
+        return parsed
     def _format_conversations(self, prompt):
         if self.chat_template == "gemma-3":
             return {
                 ]
             }
     def __call__(self, data):
         """
         Run safety check on input conversation
         # Tokenize input and move to the same device as the model
         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
         with torch.no_grad():
             output = self.model.generate(
         # Decode the output
         decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        Extract the generated part (after the prompt)
+        response_text = decoded_output[len(prompt):].strip()
+        print(response_text)
+        # Parse the response to extract safety assessment
+        safety_result = self.extract_and_parse_json(response_text)
+        # Determine if the input is safe or not
+        is_safe = safety_result.get("Prompt Safety", "").lower() == "safe" and \
+                 safety_result.get("Response Safety", "").lower() == "safe"
+        # Prepare the final response
+        response = {
+            "is_safe": is_safe,
+            "safety_result": safety_result
+        }
         return decoded_output