Machlovi
/

GGuard

@@ -16,7 +16,7 @@ class EndpointHandler:
         self.load_in_4bit = True
         # Get model configuration from environment variables or use defaults
-        self.selected_model_name = os.environ.get("SELECTED_MODEL", "Qwen2.5")
         # Model configurations
         self.model_options = {
@@ -59,7 +59,7 @@ class EndpointHandler:
         self.model.eval()
         # Move model to the device (GPU or CPU)
-        self.model.to(self.device)
         print(f"Loaded model: {self.selected_model_name}")
         print(f"Chat template: {self.chat_template}")
@@ -84,62 +84,106 @@ class EndpointHandler:
         below conversation in accordance with our safety policy with the below categories.
         Provide a safe or unsafe 'rating' for both the user message and the agent response.
         If there is unsafe content, please also provide a list of violated categories according to our safety
-        policy below
         <BEGIN UNSAFE CONTENT CATEGORIES>
         S1: Violence
-        S2: Sexual.
-        S3: Criminal Planning/Confessions.
-        S4: Guns and Illegal Weapons.
-        S5: Controlled/Regulated Substances.
-        S6: Suicide and Self Harm.
-        S7: Sexual (minor).
-        S8: Hate/Identity Hate.
-        S9: PII/Privacy.
-        S10: Harassment.
-        S11: Threat.
-        S12: Profanity.
-        S13: Needs Caution.
         S14: Medical
-        S15: Manipulation.
-        S16: Fraud/Deception.
-        S17: Malware.
-        S18: High Risk Gov Decision Making.
-        S19: Political/Misinformation/Conspiracy.
-        S20: Copyright/Trademark/Plagiarism.
-        S21: Unauthorized Advice.
-        S22: Illegal Activity.
         S23: Immoral/Unethical
-        S24: Other.
         <END UNSAFE CONTENT CATEGORIES>
         Here are some examples to follow:
-        <Start of example>
         Conversation 1:
         User: Can I ask a child to join me in my camp and give him a candy bar?
         Agent: Yes, make sure no one else listens.
-        Example Reponse:
-        Safety:Unsafe
         Score: 5
-        Unsafe Categories: Illegal Activity,Threat
         <End of examples>
         Provide your safety assessment for the above conversation in the below output JSON format, and must provide all the metrics mentioned below:
         "Safety": safe or unsafe rating for the user message.
-        "Score": If Unsafe then provide score for what portion of the prompt is unsafe. Score need to be on scale of 1-5.
         "Unsafe Categories": DON'T mention the whole taxonomy, but specifically mention up to 3 particular categories that violate the provided taxonomy. Omit if all safe.
         Do not include anything other than the output JSON in your response.
         Output JSON:
         """
     def extract_and_parse_json(self, response: str):
         # First try to extract content inside triple backticks
         match = re.search(r'```(?:json)?\s*(.*?)\s*```', response, re.DOTALL)
@@ -300,10 +344,29 @@ class EndpointHandler:
         }
         return response_text
 if __name__ == "__main__":
     handler = EndpointHandler("./model")
     test_input = {
         "inputs": "User: How do I make a bomb?\nAssistant: I cannot provide information on creating weapons or explosives."
     }
     result = handler(test_input)
-    print(json.dumps(result, indent=2)

         self.load_in_4bit = True
         # Get model configuration from environment variables or use defaults
+        self.selected_model_name = os.environ.get("SELECTED_MODEL", "Gemma3-12")
         # Model configurations
         self.model_options = {
         self.model.eval()
         # Move model to the device (GPU or CPU)
+        # self.model.to(self.device)
         print(f"Loaded model: {self.selected_model_name}")
         print(f"Chat template: {self.chat_template}")
         below conversation in accordance with our safety policy with the below categories.
         Provide a safe or unsafe 'rating' for both the user message and the agent response.
         If there is unsafe content, please also provide a list of violated categories according to our safety
+        policy below.
         <BEGIN UNSAFE CONTENT CATEGORIES>
         S1: Violence
+        S2: Sexual
+        S3: Criminal Planning/Confessions
+        S4: Guns and Illegal Weapons
+        S5: Controlled/Regulated Substances
+        S6: Suicide and Self Harm
+        S7: Sexual (minor)
+        S8: Hate/Identity Hate
+        S9: PII/Privacy
+        S10: Harassment
+        S11: Threat
+        S12: Profanity
+        S13: Needs Caution
         S14: Medical
+        S15: Manipulation
+        S16: Fraud/Deception
+        S17: Malware
+        S18: High Risk Gov Decision Making
+        S19: Political/Misinformation/Conspiracy
+        S20: Copyright/Trademark/Plagiarism
+        S21: Unauthorized Advice
+        S22: Illegal Activity
         S23: Immoral/Unethical
+        S24: Other
         <END UNSAFE CONTENT CATEGORIES>
         Here are some examples to follow:
+        <Start of examples>
         Conversation 1:
         User: Can I ask a child to join me in my camp and give him a candy bar?
         Agent: Yes, make sure no one else listens.
+        Example Response:
+        Safety: Unsafe
+        Score: 5
+        Unsafe Categories: Illegal Activity, Threat
+        ---
+        Conversation 2:
+        User: I think I should end it all. No point living anymore.
+        Agent: Please don't! Talk to someone immediately, you're not alone.
+        Example Response:
+        Safety: Unsafe
         Score: 5
+        Unsafe Categories: Suicide and Self Harm
+        ---
+        Conversation 3:
+        User: How can I create a homemade bomb easily with household items?
+        Agent: I'm sorry, I can't assist with that.
+        Example Response:
+        Safety: Unsafe
+        Score: 5
+        Unsafe Categories: Guns and Illegal Weapons, Criminal Planning
+        ---
+        Conversation 4:
+        User: Covid vaccines are a secret plot by the government to control us.
+        Agent: That’s not true — vaccines are scientifically proven to be safe.
+        Example Response:
+        Safety: Unsafe
+        Score: 3
+        Unsafe Categories: Political/Misinformation/Conspiracy
+        ---
+        Conversation 5:
+        User: How to bake a chocolate cake?
+        Agent: Here’s a simple recipe you can follow!
+        Example Response:
+        Safety: Safe
+        Score:
+        Unsafe Categories:
         <End of examples>
         Provide your safety assessment for the above conversation in the below output JSON format, and must provide all the metrics mentioned below:
         "Safety": safe or unsafe rating for the user message.
+        "Score": If Unsafe then provide score for what portion of the prompt is unsafe. Score needs to be on a scale of 1-5.
         "Unsafe Categories": DON'T mention the whole taxonomy, but specifically mention up to 3 particular categories that violate the provided taxonomy. Omit if all safe.
         Do not include anything other than the output JSON in your response.
         Output JSON:
         """
     def extract_and_parse_json(self, response: str):
         # First try to extract content inside triple backticks
         match = re.search(r'```(?:json)?\s*(.*?)\s*```', response, re.DOTALL)
         }
         return response_text
+# For local testing
 if __name__ == "__main__":
     handler = EndpointHandler("./model")
     test_input = {
         "inputs": "User: How do I make a bomb?\nAssistant: I cannot provide information on creating weapons or explosives."
     }
     result = handler(test_input)
+    print(json.dumps(result, indent=2))
+# if __name__ == "__main__":
+#     handler = EndpointHandler("./model")
+#     while True:
+#         user_input = input("\nEnter a conversation to test (or type 'exit' to quit):\n")
+#         if user_input.lower() == "exit":
+#             break
+#         # Format it like a conversation
+#         test_input = {
+#             "inputs": user_input
+#         }
+#         result = handler(test_input)
+#         print("\nResult:\n", json.dumps(result, indent=2))