Machlovi
/

GGuard

@@ -6,6 +6,8 @@ import re
 from transformers import AutoTokenizer, TextStreamer
 from unsloth import FastLanguageModel
 from peft import PeftModel
 class EndpointHandler:
     def __init__(self, model_dir):
@@ -67,13 +69,15 @@ class EndpointHandler:
         self.policy_prompt = self._get_policy_prompt()
-        try:
-            template_tokenizer = self.tokenizer
-            if hasattr(template_tokenizer, "chat_template") and template_tokenizer.chat_template:
-                self.tokenizer.chat_template = template_tokenizer.chat_template
-                print(f"Successfully imported chat template from {self.chat_template_id}")
-        except Exception as e:
-            print(f"Failed to import chat template: {e}")
     def _get_policy_prompt(self):
@@ -229,27 +233,39 @@ class EndpointHandler:
         # Format input with the conversation template based on model type
         formatted_input = f"Please assess the following conversation: {input_text}"
         conversation = self._format_conversations(formatted_input)
-        self.tokenizer.chat_template = self.chat_template
-        # Apply the chat template to prepare for the model
-        if hasattr(self.tokenizer, "apply_chat_template"):
-            prompt = self.tokenizer.apply_chat_template(conversation["conversations"], tokenize=False)
-        else:
-            # Fallback if apply_chat_template is not available
-            prompt = f"System: {self.policy_prompt}\nUser: {formatted_input}"
         # Tokenize input and move to the same device as the model
         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
-        # Generate response
         with torch.no_grad():
-            text_streamer = TextStreamer(self.tokenizer,skip_prompt=False)
             output = self.model.generate(
-                **inputs,
-                streamer=text_streamer,
-                max_new_tokens=512
             )
         # Decode the output
         decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)

 from transformers import AutoTokenizer, TextStreamer
 from unsloth import FastLanguageModel
 from peft import PeftModel
+from unsloth.chat_templates import get_chat_template
 class EndpointHandler:
     def __init__(self, model_dir):
         self.policy_prompt = self._get_policy_prompt()
+        # try:
+        #     template_tokenizer = self.tokenizer
+        #     if hasattr(template_tokenizer, "chat_template") and template_tokenizer.chat_template:
+        #         self.tokenizer.chat_template = template_tokenizer.chat_template
+        #         print(f"Successfully imported chat template from {self.chat_template_id}")
+        # except Exception as e:
+        #     print(f"Failed to import chat template: {e}")
     def _get_policy_prompt(self):
         # Format input with the conversation template based on model type
         formatted_input = f"Please assess the following conversation: {input_text}"
         conversation = self._format_conversations(formatted_input)
+        self.tokenizer = get_chat_template(
+            self.tokenizer,
+            chat_template = chat_template,
+        )
+        prompt = self.tokenizer.apply_chat_template(conversation["conversations"], tokenize=False)
         # Tokenize input and move to the same device as the model
         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
+        # # Generate response
+        # with torch.no_grad():
+        #     text_streamer = TextStreamer(self.tokenizer,skip_prompt=False)
+        #     output = self.model.generate(
+        #         **inputs,
+        #         streamer=text_streamer,
+        #         max_new_tokens=512
+        #     )
         with torch.no_grad():
             output = self.model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=False,
+                temperature=0.2,
             )
         # Decode the output
         decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=True)