Xgen

Paused

App Files Files Community

Tonic commited on Nov 22, 2023

Commit

0d5c130

1 Parent(s): 3a6504f

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -12

app.py CHANGED Viewed

@@ -7,15 +7,10 @@ import gradio as gr
 import sentencepiece
 title = "Welcome to Tonic's 🐋🐳Orca-2-13B (in 8bit)!"
-description = "You can use [🐋🐳microsoft/Orca-2-13b](https://huggingface.co/microsoft/Orca-2-13b) via API using Gradio by scrolling down and clicking Use 'Via API' or privately by [cloning this space on huggingface](https://huggingface.co/spaces/Tonic1/TonicsOrca2?duplicate=true) . [Join my active builders' server on discord](https://discord.gg/VqTxc76K3u). Big thanks to the HuggingFace Organisation for the Community Grant."
-# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_name = "microsoft/Orca-2-13b"
-# offload_folder = './model_weights'
-# if not os.path.exists(offload_folder):
-#     os.makedirs(offload_folder)
 tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
 model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
@@ -25,9 +20,26 @@ class OrcaChatBot:
         self.model = model
         self.tokenizer = tokenizer
         self.system_message = system_message
     def predict(self, user_message, temperature=0.4, max_new_tokens=70, top_p=0.99, repetition_penalty=1.9):
-        prompt = f"<|im_start|>system\n{self.system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
         inputs = self.tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
         input_ids = inputs["input_ids"].to(self.model.device)
@@ -38,13 +50,13 @@ class OrcaChatBot:
             top_p=top_p,
             repetition_penalty=repetition_penalty,
             pad_token_id=self.tokenizer.eos_token_id,
-            do_sample=True
-        )
         response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return response
 Orca_bot = OrcaChatBot(model, tokenizer)
 def gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty):
@@ -58,7 +70,7 @@ iface = gr.Interface(
     inputs=[
         gr.Textbox(label="Your Message", type="text", lines=3),
         gr.Textbox(label="Introduce a Character Here or Set a Scene (system prompt)", type="text", lines=2),
-        gr.Slider(label="Max new tokens", value=125, minimum=25, maximum=256, step=1),
         gr.Slider(label="Temperature", value=0.1, minimum=0.05, maximum=1.0, step=0.05),
         gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05),
         gr.Slider(label="Repetition penalty", value=1.9, minimum=1.0, maximum=2.0, step=0.05)

 import sentencepiece
 title = "Welcome to Tonic's 🐋🐳Orca-2-13B (in 8bit)!"
+description = "You can use [🐋🐳microsoft/Orca-2-13b](https://huggingface.co/microsoft/Orca-2-13b) via API using Gradio by scrolling down and clicking Use 'Via API' or privately by [cloning this space on huggingface](https://huggingface.co/spaces/Tonic1/TonicsOrca2?duplicate=true) . [Join my active builders' server on discord](https://discord.gg/VqTxc76K3u). Let's build together! Big thanks to the HuggingFace Organisation for the Community Grant."
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_name = "microsoft/Orca-2-13b"
 tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
 model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
         self.model = model
         self.tokenizer = tokenizer
         self.system_message = system_message
+        self.conversation_history = []
+    def update_conversation_history(self, user_message, assistant_message):
+        self.conversation_history.append(("user", user_message))
+        self.conversation_history.append(("assistant", assistant_message))
+    def format_prompt(self):
+        prompt = f"<|im_start|>assistant\n{self.system_message}<|im_end|>\n"
+        for role, message in self.conversation_history:
+            if message.strip():
+                prompt += f"<|im_start|>{role}\n{message}<|im_end|>\n"
+#               if role == "assistant":
+#                    prompt += f"<|im_end|>\n"
+        prompt += "<|im_start|> assistant\n"
+        return prompt
     def predict(self, user_message, temperature=0.4, max_new_tokens=70, top_p=0.99, repetition_penalty=1.9):
+        self.update_conversation_history(user_message, "")
+        prompt = self.format_prompt()
         inputs = self.tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
         input_ids = inputs["input_ids"].to(self.model.device)
             top_p=top_p,
             repetition_penalty=repetition_penalty,
             pad_token_id=self.tokenizer.eos_token_id,
+            do_sample=True
+    )
         response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        self.update_conversation_history("", response)
         return response
 Orca_bot = OrcaChatBot(model, tokenizer)
 def gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty):
     inputs=[
         gr.Textbox(label="Your Message", type="text", lines=3),
         gr.Textbox(label="Introduce a Character Here or Set a Scene (system prompt)", type="text", lines=2),
+        gr.Slider(label="Max new tokens", value=420, minimum=25, maximum=2056, step=1),
         gr.Slider(label="Temperature", value=0.1, minimum=0.05, maximum=1.0, step=0.05),
         gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05),
         gr.Slider(label="Repetition penalty", value=1.9, minimum=1.0, maximum=2.0, step=0.05)