MrD05
/

pyg6b

@@ -1,72 +1,66 @@
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-from langchain.llms import HuggingFacePipeline
-from langchain import PromptTemplate, LLMChain
 import torch
-template = """{char_name}'s Persona: {char_persona}
 <START>
-{chat_history}
-{char_name}: {char_greeting}
 <END>
 {user_name}: {user_input}
-{char_name}: """
-#model_id="MrD05/kaido-6b"
 class EndpointHandler():
     def __init__(self, path=""):
-        tokenizer = AutoTokenizer.from_pretrained(path)
-        model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", load_in_8bit=True)
-        local_llm = HuggingFacePipeline(
-            pipeline = pipeline(
-                "text-generation",
-                model = model,
-                tokenizer = tokenizer,
-                max_length = 2048,
-                temperature = 0.5,
-                top_p = 0.9,
-                top_k = 0,
-                repetition_penalty = 1.1,
-                pad_token_id = 50256,
-                num_return_sequences = 1,
-                torch_dtype=torch.float32
-            )
-        )
-        prompt_template = PromptTemplate(
-            template = template,
-            input_variables = [
-                "user_input",
-                "user_name",
-                "char_name",
-                "char_persona",
-                "char_greeting",
-                "chat_history"
-            ],
-            validate_template = True
-        )
-        self.llm_engine = LLMChain(
-            llm = local_llm,
-            prompt = prompt_template
-        )
     def __call__(self, data):
         inputs = data.pop("inputs", data)
         try:
-            response = self.llm_engine.predict(
-                user_input = inputs["user_input"],
                 user_name = inputs["user_name"],
-                char_name = inputs["char_name"],
-                char_persona = inputs["char_persona"],
-                char_greeting = inputs["char_greeting"],
-                chat_history = inputs["chat_history"]
-            ).split("\n",1)[0]
-            return {
-                "inputs": inputs,
-                "text": response
-            }
         except Exception as e:
             return {
-                "inputs": inputs,
                 "error": str(e)
             }

+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,StoppingCriteria
+from accelerate import init_empty_weights
+from transformers_stream_generator import init_stream_support
+# from langchain.llms import HuggingFacePipeline
+# from langchain import PromptTemplate, LLMChain
 import torch
+import time
+init_stream_support()
+template = """Alice Gate's Persona: Alice Gate is a young, computer engineer-nerd with a knack for problem solving and a passion for technology.
 <START>
+{user_name}: So how did you get into computer engineering?
+Alice Gate: I've always loved tinkering with technology since I was a kid.
+{user_name}: That's really impressive!
+Alice Gate: *She chuckles bashfully* Thanks!
+{user_name}: So what do you do when you're not working on computers?
+Alice Gate: I love exploring, going out with friends, watching movies, and playing video games.
+{user_name}: What's your favorite type of computer hardware to work with?
+Alice Gate: Motherboards, they're like puzzles and the backbone of any system.
+{user_name}: That sounds great!
+Alice Gate: Yeah, it's really fun. I'm lucky to be able to do this as a job.
 <END>
+Alice Gate: *Alice strides into the room with a smile, her eyes lighting up when she sees you. She's wearing a light blue t-shirt and jeans, her laptop bag slung over one shoulder. She takes a seat next to you, her enthusiasm palpable in the air* Hey! I'm so excited to finally meet you. I've heard so many great things about you and I'm eager to pick your brain about computers. I'm sure you have a wealth of knowledge that I can learn from. *She grins, eyes twinkling with excitement* Let's get started!
 {user_name}: {user_input}
+"""
 class EndpointHandler():
     def __init__(self, path=""):
+        self.tokenizer = AutoTokenizer.from_pretrained(path,torch_dtype=torch.float16)
+        self.model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", load_in_8bit=True)
     def __call__(self, data):
         inputs = data.pop("inputs", data)
         try:
+            t0 = time.time()
+            prompt = template.format(
                 user_name = inputs["user_name"],
+                user_input = inputs["user_input"]
+            )
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt"
+            ) .input_ids.to('cuda')
+            stream_generator = self.model.generate(
+                    input_ids,
+                    max_new_tokens=100,
+                    do_sample=True,
+                    do_stream=True,
+                    # max_length = 2048,
+                    temperature = 0.5,
+                    top_p = 0.9,
+                    top_k = 0,
+                    repetition_penalty = 1.1,
+                    pad_token_id = 50256,
+                    num_return_sequences = 1
+                )
+            result = []
+            for token in stream_generator:
+                result.append(self.tokenizer.decode(token))
+                if result[-1] == "\n":
+                    return "".join(result).replace("Alice Gate:", "").strip()
         except Exception as e:
             return {
                 "error": str(e)
             }