Spaces:

sandz7
/

loki

Runtime error

App Files Files Community

sandz7 commited on May 23, 2024

Commit

76a6e88

1 Parent(s): 10a4f12

added terminators, params on generations and a thread with steamer to finalize also a sliding feature on UI

Browse files

Files changed (1) hide show

app.py +60 -21

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import pandas as pd
 import numpy as np
 import gradio as gr
 import re
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import re
 from huggingface_hub import login
 import os
 # HF_TOKEN
 TOKEN = os.getenv('HF_AUTH_TOKEN')
@@ -26,33 +27,53 @@ DESCRIPTION = '''
 # Place transformers in hardware to prepare for process and generation
 llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
 llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=TOKEN, torch_dtype=torch.float16).to('cuda')
 # Place just input pass and return generation output
 def llama_generation(input_text: str,
-                     history):
     """
     Pass input texts, tokenize, output and back to text.
     """
-    # Header prompt
-    header = '''A conversation between a curious human and an AI assistant called Amphisbeana.
-    The assistant helps the user by giving accurate and complete responses, it is free to make suggestions.\n\n'''
-    input_ids = llama_tokenizer.encode(header + input_text,
-                                       return_tensors='pt').to('cuda')
-    # llama generation looks for the numeric vectors not the tensors so there is no need for **input_ids rather just input_ids
-    output_ids = llama_model.generate(input_ids=input_ids,
-                                      max_new_tokens=256,
-                                      temperature=0.5,
-                                      top_p=0.8,
-                                      repetition_penalty=2.0)
-    # Decode
-    output_text = llama_tokenizer.decode(output_ids[0],
-                                         skip_special_tokens=True)
-    return output_text
 # Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
 # Prompt for gpt-4o
@@ -65,6 +86,24 @@ with gr.Blocks(fill_height=True) as demo:
         fn=llama_generation,
         chatbot=chatbot,
         fill_height=True,
         examples=["Make a poem of batman inside willy wonka",
                   "How can you a burrito with just flour?",
                   "How was saturn formed in 3 sentences",

 import numpy as np
 import gradio as gr
 import re
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 import re
 from huggingface_hub import login
 import os
+from threading import Thread
 # HF_TOKEN
 TOKEN = os.getenv('HF_AUTH_TOKEN')
 # Place transformers in hardware to prepare for process and generation
 llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
 llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=TOKEN, torch_dtype=torch.float16).to('cuda')
+terminators = [
+    llama_tokenizer.eos_token_id,
+    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
 # Place just input pass and return generation output
 def llama_generation(input_text: str,
+                     history: list,
+                     temperature: float,
+                     max_new_tokens: int):
     """
     Pass input texts, tokenize, output and back to text.
     """
+    conversation = []
+    for user, assistant in history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": input_text})
+    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
+    # Skip_prompt, ignores the prompt in the chatbot
+    streamer = TextIteratorStreamer(llama_tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # generation arguments to pass in llm generate() eventually
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        eos_token_id=terminators
+    )
+    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
+    if temperature == 0:
+        generate_kwargs["do_sample"] = False
+    # In order to use the generate_kwargs we need to place it in a thread which can also allow the UI to run different commands even when the model is generating
+    # place the function as target and place the kwargs next as the kwargs
+    thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
+    thread.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        return "".join(outputs)
 # Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
 # Prompt for gpt-4o
         fn=llama_generation,
         chatbot=chatbot,
         fill_height=True,
+        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            # Slider feature users can interactive to effect the temperature of model
+            gr.Slider(minimum=0,
+                      maximum=1,
+                      step=0.1,
+                      value=0.95,
+                      label="Temperature",
+                      render=False),
+            # Sliding feature for the max tokens for generation on model
+            gr.Slider(minimum=128,
+                      maximum=1500,
+                      step=1,
+                      value=512,
+                      label="Max new tokens",
+                      render=False),
+        ],
         examples=["Make a poem of batman inside willy wonka",
                   "How can you a burrito with just flour?",
                   "How was saturn formed in 3 sentences",