Spaces:

Ctaake
/

FranziBot2

Sleeping

App Files Files Community

Ctaake commited on Apr 12, 2024

Commit

297c45c

verified ·

1 Parent(s): 1521c95

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -10

app.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 import random
-from transformers import AutoTokenizer
 from mySystemPrompt import SYSTEM_PROMPT
 # Model which is used
 checkpoint = "CohereForAI/c4ai-command-r-plus"
 # Inference client with the model (And HF-token if needed)
 client = InferenceClient(checkpoint)
-tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
 # Tokenizer chat template correction(Only works for mistral models)
 #chat_template = open("mistral-instruct.jinja").read()
 #chat_template = chat_template.replace('    ', '').replace('\n', '')
@@ -20,8 +23,9 @@ def format_prompt(message,chatbot,system_prompt):
         messages.append({"role": "user", "content":user_message})
         messages.append({"role": "assistant", "content":bot_message})
     messages.append({"role": "user", "content":message})
-    newPrompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
-    print(newPrompt)
     return newPrompt
 def inference(message, history, systemPrompt=SYSTEM_PROMPT, temperature=0.9, maxTokens=512, topP=0.9, repPenalty=1.1):
@@ -38,13 +42,18 @@ def inference(message, history, systemPrompt=SYSTEM_PROMPT, temperature=0.9, max
         seed=random.randint(0, 999999999),
     )
     # Generating the response by passing the prompt in right format plus the client settings
-    stream = client.text_generation(format_prompt(message, history, systemPrompt),
-                                    **client_settings)
     # Reading the stream
-    partial_response = ""
-    for stream_part in stream:
-        partial_response += stream_part.token.text
-        yield partial_response

 import gradio as gr
 from huggingface_hub import InferenceClient
 import random
+from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig
 from mySystemPrompt import SYSTEM_PROMPT
+bnb_config = BitsAndBytesConfig(load_in_8bit=True)
 # Model which is used
 checkpoint = "CohereForAI/c4ai-command-r-plus"
 # Inference client with the model (And HF-token if needed)
 client = InferenceClient(checkpoint)
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config)
 # Tokenizer chat template correction(Only works for mistral models)
 #chat_template = open("mistral-instruct.jinja").read()
 #chat_template = chat_template.replace('    ', '').replace('\n', '')
         messages.append({"role": "user", "content":user_message})
         messages.append({"role": "assistant", "content":bot_message})
     messages.append({"role": "user", "content":message})
+    #newPrompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
+    #print(newPrompt)
+    newPrompt = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
     return newPrompt
 def inference(message, history, systemPrompt=SYSTEM_PROMPT, temperature=0.9, maxTokens=512, topP=0.9, repPenalty=1.1):
         seed=random.randint(0, 999999999),
     )
     # Generating the response by passing the prompt in right format plus the client settings
+    #stream = client.text_generation(format_prompt(message, history, systemPrompt),
+      #                      **client_settings)
     # Reading the stream
+    #partial_response = ""
+    #for stream_part in stream:
+     #   partial_response += stream_part.token.text
+      #  yield partial_response
+    gen_tokens = model.generate(
+    format_prompt(message,history,systemPrompt),
+    **client_settings)
+    output = tokenizer.decode(gen_tokens[0])
+    return output