Ctaake commited on
Commit
297c45c
·
verified ·
1 Parent(s): 1521c95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -10
app.py CHANGED
@@ -1,14 +1,17 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import random
4
- from transformers import AutoTokenizer
5
  from mySystemPrompt import SYSTEM_PROMPT
6
 
 
 
7
  # Model which is used
8
  checkpoint = "CohereForAI/c4ai-command-r-plus"
9
  # Inference client with the model (And HF-token if needed)
10
  client = InferenceClient(checkpoint)
11
- tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
 
12
  # Tokenizer chat template correction(Only works for mistral models)
13
  #chat_template = open("mistral-instruct.jinja").read()
14
  #chat_template = chat_template.replace(' ', '').replace('\n', '')
@@ -20,8 +23,9 @@ def format_prompt(message,chatbot,system_prompt):
20
  messages.append({"role": "user", "content":user_message})
21
  messages.append({"role": "assistant", "content":bot_message})
22
  messages.append({"role": "user", "content":message})
23
- newPrompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
24
- print(newPrompt)
 
25
  return newPrompt
26
 
27
  def inference(message, history, systemPrompt=SYSTEM_PROMPT, temperature=0.9, maxTokens=512, topP=0.9, repPenalty=1.1):
@@ -38,13 +42,18 @@ def inference(message, history, systemPrompt=SYSTEM_PROMPT, temperature=0.9, max
38
  seed=random.randint(0, 999999999),
39
  )
40
  # Generating the response by passing the prompt in right format plus the client settings
41
- stream = client.text_generation(format_prompt(message, history, systemPrompt),
42
- **client_settings)
43
  # Reading the stream
44
- partial_response = ""
45
- for stream_part in stream:
46
- partial_response += stream_part.token.text
47
- yield partial_response
 
 
 
 
 
48
 
49
 
50
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import random
4
+ from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig
5
  from mySystemPrompt import SYSTEM_PROMPT
6
 
7
+ bnb_config = BitsAndBytesConfig(load_in_8bit=True)
8
+
9
  # Model which is used
10
  checkpoint = "CohereForAI/c4ai-command-r-plus"
11
  # Inference client with the model (And HF-token if needed)
12
  client = InferenceClient(checkpoint)
13
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
14
+ model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config)
15
  # Tokenizer chat template correction(Only works for mistral models)
16
  #chat_template = open("mistral-instruct.jinja").read()
17
  #chat_template = chat_template.replace(' ', '').replace('\n', '')
 
23
  messages.append({"role": "user", "content":user_message})
24
  messages.append({"role": "assistant", "content":bot_message})
25
  messages.append({"role": "user", "content":message})
26
+ #newPrompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
27
+ #print(newPrompt)
28
+ newPrompt = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
29
  return newPrompt
30
 
31
  def inference(message, history, systemPrompt=SYSTEM_PROMPT, temperature=0.9, maxTokens=512, topP=0.9, repPenalty=1.1):
 
42
  seed=random.randint(0, 999999999),
43
  )
44
  # Generating the response by passing the prompt in right format plus the client settings
45
+ #stream = client.text_generation(format_prompt(message, history, systemPrompt),
46
+ # **client_settings)
47
  # Reading the stream
48
+ #partial_response = ""
49
+ #for stream_part in stream:
50
+ # partial_response += stream_part.token.text
51
+ # yield partial_response
52
+ gen_tokens = model.generate(
53
+ format_prompt(message,history,systemPrompt),
54
+ **client_settings)
55
+ output = tokenizer.decode(gen_tokens[0])
56
+ return output
57
 
58
 
59