maxdougly commited on
Commit
9d94d25
·
verified ·
1 Parent(s): c4c109c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -20
app.py CHANGED
@@ -1,12 +1,15 @@
1
  import gradio as gr
2
  from peft import AutoPeftModelForCausalLM
3
  from transformers import AutoTokenizer
 
4
 
5
- """
6
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
7
- """
8
 
9
- def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, min_p,):
 
 
10
  messages = [{"role": "system", "content": system_message}]
11
 
12
  for val in history:
@@ -17,29 +20,27 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
17
 
18
  messages.append({"role": "user", "content": message})
19
 
20
- model = AutoPeftModelForCausalLM.from_pretrained(
21
- "eforse01/lora_model",
22
- device_map="cpu",
23
- )
24
-
25
- tokenizer = AutoTokenizer.from_pretrained("eforse01/lora_model")
26
  inputs = tokenizer.apply_chat_template(
27
  messages,
28
- tokenize = True,
29
- add_generation_prompt = True,
30
- return_tensors = "pt",
31
  )
32
 
33
- output = model.generate(input_ids = inputs, max_new_tokens = max_tokens,
34
- use_cache = True, temperature = temperature, min_p = min_p)
 
 
 
 
 
 
35
 
 
36
  response = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
37
-
38
  yield response.split('assistant')[-1]
39
 
40
- """
41
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
42
- """
43
  demo = gr.ChatInterface(
44
  respond,
45
  additional_inputs=[
@@ -57,4 +58,4 @@ demo = gr.ChatInterface(
57
  )
58
 
59
  if __name__ == "__main__":
60
- demo.launch()
 
1
  import gradio as gr
2
  from peft import AutoPeftModelForCausalLM
3
  from transformers import AutoTokenizer
4
+ import spaces
5
 
6
+ # Load the model and tokenizer globally
7
+ model = AutoPeftModelForCausalLM.from_pretrained("eforse01/lora_model", device_map="cpu")
8
+ tokenizer = AutoTokenizer.from_pretrained("eforse01/lora_model")
9
 
10
+ # Decorate the respond function with @spaces.GPU
11
+ @spaces.GPU(duration=120) # Set duration to 120 seconds if needed
12
+ def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, min_p):
13
  messages = [{"role": "system", "content": system_message}]
14
 
15
  for val in history:
 
20
 
21
  messages.append({"role": "user", "content": message})
22
 
 
 
 
 
 
 
23
  inputs = tokenizer.apply_chat_template(
24
  messages,
25
+ tokenize=True,
26
+ add_generation_prompt=True,
27
+ return_tensors="pt",
28
  )
29
 
30
+ # Generate response
31
+ output = model.generate(
32
+ input_ids=inputs.input_ids,
33
+ max_new_tokens=max_tokens,
34
+ use_cache=True,
35
+ temperature=temperature,
36
+ min_p=min_p,
37
+ )
38
 
39
+ # Decode and format response
40
  response = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
 
41
  yield response.split('assistant')[-1]
42
 
43
+ # Gradio Interface
 
 
44
  demo = gr.ChatInterface(
45
  respond,
46
  additional_inputs=[
 
58
  )
59
 
60
  if __name__ == "__main__":
61
+ demo.launch()