maxdougly commited on
Commit
fc5bec5
·
verified ·
1 Parent(s): 71235da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -38
app.py CHANGED
@@ -1,15 +1,3 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Prevent CUDA initialization outside ZeroGPU
3
-
4
- import spaces # Import spaces first
5
- import gradio as gr
6
- from peft import AutoPeftModelForCausalLM
7
- from transformers import AutoTokenizer
8
-
9
- # Load the model and tokenizer globally
10
- model = AutoPeftModelForCausalLM.from_pretrained("eforse01/lora_model")
11
- tokenizer = AutoTokenizer.from_pretrained("eforse01/lora_model")
12
-
13
  @spaces.GPU(duration=120) # Decorate the function for ZeroGPU
14
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, min_p):
15
  # Construct messages for the chat template
@@ -29,44 +17,21 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
29
  return_tensors="pt", # Return tensors for PyTorch
30
  )
31
 
32
- # Debug inputs
33
- print("Inputs:", inputs)
34
-
35
- # Extract input_ids correctly
36
- input_ids = inputs["input_ids"] # Ensure the correct field is used
37
  print("Input IDs shape:", input_ids.shape)
38
 
39
  # Generate response
40
  output = model.generate(
41
- input_ids=input_ids,
42
  max_new_tokens=max_tokens,
43
  use_cache=True,
44
  temperature=temperature,
45
  min_p=min_p,
46
  )
47
 
48
- # Debug output
49
- print("Generated Output Shape:", output.shape)
50
- print("Generated Output:", output)
51
-
52
  # Decode and format the response
53
- # Use `decode` for the first sequence in the batch
54
  response = tokenizer.decode(output[0], skip_special_tokens=True)
55
 
56
  # Yield the response
57
  yield response.split("assistant")[-1]
58
-
59
-
60
- # Gradio Interface
61
- demo = gr.ChatInterface(
62
- respond,
63
- additional_inputs=[
64
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
65
- gr.Slider(minimum=1, maximum=2048, value=2048, step=1, label="Max new tokens"),
66
- gr.Slider(minimum=0.1, maximum=4.0, value=1.5, step=0.1, label="Temperature"),
67
- gr.Slider(minimum=0.1, maximum=1.0, value=0.99, step=0.01, label="Min-p"),
68
- ],
69
- )
70
-
71
- if __name__ == "__main__":
72
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  @spaces.GPU(duration=120) # Decorate the function for ZeroGPU
2
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, min_p):
3
  # Construct messages for the chat template
 
17
  return_tensors="pt", # Return tensors for PyTorch
18
  )
19
 
20
+ # Ensure input_ids is extracted
21
+ input_ids = inputs # Tensor provided directly
 
 
 
22
  print("Input IDs shape:", input_ids.shape)
23
 
24
  # Generate response
25
  output = model.generate(
26
+ input_ids=input_ids, # Pass tensor explicitly as input_ids
27
  max_new_tokens=max_tokens,
28
  use_cache=True,
29
  temperature=temperature,
30
  min_p=min_p,
31
  )
32
 
 
 
 
 
33
  # Decode and format the response
 
34
  response = tokenizer.decode(output[0], skip_special_tokens=True)
35
 
36
  # Yield the response
37
  yield response.split("assistant")[-1]