krish10 commited on
Commit
1d3994c
·
verified ·
1 Parent(s): 226053b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -11
app.py CHANGED
@@ -1,26 +1,35 @@
 
1
  import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
- # Load model and tokenizer
5
  model_name = "krish10/Qwen3_0.6B_16bit_TA_screen"
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
8
 
9
- # Chat function
10
- def respond(message, history, system_message, max_tokens, temperature, top_p):
 
 
11
  messages = []
12
  if system_message:
13
  messages.append({"role": "system", "content": system_message})
14
- for user, assistant in history:
15
- messages.append({"role": "user", "content": user})
16
- messages.append({"role": "assistant", "content": assistant})
17
  messages.append({"role": "user", "content": message})
18
 
19
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
20
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
21
 
 
22
  outputs = model.generate(
23
- input_ids=inputs.input_ids,
24
  max_length=max_tokens,
25
  do_sample=True,
26
  temperature=temperature,
@@ -28,8 +37,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
28
  pad_token_id=tokenizer.eos_token_id
29
  )
30
 
 
31
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
32
- return decoded[len(prompt):] # return only the generated text after the prompt
33
 
34
  # Gradio UI
35
  demo = gr.ChatInterface(
@@ -42,6 +52,5 @@ demo = gr.ChatInterface(
42
  ]
43
  )
44
 
45
- # Launch
46
  if __name__ == "__main__":
47
- demo.launch()
 
1
+ import spaces
2
  import gradio as gr
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
+ # Load the model and tokenizer
6
  model_name = "krish10/Qwen3_0.6B_16bit_TA_screen"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
  model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
9
 
10
+ # Non-streaming chat function
11
+ @spaces.GPU
12
+ def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
13
+ # Construct messages from history + system message
14
  messages = []
15
  if system_message:
16
  messages.append({"role": "system", "content": system_message})
17
+ for user_msg, bot_msg in history:
18
+ messages.append({"role": "user", "content": user_msg})
19
+ messages.append({"role": "assistant", "content": bot_msg})
20
  messages.append({"role": "user", "content": message})
21
 
22
+ # Build prompt
23
+ prompt = tokenizer.apply_chat_template(
24
+ messages, tokenize=False, add_generation_prompt=True
25
+ )
26
+
27
+ # Tokenize and move to GPU
28
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
29
 
30
+ # Generate response
31
  outputs = model.generate(
32
+ input_ids=inputs["input_ids"],
33
  max_length=max_tokens,
34
  do_sample=True,
35
  temperature=temperature,
 
37
  pad_token_id=tokenizer.eos_token_id
38
  )
39
 
40
+ # Decode and return only new content
41
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
+ return decoded[len(prompt):] # strip prompt prefix
43
 
44
  # Gradio UI
45
  demo = gr.ChatInterface(
 
52
  ]
53
  )
54
 
 
55
  if __name__ == "__main__":
56
+ demo.launch()