krish10 commited on
Commit
226053b
·
verified ·
1 Parent(s): df1c813

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -54
app.py CHANGED
@@ -1,67 +1,37 @@
1
- import spaces
2
- import threading
3
  import gradio as gr
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
 
6
- # Load the model and tokenizer locally
7
  model_name = "krish10/Qwen3_0.6B_16bit_TA_screen"
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
10
 
11
- # Define the function to handle chat responses
12
- @spaces.GPU
13
- def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
14
- # Prepare the prompt by combining history and system messages
15
- if system_message!= "":
16
- msg = [
17
- {"role": "system", "content": system_message}
18
- ]
19
- else:
20
- msg = []
21
- for user_input, assistant_response in history:
22
- msg.extend(
23
- [
24
- {"role": "user", "content": user_input},
25
- {"role": "assistant", "content": assistant_response}
26
- ]
27
- )
28
- msg.append({"role": "user", "content": message})
29
-
30
- prompt = tokenizer.apply_chat_template(
31
- msg,
32
- tokenize=False,
33
- add_generation_prompt=True
34
- )
35
-
36
- # Tokenize the input prompt
37
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
38
 
39
-
40
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
41
-
42
- # Use a thread to run the generation in parallel
43
- generation_thread = threading.Thread(
44
- target=model.generate,
45
- kwargs=dict(
46
- inputs=inputs.input_ids,
47
- max_length=max_tokens,
48
- streamer=streamer,
49
- do_sample=True,
50
- temperature=temperature,
51
- top_p=top_p,
52
- pad_token_id=tokenizer.eos_token_id,
53
- ),
54
  )
55
- generation_thread.start()
56
-
57
- # Stream the tokens as they are generated
58
- text_buffer = ""
59
- for new_text in streamer:
60
- text_buffer+=new_text
61
- yield text_buffer
62
 
 
 
63
 
64
- # Create the Gradio interface
65
  demo = gr.ChatInterface(
66
  respond,
67
  additional_inputs=[
@@ -72,6 +42,6 @@ demo = gr.ChatInterface(
72
  ]
73
  )
74
 
75
- # Launch the Gradio app
76
  if __name__ == "__main__":
77
  demo.launch()
 
 
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
 
4
+ # Load model and tokenizer
5
  model_name = "krish10/Qwen3_0.6B_16bit_TA_screen"
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
8
 
9
+ # Chat function
10
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
11
+ messages = []
12
+ if system_message:
13
+ messages.append({"role": "system", "content": system_message})
14
+ for user, assistant in history:
15
+ messages.append({"role": "user", "content": user})
16
+ messages.append({"role": "assistant", "content": assistant})
17
+ messages.append({"role": "user", "content": message})
18
+
19
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
21
 
22
+ outputs = model.generate(
23
+ input_ids=inputs.input_ids,
24
+ max_length=max_tokens,
25
+ do_sample=True,
26
+ temperature=temperature,
27
+ top_p=top_p,
28
+ pad_token_id=tokenizer.eos_token_id
 
 
 
 
 
 
 
 
29
  )
 
 
 
 
 
 
 
30
 
31
+ decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
32
+ return decoded[len(prompt):] # return only the generated text after the prompt
33
 
34
+ # Gradio UI
35
  demo = gr.ChatInterface(
36
  respond,
37
  additional_inputs=[
 
42
  ]
43
  )
44
 
45
+ # Launch
46
  if __name__ == "__main__":
47
  demo.launch()