Ronaldo Claude Sonnet 4.6 commited on
Commit
f9b70b8
·
1 Parent(s): c45e8d4

Add token streaming for real-time response display

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +18 -7
app.py CHANGED
@@ -1,5 +1,6 @@
 
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
  model_id = "LiquidAI/LFM2.5-1.2B-Instruct"
@@ -26,12 +27,22 @@ def chat(message, history):
26
  )
27
  input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(model.device)
28
 
29
- output = model.generate(
30
- input_ids, do_sample=True,
31
- temperature=0.1, top_k=50,
32
- repetition_penalty=1.05, max_new_tokens=512
33
- )
34
- return tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
35
 
36
  demo = gr.ChatInterface(
37
  fn=chat,
 
1
+ import threading
2
  import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  import torch
5
 
6
  model_id = "LiquidAI/LFM2.5-1.2B-Instruct"
 
27
  )
28
  input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(model.device)
29
 
30
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
31
+ thread = threading.Thread(target=model.generate, kwargs=dict(
32
+ input_ids=input_ids,
33
+ do_sample=True,
34
+ temperature=0.1,
35
+ top_k=50,
36
+ repetition_penalty=1.05,
37
+ max_new_tokens=512,
38
+ streamer=streamer,
39
+ ))
40
+ thread.start()
41
+
42
+ partial = ""
43
+ for token in streamer:
44
+ partial += token
45
+ yield partial
46
 
47
  demo = gr.ChatInterface(
48
  fn=chat,