shabul commited on
Commit
d062dd2
·
verified ·
1 Parent(s): be033bd

Simplify: drop threading/streamer, use synchronous generate + torch.no_grad

Browse files
Files changed (1) hide show
  1. app.py +14 -43
app.py CHANGED
@@ -6,8 +6,6 @@ Loads qwen2.5-3b-feynman-explainer on CPU with a CPU-safe dtype.
6
  Streams tokens for a responsive ChatGPT-like experience.
7
  """
8
 
9
- import threading
10
-
11
  try:
12
  import spaces # HF Spaces ZeroGPU shim — no-op on CPU tier
13
  except ImportError:
@@ -15,7 +13,7 @@ except ImportError:
15
 
16
  import gradio as gr
17
  import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
19
 
20
  MODEL_ID = "shabul/qwen2.5-3b-feynman-explainer"
21
 
@@ -76,47 +74,20 @@ def respond(message: str, history: list[dict], max_new_tokens: int, temperature:
76
  return_tensors="pt",
77
  )
78
 
79
- streamer = TextIteratorStreamer(
80
- tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
81
- )
82
-
83
- gen_kwargs = dict(
84
- input_ids=input_ids,
85
- streamer=streamer,
86
- max_new_tokens=max_new_tokens,
87
- temperature=temperature,
88
- do_sample=temperature > 0,
89
- repetition_penalty=1.1,
 
90
  )
91
-
92
- gen_exception = []
93
-
94
- def generate_with_catch():
95
- try:
96
- model.generate(**gen_kwargs)
97
- except Exception as e:
98
- gen_exception.append(f"{type(e).__name__}: {repr(e)}")
99
- streamer.end()
100
-
101
- thread = threading.Thread(target=generate_with_catch, daemon=True)
102
- thread.start()
103
-
104
- partial = ""
105
- try:
106
- for token in streamer:
107
- partial += token
108
- yield partial
109
- except Exception:
110
- pass
111
-
112
- thread.join(timeout=5)
113
-
114
- if gen_exception:
115
- err_msg = f"⚠️ Generation error: {gen_exception[0]}"
116
- print(err_msg)
117
- yield (partial + "\n\n" + err_msg) if partial else err_msg
118
- elif not partial:
119
- yield "⚠️ No response generated. Try again."
120
 
121
 
122
  with gr.Blocks(
 
6
  Streams tokens for a responsive ChatGPT-like experience.
7
  """
8
 
 
 
9
  try:
10
  import spaces # HF Spaces ZeroGPU shim — no-op on CPU tier
11
  except ImportError:
 
13
 
14
  import gradio as gr
15
  import torch
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
 
18
  MODEL_ID = "shabul/qwen2.5-3b-feynman-explainer"
19
 
 
74
  return_tensors="pt",
75
  )
76
 
77
+ with torch.no_grad():
78
+ output_ids = model.generate(
79
+ input_ids,
80
+ max_new_tokens=max_new_tokens,
81
+ temperature=temperature,
82
+ do_sample=temperature > 0,
83
+ repetition_penalty=1.1,
84
+ )
85
+
86
+ response = tokenizer.decode(
87
+ output_ids[0][input_ids.shape[1]:],
88
+ skip_special_tokens=True,
89
  )
90
+ yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
 
93
  with gr.Blocks(