ryandt commited on
Commit
5c47ed5
·
1 Parent(s): 1fc3b76

Removed streaming

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -82,6 +82,10 @@ def _run_stage_gpu(
82
 
83
  if target_emb is None and encode_text_input is not None:
84
  target_emb = encode_text(encode_text_input, encoder)
 
 
 
 
85
 
86
  step_count = 0
87
  def count_steps(step, cand):
@@ -101,7 +105,12 @@ def _run_stage_gpu(
101
  on_step=count_steps,
102
  )
103
  elapsed = time.time() - t0
104
- return result, elapsed, step_count, target_emb
 
 
 
 
 
105
 
106
 
107
  def run_stage(
@@ -131,20 +140,20 @@ def run_stage(
131
  # On Stage 1, pass raw text so encoding happens inside GPU context
132
  encode_input = text.strip() if stage_num == 1 else None
133
 
134
- result, elapsed, steps, returned_emb = _run_stage_gpu(
135
  target_emb_state, encoder_name, prompt,
136
  beam_width, top_k, patience, max_steps, min_similarity, randomness,
137
  encode_text_input=encode_input,
138
  )
139
 
140
- if returned_emb is not None:
141
- target_emb_state = returned_emb
142
 
143
  stage_results_state = stage_results_state + [{
144
  "stage": stage_num,
145
- "text": result.seq_str,
146
- "cos_sim": result.cos_sim,
147
- "length": len(result.token_ids),
148
  "time": elapsed,
149
  "steps": steps,
150
  }]
 
82
 
83
  if target_emb is None and encode_text_input is not None:
84
  target_emb = encode_text(encode_text_input, encoder)
85
+ elif target_emb is not None:
86
+ # Move CPU tensor back to GPU for beam search
87
+ device = next(llm.parameters()).device
88
+ target_emb = target_emb.to(device)
89
 
90
  step_count = 0
91
  def count_steps(step, cand):
 
105
  on_step=count_steps,
106
  )
107
  elapsed = time.time() - t0
108
+ # Return only CPU/plain data to avoid CUDA init in main process on ZeroGPU
109
+ return {
110
+ "seq_str": result.seq_str,
111
+ "cos_sim": result.cos_sim,
112
+ "token_ids": result.token_ids,
113
+ }, elapsed, step_count, target_emb.cpu()
114
 
115
 
116
  def run_stage(
 
140
  # On Stage 1, pass raw text so encoding happens inside GPU context
141
  encode_input = text.strip() if stage_num == 1 else None
142
 
143
+ result_dict, elapsed, steps, returned_emb_cpu = _run_stage_gpu(
144
  target_emb_state, encoder_name, prompt,
145
  beam_width, top_k, patience, max_steps, min_similarity, randomness,
146
  encode_text_input=encode_input,
147
  )
148
 
149
+ # Store embedding on CPU — it gets moved back to GPU inside _run_stage_gpu
150
+ target_emb_state = returned_emb_cpu
151
 
152
  stage_results_state = stage_results_state + [{
153
  "stage": stage_num,
154
+ "text": result_dict["seq_str"],
155
+ "cos_sim": result_dict["cos_sim"],
156
+ "length": len(result_dict["token_ids"]),
157
  "time": elapsed,
158
  "steps": steps,
159
  }]