Spaces:
Running on Zero
Running on Zero
Removed streaming
Browse files
app.py
CHANGED
|
@@ -82,6 +82,10 @@ def _run_stage_gpu(
|
|
| 82 |
|
| 83 |
if target_emb is None and encode_text_input is not None:
|
| 84 |
target_emb = encode_text(encode_text_input, encoder)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
step_count = 0
|
| 87 |
def count_steps(step, cand):
|
|
@@ -101,7 +105,12 @@ def _run_stage_gpu(
|
|
| 101 |
on_step=count_steps,
|
| 102 |
)
|
| 103 |
elapsed = time.time() - t0
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
def run_stage(
|
|
@@ -131,20 +140,20 @@ def run_stage(
|
|
| 131 |
# On Stage 1, pass raw text so encoding happens inside GPU context
|
| 132 |
encode_input = text.strip() if stage_num == 1 else None
|
| 133 |
|
| 134 |
-
|
| 135 |
target_emb_state, encoder_name, prompt,
|
| 136 |
beam_width, top_k, patience, max_steps, min_similarity, randomness,
|
| 137 |
encode_text_input=encode_input,
|
| 138 |
)
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
stage_results_state = stage_results_state + [{
|
| 144 |
"stage": stage_num,
|
| 145 |
-
"text":
|
| 146 |
-
"cos_sim":
|
| 147 |
-
"length": len(
|
| 148 |
"time": elapsed,
|
| 149 |
"steps": steps,
|
| 150 |
}]
|
|
|
|
| 82 |
|
| 83 |
if target_emb is None and encode_text_input is not None:
|
| 84 |
target_emb = encode_text(encode_text_input, encoder)
|
| 85 |
+
elif target_emb is not None:
|
| 86 |
+
# Move CPU tensor back to GPU for beam search
|
| 87 |
+
device = next(llm.parameters()).device
|
| 88 |
+
target_emb = target_emb.to(device)
|
| 89 |
|
| 90 |
step_count = 0
|
| 91 |
def count_steps(step, cand):
|
|
|
|
| 105 |
on_step=count_steps,
|
| 106 |
)
|
| 107 |
elapsed = time.time() - t0
|
| 108 |
+
# Return only CPU/plain data to avoid CUDA init in main process on ZeroGPU
|
| 109 |
+
return {
|
| 110 |
+
"seq_str": result.seq_str,
|
| 111 |
+
"cos_sim": result.cos_sim,
|
| 112 |
+
"token_ids": result.token_ids,
|
| 113 |
+
}, elapsed, step_count, target_emb.cpu()
|
| 114 |
|
| 115 |
|
| 116 |
def run_stage(
|
|
|
|
| 140 |
# On Stage 1, pass raw text so encoding happens inside GPU context
|
| 141 |
encode_input = text.strip() if stage_num == 1 else None
|
| 142 |
|
| 143 |
+
result_dict, elapsed, steps, returned_emb_cpu = _run_stage_gpu(
|
| 144 |
target_emb_state, encoder_name, prompt,
|
| 145 |
beam_width, top_k, patience, max_steps, min_similarity, randomness,
|
| 146 |
encode_text_input=encode_input,
|
| 147 |
)
|
| 148 |
|
| 149 |
+
# Store embedding on CPU — it gets moved back to GPU inside _run_stage_gpu
|
| 150 |
+
target_emb_state = returned_emb_cpu
|
| 151 |
|
| 152 |
stage_results_state = stage_results_state + [{
|
| 153 |
"stage": stage_num,
|
| 154 |
+
"text": result_dict["seq_str"],
|
| 155 |
+
"cos_sim": result_dict["cos_sim"],
|
| 156 |
+
"length": len(result_dict["token_ids"]),
|
| 157 |
"time": elapsed,
|
| 158 |
"steps": steps,
|
| 159 |
}]
|