fuvty commited on
Commit
8672996
·
1 Parent(s): f926ba4

[debug] zeroGPU

Browse files
Files changed (1) hide show
  1. app.py +30 -23
app.py CHANGED
@@ -7,8 +7,8 @@ This creates a web interface to compare three inference modes simultaneously:
7
  3. C2C: Rosetta model with projectors
8
 
9
  ZeroGPU Support:
10
- - Models are loaded to CPU at startup
11
- - @spaces.GPU decorator moves models to GPU on-demand for each inference
12
  - Works seamlessly on both ZeroGPU and regular GPU environments
13
  """
14
 
@@ -221,9 +221,9 @@ class ModelManager:
221
  @spaces.GPU(duration=60)
222
  def generate_single(self, user_input: str) -> Generator[str, None, None]:
223
  """Generate response from single model with streaming."""
224
- # Move model to GPU for ZeroGPU
225
- device = torch.device("cuda" if torch.cuda.is_available() else self.device)
226
- if ZEROGPU_AVAILABLE and self.single_model.device.type != "cuda":
227
  self.single_model.to(device)
228
 
229
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
@@ -252,23 +252,23 @@ class ModelManager:
252
  thread.start()
253
 
254
  # Stream tokens
255
- generated_text = ""
256
  for token in streamer:
257
- generated_text += token
258
- yield generated_text
259
-
 
 
 
 
260
  @spaces.GPU(duration=90)
261
  def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
262
  """Generate response from T2T model with streaming (returns context, answer)."""
263
- # Move models to GPU for ZeroGPU
264
- device = torch.device("cuda" if torch.cuda.is_available() else self.device)
265
  if ZEROGPU_AVAILABLE:
266
- if self.t2t_model.context_model.device.type != "cuda":
267
- self.t2t_model.context_model.to(device)
268
- if self.t2t_model.answer_model.device.type != "cuda":
269
- self.t2t_model.answer_model.to(device)
270
 
271
-
272
  # Stage 1: Context generation
273
  context_streamer = TextIteratorStreamer(
274
  self.t2t_model.context_tokenizer,
@@ -349,13 +349,18 @@ class ModelManager:
349
  for token in answer_streamer:
350
  answer_text += token
351
  yield context_text, answer_text
352
-
 
 
 
 
 
353
  @spaces.GPU(duration=60)
354
  def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
355
  """Generate response from C2C model with streaming."""
356
- # Move model to GPU for ZeroGPU
357
- device = torch.device("cuda" if torch.cuda.is_available() else self.device)
358
- if ZEROGPU_AVAILABLE and self.c2c_model.device.type != "cuda":
359
  self.c2c_model.to(device)
360
 
361
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
@@ -397,10 +402,12 @@ class ModelManager:
397
  thread.start()
398
 
399
  # Stream tokens
400
- generated_text = ""
401
  for token in streamer:
402
- generated_text += token
403
- yield generated_text
 
 
 
404
 
405
 
406
  def create_demo(model_manager: ModelManager):
 
7
  3. C2C: Rosetta model with projectors
8
 
9
  ZeroGPU Support:
10
+ - Models are loaded to CUDA at startup
11
+ - @spaces.GPU decorator handles GPU allocation automatically for each inference
12
  - Works seamlessly on both ZeroGPU and regular GPU environments
13
  """
14
 
 
221
  @spaces.GPU(duration=60)
222
  def generate_single(self, user_input: str) -> Generator[str, None, None]:
223
  """Generate response from single model with streaming."""
224
+ # For ZeroGPU, move model to GPU on-demand
225
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
226
+ if ZEROGPU_AVAILABLE:
227
  self.single_model.to(device)
228
 
229
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
 
252
  thread.start()
253
 
254
  # Stream tokens
 
255
  for token in streamer:
256
+ yield token
257
+ thread.join()
258
+
259
+ if ZEROGPU_AVAILABLE:
260
+ self.single_model.to("cpu")
261
+
262
+
263
  @spaces.GPU(duration=90)
264
  def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
265
  """Generate response from T2T model with streaming (returns context, answer)."""
266
+ # For ZeroGPU, move model to GPU on-demand
267
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
268
  if ZEROGPU_AVAILABLE:
269
+ self.t2t_model.context_model.to(device)
270
+ self.t2t_model.answer_model.to(device)
 
 
271
 
 
272
  # Stage 1: Context generation
273
  context_streamer = TextIteratorStreamer(
274
  self.t2t_model.context_tokenizer,
 
349
  for token in answer_streamer:
350
  answer_text += token
351
  yield context_text, answer_text
352
+ thread.join()
353
+
354
+ if ZEROGPU_AVAILABLE:
355
+ self.t2t_model.context_model.to("cpu")
356
+ self.t2t_model.answer_model.to("cpu")
357
+
358
  @spaces.GPU(duration=60)
359
  def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
360
  """Generate response from C2C model with streaming."""
361
+ # For ZeroGPU, move model to GPU on-demand
362
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
363
+ if ZEROGPU_AVAILABLE:
364
  self.c2c_model.to(device)
365
 
366
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
 
402
  thread.start()
403
 
404
  # Stream tokens
 
405
  for token in streamer:
406
+ yield token
407
+ thread.join()
408
+
409
+ if ZEROGPU_AVAILABLE:
410
+ self.c2c_model.to("cpu")
411
 
412
 
413
  def create_demo(model_manager: ModelManager):