fuvty commited on
Commit
04539d0
Β·
1 Parent(s): 1887266

[debug] zeroGPU

Browse files
Files changed (1) hide show
  1. app.py +27 -46
app.py CHANGED
@@ -7,8 +7,9 @@ This creates a web interface to compare three inference modes simultaneously:
7
  3. C2C: Rosetta model with projectors
8
 
9
  ZeroGPU Support:
10
- - Models are loaded to CUDA at startup
11
- - @spaces.GPU decorator handles GPU allocation automatically for each inference
 
12
  - Works seamlessly on both ZeroGPU and regular GPU environments
13
  """
14
 
@@ -51,19 +52,15 @@ class ModelManager:
51
  c2c_checkpoint_path: Path to C2C checkpoint directory
52
  device: Device to use (cuda, cpu, or auto)
53
  """
54
- # For ZeroGPU, models should be loaded to CUDA directly
55
- # The @spaces.GPU decorator handles GPU allocation automatically
56
  if device == "auto":
57
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
  else:
59
  self.device = torch.device(device)
60
 
61
- # Debug information
62
  print(f"Using device: {self.device}")
63
- print(f"CUDA available: {torch.cuda.is_available()}")
64
- print(f"CUDA device count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}")
65
  if ZEROGPU_AVAILABLE:
66
- print("ZeroGPU detected: Models will be loaded to CUDA (decorator handles allocation)")
67
 
68
  # Model configurations
69
  self.single_model_name = single_model_name
@@ -108,10 +105,11 @@ class ModelManager:
108
  self.single_model, self.single_tokenizer = load_hf_model(
109
  self.single_model_name, self.device
110
  )
111
- # Explicitly move model to device (required for ZeroGPU)
112
- self.single_model = self.single_model.to(self.device)
113
  set_default_chat_template(self.single_tokenizer, self.single_model_name)
114
- print(f"[Single] βœ“ Model loaded on {self.single_model.device}")
 
 
 
115
 
116
  def _load_t2t_model(self):
117
  """Load two-stage model."""
@@ -127,10 +125,11 @@ class ModelManager:
127
  device=str(self.device),
128
  background_prompt=self.t2t_background_prompt
129
  )
130
- # Explicitly move models to device (required for ZeroGPU)
131
- self.t2t_model.context_model = self.t2t_model.context_model.to(self.device)
132
- self.t2t_model.answer_model = self.t2t_model.answer_model.to(self.device)
133
- print(f"[T2T] βœ“ Models loaded on {self.t2t_model.context_model.device} and {self.t2t_model.answer_model.device}")
 
134
 
135
  def _load_c2c_model(self):
136
  """Load Rosetta (C2C) model."""
@@ -187,9 +186,10 @@ class ModelManager:
187
  self.c2c_model, self.c2c_tokenizer = load_rosetta_model(
188
  model_config, eval_config, self.device
189
  )
190
- # Explicitly move model to device (required for ZeroGPU)
191
- self.c2c_model = self.c2c_model.to(self.device)
192
- print(f"[C2C] βœ“ Model loaded on {self.c2c_model.device}")
 
193
 
194
  def _load_all_models(self):
195
  """Load all models sequentially."""
@@ -231,17 +231,12 @@ class ModelManager:
231
  @spaces.GPU(duration=30)
232
  def generate_single(self, user_input: str) -> Generator[str, None, None]:
233
  """Generate response from single model with streaming."""
234
- # @spaces.GPU decorator handles GPU allocation automatically
235
- # Ensure model is on correct device (ZeroGPU may move it)
236
- if self.single_model.device != self.device:
237
- print(f"[Single] Moving model from {self.single_model.device} to {self.device}")
238
- self.single_model = self.single_model.to(self.device)
239
-
240
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
241
  text = self.single_tokenizer.apply_chat_template(
242
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
243
  )
244
- inputs = self.single_tokenizer(text, return_tensors="pt").to(self.device)
 
245
 
246
  # Setup streamer
247
  streamer = TextIteratorStreamer(
@@ -271,15 +266,6 @@ class ModelManager:
271
  @spaces.GPU(duration=90)
272
  def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
273
  """Generate response from T2T model with streaming (returns context, answer)."""
274
- # @spaces.GPU decorator handles GPU allocation automatically
275
- # Ensure models are on correct device (ZeroGPU may move them)
276
- if self.t2t_model.context_model.device != self.device:
277
- print(f"[T2T] Moving context model from {self.t2t_model.context_model.device} to {self.device}")
278
- self.t2t_model.context_model = self.t2t_model.context_model.to(self.device)
279
- if self.t2t_model.answer_model.device != self.device:
280
- print(f"[T2T] Moving answer model from {self.t2t_model.answer_model.device} to {self.device}")
281
- self.t2t_model.answer_model = self.t2t_model.answer_model.to(self.device)
282
-
283
  # Stage 1: Context generation
284
  context_streamer = TextIteratorStreamer(
285
  self.t2t_model.context_tokenizer,
@@ -294,7 +280,7 @@ class ModelManager:
294
  add_generation_prompt=True,
295
  return_tensors="pt",
296
  enable_thinking=False
297
- ).to(self.device)
298
 
299
  generation_kwargs = {
300
  'input_ids': inputs,
@@ -343,7 +329,7 @@ class ModelManager:
343
  add_generation_prompt=True,
344
  return_tensors="pt",
345
  enable_thinking=False
346
- ).to(self.device)
347
 
348
  generation_kwargs = {
349
  'input_ids': inputs,
@@ -364,17 +350,12 @@ class ModelManager:
364
  @spaces.GPU(duration=30)
365
  def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
366
  """Generate response from C2C model with streaming."""
367
- # @spaces.GPU decorator handles GPU allocation automatically
368
- # Ensure model is on correct device (ZeroGPU may move it)
369
- if self.c2c_model.device != self.device:
370
- print(f"[C2C] Moving model from {self.c2c_model.device} to {self.device}")
371
- self.c2c_model = self.c2c_model.to(self.device)
372
-
373
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
374
  text = self.c2c_tokenizer.apply_chat_template(
375
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
376
  )
377
- inputs = self.c2c_tokenizer(text, return_tensors="pt").to(self.device)
 
378
 
379
  # Setup streamer
380
  streamer = TextIteratorStreamer(
@@ -387,12 +368,12 @@ class ModelManager:
387
  full_length = inputs.input_ids.shape[1]
388
  instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
389
  full_length - 1, 1
390
- ).unsqueeze(0).to(self.device)
391
  label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
392
  1, 1
393
- ).unsqueeze(0).to(self.device)
394
  position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
395
- torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(self.device)
396
 
397
  # Generation parameters
398
  generation_kwargs = {
 
7
  3. C2C: Rosetta model with projectors
8
 
9
  ZeroGPU Support:
10
+ - Models are loaded to CUDA if available
11
+ - @spaces.GPU decorator handles device allocation automatically
12
+ - Inputs are moved to match the model's actual device
13
  - Works seamlessly on both ZeroGPU and regular GPU environments
14
  """
15
 
 
52
  c2c_checkpoint_path: Path to C2C checkpoint directory
53
  device: Device to use (cuda, cpu, or auto)
54
  """
55
+ # Always use CUDA if available, ZeroGPU handles the rest
 
56
  if device == "auto":
57
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
  else:
59
  self.device = torch.device(device)
60
 
 
61
  print(f"Using device: {self.device}")
 
 
62
  if ZEROGPU_AVAILABLE:
63
+ print("ZeroGPU environment detected")
64
 
65
  # Model configurations
66
  self.single_model_name = single_model_name
 
105
  self.single_model, self.single_tokenizer = load_hf_model(
106
  self.single_model_name, self.device
107
  )
 
 
108
  set_default_chat_template(self.single_tokenizer, self.single_model_name)
109
+ # Move to CUDA if available (following HuggingFace ZeroGPU pattern)
110
+ if torch.cuda.is_available():
111
+ self.single_model = self.single_model.to('cuda')
112
+ print(f"[Single] βœ“ Model loaded")
113
 
114
  def _load_t2t_model(self):
115
  """Load two-stage model."""
 
125
  device=str(self.device),
126
  background_prompt=self.t2t_background_prompt
127
  )
128
+ # Move to CUDA if available (following HuggingFace ZeroGPU pattern)
129
+ if torch.cuda.is_available():
130
+ self.t2t_model.context_model = self.t2t_model.context_model.to('cuda')
131
+ self.t2t_model.answer_model = self.t2t_model.answer_model.to('cuda')
132
+ print("[T2T] βœ“ Models loaded")
133
 
134
  def _load_c2c_model(self):
135
  """Load Rosetta (C2C) model."""
 
186
  self.c2c_model, self.c2c_tokenizer = load_rosetta_model(
187
  model_config, eval_config, self.device
188
  )
189
+ # Move to CUDA if available (following HuggingFace ZeroGPU pattern)
190
+ if torch.cuda.is_available():
191
+ self.c2c_model = self.c2c_model.to('cuda')
192
+ print("[C2C] βœ“ Model loaded")
193
 
194
  def _load_all_models(self):
195
  """Load all models sequentially."""
 
231
  @spaces.GPU(duration=30)
232
  def generate_single(self, user_input: str) -> Generator[str, None, None]:
233
  """Generate response from single model with streaming."""
 
 
 
 
 
 
234
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
235
  text = self.single_tokenizer.apply_chat_template(
236
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
237
  )
238
+ # Use the model's actual device (ZeroGPU handles device placement)
239
+ inputs = self.single_tokenizer(text, return_tensors="pt").to(self.single_model.device)
240
 
241
  # Setup streamer
242
  streamer = TextIteratorStreamer(
 
266
  @spaces.GPU(duration=90)
267
  def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
268
  """Generate response from T2T model with streaming (returns context, answer)."""
 
 
 
 
 
 
 
 
 
269
  # Stage 1: Context generation
270
  context_streamer = TextIteratorStreamer(
271
  self.t2t_model.context_tokenizer,
 
280
  add_generation_prompt=True,
281
  return_tensors="pt",
282
  enable_thinking=False
283
+ ).to(self.t2t_model.context_model.device)
284
 
285
  generation_kwargs = {
286
  'input_ids': inputs,
 
329
  add_generation_prompt=True,
330
  return_tensors="pt",
331
  enable_thinking=False
332
+ ).to(self.t2t_model.answer_model.device)
333
 
334
  generation_kwargs = {
335
  'input_ids': inputs,
 
350
  @spaces.GPU(duration=30)
351
  def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
352
  """Generate response from C2C model with streaming."""
 
 
 
 
 
 
353
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
354
  text = self.c2c_tokenizer.apply_chat_template(
355
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
356
  )
357
+ # Use the model's actual device (ZeroGPU handles device placement)
358
+ inputs = self.c2c_tokenizer(text, return_tensors="pt").to(self.c2c_model.device)
359
 
360
  # Setup streamer
361
  streamer = TextIteratorStreamer(
 
368
  full_length = inputs.input_ids.shape[1]
369
  instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
370
  full_length - 1, 1
371
+ ).unsqueeze(0).to(self.c2c_model.device)
372
  label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
373
  1, 1
374
+ ).unsqueeze(0).to(self.c2c_model.device)
375
  position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
376
+ torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(self.c2c_model.device)
377
 
378
  # Generation parameters
379
  generation_kwargs = {