leoeric commited on
Commit
b9eaf60
Β·
1 Parent(s): fa69135

Improve log visibility and error reporting for GPU abort debugging

Browse files

- Add detailed error messages showing STDERR, STDOUT, and log file content
- Show log file location at start of generation
- Add clear instructions on where to find logs (Files tab vs Logs tab)
- Include environment variables and GPU info in log file
- Show more context in error messages (last 5000 chars instead of 3000)
- Better detection of silent process kills

Files changed (1) hide show
  1. app.py +70 -12
app.py CHANGED
@@ -226,7 +226,10 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
226
 
227
  # Create log file for debugging
228
  log_file = output_dir / "generation.log"
229
- status_msg += f"πŸ“‹ Logs will be saved to: {log_file}\n"
 
 
 
230
 
231
  # Ensure GPU environment variables are passed to subprocess
232
  env = os.environ.copy()
@@ -253,6 +256,7 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
253
 
254
  # Run with timeout (45 minutes max - allows for download + generation)
255
  # Capture output and write to log file
 
256
  result = subprocess.run(
257
  cmd,
258
  capture_output=True,
@@ -262,15 +266,32 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
262
  timeout=2700
263
  )
264
 
265
- # Write to log file
266
  with open(log_file, 'w') as log:
267
  log.write("=== GENERATION LOG ===\n\n")
268
  log.write(f"Command: {' '.join(cmd)}\n\n")
 
 
 
 
 
 
 
269
  log.write("=== STDOUT ===\n")
270
- log.write(result.stdout)
271
  log.write("\n\n=== STDERR ===\n")
272
- log.write(result.stderr)
273
  log.write(f"\n\n=== RETURN CODE: {result.returncode} ===\n")
 
 
 
 
 
 
 
 
 
 
274
 
275
  # Read log file for detailed output
276
  log_content = ""
@@ -299,16 +320,53 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
299
  error_msg += "- Consider using paid GPU tier for longer runs\n"
300
  error_msg += "- If issue persists, model may be too large for available GPU\n\n"
301
 
302
- error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
303
- error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  if log_content:
305
- # Show last 3000 chars of log for more context
306
- error_msg += f"πŸ“‹ Last 3000 characters from log file ({log_file}):\n"
307
- error_msg += f"{'='*60}\n"
308
- error_msg += log_content[-3000:]
309
- error_msg += f"\n{'='*60}\n"
310
  else:
311
- error_msg += f"⚠️ Log file not found at: {log_file}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  return None, error_msg
313
 
314
  status_msg += "Generation complete. Looking for output...\n"
 
226
 
227
  # Create log file for debugging
228
  log_file = output_dir / "generation.log"
229
+ status_msg += f"\nπŸ“‹ LOG FILE LOCATION:\n"
230
+ status_msg += f" File: {log_file}\n"
231
+ status_msg += f" View in Space: Files tab β†’ outputs β†’ generation.log\n"
232
+ status_msg += f" (Logs are written in real-time during generation)\n\n"
233
 
234
  # Ensure GPU environment variables are passed to subprocess
235
  env = os.environ.copy()
 
256
 
257
  # Run with timeout (45 minutes max - allows for download + generation)
258
  # Capture output and write to log file
259
+ # Note: If process is killed (e.g., GPU abort), we still capture what was output
260
  result = subprocess.run(
261
  cmd,
262
  capture_output=True,
 
266
  timeout=2700
267
  )
268
 
269
+ # Write comprehensive log file
270
  with open(log_file, 'w') as log:
271
  log.write("=== GENERATION LOG ===\n\n")
272
  log.write(f"Command: {' '.join(cmd)}\n\n")
273
+ log.write(f"Environment Variables:\n")
274
+ log.write(f" CUDA_VISIBLE_DEVICES={env.get('CUDA_VISIBLE_DEVICES', 'not set')}\n")
275
+ log.write(f" CUDA_AVAILABLE={torch.cuda.is_available()}\n")
276
+ if torch.cuda.is_available():
277
+ log.write(f" GPU_NAME={torch.cuda.get_device_name(0)}\n")
278
+ log.write(f" GPU_MEMORY_TOTAL={torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB\n")
279
+ log.write(f"\n")
280
  log.write("=== STDOUT ===\n")
281
+ log.write(result.stdout if result.stdout else "(empty)\n")
282
  log.write("\n\n=== STDERR ===\n")
283
+ log.write(result.stderr if result.stderr else "(empty)\n")
284
  log.write(f"\n\n=== RETURN CODE: {result.returncode} ===\n")
285
+
286
+ # Add note about GPU abort
287
+ if result.returncode != 0:
288
+ log.write(f"\n⚠️ PROCESS FAILED WITH RETURN CODE {result.returncode}\n")
289
+ log.write("This could indicate:\n")
290
+ log.write("- GPU abort/timeout\n")
291
+ log.write("- CUDA out of memory\n")
292
+ log.write("- Process killed by system\n")
293
+ log.write("- Model loading error\n")
294
+ log.write("\nCheck the STDERR section above for detailed error messages.\n")
295
 
296
  # Read log file for detailed output
297
  log_content = ""
 
320
  error_msg += "- Consider using paid GPU tier for longer runs\n"
321
  error_msg += "- If issue persists, model may be too large for available GPU\n\n"
322
 
323
+ # Show detailed error information
324
+ error_msg += f"\n{'='*80}\n"
325
+ error_msg += f"πŸ“‹ DETAILED ERROR LOGS\n"
326
+ error_msg += f"{'='*80}\n\n"
327
+
328
+ # Show return code and command
329
+ error_msg += f"Return Code: {result.returncode}\n"
330
+ error_msg += f"Command: {' '.join(cmd)}\n\n"
331
+
332
+ # Show STDERR (usually contains the actual error)
333
+ if result.stderr:
334
+ error_msg += f"=== STDERR (Error Output) ===\n"
335
+ error_msg += f"{result.stderr}\n\n"
336
+ else:
337
+ error_msg += f"⚠️ No STDERR output (process may have been killed silently)\n\n"
338
+
339
+ # Show STDOUT (may contain useful info)
340
+ if result.stdout:
341
+ error_msg += f"=== STDOUT (Standard Output) ===\n"
342
+ # Show last 5000 chars of stdout
343
+ stdout_preview = result.stdout[-5000:] if len(result.stdout) > 5000 else result.stdout
344
+ error_msg += f"{stdout_preview}\n\n"
345
+
346
+ # Show log file content if available
347
  if log_content:
348
+ error_msg += f"=== LOG FILE CONTENT ({log_file}) ===\n"
349
+ # Show last 5000 chars of log
350
+ log_preview = log_content[-5000:] if len(log_content) > 5000 else log_content
351
+ error_msg += f"{log_preview}\n\n"
 
352
  else:
353
+ error_msg += f"⚠️ Log file not found at: {log_file}\n\n"
354
+
355
+ # Instructions on where to find logs
356
+ error_msg += f"{'='*80}\n"
357
+ error_msg += f"πŸ“ HOW TO VIEW FULL LOGS:\n"
358
+ error_msg += f"{'='*80}\n"
359
+ error_msg += f"OPTION 1 - Space Files Tab (Recommended):\n"
360
+ error_msg += f" 1. Click 'Files' tab in your Space\n"
361
+ error_msg += f" 2. Navigate to: outputs/generation.log\n"
362
+ error_msg += f" 3. Click to view/download the full log\n\n"
363
+ error_msg += f"OPTION 2 - Space Logs Tab:\n"
364
+ error_msg += f" 1. Click 'Logs' tab in your Space\n"
365
+ error_msg += f" 2. Look for messages starting with '[sample.py]'\n"
366
+ error_msg += f" 3. Check for GPU abort or CUDA errors\n\n"
367
+ error_msg += f"Full log path: {log_file}\n"
368
+ error_msg += f"{'='*80}\n"
369
+
370
  return None, error_msg
371
 
372
  status_msg += "Generation complete. Looking for output...\n"