Spaces:
Running
on
Zero
Running
on
Zero
Improve log visibility and error reporting for GPU abort debugging
Browse files- Add detailed error messages showing STDERR, STDOUT, and log file content
- Show log file location at start of generation
- Add clear instructions on where to find logs (Files tab vs Logs tab)
- Include environment variables and GPU info in log file
- Show more context in error messages (last 5000 chars instead of 3000)
- Better detection of silent process kills
app.py
CHANGED
|
@@ -226,7 +226,10 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
|
|
| 226 |
|
| 227 |
# Create log file for debugging
|
| 228 |
log_file = output_dir / "generation.log"
|
| 229 |
-
status_msg += f"π
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
# Ensure GPU environment variables are passed to subprocess
|
| 232 |
env = os.environ.copy()
|
|
@@ -253,6 +256,7 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
|
|
| 253 |
|
| 254 |
# Run with timeout (45 minutes max - allows for download + generation)
|
| 255 |
# Capture output and write to log file
|
|
|
|
| 256 |
result = subprocess.run(
|
| 257 |
cmd,
|
| 258 |
capture_output=True,
|
|
@@ -262,15 +266,32 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
|
|
| 262 |
timeout=2700
|
| 263 |
)
|
| 264 |
|
| 265 |
-
# Write
|
| 266 |
with open(log_file, 'w') as log:
|
| 267 |
log.write("=== GENERATION LOG ===\n\n")
|
| 268 |
log.write(f"Command: {' '.join(cmd)}\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
log.write("=== STDOUT ===\n")
|
| 270 |
-
log.write(result.stdout)
|
| 271 |
log.write("\n\n=== STDERR ===\n")
|
| 272 |
-
log.write(result.stderr)
|
| 273 |
log.write(f"\n\n=== RETURN CODE: {result.returncode} ===\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
# Read log file for detailed output
|
| 276 |
log_content = ""
|
|
@@ -299,16 +320,53 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
|
|
| 299 |
error_msg += "- Consider using paid GPU tier for longer runs\n"
|
| 300 |
error_msg += "- If issue persists, model may be too large for available GPU\n\n"
|
| 301 |
|
| 302 |
-
|
| 303 |
-
error_msg += f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
if log_content:
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
error_msg +=
|
| 309 |
-
error_msg += f"\n{'='*60}\n"
|
| 310 |
else:
|
| 311 |
-
error_msg += f"β οΈ Log file not found at: {log_file}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
return None, error_msg
|
| 313 |
|
| 314 |
status_msg += "Generation complete. Looking for output...\n"
|
|
|
|
| 226 |
|
| 227 |
# Create log file for debugging
|
| 228 |
log_file = output_dir / "generation.log"
|
| 229 |
+
status_msg += f"\nπ LOG FILE LOCATION:\n"
|
| 230 |
+
status_msg += f" File: {log_file}\n"
|
| 231 |
+
status_msg += f" View in Space: Files tab β outputs β generation.log\n"
|
| 232 |
+
status_msg += f" (Logs are written in real-time during generation)\n\n"
|
| 233 |
|
| 234 |
# Ensure GPU environment variables are passed to subprocess
|
| 235 |
env = os.environ.copy()
|
|
|
|
| 256 |
|
| 257 |
# Run with timeout (45 minutes max - allows for download + generation)
|
| 258 |
# Capture output and write to log file
|
| 259 |
+
# Note: If process is killed (e.g., GPU abort), we still capture what was output
|
| 260 |
result = subprocess.run(
|
| 261 |
cmd,
|
| 262 |
capture_output=True,
|
|
|
|
| 266 |
timeout=2700
|
| 267 |
)
|
| 268 |
|
| 269 |
+
# Write comprehensive log file
|
| 270 |
with open(log_file, 'w') as log:
|
| 271 |
log.write("=== GENERATION LOG ===\n\n")
|
| 272 |
log.write(f"Command: {' '.join(cmd)}\n\n")
|
| 273 |
+
log.write(f"Environment Variables:\n")
|
| 274 |
+
log.write(f" CUDA_VISIBLE_DEVICES={env.get('CUDA_VISIBLE_DEVICES', 'not set')}\n")
|
| 275 |
+
log.write(f" CUDA_AVAILABLE={torch.cuda.is_available()}\n")
|
| 276 |
+
if torch.cuda.is_available():
|
| 277 |
+
log.write(f" GPU_NAME={torch.cuda.get_device_name(0)}\n")
|
| 278 |
+
log.write(f" GPU_MEMORY_TOTAL={torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB\n")
|
| 279 |
+
log.write(f"\n")
|
| 280 |
log.write("=== STDOUT ===\n")
|
| 281 |
+
log.write(result.stdout if result.stdout else "(empty)\n")
|
| 282 |
log.write("\n\n=== STDERR ===\n")
|
| 283 |
+
log.write(result.stderr if result.stderr else "(empty)\n")
|
| 284 |
log.write(f"\n\n=== RETURN CODE: {result.returncode} ===\n")
|
| 285 |
+
|
| 286 |
+
# Add note about GPU abort
|
| 287 |
+
if result.returncode != 0:
|
| 288 |
+
log.write(f"\nβ οΈ PROCESS FAILED WITH RETURN CODE {result.returncode}\n")
|
| 289 |
+
log.write("This could indicate:\n")
|
| 290 |
+
log.write("- GPU abort/timeout\n")
|
| 291 |
+
log.write("- CUDA out of memory\n")
|
| 292 |
+
log.write("- Process killed by system\n")
|
| 293 |
+
log.write("- Model loading error\n")
|
| 294 |
+
log.write("\nCheck the STDERR section above for detailed error messages.\n")
|
| 295 |
|
| 296 |
# Read log file for detailed output
|
| 297 |
log_content = ""
|
|
|
|
| 320 |
error_msg += "- Consider using paid GPU tier for longer runs\n"
|
| 321 |
error_msg += "- If issue persists, model may be too large for available GPU\n\n"
|
| 322 |
|
| 323 |
+
# Show detailed error information
|
| 324 |
+
error_msg += f"\n{'='*80}\n"
|
| 325 |
+
error_msg += f"π DETAILED ERROR LOGS\n"
|
| 326 |
+
error_msg += f"{'='*80}\n\n"
|
| 327 |
+
|
| 328 |
+
# Show return code and command
|
| 329 |
+
error_msg += f"Return Code: {result.returncode}\n"
|
| 330 |
+
error_msg += f"Command: {' '.join(cmd)}\n\n"
|
| 331 |
+
|
| 332 |
+
# Show STDERR (usually contains the actual error)
|
| 333 |
+
if result.stderr:
|
| 334 |
+
error_msg += f"=== STDERR (Error Output) ===\n"
|
| 335 |
+
error_msg += f"{result.stderr}\n\n"
|
| 336 |
+
else:
|
| 337 |
+
error_msg += f"β οΈ No STDERR output (process may have been killed silently)\n\n"
|
| 338 |
+
|
| 339 |
+
# Show STDOUT (may contain useful info)
|
| 340 |
+
if result.stdout:
|
| 341 |
+
error_msg += f"=== STDOUT (Standard Output) ===\n"
|
| 342 |
+
# Show last 5000 chars of stdout
|
| 343 |
+
stdout_preview = result.stdout[-5000:] if len(result.stdout) > 5000 else result.stdout
|
| 344 |
+
error_msg += f"{stdout_preview}\n\n"
|
| 345 |
+
|
| 346 |
+
# Show log file content if available
|
| 347 |
if log_content:
|
| 348 |
+
error_msg += f"=== LOG FILE CONTENT ({log_file}) ===\n"
|
| 349 |
+
# Show last 5000 chars of log
|
| 350 |
+
log_preview = log_content[-5000:] if len(log_content) > 5000 else log_content
|
| 351 |
+
error_msg += f"{log_preview}\n\n"
|
|
|
|
| 352 |
else:
|
| 353 |
+
error_msg += f"β οΈ Log file not found at: {log_file}\n\n"
|
| 354 |
+
|
| 355 |
+
# Instructions on where to find logs
|
| 356 |
+
error_msg += f"{'='*80}\n"
|
| 357 |
+
error_msg += f"π HOW TO VIEW FULL LOGS:\n"
|
| 358 |
+
error_msg += f"{'='*80}\n"
|
| 359 |
+
error_msg += f"OPTION 1 - Space Files Tab (Recommended):\n"
|
| 360 |
+
error_msg += f" 1. Click 'Files' tab in your Space\n"
|
| 361 |
+
error_msg += f" 2. Navigate to: outputs/generation.log\n"
|
| 362 |
+
error_msg += f" 3. Click to view/download the full log\n\n"
|
| 363 |
+
error_msg += f"OPTION 2 - Space Logs Tab:\n"
|
| 364 |
+
error_msg += f" 1. Click 'Logs' tab in your Space\n"
|
| 365 |
+
error_msg += f" 2. Look for messages starting with '[sample.py]'\n"
|
| 366 |
+
error_msg += f" 3. Check for GPU abort or CUDA errors\n\n"
|
| 367 |
+
error_msg += f"Full log path: {log_file}\n"
|
| 368 |
+
error_msg += f"{'='*80}\n"
|
| 369 |
+
|
| 370 |
return None, error_msg
|
| 371 |
|
| 372 |
status_msg += "Generation complete. Looking for output...\n"
|