Spaces:

GlobalStudio
/

starflow

Running on Zero

leoeric commited on 25 days ago

Commit

b9eaf60

1 Parent(s): fa69135

Improve log visibility and error reporting for GPU abort debugging

- Add detailed error messages showing STDERR, STDOUT, and log file content
- Show log file location at start of generation
- Add clear instructions on where to find logs (Files tab vs Logs tab)
- Include environment variables and GPU info in log file
- Show more context in error messages (last 5000 chars instead of 3000)
- Better detection of silent process kills

Files changed (1) hide show

app.py +70 -12

app.py CHANGED Viewed

@@ -226,7 +226,10 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
         # Create log file for debugging
         log_file = output_dir / "generation.log"
-        status_msg += f"📋 Logs will be saved to: {log_file}\n"
         # Ensure GPU environment variables are passed to subprocess
         env = os.environ.copy()
@@ -253,6 +256,7 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
         # Run with timeout (45 minutes max - allows for download + generation)
         # Capture output and write to log file
         result = subprocess.run(
             cmd,
             capture_output=True,
@@ -262,15 +266,32 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
             timeout=2700
         )
-        # Write to log file
         with open(log_file, 'w') as log:
             log.write("=== GENERATION LOG ===\n\n")
             log.write(f"Command: {' '.join(cmd)}\n\n")
             log.write("=== STDOUT ===\n")
-            log.write(result.stdout)
             log.write("\n\n=== STDERR ===\n")
-            log.write(result.stderr)
             log.write(f"\n\n=== RETURN CODE: {result.returncode} ===\n")
         # Read log file for detailed output
         log_content = ""
@@ -299,16 +320,53 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
                 error_msg += "- Consider using paid GPU tier for longer runs\n"
                 error_msg += "- If issue persists, model may be too large for available GPU\n\n"
-            error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
-            error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
             if log_content:
-                # Show last 3000 chars of log for more context
-                error_msg += f"📋 Last 3000 characters from log file ({log_file}):\n"
-                error_msg += f"{'='*60}\n"
-                error_msg += log_content[-3000:]
-                error_msg += f"\n{'='*60}\n"
             else:
-                error_msg += f"⚠️  Log file not found at: {log_file}\n"
             return None, error_msg
         status_msg += "Generation complete. Looking for output...\n"

         # Create log file for debugging
         log_file = output_dir / "generation.log"
+        status_msg += f"\n📋 LOG FILE LOCATION:\n"
+        status_msg += f"   File: {log_file}\n"
+        status_msg += f"   View in Space: Files tab → outputs → generation.log\n"
+        status_msg += f"   (Logs are written in real-time during generation)\n\n"
         # Ensure GPU environment variables are passed to subprocess
         env = os.environ.copy()
         # Run with timeout (45 minutes max - allows for download + generation)
         # Capture output and write to log file
+        # Note: If process is killed (e.g., GPU abort), we still capture what was output
         result = subprocess.run(
             cmd,
             capture_output=True,
             timeout=2700
         )
+        # Write comprehensive log file
         with open(log_file, 'w') as log:
             log.write("=== GENERATION LOG ===\n\n")
             log.write(f"Command: {' '.join(cmd)}\n\n")
+            log.write(f"Environment Variables:\n")
+            log.write(f"  CUDA_VISIBLE_DEVICES={env.get('CUDA_VISIBLE_DEVICES', 'not set')}\n")
+            log.write(f"  CUDA_AVAILABLE={torch.cuda.is_available()}\n")
+            if torch.cuda.is_available():
+                log.write(f"  GPU_NAME={torch.cuda.get_device_name(0)}\n")
+                log.write(f"  GPU_MEMORY_TOTAL={torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB\n")
+            log.write(f"\n")
             log.write("=== STDOUT ===\n")
+            log.write(result.stdout if result.stdout else "(empty)\n")
             log.write("\n\n=== STDERR ===\n")
+            log.write(result.stderr if result.stderr else "(empty)\n")
             log.write(f"\n\n=== RETURN CODE: {result.returncode} ===\n")
+            # Add note about GPU abort
+            if result.returncode != 0:
+                log.write(f"\n⚠️  PROCESS FAILED WITH RETURN CODE {result.returncode}\n")
+                log.write("This could indicate:\n")
+                log.write("- GPU abort/timeout\n")
+                log.write("- CUDA out of memory\n")
+                log.write("- Process killed by system\n")
+                log.write("- Model loading error\n")
+                log.write("\nCheck the STDERR section above for detailed error messages.\n")
         # Read log file for detailed output
         log_content = ""
                 error_msg += "- Consider using paid GPU tier for longer runs\n"
                 error_msg += "- If issue persists, model may be too large for available GPU\n\n"
+            # Show detailed error information
+            error_msg += f"\n{'='*80}\n"
+            error_msg += f"📋 DETAILED ERROR LOGS\n"
+            error_msg += f"{'='*80}\n\n"
+            # Show return code and command
+            error_msg += f"Return Code: {result.returncode}\n"
+            error_msg += f"Command: {' '.join(cmd)}\n\n"
+            # Show STDERR (usually contains the actual error)
+            if result.stderr:
+                error_msg += f"=== STDERR (Error Output) ===\n"
+                error_msg += f"{result.stderr}\n\n"
+            else:
+                error_msg += f"⚠️  No STDERR output (process may have been killed silently)\n\n"
+            # Show STDOUT (may contain useful info)
+            if result.stdout:
+                error_msg += f"=== STDOUT (Standard Output) ===\n"
+                # Show last 5000 chars of stdout
+                stdout_preview = result.stdout[-5000:] if len(result.stdout) > 5000 else result.stdout
+                error_msg += f"{stdout_preview}\n\n"
+            # Show log file content if available
             if log_content:
+                error_msg += f"=== LOG FILE CONTENT ({log_file}) ===\n"
+                # Show last 5000 chars of log
+                log_preview = log_content[-5000:] if len(log_content) > 5000 else log_content
+                error_msg += f"{log_preview}\n\n"
             else:
+                error_msg += f"⚠️  Log file not found at: {log_file}\n\n"
+            # Instructions on where to find logs
+            error_msg += f"{'='*80}\n"
+            error_msg += f"📁 HOW TO VIEW FULL LOGS:\n"
+            error_msg += f"{'='*80}\n"
+            error_msg += f"OPTION 1 - Space Files Tab (Recommended):\n"
+            error_msg += f"  1. Click 'Files' tab in your Space\n"
+            error_msg += f"  2. Navigate to: outputs/generation.log\n"
+            error_msg += f"  3. Click to view/download the full log\n\n"
+            error_msg += f"OPTION 2 - Space Logs Tab:\n"
+            error_msg += f"  1. Click 'Logs' tab in your Space\n"
+            error_msg += f"  2. Look for messages starting with '[sample.py]'\n"
+            error_msg += f"  3. Check for GPU abort or CUDA errors\n\n"
+            error_msg += f"Full log path: {log_file}\n"
+            error_msg += f"{'='*80}\n"
             return None, error_msg
         status_msg += "Generation complete. Looking for output...\n"