#!/usr/bin/env bash # Debug script: Run all debug tests in sequence set -euo pipefail echo "=== MegaBlocks Build Debugging Suite ===" echo "Running progressive debug tests to identify build hang issue" echo "ROCm installation: /opt/rocm-7.0.1" echo # Make all scripts executable chmod +x debug-build-1-env.sh debug-build-2-hipcc.sh debug-build-3-torch-ext.sh debug-build-4-megablocks.sh scripts=( "debug-build-1-env.sh" "debug-build-2-hipcc.sh" "debug-build-3-torch-ext.sh" "debug-build-4-megablocks.sh" ) results=() start_time=$(date +%s) for script in "${scripts[@]}"; do echo echo "========================================" echo "Running $script" echo "========================================" script_start=$(date +%s) if ./"$script"; then script_end=$(date +%s) duration=$((script_end - script_start)) echo "✓ $script completed successfully in ${duration}s" results+=("✓ $script: SUCCESS (${duration}s)") else script_end=$(date +%s) duration=$((script_end - script_start)) echo "✗ $script failed in ${duration}s" results+=("✗ $script: FAILED (${duration}s)") fi echo "----------------------------------------" done end_time=$(date +%s) total_duration=$((end_time - start_time)) echo echo "========================================" echo "SUMMARY REPORT" echo "========================================" echo "Total runtime: ${total_duration}s" echo for result in "${results[@]}"; do echo "$result" done echo echo "=== Analysis ===" echo "1. If debug-1-env.sh fails: ROCm installation/environment issue" echo "2. If debug-2-hipcc.sh fails: HIP compiler issue" echo "3. If debug-3-torch-ext.sh hangs: PyTorch extension compilation issue" echo "4. If debug-4-megablocks.sh hangs: MegaBlocks-specific compilation issue" echo echo "=== Next Steps Based on Results ===" echo "- If all pass: The issue may be intermittent or environment-specific" echo "- If script 3 or 4 hangs: Run with strace to see where it hangs:" echo " strace -f -e trace=process,signal python3 build.py" echo "- Check compilation log files in .torch_extensions for more details" echo "- Consider using PYTORCH_JIT_LOG_LEVEL=1 for more verbose output" echo echo "=== Additional Debugging Commands ===" echo "# Check for stuck processes:" echo "ps aux | grep -E '(hipcc|hip-clang|python)'" echo echo "# Monitor system resources during build:" echo "htop" echo echo "# Check for device issues:" echo "dmesg | tail -20" echo echo "# Force clean rebuild:" echo "rm -rf .torch_extensions* && ./build.sh" echo echo "Debug suite complete."