File size: 2,646 Bytes
2d8a802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env bash

# Debug script: Run all debug tests in sequence

set -euo pipefail

echo "=== MegaBlocks Build Debugging Suite ==="
echo "Running progressive debug tests to identify build hang issue"
echo "ROCm installation: /opt/rocm-7.0.1"
echo

# Make all scripts executable
chmod +x debug-build-1-env.sh debug-build-2-hipcc.sh debug-build-3-torch-ext.sh debug-build-4-megablocks.sh

scripts=(
    "debug-build-1-env.sh"
    "debug-build-2-hipcc.sh"
    "debug-build-3-torch-ext.sh"
    "debug-build-4-megablocks.sh"
)

results=()
start_time=$(date +%s)

for script in "${scripts[@]}"; do
    echo
    echo "========================================"
    echo "Running $script"
    echo "========================================"

    script_start=$(date +%s)

    if ./"$script"; then
        script_end=$(date +%s)
        duration=$((script_end - script_start))
        echo "✓ $script completed successfully in ${duration}s"
        results+=("✓ $script: SUCCESS (${duration}s)")
    else
        script_end=$(date +%s)
        duration=$((script_end - script_start))
        echo "✗ $script failed in ${duration}s"
        results+=("✗ $script: FAILED (${duration}s)")
    fi

    echo "----------------------------------------"
done

end_time=$(date +%s)
total_duration=$((end_time - start_time))

echo
echo "========================================"
echo "SUMMARY REPORT"
echo "========================================"
echo "Total runtime: ${total_duration}s"
echo

for result in "${results[@]}"; do
    echo "$result"
done

echo
echo "=== Analysis ==="
echo "1. If debug-1-env.sh fails: ROCm installation/environment issue"
echo "2. If debug-2-hipcc.sh fails: HIP compiler issue"
echo "3. If debug-3-torch-ext.sh hangs: PyTorch extension compilation issue"
echo "4. If debug-4-megablocks.sh hangs: MegaBlocks-specific compilation issue"
echo
echo "=== Next Steps Based on Results ==="
echo "- If all pass: The issue may be intermittent or environment-specific"
echo "- If script 3 or 4 hangs: Run with strace to see where it hangs:"
echo "  strace -f -e trace=process,signal python3 build.py"
echo "- Check compilation log files in .torch_extensions for more details"
echo "- Consider using PYTORCH_JIT_LOG_LEVEL=1 for more verbose output"

echo
echo "=== Additional Debugging Commands ==="
echo "# Check for stuck processes:"
echo "ps aux | grep -E '(hipcc|hip-clang|python)'"
echo
echo "# Monitor system resources during build:"
echo "htop"
echo
echo "# Check for device issues:"
echo "dmesg | tail -20"
echo
echo "# Force clean rebuild:"
echo "rm -rf .torch_extensions* && ./build.sh"

echo
echo "Debug suite complete."