File size: 2,945 Bytes
2af0e94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | #!/bin/bash -l
#SBATCH --job-name=ccl-stress
#SBATCH --account=AIRR-P51-DAWN-GPU
#SBATCH --partition=pvc9
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=12
#SBATCH --time=02:00:00
#SBATCH --output=Logs/test_ccl_%j.out
#SBATCH --error=Logs/test_ccl_%j.err
#SBATCH --exclude=pvc-s-135
# CCL Stress Test: isolate epoch-boundary DDP hangs on Intel XPU.
#
# 1 node x 4 XPU cards x 2 tiles/card = 8 XPU tiles
# srun timeout = 1h (enough for the full test; if it hangs, the timeout kills it
# and the last logged "PHASE N START" reveals the failing phase).
cd /rds/project/rds-TWhPgQVLKbA/Code/OmniMorph
. /etc/profile.d/modules.sh
module purge
module load rhel9/default-dawn
source ~/miniconda3/etc/profile.d/conda.sh
conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu
# --- CCL/MPI setup ---
export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so
export I_MPI_HYDRA_BOOTSTRAP=slurm
export CCL_WORKER_AFFINITY=auto
# Match production training threshold
export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=100000
# --- XPU memory allocator ---
export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
# --- Multi-node setup ---
export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1)
export MASTER_PORT=12355
SRUN_TIMEOUT=3600 # 1h timeout for the test
echo "============================================"
echo "CCL Stress Test"
echo "============================================"
echo "Job ID: $SLURM_JOB_ID"
echo "Nodes: $SLURM_NNODES"
echo "Tasks/node: $SLURM_NTASKS_PER_NODE"
echo "Total tasks: $SLURM_NTASKS"
echo "CPUs/task: $SLURM_CPUS_PER_TASK"
echo "Master: $MASTER_ADDR:$MASTER_PORT"
echo "Node list: $SLURM_NODELIST"
echo "Walltime: 02:00:00"
echo "Timeout/srun: ${SRUN_TIMEOUT}s"
echo "CCL_ZE_CACHE: $CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD"
echo "============================================"
echo "Start time: $(date)"
echo "============================================"
timeout $SRUN_TIMEOUT srun --kill-on-bad-exit=1 bash -c '
export LOCAL_RANK=$SLURM_LOCALID
export RANK=$SLURM_PROCID
export WORLD_SIZE=$SLURM_NTASKS
export MASTER_ADDR='"$MASTER_ADDR"'
export MASTER_PORT='"$MASTER_PORT"'
python tests/test_ccl_stress.py \
--spatial-size 64 \
--batch-size 2 \
--phase1-steps 200 \
--phase3-steps 10 \
--phase4-steps 10 \
--phase5-broadcasts 50
'
EXIT_CODE=$?
echo "============================================"
echo "End time: $(date)"
echo "Exit code: $EXIT_CODE"
if [ $EXIT_CODE -eq 0 ]; then
echo "Result: ALL PHASES PASSED"
elif [ $EXIT_CODE -eq 124 ]; then
echo "Result: TIMEOUT (CCL hang detected)"
echo " Check the last 'PHASE N START' in the log to identify the hanging phase."
else
echo "Result: FAILED (exit code $EXIT_CODE)"
fi
echo "============================================"
|