#!/bin/bash -l #SBATCH --job-name=ccl-stress #SBATCH --account=AIRR-P51-DAWN-GPU #SBATCH --partition=pvc9 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=12 #SBATCH --time=02:00:00 #SBATCH --output=Logs/test_ccl_%j.out #SBATCH --error=Logs/test_ccl_%j.err #SBATCH --exclude=pvc-s-135 # CCL Stress Test: isolate epoch-boundary DDP hangs on Intel XPU. # # 1 node x 4 XPU cards x 2 tiles/card = 8 XPU tiles # srun timeout = 1h (enough for the full test; if it hangs, the timeout kills it # and the last logged "PHASE N START" reveals the failing phase). cd /rds/project/rds-TWhPgQVLKbA/Code/OmniMorph . /etc/profile.d/modules.sh module purge module load rhel9/default-dawn source ~/miniconda3/etc/profile.d/conda.sh conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu # --- CCL/MPI setup --- export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so export I_MPI_HYDRA_BOOTSTRAP=slurm export CCL_WORKER_AFFINITY=auto # Match production training threshold export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=100000 # --- XPU memory allocator --- export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 # --- Multi-node setup --- export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1) export MASTER_PORT=12355 SRUN_TIMEOUT=3600 # 1h timeout for the test echo "============================================" echo "CCL Stress Test" echo "============================================" echo "Job ID: $SLURM_JOB_ID" echo "Nodes: $SLURM_NNODES" echo "Tasks/node: $SLURM_NTASKS_PER_NODE" echo "Total tasks: $SLURM_NTASKS" echo "CPUs/task: $SLURM_CPUS_PER_TASK" echo "Master: $MASTER_ADDR:$MASTER_PORT" echo "Node list: $SLURM_NODELIST" echo "Walltime: 02:00:00" echo "Timeout/srun: ${SRUN_TIMEOUT}s" echo "CCL_ZE_CACHE: $CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD" echo "============================================" echo "Start time: $(date)" echo "============================================" timeout $SRUN_TIMEOUT srun --kill-on-bad-exit=1 bash -c ' export LOCAL_RANK=$SLURM_LOCALID export RANK=$SLURM_PROCID export WORLD_SIZE=$SLURM_NTASKS export MASTER_ADDR='"$MASTER_ADDR"' export MASTER_PORT='"$MASTER_PORT"' python tests/test_ccl_stress.py \ --spatial-size 64 \ --batch-size 2 \ --phase1-steps 200 \ --phase3-steps 10 \ --phase4-steps 10 \ --phase5-broadcasts 50 ' EXIT_CODE=$? echo "============================================" echo "End time: $(date)" echo "Exit code: $EXIT_CODE" if [ $EXIT_CODE -eq 0 ]; then echo "Result: ALL PHASES PASSED" elif [ $EXIT_CODE -eq 124 ]; then echo "Result: TIMEOUT (CCL hang detected)" echo " Check the last 'PHASE N START' in the log to identify the hanging phase." else echo "Result: FAILED (exit code $EXIT_CODE)" fi echo "============================================"