| #!/bin/bash -l |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| cd /rds/project/rds-TWhPgQVLKbA/Code/OmniMorph |
|
|
| . /etc/profile.d/modules.sh |
| module purge |
| module load rhel9/default-dawn |
|
|
| source ~/miniconda3/etc/profile.d/conda.sh |
| conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu |
|
|
| |
| export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so |
| export I_MPI_HYDRA_BOOTSTRAP=slurm |
| export CCL_WORKER_AFFINITY=auto |
| export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=100000 |
|
|
| |
| export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 |
|
|
| |
| export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1) |
| export MASTER_PORT=12355 |
|
|
| SRUN_TIMEOUT=3600 |
|
|
| echo "============================================" |
| echo "CCL Multi-Node Stress Test" |
| echo "============================================" |
| echo "Job ID: $SLURM_JOB_ID" |
| echo "Nodes: $SLURM_NNODES" |
| echo "Tasks/node: $SLURM_NTASKS_PER_NODE" |
| echo "Total tasks: $SLURM_NTASKS" |
| echo "CPUs/task: $SLURM_CPUS_PER_TASK" |
| echo "Master: $MASTER_ADDR:$MASTER_PORT" |
| echo "Node list: $SLURM_NODELIST" |
| echo "Walltime: 02:00:00" |
| echo "Timeout/srun: ${SRUN_TIMEOUT}s" |
| echo "CCL_ZE_CACHE: $CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD" |
| echo "============================================" |
| echo "Start time: $(date)" |
| echo "============================================" |
|
|
| timeout $SRUN_TIMEOUT srun --kill-on-bad-exit=1 bash -c ' |
| export LOCAL_RANK=$SLURM_LOCALID |
| export RANK=$SLURM_PROCID |
| export WORLD_SIZE=$SLURM_NTASKS |
| export MASTER_ADDR='"$MASTER_ADDR"' |
| export MASTER_PORT='"$MASTER_PORT"' |
| python tests/test_ccl_stress.py \ |
| --spatial-size 64 \ |
| --batch-size 2 \ |
| --phase1-steps 200 \ |
| --phase3-steps 10 \ |
| --phase4-steps 10 \ |
| --phase5-broadcasts 50 |
| ' |
| EXIT_CODE=$? |
|
|
| echo "============================================" |
| echo "End time: $(date)" |
| echo "Exit code: $EXIT_CODE" |
| if [ $EXIT_CODE -eq 0 ]; then |
| echo "Result: ALL PHASES PASSED" |
| echo " Multi-node CCL hang NOT reproduced with simple model." |
| echo " The issue may require the actual OM_net architecture or training loop specifics." |
| elif [ $EXIT_CODE -eq 124 ]; then |
| echo "Result: TIMEOUT (CCL hang detected)" |
| echo " Check the last 'PHASE N START' in the log to identify the hanging phase." |
| echo " This confirms the hang is in the CCL inter-node (OFI) transport." |
| else |
| echo "Result: FAILED (exit code $EXIT_CODE)" |
| fi |
| echo "============================================" |
|
|