Omini3D / bash_train_stratB.sh
maxmo2009's picture
Sync from local: code + epoch-110 checkpoint, clean README
2af0e94 verified
#!/bin/bash -l
#SBATCH --job-name=om-stratB
#SBATCH --account=AIRR-P51-DAWN-GPU
#SBATCH --partition=pvc9
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=12
#SBATCH --time=36:00:00
#SBATCH --output=Logs/stratB_%j.out
#SBATCH --error=Logs/stratB_%j.err
# Strategy B: Validation diagnostic — run WITHOUT proactive restart to validate
# memory stability and all 3 training modes (diffusion + contrastive + registration).
# Uses real data so registration can activate (requires loss_gen_a < -0.6).
# --no-save prevents checkpoint writes to avoid contaminating Strategy A.
# 1 node x 8 XPU tiles = 8 tiles.
cd /rds/project/rds-TWhPgQVLKbA/Code/OmniMorph
. /etc/profile.d/modules.sh
module purge
module load rhel9/default-dawn
source ~/miniconda3/etc/profile.d/conda.sh
conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu
# --- CCL/MPI setup ---
export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so
export I_MPI_HYDRA_BOOTSTRAP=slurm
export CCL_WORKER_AFFINITY=auto
# Increase IPC handle cache to avoid driver segfault after ~400 DDP steps.
# Default 1000 handles fills up, eviction triggers GPU segfault (drm_neo.cpp:288).
export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=10000
# --- XPU memory allocator ---
export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
# --- Multi-node setup ---
export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1)
export MASTER_PORT=12356
echo "============================================"
echo "STRATEGY B: Leak rate diagnostic (no restart)"
echo "Job ID: $SLURM_JOB_ID"
echo "Nodes: $SLURM_NNODES"
echo "Tasks/node: $SLURM_NTASKS_PER_NODE"
echo "Total tasks: $SLURM_NTASKS"
echo "Master: $MASTER_ADDR:$MASTER_PORT"
echo "============================================"
# UpsampleConv has zero memory leak — no proactive restart needed.
# Runs continuously until walltime or completion.
srun --kill-on-bad-exit=1 bash -c '
export LOCAL_RANK=$SLURM_LOCALID
export RANK=$SLURM_PROCID
export WORLD_SIZE=$SLURM_NTASKS
export MASTER_ADDR='"$MASTER_ADDR"'
export MASTER_PORT='"$MASTER_PORT"'
python OM_train_3modes.py -C Config/config_om.yaml --batchsize 2 --max-steps-before-restart 0
'
EXIT_CODE=$?
echo "=== Strategy B finished with exit code $EXIT_CODE at $(date) ==="