#!/bin/bash -l #SBATCH --job-name=om-stratB #SBATCH --account=AIRR-P51-DAWN-GPU #SBATCH --partition=pvc9 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=12 #SBATCH --time=36:00:00 #SBATCH --output=Logs/stratB_%j.out #SBATCH --error=Logs/stratB_%j.err # Strategy B: Validation diagnostic — run WITHOUT proactive restart to validate # memory stability and all 3 training modes (diffusion + contrastive + registration). # Uses real data so registration can activate (requires loss_gen_a < -0.6). # --no-save prevents checkpoint writes to avoid contaminating Strategy A. # 1 node x 8 XPU tiles = 8 tiles. cd /rds/project/rds-TWhPgQVLKbA/Code/OmniMorph . /etc/profile.d/modules.sh module purge module load rhel9/default-dawn source ~/miniconda3/etc/profile.d/conda.sh conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu # --- CCL/MPI setup --- export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so export I_MPI_HYDRA_BOOTSTRAP=slurm export CCL_WORKER_AFFINITY=auto # Increase IPC handle cache to avoid driver segfault after ~400 DDP steps. # Default 1000 handles fills up, eviction triggers GPU segfault (drm_neo.cpp:288). export CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD=10000 # --- XPU memory allocator --- export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512 # --- Multi-node setup --- export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n1) export MASTER_PORT=12356 echo "============================================" echo "STRATEGY B: Leak rate diagnostic (no restart)" echo "Job ID: $SLURM_JOB_ID" echo "Nodes: $SLURM_NNODES" echo "Tasks/node: $SLURM_NTASKS_PER_NODE" echo "Total tasks: $SLURM_NTASKS" echo "Master: $MASTER_ADDR:$MASTER_PORT" echo "============================================" # UpsampleConv has zero memory leak — no proactive restart needed. # Runs continuously until walltime or completion. srun --kill-on-bad-exit=1 bash -c ' export LOCAL_RANK=$SLURM_LOCALID export RANK=$SLURM_PROCID export WORLD_SIZE=$SLURM_NTASKS export MASTER_ADDR='"$MASTER_ADDR"' export MASTER_PORT='"$MASTER_PORT"' python OM_train_3modes.py -C Config/config_om.yaml --batchsize 2 --max-steps-before-restart 0 ' EXIT_CODE=$? echo "=== Strategy B finished with exit code $EXIT_CODE at $(date) ==="