| #SBATCH --job-name=xpu-leak-diag | |
| #SBATCH --account=AIRR-P51-DAWN-GPU | |
| #SBATCH --partition=pvc9 | |
| #SBATCH --nodes=1 | |
| #SBATCH --gres=gpu:1 | |
| #SBATCH --ntasks-per-node=1 | |
| #SBATCH --cpus-per-task=12 | |
| #SBATCH --time=01:00:00 | |
| #SBATCH --output=Logs/diagnose_leak_%j.out | |
| #SBATCH --error=Logs/diagnose_leak_%j.err | |
| . /etc/profile.d/modules.sh | |
| module purge | |
| module load rhel9/default-dawn | |
| source ~/miniconda3/etc/profile.d/conda.sh | |
| conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu | |
| export CCL_WORKER_AFFINITY=auto | |
| export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so | |
| # Run all tests sequentially on a single XPU tile | |
| python tests/diagnose_xpu_leak.py --test 1 | |
| python tests/diagnose_xpu_leak.py --test 2 | |
| python tests/diagnose_xpu_leak.py --test 3 | |
| python tests/diagnose_xpu_leak.py --test 4 | |