#!/bin/bash -l #SBATCH --job-name=xpu-leak-diag #SBATCH --account=AIRR-P51-DAWN-GPU #SBATCH --partition=pvc9 #SBATCH --nodes=1 #SBATCH --gres=gpu:1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=12 #SBATCH --time=01:00:00 #SBATCH --output=Logs/diagnose_leak_%j.out #SBATCH --error=Logs/diagnose_leak_%j.err . /etc/profile.d/modules.sh module purge module load rhel9/default-dawn source ~/miniconda3/etc/profile.d/conda.sh conda activate ~/rds/rds-airr-p51-TWhPgQVLKbA/Env/pub_env/pytorch-xpu export CCL_WORKER_AFFINITY=auto export I_MPI_PMI_LIBRARY=/usr/local/software/slurm/current-rhel8/lib/libpmi2.so # Run all tests sequentially on a single XPU tile python tests/diagnose_xpu_leak.py --test 1 python tests/diagnose_xpu_leak.py --test 2 python tests/diagnose_xpu_leak.py --test 3 python tests/diagnose_xpu_leak.py --test 4