LVP / configurations /cluster /tianyuan_requeue.yaml
kiwhansong's picture
add demo
142a1ac
defaults:
- base_slurm
- _self_
params:
partition: kempner_requeue # e.g. kempner_h100
account: kempner_sham_lab # e.g. kempner_sham_lab
env_name: ei_world_model
num_gpus: 4
num_cpus: 48
memory: 256G
time: "3-00:00:00"
launch_template: |
#!/bin/bash
#SBATCH -J {name}
#SBATCH -o {log_dir}/out_%j.out
#SBATCH -e {log_dir}/error_%j.err
#SBATCH --mail-user={email}
#SBATCH --mail-type=FAIL
#SBATCH --account={account}
#SBATCH --partition={partition}
#SBATCH --nodes=${experiment.num_nodes}
#SBATCH --ntasks-per-node={num_gpus}
#SBATCH --gres=gpu:nvidia_h100_80gb_hbm3:{num_gpus}
#SBATCH --cpus-per-task=12
#SBATCH --mem={memory}
#SBATCH --time={time}
# export NCCL_DEBUG=INFO
# export PYTHONFAULTHANDLER=1
cd {project_root}
module load Mambaforge
mamba deactivate
mamba activate {env_name}
module load cuda/12.4.1-fasrc01
module load cudnn
module load gcc/9.5.0-fasrc01
srun python -m main {python_args}