| #SBATCH --job-name=new_coupling # Name of the job | |
| #SBATCH --output=outs/new_coupling.out # Stdout goes to logs/jobname_jobid.out | |
| #SBATCH --error=outs/new_coupling.err # Stderr goes to logs/jobname_jobid.err | |
| #SBATCH --partition=dgx-b200 # Queue to submit to | |
| #SBATCH --ntasks=1 # Number of tasks (usually one per process) | |
| #SBATCH --nodes=1 | |
| #SBATCH --gpus=1 | |
| #SBATCH --ntasks-per-node=8 | |
| #SBATCH --mem-per-gpu=128G | |
| #SBATCH --cpus-per-gpu=8 | |
| export OMP_NUM_THREADS=64 | |
| # export NCCL_DEBUG=INFO | |
| export NCCL_NVLS_ENABLE=1 | |
| export NCCL_IB_ADAPTIVE_ROUTING=1 | |
| export NCCL_IB_SL=1 | |
| export NCCL_IB_QPS_PER_CONNECTION=2 | |
| export NCCL_IB_SPLIT_DATA_ON_QPS=0 | |
| export NCCL_IB_HCA=mlx5_15,mlx5_10,mlx5_14,mlx5_13,mlx5_8,mlx5_7,mlx5_9,mlx5_4 | |
| export NCCL_SOCKET_IFNAME=bond0 | |
| export NCCL_ALGO=RING | |
| export UCX_TLS=rc | |
| python ./peptide/new_coupling.py \ | |
| --checkpoint ./peptide/ckpt/PepReDi_v2.pt \ | |
| --version 3 \ | |
| --gen_steps 16 |