File size: 967 Bytes
d2693e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash
#SBATCH --job-name=new_coupling         # Name of the job
#SBATCH --output=outs/new_coupling.out         # Stdout goes to logs/jobname_jobid.out
#SBATCH --error=outs/new_coupling.err          # Stderr goes to logs/jobname_jobid.err
#SBATCH --partition=dgx-b200      # Queue to submit to
#SBATCH --ntasks=1                      # Number of tasks (usually one per process)
#SBATCH --nodes=1
#SBATCH --gpus=1
#SBATCH --ntasks-per-node=8
#SBATCH --mem-per-gpu=128G
#SBATCH --cpus-per-gpu=8

export OMP_NUM_THREADS=64

# export NCCL_DEBUG=INFO
export NCCL_NVLS_ENABLE=1
export NCCL_IB_ADAPTIVE_ROUTING=1
export NCCL_IB_SL=1
export NCCL_IB_QPS_PER_CONNECTION=2
export NCCL_IB_SPLIT_DATA_ON_QPS=0
export NCCL_IB_HCA=mlx5_15,mlx5_10,mlx5_14,mlx5_13,mlx5_8,mlx5_7,mlx5_9,mlx5_4
export NCCL_SOCKET_IFNAME=bond0
export NCCL_ALGO=RING
export UCX_TLS=rc

python ./peptide/new_coupling.py \
  --checkpoint ./peptide/ckpt/PepReDi_v2.pt \
  --version 3 \
  --gen_steps 16