| #!/bin/bash | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! | |
| #SBATCH --cpus-per-task=48 | |
| #SBATCH --gres=gpu:4 | |
| #SBATCH --partition=production-cluster | |
| #SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out | |
| set -x -e | |
| source /admin/home/loubna/.bashrc | |
| conda activate brr4 | |
| # File Path setup | |
| echo "START TIME: $(date)" | |
| GPUS_PER_NODE=4 | |
| MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
| MASTER_PORT=6000 | |
| NNODES=$SLURM_NNODES | |
| NODE_RANK=$SLURM_PROCID | |
| WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) | |
| model=$1 | |
| task=$2 | |
| org=$3 | |
| out_path=$4 | |
| CMD="\ | |
| /fsx/loubna/code/bigcode-evaluation-harness/main.py \ | |
| --model $org/$model \ | |
| --tasks $task \ | |
| --max_length_generation 512 \ | |
| --batch_size 50 \ | |
| --n_samples 50 \ | |
| --temperature 0.2 \ | |
| --precision bf16 \ | |
| --allow_code_execution \ | |
| --trust_remote_code \ | |
| --save_generations \ | |
| --use_auth_token \ | |
| --generation_only \ | |
| --save_generations_path $out_path/generations_$task\_$model.json \ | |
| " | |
| export LAUNCHER="accelerate launch \ | |
| --multi_gpu \ | |
| --num_machines $NNODES \ | |
| --num_processes $WORLD_SIZE \ | |
| --main_process_ip "$MASTER_ADDR" \ | |
| --main_process_port $MASTER_PORT \ | |
| --num_processes $WORLD_SIZE \ | |
| --machine_rank \$SLURM_PROCID \ | |
| --role $SLURMD_NODENAME: \ | |
| --rdzv_conf rdzv_backend=c10d \ | |
| --max_restarts 0 \ | |
| --tee 3 \ | |
| " | |
| # force crashing on nccl issues like hanging broadcast | |
| export NCCL_ASYNC_ERROR_HANDLING=1 | |
| # AWS specific | |
| export NCCL_PROTO=simple | |
| export RDMAV_FORK_SAFE=1 | |
| export FI_EFA_FORK_SAFE=1 | |
| export FI_EFA_USE_DEVICE_RDMA=1 | |
| export FI_PROVIDER=efa | |
| export FI_LOG_LEVEL=1 | |
| export NCCL_IB_DISABLE=1 | |
| export NCCL_SOCKET_IFNAME=ens | |
| echo $CMD | |
| SRUN_ARGS=" \ | |
| --wait=60 \ | |
| --kill-on-bad-exit=1 \ | |
| " | |
| clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD" | |
| echo "END TIME: $(date)" |