#!/bin/bash #SBATCH --job-name=wikitext2_any_order #SBATCH --partition=kempner_h100 #SBATCH --account=kempner_albergo_lab #SBATCH --partition=kempner_h100 #SBATCH --nodes=2 #SBATCH --gpus-per-node=4 #SBATCH --ntasks-per-node=4 #SBATCH --mem=100GB #SBATCH --time=1-00:00:00 #SBATCH --output=slurm_logs/wikitext2/job-%j.out export NCCL_SOCKET_FAMILY=AF_INET export MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n 1) export MASTER_PORT=$(shuf -i 15000-59999 -n 1) export NODE_RANK=$SLURM_NODEID export NCCL_DEBUG=INFO export NCCL_DEBUG_SUBSYS=ALL export TORCH_DISTRIBUTED_DEBUG=DETAIL srun python train.py --config-path config/wikitext2 --config-name any_order