CodonTranslator / slurm /train_v3_h200_8x_single.sbatch
alegendaryfish's picture
Refine CodonTranslator docs and public scripts
75c84f0 verified
#!/bin/bash
# Single-node 8x H200 training entrypoint.
# Reserved single-node smoke-run example:
# sbatch --time=00:45:00 \
# --export=ALL,OUT_DIR=/path/to/outputs_codontranslator_smoke,MAX_STEPS=20,SAVE_STEPS=0,EVAL_INTERVAL=0 \
# slurm/train_v3_h200_8x_single.sbatch
# Full-run example:
# sbatch slurm/train_v3_h200_8x_single.sbatch
#
# Suggested W&B overrides:
# sbatch --export=ALL,WANDB_PROJECT=codontranslator,WANDB_NAME=codontranslator-run1 \
# slurm/train_v3_h200_8x_single.sbatch
# If the environment is still configured for offline logging, override at submit time:
# sbatch --export=ALL,WANDB_MODE=online slurm/train_v3_h200_8x_single.sbatch
# This script is pinned to the reserved H200 allocation on ihccs210.
# Do not use QoS=reserved on any other node.
#SBATCH --job-name=train-v3-h200-8x
#SBATCH --partition=beacon
#SBATCH --qos=reserved
#SBATCH --reservation=heng-reservation
#SBATCH --nodelist=ihccs210
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:nvidia_h200:8
#SBATCH --cpus-per-task=16
#SBATCH --mem=512G
#SBATCH --time=3-00:00:00
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err
set -euo pipefail
set +u
source ~/.bashrc
conda activate dna
set -u
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
REPO_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd)
cd "${REPO_ROOT}"
TRAIN_DATA=${TRAIN_DATA:-${REPO_ROOT}/data_v3_rebuild/train}
VAL_DATA=${VAL_DATA:-${REPO_ROOT}/data_v3_rebuild/val}
EMBED_DIR=${EMBED_DIR:-${REPO_ROOT}/embeddings_v2}
OUT_DIR=${OUT_DIR:-${REPO_ROOT}/outputs_codontranslator_h200_8x_bs48ga4}
WANDB_PROJECT=${WANDB_PROJECT:-codontranslator}
WANDB_NAME=${WANDB_NAME:-$(basename "${OUT_DIR}")}
WANDB_RUN_ID=${WANDB_RUN_ID:-$(basename "${OUT_DIR}")}
WANDB_RESUME=${WANDB_RESUME:-allow}
WANDB_DIR=${WANDB_DIR:-${OUT_DIR}/wandb}
NPROC_PER_NODE=${NPROC_PER_NODE:-8}
BATCH_SIZE=${BATCH_SIZE:-48}
GRAD_ACCUM=${GRAD_ACCUM:-4}
EVAL_BATCH_SIZE=${EVAL_BATCH_SIZE:-32}
WORKERS=${WORKERS:-0}
EPOCHS=${EPOCHS:-3}
LR=${LR:-7e-5}
WARMUP_RATIO=${WARMUP_RATIO:-0.1}
WEIGHT_DECAY=${WEIGHT_DECAY:-1e-4}
LOGGING_STEPS=${LOGGING_STEPS:-10}
SAVE_STEPS=${SAVE_STEPS:-500}
SAVE_TOTAL_LIMIT=${SAVE_TOTAL_LIMIT:-1000}
EVAL_INTERVAL=${EVAL_INTERVAL:-5000}
EVAL_STEPS=${EVAL_STEPS:-256}
TRAIN_SHUFFLE_BUFFER=${TRAIN_SHUFFLE_BUFFER:-8192}
VAL_SHUFFLE_BUFFER=${VAL_SHUFFLE_BUFFER:-0}
CKPT_RECENT_WINDOW_STEPS=${CKPT_RECENT_WINDOW_STEPS:-2000}
CKPT_RECENT_INTERVAL=${CKPT_RECENT_INTERVAL:-500}
CKPT_ARCHIVE_INTERVAL=${CKPT_ARCHIVE_INTERVAL:-1000}
RESUME_FROM=${RESUME_FROM:-auto}
MAX_STEPS=${MAX_STEPS:-}
MASTER_PORT=${MASTER_PORT:-29500}
GRAD_CKPT=${GRAD_CKPT:-0}
export WANDB_PROJECT WANDB_NAME WANDB_RUN_ID WANDB_RESUME WANDB_DIR
export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
export TORCH_DISTRIBUTED_DEBUG=${TORCH_DISTRIBUTED_DEBUG:-DETAIL}
export NCCL_P2P_DISABLE=${NCCL_P2P_DISABLE:-0}
export NCCL_IB_DISABLE=${NCCL_IB_DISABLE:-1}
export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-0}
export NCCL_ASYNC_ERROR_HANDLING=${NCCL_ASYNC_ERROR_HANDLING:-1}
export NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE:-1}
export NCCL_CUMEM_HOST_ENABLE=${NCCL_CUMEM_HOST_ENABLE:-1}
export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1}
mkdir -p "${OUT_DIR}" "${WANDB_DIR}"
if [[ ! -d "${TRAIN_DATA}" ]]; then
echo "Missing train data dir: ${TRAIN_DATA}" >&2
exit 1
fi
if [[ ! -d "${VAL_DATA}" ]]; then
echo "Missing val data dir: ${VAL_DATA}" >&2
exit 1
fi
if [[ ! -f "${EMBED_DIR}/species_vocab.json" ]]; then
echo "Missing embeddings vocab: ${EMBED_DIR}/species_vocab.json" >&2
exit 1
fi
echo "HOST=$(hostname)"
echo "TRAIN_DATA=${TRAIN_DATA}"
echo "VAL_DATA=${VAL_DATA}"
echo "EMBED_DIR=${EMBED_DIR}"
echo "OUT_DIR=${OUT_DIR}"
echo "WANDB_PROJECT=${WANDB_PROJECT} WANDB_NAME=${WANDB_NAME} WANDB_RUN_ID=${WANDB_RUN_ID} WANDB_RESUME=${WANDB_RESUME} WANDB_MODE=${WANDB_MODE:-unset}"
echo "BATCH_SIZE=${BATCH_SIZE} GRAD_ACCUM=${GRAD_ACCUM} EVAL_BATCH_SIZE=${EVAL_BATCH_SIZE} NPROC_PER_NODE=${NPROC_PER_NODE}"
echo "WEIGHT_DECAY=${WEIGHT_DECAY} SAVE_STEPS=${SAVE_STEPS} EVAL_INTERVAL=${EVAL_INTERVAL} MAX_STEPS=${MAX_STEPS:-unset}"
echo "NCCL_P2P_DISABLE=${NCCL_P2P_DISABLE} NCCL_IB_DISABLE=${NCCL_IB_DISABLE} NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE} NCCL_CUMEM_HOST_ENABLE=${NCCL_CUMEM_HOST_ENABLE}"
echo "=== GPU inventory ==="
nvidia-smi --query-gpu=index,name,memory.total,driver_version --format=csv,noheader || true
echo "=== GPU topology ==="
nvidia-smi topo -m || true
echo "=== NVLink status ==="
nvidia-smi nvlink -s || true
CMD=(
torchrun
--standalone
--nproc_per_node "${NPROC_PER_NODE}"
--master_port "${MASTER_PORT}"
train.py
--train_data "${TRAIN_DATA}"
--val_data "${VAL_DATA}"
--embeddings_dir "${EMBED_DIR}"
--output_dir "${OUT_DIR}"
--fsdp
--bf16
--attn mha
--hidden 750
--layers 20
--heads 15
--mlp_ratio 3.2
--batch_size "${BATCH_SIZE}"
--grad_accum "${GRAD_ACCUM}"
--eval_batch_size "${EVAL_BATCH_SIZE}"
--epochs "${EPOCHS}"
--workers "${WORKERS}"
--warmup_ratio "${WARMUP_RATIO}"
--lr "${LR}"
--weight_decay "${WEIGHT_DECAY}"
--train_shuffle_buffer "${TRAIN_SHUFFLE_BUFFER}"
--val_shuffle_buffer "${VAL_SHUFFLE_BUFFER}"
--logging_steps "${LOGGING_STEPS}"
--save_steps "${SAVE_STEPS}"
--save_total_limit "${SAVE_TOTAL_LIMIT}"
--ckpt_recent_window_steps "${CKPT_RECENT_WINDOW_STEPS}"
--ckpt_recent_interval "${CKPT_RECENT_INTERVAL}"
--ckpt_archive_interval "${CKPT_ARCHIVE_INTERVAL}"
--eval_interval "${EVAL_INTERVAL}"
--eval_steps "${EVAL_STEPS}"
)
if [[ "${RESUME_FROM}" != "none" && -n "${RESUME_FROM}" ]]; then
CMD+=(--resume_from "${RESUME_FROM}")
fi
if [[ -n "${MAX_STEPS}" ]]; then
CMD+=(--max_steps "${MAX_STEPS}")
fi
if [[ "${GRAD_CKPT}" == "1" ]]; then
CMD+=(--grad_ckpt)
fi
exec "${CMD[@]}"