| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| set +u |
| source ~/.bashrc |
| conda activate dna |
| set -u |
|
|
| SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) |
| REPO_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd) |
| cd "${REPO_ROOT}" |
|
|
| TRAIN_DATA=${TRAIN_DATA:-${REPO_ROOT}/data_v3_rebuild/train} |
| VAL_DATA=${VAL_DATA:-${REPO_ROOT}/data_v3_rebuild/val} |
| EMBED_DIR=${EMBED_DIR:-${REPO_ROOT}/embeddings_v2} |
| OUT_DIR=${OUT_DIR:-${REPO_ROOT}/outputs_codontranslator_h200_8x_bs48ga4} |
|
|
| WANDB_PROJECT=${WANDB_PROJECT:-codontranslator} |
| WANDB_NAME=${WANDB_NAME:-$(basename "${OUT_DIR}")} |
| WANDB_RUN_ID=${WANDB_RUN_ID:-$(basename "${OUT_DIR}")} |
| WANDB_RESUME=${WANDB_RESUME:-allow} |
| WANDB_DIR=${WANDB_DIR:-${OUT_DIR}/wandb} |
|
|
| NPROC_PER_NODE=${NPROC_PER_NODE:-8} |
| BATCH_SIZE=${BATCH_SIZE:-48} |
| GRAD_ACCUM=${GRAD_ACCUM:-4} |
| EVAL_BATCH_SIZE=${EVAL_BATCH_SIZE:-32} |
| WORKERS=${WORKERS:-0} |
| EPOCHS=${EPOCHS:-3} |
| LR=${LR:-7e-5} |
| WARMUP_RATIO=${WARMUP_RATIO:-0.1} |
| WEIGHT_DECAY=${WEIGHT_DECAY:-1e-4} |
| LOGGING_STEPS=${LOGGING_STEPS:-10} |
| SAVE_STEPS=${SAVE_STEPS:-500} |
| SAVE_TOTAL_LIMIT=${SAVE_TOTAL_LIMIT:-1000} |
| EVAL_INTERVAL=${EVAL_INTERVAL:-5000} |
| EVAL_STEPS=${EVAL_STEPS:-256} |
| TRAIN_SHUFFLE_BUFFER=${TRAIN_SHUFFLE_BUFFER:-8192} |
| VAL_SHUFFLE_BUFFER=${VAL_SHUFFLE_BUFFER:-0} |
| CKPT_RECENT_WINDOW_STEPS=${CKPT_RECENT_WINDOW_STEPS:-2000} |
| CKPT_RECENT_INTERVAL=${CKPT_RECENT_INTERVAL:-500} |
| CKPT_ARCHIVE_INTERVAL=${CKPT_ARCHIVE_INTERVAL:-1000} |
| RESUME_FROM=${RESUME_FROM:-auto} |
| MAX_STEPS=${MAX_STEPS:-} |
| MASTER_PORT=${MASTER_PORT:-29500} |
| GRAD_CKPT=${GRAD_CKPT:-0} |
|
|
| export WANDB_PROJECT WANDB_NAME WANDB_RUN_ID WANDB_RESUME WANDB_DIR |
| export NCCL_DEBUG=${NCCL_DEBUG:-WARN} |
| export TORCH_DISTRIBUTED_DEBUG=${TORCH_DISTRIBUTED_DEBUG:-DETAIL} |
| export NCCL_P2P_DISABLE=${NCCL_P2P_DISABLE:-0} |
| export NCCL_IB_DISABLE=${NCCL_IB_DISABLE:-1} |
| export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-0} |
| export NCCL_ASYNC_ERROR_HANDLING=${NCCL_ASYNC_ERROR_HANDLING:-1} |
| export NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE:-1} |
| export NCCL_CUMEM_HOST_ENABLE=${NCCL_CUMEM_HOST_ENABLE:-1} |
| export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1} |
|
|
| mkdir -p "${OUT_DIR}" "${WANDB_DIR}" |
|
|
| if [[ ! -d "${TRAIN_DATA}" ]]; then |
| echo "Missing train data dir: ${TRAIN_DATA}" >&2 |
| exit 1 |
| fi |
| if [[ ! -d "${VAL_DATA}" ]]; then |
| echo "Missing val data dir: ${VAL_DATA}" >&2 |
| exit 1 |
| fi |
| if [[ ! -f "${EMBED_DIR}/species_vocab.json" ]]; then |
| echo "Missing embeddings vocab: ${EMBED_DIR}/species_vocab.json" >&2 |
| exit 1 |
| fi |
|
|
| echo "HOST=$(hostname)" |
| echo "TRAIN_DATA=${TRAIN_DATA}" |
| echo "VAL_DATA=${VAL_DATA}" |
| echo "EMBED_DIR=${EMBED_DIR}" |
| echo "OUT_DIR=${OUT_DIR}" |
| echo "WANDB_PROJECT=${WANDB_PROJECT} WANDB_NAME=${WANDB_NAME} WANDB_RUN_ID=${WANDB_RUN_ID} WANDB_RESUME=${WANDB_RESUME} WANDB_MODE=${WANDB_MODE:-unset}" |
| echo "BATCH_SIZE=${BATCH_SIZE} GRAD_ACCUM=${GRAD_ACCUM} EVAL_BATCH_SIZE=${EVAL_BATCH_SIZE} NPROC_PER_NODE=${NPROC_PER_NODE}" |
| echo "WEIGHT_DECAY=${WEIGHT_DECAY} SAVE_STEPS=${SAVE_STEPS} EVAL_INTERVAL=${EVAL_INTERVAL} MAX_STEPS=${MAX_STEPS:-unset}" |
| echo "NCCL_P2P_DISABLE=${NCCL_P2P_DISABLE} NCCL_IB_DISABLE=${NCCL_IB_DISABLE} NCCL_SHM_DISABLE=${NCCL_SHM_DISABLE} NCCL_CUMEM_HOST_ENABLE=${NCCL_CUMEM_HOST_ENABLE}" |
|
|
| echo "=== GPU inventory ===" |
| nvidia-smi --query-gpu=index,name,memory.total,driver_version --format=csv,noheader || true |
| echo "=== GPU topology ===" |
| nvidia-smi topo -m || true |
| echo "=== NVLink status ===" |
| nvidia-smi nvlink -s || true |
|
|
| CMD=( |
| torchrun |
| --standalone |
| --nproc_per_node "${NPROC_PER_NODE}" |
| --master_port "${MASTER_PORT}" |
| train.py |
| --train_data "${TRAIN_DATA}" |
| --val_data "${VAL_DATA}" |
| --embeddings_dir "${EMBED_DIR}" |
| --output_dir "${OUT_DIR}" |
| --fsdp |
| --bf16 |
| --attn mha |
| --hidden 750 |
| --layers 20 |
| --heads 15 |
| --mlp_ratio 3.2 |
| --batch_size "${BATCH_SIZE}" |
| --grad_accum "${GRAD_ACCUM}" |
| --eval_batch_size "${EVAL_BATCH_SIZE}" |
| --epochs "${EPOCHS}" |
| --workers "${WORKERS}" |
| --warmup_ratio "${WARMUP_RATIO}" |
| --lr "${LR}" |
| --weight_decay "${WEIGHT_DECAY}" |
| --train_shuffle_buffer "${TRAIN_SHUFFLE_BUFFER}" |
| --val_shuffle_buffer "${VAL_SHUFFLE_BUFFER}" |
| --logging_steps "${LOGGING_STEPS}" |
| --save_steps "${SAVE_STEPS}" |
| --save_total_limit "${SAVE_TOTAL_LIMIT}" |
| --ckpt_recent_window_steps "${CKPT_RECENT_WINDOW_STEPS}" |
| --ckpt_recent_interval "${CKPT_RECENT_INTERVAL}" |
| --ckpt_archive_interval "${CKPT_ARCHIVE_INTERVAL}" |
| --eval_interval "${EVAL_INTERVAL}" |
| --eval_steps "${EVAL_STEPS}" |
| ) |
|
|
| if [[ "${RESUME_FROM}" != "none" && -n "${RESUME_FROM}" ]]; then |
| CMD+=(--resume_from "${RESUME_FROM}") |
| fi |
| if [[ -n "${MAX_STEPS}" ]]; then |
| CMD+=(--max_steps "${MAX_STEPS}") |
| fi |
| if [[ "${GRAD_CKPT}" == "1" ]]; then |
| CMD+=(--grad_ckpt) |
| fi |
|
|
| exec "${CMD[@]}" |
|
|