source/scripts/launch_fp8.sh · pathcosmos/frankenstallm at main

frankenstallm / source /scripts /launch_fp8.sh

Upload folder using huggingface_hub (#17)

48ecd01 5 days ago

3.55 kB

	#!/usr/bin/env bash
	# =============================================================================
	# launch_fp8.sh — 8-GPU FP8 pretraining launcher for B200
	#
	# Usage:
	# bash scripts/launch_fp8.sh # full run
	# bash scripts/launch_fp8.sh --max_steps 500 # quick test
	# bash scripts/launch_fp8.sh --resume checkpoints/small_fp8_run1/checkpoint-0001000
	#
	# Config is read from configs/small_fp8.yaml (model) + CLI args (train).
	# Logs: checkpoints/<RUN_NAME>/train.log
	# checkpoints/<RUN_NAME>/tensorboard/
	# =============================================================================
	set -euo pipefail

	# ---- Configurable defaults --------------------------------------------------
	RUN_NAME="${RUN_NAME:-small_fp8_run1}"
	CONFIG="${CONFIG:-configs/small_fp8.yaml}"
	TRAIN_DATA="${TRAIN_DATA:-data/train.bin}"
	VAL_DATA="${VAL_DATA:-data/val.bin}"
	CKPT_DIR="checkpoints/${RUN_NAME}"
	LOG_FILE="${CKPT_DIR}/train.log"
	NPROC=8
	MASTER_PORT="${MASTER_PORT:-29500}"

	# ---- Defaults that can be overridden via extra CLI args --------------------
	MAX_STEPS=100000
	BATCH_SIZE=8
	GRAD_ACCUM=4
	WARMUP_STEPS=2000
	SEED=42

	# ---- Pass remaining CLI args directly to pretrain.py ----------------------
	EXTRA_ARGS="$@"

	# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
	# Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
	export NCCL_IB_DISABLE=1
	# Use Ring algorithm for large gradient tensors (128M-70B model range).
	export NCCL_ALGO=Ring
	# Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
	export NCCL_PROTO=Simple
	# More channels → better NVSwitch saturation for large all-reduce payloads.
	export NCCL_MIN_NCHANNELS=16
	export NCCL_MAX_NCHANNELS=16
	# Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
	export NCCL_BUFFSIZE=67108864
	# CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
	export OMP_NUM_THREADS=4
	export MKL_NUM_THREADS=4

	# ---- Setup ------------------------------------------------------------------
	mkdir -p "${CKPT_DIR}"
	cd "$(dirname "$0")/.." # always run from project root

	echo "=================================================================="
	echo " Run name : ${RUN_NAME}"
	echo " Config : ${CONFIG}"
	echo " CKPT dir : ${CKPT_DIR}"
	echo " Log file : ${LOG_FILE}"
	echo " Started : $(date)"
	echo "=================================================================="

	# Suppress the harmless flash_attn kernel override warning from all ranks.
	export PYTHONWARNINGS="ignore::UserWarning:torch.library"

	torchrun \
	--nproc_per_node=${NPROC} \
	--master_port=${MASTER_PORT} \
	train/pretrain.py \
	--config "${CONFIG}" \
	--train_data "${TRAIN_DATA}" \
	--val_data "${VAL_DATA}" \
	--checkpoint_dir "${CKPT_DIR}" \
	--log_file "${LOG_FILE}" \
	--max_steps ${MAX_STEPS} \
	--batch_size ${BATCH_SIZE} \
	--grad_accum ${GRAD_ACCUM} \
	--warmup_steps ${WARMUP_STEPS} \
	--seed ${SEED} \
	${EXTRA_ARGS} \
	2>&1 \| grep -v "UserWarning" \
	\| grep -v "Warning only once" \
	\| grep -v "Overriding a previously" \
	\| grep -v "dispatch key:" \
	\| grep -v "previous kernel:" \
	\| grep -v "new kernel:" \
	\| grep -v "operator: flash_attn" \
	\| grep -v "registered at /usr/local" \
	\| grep -v "self.m.impl"

	echo "=================================================================="
	echo " Done : $(date)"
	echo "=================================================================="