| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| |
| export OMP_NUM_THREADS=4 |
| export NCCL_ALGO=NVLS |
| export NCCL_IB_DISABLE=1 |
| export NCCL_P2P_LEVEL=NVL |
| export NCCL_NET_GDR_LEVEL=0 |
|
|
| |
| PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang" |
| CONFIG="${PROJECT_ROOT}/configs/hybrid_3b.yaml" |
| TRAIN_DATA="${PROJECT_ROOT}/data/3b_train.bin" |
| VAL_DATA="${PROJECT_ROOT}/data/3b_val.bin" |
| CKPT_DIR="${PROJECT_ROOT}/checkpoints/hybrid_3b_run1" |
| LOG_FILE="${PROJECT_ROOT}/logs/hybrid_3b_train.log" |
|
|
| |
| mkdir -p "${CKPT_DIR}" |
| mkdir -p "$(dirname ${LOG_FILE})" |
|
|
| cd "${PROJECT_ROOT}" |
|
|
| echo "============================================" |
| echo " FRANKENSTALLM-H 3B Hybrid Training" |
| echo " Config: ${CONFIG}" |
| echo " Data: ${TRAIN_DATA}" |
| echo " Checkpoint: ${CKPT_DIR}" |
| echo " Started: $(date '+%Y-%m-%d %H:%M:%S')" |
| echo "============================================" |
|
|
| |
| torchrun \ |
| --nproc_per_node=8 \ |
| --master_port=29500 \ |
| train/pretrain.py \ |
| --config "${CONFIG}" \ |
| --train_data "${TRAIN_DATA}" \ |
| --val_data "${VAL_DATA}" \ |
| --checkpoint_dir "${CKPT_DIR}" \ |
| --batch_size 4 \ |
| --lr 2e-4 \ |
| --weight_decay 0.1 \ |
| --warmup_steps 2000 \ |
| --grad_accum 8 \ |
| --max_steps 57000 \ |
| --log_file "${LOG_FILE}" \ |
| --use_fp8 \ |
| "$@" |
|
|
| echo "Training finished at $(date '+%Y-%m-%d %H:%M:%S')" |
|
|