| # ============================================================================ | |
| # FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer ํ์ต ๋ฐ์น ์คํฌ๋ฆฝํธ | |
| # ============================================================================ | |
| # | |
| # ์ฌ์ฉ๋ฒ: | |
| # nohup setsid bash scripts/launch_hybrid_3b.sh > logs/hybrid_3b.log 2>&1 & | |
| # | |
| # SIGHUP ๋ฐฉ์ด: nohup + setsid ์กฐํฉ์ผ๋ก SSH ๋๊น์๋ ํ์ต ์ ์ง | |
| # ============================================================================ | |
| set -euo pipefail | |
| # ---- ํ๊ฒฝ ๋ณ์ ---- | |
| export OMP_NUM_THREADS=4 | |
| export NCCL_ALGO=NVLS # NVSwitch ์ต์ ์๊ณ ๋ฆฌ์ฆ | |
| export NCCL_IB_DISABLE=1 # InfiniBand ๋นํ์ฑ (๋จ์ผ ๋ ธ๋) | |
| export NCCL_P2P_LEVEL=NVL # NVLink P2P | |
| export NCCL_NET_GDR_LEVEL=0 # GPU Direct RDMA ๋นํ์ฑ (๋จ์ผ ๋ ธ๋) | |
| # ---- ๊ฒฝ๋ก ---- | |
| PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang" | |
| CONFIG="${PROJECT_ROOT}/configs/hybrid_3b.yaml" | |
| TRAIN_DATA="${PROJECT_ROOT}/data/3b_train.bin" | |
| VAL_DATA="${PROJECT_ROOT}/data/3b_val.bin" | |
| CKPT_DIR="${PROJECT_ROOT}/checkpoints/hybrid_3b_run1" | |
| LOG_FILE="${PROJECT_ROOT}/logs/hybrid_3b_train.log" | |
| # ---- ๋๋ ํ ๋ฆฌ ์์ฑ ---- | |
| mkdir -p "${CKPT_DIR}" | |
| mkdir -p "$(dirname ${LOG_FILE})" | |
| cd "${PROJECT_ROOT}" | |
| echo "============================================" | |
| echo " FRANKENSTALLM-H 3B Hybrid Training" | |
| echo " Config: ${CONFIG}" | |
| echo " Data: ${TRAIN_DATA}" | |
| echo " Checkpoint: ${CKPT_DIR}" | |
| echo " Started: $(date '+%Y-%m-%d %H:%M:%S')" | |
| echo "============================================" | |
| # ---- ํ์ต ์คํ (8 GPU DDP) ---- | |
| torchrun \ | |
| --nproc_per_node=8 \ | |
| --master_port=29500 \ | |
| train/pretrain.py \ | |
| --config "${CONFIG}" \ | |
| --train_data "${TRAIN_DATA}" \ | |
| --val_data "${VAL_DATA}" \ | |
| --checkpoint_dir "${CKPT_DIR}" \ | |
| --batch_size 4 \ | |
| --lr 2e-4 \ | |
| --weight_decay 0.1 \ | |
| --warmup_steps 2000 \ | |
| --grad_accum 8 \ | |
| --max_steps 57000 \ | |
| --log_file "${LOG_FILE}" \ | |
| --use_fp8 \ | |
| "$@" | |
| echo "Training finished at $(date '+%Y-%m-%d %H:%M:%S')" | |