frankenstallm / source /scripts /launch_hybrid_3b.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/bin/bash
# ============================================================================
# FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer ํ•™์Šต ๋Ÿฐ์น˜ ์Šคํฌ๋ฆฝํŠธ
# ============================================================================
#
# ์‚ฌ์šฉ๋ฒ•:
# nohup setsid bash scripts/launch_hybrid_3b.sh > logs/hybrid_3b.log 2>&1 &
#
# SIGHUP ๋ฐฉ์–ด: nohup + setsid ์กฐํ•ฉ์œผ๋กœ SSH ๋Š๊น€์—๋„ ํ•™์Šต ์œ ์ง€
# ============================================================================
set -euo pipefail
# ---- ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ----
export OMP_NUM_THREADS=4
export NCCL_ALGO=NVLS # NVSwitch ์ตœ์  ์•Œ๊ณ ๋ฆฌ์ฆ˜
export NCCL_IB_DISABLE=1 # InfiniBand ๋น„ํ™œ์„ฑ (๋‹จ์ผ ๋…ธ๋“œ)
export NCCL_P2P_LEVEL=NVL # NVLink P2P
export NCCL_NET_GDR_LEVEL=0 # GPU Direct RDMA ๋น„ํ™œ์„ฑ (๋‹จ์ผ ๋…ธ๋“œ)
# ---- ๊ฒฝ๋กœ ----
PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
CONFIG="${PROJECT_ROOT}/configs/hybrid_3b.yaml"
TRAIN_DATA="${PROJECT_ROOT}/data/3b_train.bin"
VAL_DATA="${PROJECT_ROOT}/data/3b_val.bin"
CKPT_DIR="${PROJECT_ROOT}/checkpoints/hybrid_3b_run1"
LOG_FILE="${PROJECT_ROOT}/logs/hybrid_3b_train.log"
# ---- ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ ----
mkdir -p "${CKPT_DIR}"
mkdir -p "$(dirname ${LOG_FILE})"
cd "${PROJECT_ROOT}"
echo "============================================"
echo " FRANKENSTALLM-H 3B Hybrid Training"
echo " Config: ${CONFIG}"
echo " Data: ${TRAIN_DATA}"
echo " Checkpoint: ${CKPT_DIR}"
echo " Started: $(date '+%Y-%m-%d %H:%M:%S')"
echo "============================================"
# ---- ํ•™์Šต ์‹คํ–‰ (8 GPU DDP) ----
torchrun \
--nproc_per_node=8 \
--master_port=29500 \
train/pretrain.py \
--config "${CONFIG}" \
--train_data "${TRAIN_DATA}" \
--val_data "${VAL_DATA}" \
--checkpoint_dir "${CKPT_DIR}" \
--batch_size 4 \
--lr 2e-4 \
--weight_decay 0.1 \
--warmup_steps 2000 \
--grad_accum 8 \
--max_steps 57000 \
--log_file "${LOG_FILE}" \
--use_fp8 \
"$@"
echo "Training finished at $(date '+%Y-%m-%d %H:%M:%S')"