TaoNet-mini-T2 / code /TaoTrain /scripts /remote /submit_200m_branch_only_chat.sh
StarMist0012's picture
Add files using upload-large-folder tool
e2bfccc verified
#!/usr/bin/env bash
set -euo pipefail
RUN_ID="${RUN_ID:-taotern-200m-branch-only-chat-$(date +%Y%m%d-%H%M%S)}"
JOB_ROOT="${JOB_ROOT:-/home/student/YouZheng/jobs/taotern}"
REMOTE_REPO="${REMOTE_REPO:-$(pwd)}"
PYTHON_BIN="${PYTHON_BIN:-/home/student/.venv/bin/python}"
SSM_REPO_PATH="${SSM_REPO_PATH:-/home/student/YouZheng/gamma_ssm_repo}"
DATA_PATH="${DATA_PATH:-/home/student/Data/TaoData/pretrain.jsonl}"
SFT_DATA_PATH="${SFT_DATA_PATH:-/home/student/Data/TaoData/sft.jsonl}"
TOKENIZER_PATH="${TOKENIZER_PATH:-/home/student/YouZheng/tokenizers/taodata_pilot_8k/tokenizer.model}"
SEQ_LEN="${SEQ_LEN:-512}"
BATCH_SIZE="${BATCH_SIZE:-8}"
PRETRAIN_TOKENS="${PRETRAIN_TOKENS:-4000000000}"
SFT_STEPS="${SFT_STEPS:-50000}"
PRETRAIN_LR="${PRETRAIN_LR:-0.0008}"
SFT_LR="${SFT_LR:-0.00005}"
WEIGHT_DECAY="${WEIGHT_DECAY:-0.01}"
LOG_EVERY="${LOG_EVERY:-100}"
SAVE_EVERY="${SAVE_EVERY:-100000}"
SFT_SAVE_EVERY="${SFT_SAVE_EVERY:-10000}"
TOKENIZER_THREADS="${TOKENIZER_THREADS:-8}"
SAMPLES_PER_CHUNK="${SAMPLES_PER_CHUNK:-2000}"
BLOCK_RESIDUAL_RMS_CAP="${BLOCK_RESIDUAL_RMS_CAP:-}"
JOB_COMMAND="REMOTE_REPO=$REMOTE_REPO PYTHON_BIN=$PYTHON_BIN SSM_REPO_PATH=$SSM_REPO_PATH DATA_PATH=$DATA_PATH SFT_DATA_PATH=$SFT_DATA_PATH TOKENIZER_PATH=$TOKENIZER_PATH SEQ_LEN=$SEQ_LEN BATCH_SIZE=$BATCH_SIZE PRETRAIN_TOKENS=$PRETRAIN_TOKENS SFT_STEPS=$SFT_STEPS PRETRAIN_LR=$PRETRAIN_LR SFT_LR=$SFT_LR WEIGHT_DECAY=$WEIGHT_DECAY LOG_EVERY=$LOG_EVERY SAVE_EVERY=$SAVE_EVERY SFT_SAVE_EVERY=$SFT_SAVE_EVERY TOKENIZER_THREADS=$TOKENIZER_THREADS SAMPLES_PER_CHUNK=$SAMPLES_PER_CHUNK BLOCK_RESIDUAL_RMS_CAP=$BLOCK_RESIDUAL_RMS_CAP bash scripts/remote/run_200m_branch_only_chat.sh"
export RUN_ID JOB_ROOT JOB_COMMAND
export OUTPUT_DIR="${OUTPUT_DIR:-$JOB_ROOT/$RUN_ID/outputs}"
export CHECKPOINT_DIR="${CHECKPOINT_DIR:-$JOB_ROOT/$RUN_ID/checkpoints}"
bash scripts/remote/submit_detached_job.sh