TaoNet-mini-T2 / code /TaoTrain /scripts /remote /run_200m_base_suite.sh
StarMist0012's picture
Add files using upload-large-folder tool
e2bfccc verified
#!/usr/bin/env bash
set -euo pipefail
DATA_PATH="${DATA_PATH:-/home/student/Data/TaoData/pretrain.jsonl}"
TOKENIZER_PATH="${TOKENIZER_PATH:-/home/student/YouZheng/tokenizers/taodata_pilot_8k/tokenizer.model}"
SSM_REPO_PATH="${SSM_REPO_PATH:-/home/student/YouZheng/gamma_ssm_repo}"
PYTHON_BIN="${PYTHON_BIN:-/home/student/.venv/bin/python}"
REMOTE_REPO="${REMOTE_REPO:-$(pwd)}"
OUTPUT_BASE="${REPOBRIDGE_OUTPUT_DIR:-$REMOTE_REPO/results/200m-base-suite}"
CHECKPOINT_BASE="${TAOTERN_CHECKPOINT_DIR:-$OUTPUT_BASE/checkpoints}"
# Stage-1 defaults are intentionally modest. Increase these through environment
# variables after the 200M shapes are stable on the RTX5090.
MAX_TOKENS="${MAX_TOKENS:-50000000}"
MAX_RECORDS="${MAX_RECORDS:-100000}"
TRAIN_STEPS="${TRAIN_STEPS:-200}"
EVAL_BATCHES="${EVAL_BATCHES:-16}"
BATCH_SIZES="${BATCH_SIZES:-4,8}"
SEQ_LEN="${SEQ_LEN:-512}"
LEARNING_RATE="${LEARNING_RATE:-0.0006}"
WEIGHT_DECAY="${WEIGHT_DECAY:-0.01}"
DRY_RUN="${DRY_RUN:-0}"
export PYTHONPATH="$REMOTE_REPO/src:$SSM_REPO_PATH"
mkdir -p "$OUTPUT_BASE" "$CHECKPOINT_BASE"
run_variant() {
local variant="$1"
shift
local output_dir="$OUTPUT_BASE/$variant"
local checkpoint_dir="$CHECKPOINT_BASE/$variant"
mkdir -p "$output_dir" "$checkpoint_dir"
local cmd="$PYTHON_BIN scripts/benchmark_taonet_real_tokens.py \
--data-path $DATA_PATH \
--text-field text \
--tokenizer-type sentencepiece \
--tokenizer-path $TOKENIZER_PATH \
--max-records $MAX_RECORDS \
--max-tokens $MAX_TOKENS \
--eval-fraction 0.1 \
--batch-sizes $BATCH_SIZES \
--seq-len $SEQ_LEN \
--dtype bf16 \
--device cuda \
--warmup 1 \
--repeats 2 \
--backward \
--train-steps $TRAIN_STEPS \
--learning-rate $LEARNING_RATE \
--weight-decay $WEIGHT_DECAY \
--eval-batches $EVAL_BATCHES \
--output-dir $output_dir \
--resume-completed \
--incremental-output \
--save-case-checkpoints \
--checkpoint-dir $checkpoint_dir \
$*"
printf '\n=== 200M variant: %s ===\n' "$variant"
printf '%s\n' "$cmd"
if [ "$DRY_RUN" = "1" ]; then
return 0
fi
eval "$cmd"
}
run_variant attention_196m \
--architectures taonet \
--hidden-dim 960 \
--num-layers 16 \
--num-heads 8 \
--d-latent-kv 720 \
--d-rope 120 \
--hidden-dim-ff 2880
run_variant pure_ssm_196m_hadamard \
--architectures taonet_ssm \
--hidden-dim 1024 \
--num-layers 18 \
--num-heads 8 \
--d-latent-kv 768 \
--d-rope 128 \
--hidden-dim-ff 3072 \
--ssm-core dplr \
--ssm-hidden-dims 16 \
--ssm-mixer-dims 256 \
--ssm-num-lanes-list 2 \
--ssm-lane-combine channel \
--ssm-lane-modes split \
--ssm-split-mixes hadamard \
--ssm-rank 1 \
--ssm-kernel-mode conv \
--no-ssm-finite-tail-correction \
--ssm-gate-types channel \
--ssm-local-shift \
--ssm-local-shift-per-channel \
--ssm-local-shift-init 0.1
run_variant pure_ssm_196m_nomix \
--architectures taonet_ssm \
--hidden-dim 1024 \
--num-layers 18 \
--num-heads 8 \
--d-latent-kv 768 \
--d-rope 128 \
--hidden-dim-ff 3072 \
--ssm-core dplr \
--ssm-hidden-dims 16 \
--ssm-mixer-dims 256 \
--ssm-num-lanes-list 2 \
--ssm-lane-combine channel \
--ssm-lane-modes split \
--ssm-split-mixes none \
--ssm-rank 1 \
--ssm-kernel-mode conv \
--no-ssm-finite-tail-correction \
--ssm-gate-types channel \
--ssm-local-shift \
--ssm-local-shift-per-channel \
--ssm-local-shift-init 0.1
run_variant hybrid_ssm_first_199m \
--architectures taonet_hybrid \
--hidden-dim 1024 \
--num-layers 16 \
--num-heads 8 \
--d-latent-kv 768 \
--d-rope 128 \
--hidden-dim-ff 3072 \
--ssm-core dplr \
--ssm-hidden-dims 32 \
--ssm-mixer-dims 256 \
--ssm-num-lanes-list 2 \
--ssm-lane-combine channel \
--ssm-lane-modes split \
--ssm-split-mixes hadamard \
--ssm-rank 1 \
--ssm-kernel-mode conv \
--no-ssm-finite-tail-correction \
--ssm-gate-types channel \
--hybrid-patterns ssm_first \
--ssm-local-shift \
--ssm-local-shift-per-channel \
--ssm-local-shift-init 0.1
if [ "$DRY_RUN" != "1" ]; then
"$PYTHON_BIN" scripts/summarize_taonet_benchmark_suite.py --suite-dir "$OUTPUT_BASE"
fi