#!/usr/bin/env bash set -euo pipefail DATA_PATH="${DATA_PATH:-/home/student/Data/TaoData/pretrain.jsonl}" TOKENIZER_PATH="${TOKENIZER_PATH:-/home/student/YouZheng/tokenizers/taodata_pilot_8k/tokenizer.model}" SSM_REPO_PATH="${SSM_REPO_PATH:-/home/student/YouZheng/gamma_ssm_repo}" PYTHON_BIN="${PYTHON_BIN:-/home/student/.venv/bin/python}" REMOTE_REPO="${REMOTE_REPO:-$(pwd)}" OUTPUT_BASE="${REPOBRIDGE_OUTPUT_DIR:-$REMOTE_REPO/results/200m-base-suite}" CHECKPOINT_BASE="${TAOTERN_CHECKPOINT_DIR:-$OUTPUT_BASE/checkpoints}" # Stage-1 defaults are intentionally modest. Increase these through environment # variables after the 200M shapes are stable on the RTX5090. MAX_TOKENS="${MAX_TOKENS:-50000000}" MAX_RECORDS="${MAX_RECORDS:-100000}" TRAIN_STEPS="${TRAIN_STEPS:-200}" EVAL_BATCHES="${EVAL_BATCHES:-16}" BATCH_SIZES="${BATCH_SIZES:-4,8}" SEQ_LEN="${SEQ_LEN:-512}" LEARNING_RATE="${LEARNING_RATE:-0.0006}" WEIGHT_DECAY="${WEIGHT_DECAY:-0.01}" DRY_RUN="${DRY_RUN:-0}" export PYTHONPATH="$REMOTE_REPO/src:$SSM_REPO_PATH" mkdir -p "$OUTPUT_BASE" "$CHECKPOINT_BASE" run_variant() { local variant="$1" shift local output_dir="$OUTPUT_BASE/$variant" local checkpoint_dir="$CHECKPOINT_BASE/$variant" mkdir -p "$output_dir" "$checkpoint_dir" local cmd="$PYTHON_BIN scripts/benchmark_taonet_real_tokens.py \ --data-path $DATA_PATH \ --text-field text \ --tokenizer-type sentencepiece \ --tokenizer-path $TOKENIZER_PATH \ --max-records $MAX_RECORDS \ --max-tokens $MAX_TOKENS \ --eval-fraction 0.1 \ --batch-sizes $BATCH_SIZES \ --seq-len $SEQ_LEN \ --dtype bf16 \ --device cuda \ --warmup 1 \ --repeats 2 \ --backward \ --train-steps $TRAIN_STEPS \ --learning-rate $LEARNING_RATE \ --weight-decay $WEIGHT_DECAY \ --eval-batches $EVAL_BATCHES \ --output-dir $output_dir \ --resume-completed \ --incremental-output \ --save-case-checkpoints \ --checkpoint-dir $checkpoint_dir \ $*" printf '\n=== 200M variant: %s ===\n' "$variant" printf '%s\n' "$cmd" if [ "$DRY_RUN" = "1" ]; then return 0 fi eval "$cmd" } run_variant attention_196m \ --architectures taonet \ --hidden-dim 960 \ --num-layers 16 \ --num-heads 8 \ --d-latent-kv 720 \ --d-rope 120 \ --hidden-dim-ff 2880 run_variant pure_ssm_196m_hadamard \ --architectures taonet_ssm \ --hidden-dim 1024 \ --num-layers 18 \ --num-heads 8 \ --d-latent-kv 768 \ --d-rope 128 \ --hidden-dim-ff 3072 \ --ssm-core dplr \ --ssm-hidden-dims 16 \ --ssm-mixer-dims 256 \ --ssm-num-lanes-list 2 \ --ssm-lane-combine channel \ --ssm-lane-modes split \ --ssm-split-mixes hadamard \ --ssm-rank 1 \ --ssm-kernel-mode conv \ --no-ssm-finite-tail-correction \ --ssm-gate-types channel \ --ssm-local-shift \ --ssm-local-shift-per-channel \ --ssm-local-shift-init 0.1 run_variant pure_ssm_196m_nomix \ --architectures taonet_ssm \ --hidden-dim 1024 \ --num-layers 18 \ --num-heads 8 \ --d-latent-kv 768 \ --d-rope 128 \ --hidden-dim-ff 3072 \ --ssm-core dplr \ --ssm-hidden-dims 16 \ --ssm-mixer-dims 256 \ --ssm-num-lanes-list 2 \ --ssm-lane-combine channel \ --ssm-lane-modes split \ --ssm-split-mixes none \ --ssm-rank 1 \ --ssm-kernel-mode conv \ --no-ssm-finite-tail-correction \ --ssm-gate-types channel \ --ssm-local-shift \ --ssm-local-shift-per-channel \ --ssm-local-shift-init 0.1 run_variant hybrid_ssm_first_199m \ --architectures taonet_hybrid \ --hidden-dim 1024 \ --num-layers 16 \ --num-heads 8 \ --d-latent-kv 768 \ --d-rope 128 \ --hidden-dim-ff 3072 \ --ssm-core dplr \ --ssm-hidden-dims 32 \ --ssm-mixer-dims 256 \ --ssm-num-lanes-list 2 \ --ssm-lane-combine channel \ --ssm-lane-modes split \ --ssm-split-mixes hadamard \ --ssm-rank 1 \ --ssm-kernel-mode conv \ --no-ssm-finite-tail-correction \ --ssm-gate-types channel \ --hybrid-patterns ssm_first \ --ssm-local-shift \ --ssm-local-shift-per-channel \ --ssm-local-shift-init 0.1 if [ "$DRY_RUN" != "1" ]; then "$PYTHON_BIN" scripts/summarize_taonet_benchmark_suite.py --suite-dir "$OUTPUT_BASE" fi