File size: 4,323 Bytes
e2bfccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env bash
set -euo pipefail

DATA_PATH="${DATA_PATH:-/home/student/Data/TaoData/pretrain.jsonl}"
TOKENIZER_PATH="${TOKENIZER_PATH:-/home/student/YouZheng/tokenizers/taodata_pilot_8k/tokenizer.model}"
SSM_REPO_PATH="${SSM_REPO_PATH:-/home/student/YouZheng/gamma_ssm_repo}"
PYTHON_BIN="${PYTHON_BIN:-/home/student/.venv/bin/python}"
REMOTE_REPO="${REMOTE_REPO:-$(pwd)}"
OUTPUT_BASE="${REPOBRIDGE_OUTPUT_DIR:-$REMOTE_REPO/results/200m-base-suite}"
CHECKPOINT_BASE="${TAOTERN_CHECKPOINT_DIR:-$OUTPUT_BASE/checkpoints}"

# Stage-1 defaults are intentionally modest. Increase these through environment
# variables after the 200M shapes are stable on the RTX5090.
MAX_TOKENS="${MAX_TOKENS:-50000000}"
MAX_RECORDS="${MAX_RECORDS:-100000}"
TRAIN_STEPS="${TRAIN_STEPS:-200}"
EVAL_BATCHES="${EVAL_BATCHES:-16}"
BATCH_SIZES="${BATCH_SIZES:-4,8}"
SEQ_LEN="${SEQ_LEN:-512}"
LEARNING_RATE="${LEARNING_RATE:-0.0006}"
WEIGHT_DECAY="${WEIGHT_DECAY:-0.01}"
DRY_RUN="${DRY_RUN:-0}"

export PYTHONPATH="$REMOTE_REPO/src:$SSM_REPO_PATH"
mkdir -p "$OUTPUT_BASE" "$CHECKPOINT_BASE"

run_variant() {
  local variant="$1"
  shift
  local output_dir="$OUTPUT_BASE/$variant"
  local checkpoint_dir="$CHECKPOINT_BASE/$variant"
  mkdir -p "$output_dir" "$checkpoint_dir"

  local cmd="$PYTHON_BIN scripts/benchmark_taonet_real_tokens.py \

    --data-path $DATA_PATH \

    --text-field text \

    --tokenizer-type sentencepiece \

    --tokenizer-path $TOKENIZER_PATH \

    --max-records $MAX_RECORDS \

    --max-tokens $MAX_TOKENS \

    --eval-fraction 0.1 \

    --batch-sizes $BATCH_SIZES \

    --seq-len $SEQ_LEN \

    --dtype bf16 \

    --device cuda \

    --warmup 1 \

    --repeats 2 \

    --backward \

    --train-steps $TRAIN_STEPS \

    --learning-rate $LEARNING_RATE \

    --weight-decay $WEIGHT_DECAY \

    --eval-batches $EVAL_BATCHES \

    --output-dir $output_dir \

    --resume-completed \

    --incremental-output \

    --save-case-checkpoints \

    --checkpoint-dir $checkpoint_dir \

    $*"

  printf '\n=== 200M variant: %s ===\n' "$variant"
  printf '%s\n' "$cmd"
  if [ "$DRY_RUN" = "1" ]; then
    return 0
  fi
  eval "$cmd"
}

run_variant attention_196m \
  --architectures taonet \
  --hidden-dim 960 \
  --num-layers 16 \
  --num-heads 8 \
  --d-latent-kv 720 \
  --d-rope 120 \
  --hidden-dim-ff 2880

run_variant pure_ssm_196m_hadamard \
  --architectures taonet_ssm \
  --hidden-dim 1024 \
  --num-layers 18 \
  --num-heads 8 \
  --d-latent-kv 768 \
  --d-rope 128 \
  --hidden-dim-ff 3072 \
  --ssm-core dplr \
  --ssm-hidden-dims 16 \
  --ssm-mixer-dims 256 \
  --ssm-num-lanes-list 2 \
  --ssm-lane-combine channel \
  --ssm-lane-modes split \
  --ssm-split-mixes hadamard \
  --ssm-rank 1 \
  --ssm-kernel-mode conv \
  --no-ssm-finite-tail-correction \
  --ssm-gate-types channel \
  --ssm-local-shift \
  --ssm-local-shift-per-channel \
  --ssm-local-shift-init 0.1

run_variant pure_ssm_196m_nomix \
  --architectures taonet_ssm \
  --hidden-dim 1024 \
  --num-layers 18 \
  --num-heads 8 \
  --d-latent-kv 768 \
  --d-rope 128 \
  --hidden-dim-ff 3072 \
  --ssm-core dplr \
  --ssm-hidden-dims 16 \
  --ssm-mixer-dims 256 \
  --ssm-num-lanes-list 2 \
  --ssm-lane-combine channel \
  --ssm-lane-modes split \
  --ssm-split-mixes none \
  --ssm-rank 1 \
  --ssm-kernel-mode conv \
  --no-ssm-finite-tail-correction \
  --ssm-gate-types channel \
  --ssm-local-shift \
  --ssm-local-shift-per-channel \
  --ssm-local-shift-init 0.1

run_variant hybrid_ssm_first_199m \
  --architectures taonet_hybrid \
  --hidden-dim 1024 \
  --num-layers 16 \
  --num-heads 8 \
  --d-latent-kv 768 \
  --d-rope 128 \
  --hidden-dim-ff 3072 \
  --ssm-core dplr \
  --ssm-hidden-dims 32 \
  --ssm-mixer-dims 256 \
  --ssm-num-lanes-list 2 \
  --ssm-lane-combine channel \
  --ssm-lane-modes split \
  --ssm-split-mixes hadamard \
  --ssm-rank 1 \
  --ssm-kernel-mode conv \
  --no-ssm-finite-tail-correction \
  --ssm-gate-types channel \
  --hybrid-patterns ssm_first \
  --ssm-local-shift \
  --ssm-local-shift-per-channel \
  --ssm-local-shift-init 0.1

if [ "$DRY_RUN" != "1" ]; then
  "$PYTHON_BIN" scripts/summarize_taonet_benchmark_suite.py --suite-dir "$OUTPUT_BASE"
fi