#!/usr/bin/env bash
set -euo pipefail

DATA_PATH="${DATA_PATH:-/home/student/Data/TaoData/pretrain.jsonl}"
TOKENIZER_PATH="${TOKENIZER_PATH:-/home/student/YouZheng/tokenizers/taodata_pilot_8k/tokenizer.model}"
SSM_REPO_PATH="${SSM_REPO_PATH:-/home/student/YouZheng/gamma_ssm_repo}"
PYTHON_BIN="${PYTHON_BIN:-/home/student/.venv/bin/python}"
REMOTE_REPO="${REMOTE_REPO:-$(pwd)}"
OUTPUT_BASE="${REPOBRIDGE_OUTPUT_DIR:-$REMOTE_REPO/results/200m-base-suite}"
CHECKPOINT_BASE="${TAOTERN_CHECKPOINT_DIR:-$OUTPUT_BASE/checkpoints}"

# Stage-1 defaults are intentionally modest. Increase these through environment
# variables after the 200M shapes are stable on the RTX5090.
MAX_TOKENS="${MAX_TOKENS:-50000000}"
MAX_RECORDS="${MAX_RECORDS:-100000}"
TRAIN_STEPS="${TRAIN_STEPS:-200}"
EVAL_BATCHES="${EVAL_BATCHES:-16}"
BATCH_SIZES="${BATCH_SIZES:-4,8}"
SEQ_LEN="${SEQ_LEN:-512}"
LEARNING_RATE="${LEARNING_RATE:-0.0006}"
WEIGHT_DECAY="${WEIGHT_DECAY:-0.01}"
DRY_RUN="${DRY_RUN:-0}"

export PYTHONPATH="$REMOTE_REPO/src:$SSM_REPO_PATH"
mkdir -p "$OUTPUT_BASE" "$CHECKPOINT_BASE"

run_variant() {
  local variant="$1"
  shift
  local output_dir="$OUTPUT_BASE/$variant"
  local checkpoint_dir="$CHECKPOINT_BASE/$variant"
  mkdir -p "$output_dir" "$checkpoint_dir"

  local cmd="$PYTHON_BIN scripts/benchmark_taonet_real_tokens.py \
    --data-path $DATA_PATH \
    --text-field text \
    --tokenizer-type sentencepiece \
    --tokenizer-path $TOKENIZER_PATH \
    --max-records $MAX_RECORDS \
    --max-tokens $MAX_TOKENS \
    --eval-fraction 0.1 \
    --batch-sizes $BATCH_SIZES \
    --seq-len $SEQ_LEN \
    --dtype bf16 \
    --device cuda \
    --warmup 1 \
    --repeats 2 \
    --backward \
    --train-steps $TRAIN_STEPS \
    --learning-rate $LEARNING_RATE \
    --weight-decay $WEIGHT_DECAY \
    --eval-batches $EVAL_BATCHES \
    --output-dir $output_dir \
    --resume-completed \
    --incremental-output \
    --save-case-checkpoints \
    --checkpoint-dir $checkpoint_dir \
    $*"

  printf '\n=== 200M variant: %s ===\n' "$variant"
  printf '%s\n' "$cmd"
  if [ "$DRY_RUN" = "1" ]; then
    return 0
  fi
  eval "$cmd"
}

run_variant attention_196m \
  --architectures taonet \
  --hidden-dim 960 \
  --num-layers 16 \
  --num-heads 8 \
  --d-latent-kv 720 \
  --d-rope 120 \
  --hidden-dim-ff 2880

run_variant pure_ssm_196m_hadamard \
  --architectures taonet_ssm \
  --hidden-dim 1024 \
  --num-layers 18 \
  --num-heads 8 \
  --d-latent-kv 768 \
  --d-rope 128 \
  --hidden-dim-ff 3072 \
  --ssm-core dplr \
  --ssm-hidden-dims 16 \
  --ssm-mixer-dims 256 \
  --ssm-num-lanes-list 2 \
  --ssm-lane-combine channel \
  --ssm-lane-modes split \
  --ssm-split-mixes hadamard \
  --ssm-rank 1 \
  --ssm-kernel-mode conv \
  --no-ssm-finite-tail-correction \
  --ssm-gate-types channel \
  --ssm-local-shift \
  --ssm-local-shift-per-channel \
  --ssm-local-shift-init 0.1

run_variant pure_ssm_196m_nomix \
  --architectures taonet_ssm \
  --hidden-dim 1024 \
  --num-layers 18 \
  --num-heads 8 \
  --d-latent-kv 768 \
  --d-rope 128 \
  --hidden-dim-ff 3072 \
  --ssm-core dplr \
  --ssm-hidden-dims 16 \
  --ssm-mixer-dims 256 \
  --ssm-num-lanes-list 2 \
  --ssm-lane-combine channel \
  --ssm-lane-modes split \
  --ssm-split-mixes none \
  --ssm-rank 1 \
  --ssm-kernel-mode conv \
  --no-ssm-finite-tail-correction \
  --ssm-gate-types channel \
  --ssm-local-shift \
  --ssm-local-shift-per-channel \
  --ssm-local-shift-init 0.1

run_variant hybrid_ssm_first_199m \
  --architectures taonet_hybrid \
  --hidden-dim 1024 \
  --num-layers 16 \
  --num-heads 8 \
  --d-latent-kv 768 \
  --d-rope 128 \
  --hidden-dim-ff 3072 \
  --ssm-core dplr \
  --ssm-hidden-dims 32 \
  --ssm-mixer-dims 256 \
  --ssm-num-lanes-list 2 \
  --ssm-lane-combine channel \
  --ssm-lane-modes split \
  --ssm-split-mixes hadamard \
  --ssm-rank 1 \
  --ssm-kernel-mode conv \
  --no-ssm-finite-tail-correction \
  --ssm-gate-types channel \
  --hybrid-patterns ssm_first \
  --ssm-local-shift \
  --ssm-local-shift-per-channel \
  --ssm-local-shift-init 0.1

if [ "$DRY_RUN" != "1" ]; then
  "$PYTHON_BIN" scripts/summarize_taonet_benchmark_suite.py --suite-dir "$OUTPUT_BASE"
fi