#!/usr/bin/env bash set -euo pipefail DATA_PATH="${DATA_PATH:-/home/student/Data/TaoData/pretrain.jsonl}" SFT_DATA_PATH="${SFT_DATA_PATH:-/home/student/Data/TaoData/sft.jsonl}" TOKENIZER_PATH="${TOKENIZER_PATH:-/home/student/YouZheng/tokenizers/taodata_pilot_8k/tokenizer.model}" SSM_REPO_PATH="${SSM_REPO_PATH:-/home/student/YouZheng/gamma_ssm_repo}" PYTHON_BIN="${PYTHON_BIN:-/home/student/.venv/bin/python}" REMOTE_REPO="${REMOTE_REPO:-$(pwd)}" OUTPUT_BASE="${REPOBRIDGE_OUTPUT_DIR:-$REMOTE_REPO/results/pre-200m-stability-gate}" CHECKPOINT_BASE="${TAOTERN_CHECKPOINT_DIR:-$OUTPUT_BASE/checkpoints}" SEQ_LEN="${SEQ_LEN:-512}" BATCH_SIZE="${BATCH_SIZE:-8}" TARGET_TOKENS="${TARGET_TOKENS:-20000000}" MAX_TOKENS="${MAX_TOKENS:-50000000}" MAX_RECORDS="${MAX_RECORDS:-120000}" EVAL_BATCHES="${EVAL_BATCHES:-64}" LEARNING_RATE="${LEARNING_RATE:-0.0008}" WEIGHT_DECAY="${WEIGHT_DECAY:-0.01}" TRAIN_LOG_EVERY="${TRAIN_LOG_EVERY:-250}" SFT_SANITY_SAMPLES="${SFT_SANITY_SAMPLES:-4}" SFT_SANITY_STEPS="${SFT_SANITY_STEPS:-120}" SFT_SANITY_LR="${SFT_SANITY_LR:-0.00005}" ceil_div() { local numerator="$1" local denominator="$2" echo $(( (numerator + denominator - 1) / denominator )) } TRAIN_STEPS="${TRAIN_STEPS:-$(ceil_div "$TARGET_TOKENS" $((BATCH_SIZE * SEQ_LEN)))}" export PYTHONPATH="$REMOTE_REPO/src:$SSM_REPO_PATH" mkdir -p "$OUTPUT_BASE" "$CHECKPOINT_BASE" "$OUTPUT_BASE/configs" "$OUTPUT_BASE/diagnostics" cat > "$OUTPUT_BASE/run_plan.json" <&2 exit 2 fi "$PYTHON_BIN" scripts/diagnostics/activation_probe.py \ --checkpoint "$PRETRAIN_CKPT" \ --tokenizer-path "$TOKENIZER_PATH" \ --data-path "$DATA_PATH" \ --text-field text \ --output "$OUTPUT_BASE/diagnostics/activation_probe_pretrain_latest.json" \ --batch-size 2 \ --seq-len "$SEQ_LEN" \ --device cuda \ --dtype bfloat16 "$PYTHON_BIN" scripts/diagnostics/generate_checkpoint_samples.py \ --checkpoint "$PRETRAIN_CKPT" \ --tokenizer-path "$TOKENIZER_PATH" \ --output "$OUTPUT_BASE/diagnostics/generation_samples_pretrain_latest.json" \ --max-new-tokens 80 \ --temperature 0.8 \ --top-p 0.9 \ --prompt "The purpose of artificial intelligence is" \ --prompt "In a small village," \ --prompt "Hello, who are you?" SFT_CONFIG="$OUTPUT_BASE/configs/sft_sanity.yaml" cat > "$SFT_CONFIG" <" assistant_token: "" response_loss_only: true batch_size: $BATCH_SIZE num_epochs: 100000 max_steps: $SFT_SANITY_STEPS gradient_accumulation_steps: 1 max_grad_norm: 1.0 optimizer: optimizer_type: adamw learning_rate: $SFT_SANITY_LR weight_decay: 0.0 betas: [0.9, 0.999] eps: 1e-8 scheduler: scheduler_type: linearWarmup warmup_steps: 0 dtype: bfloat16 device: cuda checkpoint_dir: $CHECKPOINT_BASE/sft_sanity save_every_steps: 5000 save_best_model: false keep_last_n_checkpoints: 1 eval_every_steps: 5000 eval_samples: 32 log_every_steps: 10 aim_repo: $OUTPUT_BASE/.aim-sft-sanity seed: 43 num_workers: 0 pin_memory: true YAML "$PYTHON_BIN" scripts/diagnostics/sft_sanity_check.py \ --config "$SFT_CONFIG" \ --checkpoint "$PRETRAIN_CKPT" \ --output "$OUTPUT_BASE/diagnostics/sft_sanity_pretrain_latest.json" \ --samples "$SFT_SANITY_SAMPLES" \ --steps "$SFT_SANITY_STEPS" \ --lr "$SFT_SANITY_LR" \ --log-every 20 \ --device cuda \ --dtype bfloat16 \ --ssm-branch-rms-norm \ --ssm-branch-clip-value 1.0 \ --block-residual-rms-norm \ --block-residual-rms-target 1.0 cat > "$OUTPUT_BASE/gate_summary.json" <