feather-runtime / overlay /scripts /sweep_depth_local.sh
Jackoatmon's picture
Normalize shell line endings in Docker build
6618931 verified
#!/usr/bin/env bash
# Local sequential depth sweep on RTX 3060.
# Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
# Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
# sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) β†’ ~20 min total.
set -euo pipefail
cd "$(dirname "${BASH_SOURCE[0]}")/.."
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
# WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
# CUDA driver library at runtime.
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
export PYTORCH_ALLOC_CONF=expandable_segments:True
# GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
# This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
export HYDRA_HTM_FUSED=0
# Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
export HYDRA_D_MODEL=96
export HYDRA_D_STATE=16
export HYDRA_HEADDIM=12
export HYDRA_EXPAND=3
export HYDRA_ENGRAM_N_COLUMNS=4096
export HYDRA_SDR_TARGET_ACTIVE=327
# Training knobs tuned for 6GB VRAM.
export HYDRA_BATCH_SIZE=1
export HYDRA_TOTAL_BATCH=32768 # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
export HYDRA_TIME_BUDGET=300 # 5 min per run
export HYDRA_CKPT_INTERVAL=0 # don't save ckpts during sweep
export HYDRA_MID_VAL_INTERVAL=250
# Full per-layer diagnostic panel.
export HYDRA_LAYER_DIAGNOSTICS=1
export HYDRA_LAYER_DIAG_SVD_EVERY=100
# Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
# NOT streaming β€” already have 2049 shards from prior local runs.
unset HYDRA_USE_NEMOTRON
PY=/home/mikeb/work/feather/.venv/bin/python3
OUT_DIR=/tmp/local_sweep
mkdir -p "$OUT_DIR"
for N in 1 2 3 4; do
echo "=========================================="
echo "=== n_layer=$N $(date +%H:%M:%S) ==="
echo "=========================================="
export HYDRA_N_LAYER=$N
export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
LOG="$OUT_DIR/sweep_n${N}.log"
"$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
# Quick tail of the important lines
grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
done
echo ""
echo "=== SWEEP COMPLETE ==="
ls -la "$OUT_DIR"