Spaces:
Runtime error
Runtime error
| # Local sequential depth sweep on RTX 3060. | |
| # Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main). | |
| # Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327), | |
| # sweeping n_layer β {1, 2, 3, 4}. Each run 300s (~5 min) β ~20 min total. | |
| set -euo pipefail | |
| cd "$(dirname "${BASH_SOURCE[0]}")/.." | |
| export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} | |
| # WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the | |
| # CUDA driver library at runtime. | |
| export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-} | |
| export PYTORCH_ALLOC_CONF=expandable_segments:True | |
| # GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only). | |
| # This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async). | |
| export HYDRA_HTM_FUSED=0 | |
| # Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity). | |
| export HYDRA_D_MODEL=96 | |
| export HYDRA_D_STATE=16 | |
| export HYDRA_HEADDIM=12 | |
| export HYDRA_EXPAND=3 | |
| export HYDRA_ENGRAM_N_COLUMNS=4096 | |
| export HYDRA_SDR_TARGET_ACTIVE=327 | |
| # Training knobs tuned for 6GB VRAM. | |
| export HYDRA_BATCH_SIZE=1 | |
| export HYDRA_TOTAL_BATCH=32768 # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config | |
| export HYDRA_TIME_BUDGET=300 # 5 min per run | |
| export HYDRA_CKPT_INTERVAL=0 # don't save ckpts during sweep | |
| export HYDRA_MID_VAL_INTERVAL=250 | |
| # Full per-layer diagnostic panel. | |
| export HYDRA_LAYER_DIAGNOSTICS=1 | |
| export HYDRA_LAYER_DIAG_SVD_EVERY=100 | |
| # Use cached shards + tokenizer + retina (vocab=8192, target_active=327). | |
| # NOT streaming β already have 2049 shards from prior local runs. | |
| unset HYDRA_USE_NEMOTRON | |
| PY=/home/mikeb/work/feather/.venv/bin/python3 | |
| OUT_DIR=/tmp/local_sweep | |
| mkdir -p "$OUT_DIR" | |
| for N in 1 2 3 4; do | |
| echo "==========================================" | |
| echo "=== n_layer=$N $(date +%H:%M:%S) ===" | |
| echo "==========================================" | |
| export HYDRA_N_LAYER=$N | |
| export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json" | |
| LOG="$OUT_DIR/sweep_n${N}.log" | |
| "$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)" | |
| echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ===" | |
| # Quick tail of the important lines | |
| grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true | |
| done | |
| echo "" | |
| echo "=== SWEEP COMPLETE ===" | |
| ls -la "$OUT_DIR" | |