#!/bin/bash # Feather prod8 autonomous launcher — survives Hermes session transitions set -euo pipefail cd /home/mikeb/work/feather # Find HF token HF=$(grep -ohP 'hf_[A-Za-z0-9_-]+' ~/.bashrc 2>/dev/null | head -1 || true) # Kill stale training pkill -9 -f "python.*train\.py" 2>/dev/null || true sleep 1 # Export all HYDRA env vars export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export HF_TOKEN="$HF" export HUGGINGFACE_HUB_TOKEN="$HF" export WANDB_DISABLED=true export HYDRA_USE_NEMOTRON=1 export HYDRA_USE_FULL_BLEND=1 export HYDRA_SAMPLED_SOFTMAX=1024 export HYDRA_SOFTCAP_CLAMP=1 export HYDRA_SEQ_LEN=1024 export HYDRA_HEADDIM=32 export HYDRA_D_STATE=64 export HYDRA_TIME_BUDGET=300 export HYDRA_ENGRAM_TOPK=64 export HYDRA_GDN_LAYERS= export HYDRA_MTP_K=1 export HYDRA_USE_MDLM=0 export HYDRA_MUON_COMPILE=0 export HYDRA_MUON_NS_STEPS=2 export HYDRA_MATRIX_LR=0.01 export HYDRA_EMBED_LR=0.20 export HYDRA_UNEMBED_LR=0.001 export HYDRA_DT_BIAS_LR=0.05 export HYDRA_SCALAR_LR=0.01 export HYDRA_WARMUP_RATIO=0.01 export HYDRA_LR_MIN_MULT=0.10 export HYDRA_WARMSTART=1 export HYDRA_STREAM_SHUFFLE_BUFFER=4096 export HYDRA_LOCAL_SHARDS_ONLY=0 export HYDRA_BACKGROUND_PREFETCH=0 export HYDRA_STREAM_PREFETCH=16 export HYDRA_TOKEN_PREFETCH=4 export HYDRA_TOKEN_CACHE_GB=4 export HYDRA_CKPT_INTERVAL=2000 export HYDRA_MID_VAL_INTERVAL=250 export HYDRA_CKPT_ROTATIONS=3 export HYDRA_SKIP_FACTUAL_EVAL=1 export HYDRA_N_LAYER=6 export HYDRA_D_MODEL=192 export HYDRA_EXPAND=3 export HYDRA_BATCH_SIZE=16 export HYDRA_TOTAL_BATCH=32768 export HYDRA_HESTIA_INTERVAL=999999 export HYDRA_HTM_SUBSAMPLE=16 export UV_PYTHON=/usr/bin/python3 # Launch via setsid for session transition survival setsid -f taskset -c 0-15 ./.venv/bin/python -u train.py >run_3060_prod8.log 2>&1 & TPID=$! echo "Launched PID=$TPID" sleep 2 pgrep -n -f 'python.*train\.py' 2>/dev/null && echo "Training running" || echo "WARNING: no training process found"