File size: 2,011 Bytes
422445b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/bash
# Feather prod8 autonomous launcher — survives Hermes session transitions
set -euo pipefail
cd /home/mikeb/work/feather

# Find HF token
HF=$(grep -ohP 'hf_[A-Za-z0-9_-]+' ~/.bashrc 2>/dev/null | head -1 || true)

# Kill stale training
pkill -9 -f "python.*train\.py" 2>/dev/null || true
sleep 1

# Export all HYDRA env vars
export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export HF_TOKEN="$HF"
export HUGGINGFACE_HUB_TOKEN="$HF"
export WANDB_DISABLED=true
export HYDRA_USE_NEMOTRON=1
export HYDRA_USE_FULL_BLEND=1
export HYDRA_SAMPLED_SOFTMAX=1024
export HYDRA_SOFTCAP_CLAMP=1
export HYDRA_SEQ_LEN=1024
export HYDRA_HEADDIM=32
export HYDRA_D_STATE=64
export HYDRA_TIME_BUDGET=300
export HYDRA_ENGRAM_TOPK=64
export HYDRA_GDN_LAYERS=
export HYDRA_MTP_K=1
export HYDRA_USE_MDLM=0
export HYDRA_MUON_COMPILE=0
export HYDRA_MUON_NS_STEPS=2
export HYDRA_MATRIX_LR=0.01
export HYDRA_EMBED_LR=0.20
export HYDRA_UNEMBED_LR=0.001
export HYDRA_DT_BIAS_LR=0.05
export HYDRA_SCALAR_LR=0.01
export HYDRA_WARMUP_RATIO=0.01
export HYDRA_LR_MIN_MULT=0.10
export HYDRA_WARMSTART=1
export HYDRA_STREAM_SHUFFLE_BUFFER=4096
export HYDRA_LOCAL_SHARDS_ONLY=0
export HYDRA_BACKGROUND_PREFETCH=0
export HYDRA_STREAM_PREFETCH=16
export HYDRA_TOKEN_PREFETCH=4
export HYDRA_TOKEN_CACHE_GB=4
export HYDRA_CKPT_INTERVAL=2000
export HYDRA_MID_VAL_INTERVAL=250
export HYDRA_CKPT_ROTATIONS=3
export HYDRA_SKIP_FACTUAL_EVAL=1
export HYDRA_N_LAYER=6
export HYDRA_D_MODEL=192
export HYDRA_EXPAND=3
export HYDRA_BATCH_SIZE=16
export HYDRA_TOTAL_BATCH=32768
export HYDRA_HESTIA_INTERVAL=999999
export HYDRA_HTM_SUBSAMPLE=16
export UV_PYTHON=/usr/bin/python3

# Launch via setsid for session transition survival
setsid -f taskset -c 0-15 ./.venv/bin/python -u train.py </dev/null >>run_3060_prod8.log 2>&1 &
TPID=$!
echo "Launched PID=$TPID"
sleep 2
pgrep -n -f 'python.*train\.py' 2>/dev/null && echo "Training running" || echo "WARNING: no training process found"