| #!/bin/bash |
| |
| set -euo pipefail |
| cd /home/mikeb/work/feather |
| HF=$(grep -ohP 'hf_[A-Za-z0-9_-]+' ~/.bashrc 2>/dev/null | head -1 || true) |
| pkill -9 -f "python.*train\.py" 2>/dev/null || true |
| sleep 1 |
| rm -f /home/mikeb/.cache/autoresearch/packed_tokens_v1_T1024_V65536_train.bin* |
|
|
| export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| export HF_TOKEN="$HF" |
| export HUGGINGFACE_HUB_TOKEN="$HF" |
| export WANDB_DISABLED=true |
| export HYDRA_USE_NEMOTRON=1 |
| export HYDRA_USE_FULL_BLEND=1 |
| export HYDRA_SAMPLED_SOFTMAX=1024 |
| export HYDRA_SOFTCAP_CLAMP=1 |
| export HYDRA_SEQ_LEN=1024 |
| export HYDRA_HEADDIM=32 |
| export HYDRA_D_STATE=64 |
| export HYDRA_TIME_BUDGET=300 |
| export HYDRA_ENGRAM_TOPK=64 |
| export HYDRA_GDN_LAYERS= |
| export HYDRA_MTP_K=1 |
| export HYDRA_USE_MDLM=0 |
| export HYDRA_MUON_COMPILE=0 |
| export HYDRA_MUON_NS_STEPS=2 |
| |
| |
| export HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/best_bpb.pt |
| export HYDRA_MATRIX_LR=0.004 |
| export HYDRA_EMBED_LR=0.08 |
| export HYDRA_UNEMBED_LR=0.0005 |
| export HYDRA_DT_BIAS_LR=0.02 |
| export HYDRA_SCALAR_LR=0.004 |
| export HYDRA_WEIGHT_DECAY=0.03 |
| export HYDRA_DROPOUT=0.30 |
| export HYDRA_LABEL_SMOOTHING=0.05 |
| export HYDRA_Z_LOSS_WEIGHT=0.0005 |
| export HYDRA_WARMUP_RATIO=0.02 |
| export HYDRA_LR_MIN_MULT=0.25 |
| export HYDRA_WARMSTART=1 |
| export HYDRA_STREAM_SHUFFLE_BUFFER=4096 |
| export HYDRA_LOCAL_SHARDS_ONLY=0 |
| export HYDRA_BACKGROUND_PREFETCH=0 |
| export HYDRA_STREAM_PREFETCH=16 |
| export HYDRA_TOKEN_PREFETCH=4 |
| export HYDRA_TOKEN_CACHE_GB=4 |
| export HYDRA_CKPT_INTERVAL=2000 |
| export HYDRA_MID_VAL_INTERVAL=250 |
| export HYDRA_MID_VAL_BATCH=1 |
| export HYDRA_MID_VAL_TOKENS=51200 |
| export HYDRA_EVAL_BATCH=1 |
| export HYDRA_CKPT_ROTATIONS=3 |
| export HYDRA_SKIP_FACTUAL_EVAL=1 |
| export HYDRA_FORCE_OS_EXIT=1 |
| export HYDRA_N_LAYER=6 |
| export HYDRA_D_MODEL=192 |
| export HYDRA_EXPAND=3 |
| export HYDRA_BATCH_SIZE=16 |
| export HYDRA_TOTAL_BATCH=32768 |
| export HYDRA_HTM_SUBSAMPLE=16 |
| export UV_PYTHON=/usr/bin/python3 |
|
|
| setsid -f taskset -c 0-15 ./.venv/bin/python -u train.py </dev/null >>run_3060_prod9.log 2>&1 & |
| TPID=$! |
| echo "Launched PID=$TPID" |
| sleep 2 |
| pgrep -n -f 'python.*train\.py' && echo "Training running" || echo "WARNING: no process" |