#!/usr/bin/env bash # Re-runs the two configs that OOM'd in the original sweep, now with the # chunked-KL fix and PYTORCH_CUDA_ALLOC_CONF=expandable_segments in distill.py. # Reads HF_TOKEN, HUGGING_FACE_HUB_TOKEN, WANDB_API_KEY from the calling env. # # Launch with: # nohup ./scripts/run_sweep_rerun.sh > logs/sweep_rerun_master.log 2>&1 & set -uo pipefail cd "$(dirname "$0")/.." CONFIGS=( "configs/replicate_zero4.toml" "configs/grow40_winning.toml" ) LOG_DIR="logs" mkdir -p "$LOG_DIR" for cfg in "${CONFIGS[@]}"; do name="$(basename "$cfg" .toml)" log="$LOG_DIR/$name.log" echo ">>> [$(date '+%F %T')] starting $name -> $log" .venv/bin/accelerate launch \ --config_file configs/accelerate.yaml \ distill.py \ --config "$cfg" \ > "$log" 2>&1 rc=$? echo "<<< [$(date '+%F %T')] finished $name (exit=$rc)" if [[ $rc -ne 0 ]]; then echo " last 20 lines of $log:" tail -20 "$log" | sed 's/^/ /' fi done echo ">>> [$(date '+%F %T')] rerun complete"