#!/bin/bash # Large-N Evaluator - runs evaluation on a 520-prompt dataset to prove statistical significance #SBATCH --job-name=eval-large #SBATCH --output=logs/eval-large-%j.out #SBATCH --error=logs/eval-large-%j.err #SBATCH --time=24:00:00 #SBATCH --mem=64G #SBATCH --gres=gpu:1 #SBATCH --cpus-per-task=4 set -euo pipefail PERSIST_ROOT="/common/users/$USER/iconoclast_ilabs" SITE_PACKAGES="$PERSIST_ROOT/python312-site" SYS_PY="/common/system/venv/python312/bin/python" PROJECT_ROOT="$HOME/iconoclast" # List of all completed studies we want to do a large-N evaluation on. # These must correspond to actual .jsonl files in checkpoints/ # Format: model_key (the part before .jsonl in checkpoints) CHECKPOINTS=( "qwen3-1p7b-rutgers-paper-directness" "qwen2-5-3b-rutgers-benchmark" "qwen3-4b-rutgers-benchmark-v2" "phi35-mini-rutgers-nullspace-benchmark-v3" "llama3-1-8b-rutgers-benchmark" "smollm2-1p7b-rutgers-benchmark" "gemma2-2b-seq" "mistral-7b-seq" "phi4-mini-seq" "stablelm2-1p6b-seq" "yi-1p5-9b-seq" "falcon3-7b-seq" "olmo2-1b-seq" ) # And their HERETIC counterparts for cp in "${CHECKPOINTS[@]}"; do # Extract the base model name part from the run name base_name=$(echo "$cp" | sed -E 's/-(rutgers|seq|benchmark|paper).*//') HERETIC_CHECKPOINTS+=("${base_name}-heretic") done ALL_CHECKPOINTS=("${CHECKPOINTS[@]}" "${HERETIC_CHECKPOINTS[@]}") cd "$PROJECT_ROOT" mkdir -p logs mkdir -p "$PERSIST_ROOT/large_evals" for run_name in "${ALL_CHECKPOINTS[@]}"; do checkpoint_dir="$PERSIST_ROOT/checkpoints/$run_name" # The .jsonl file name is generated by replacing non-alnum with '--'. # We can just find it in the dir. if [ ! -d "$checkpoint_dir" ]; then echo "Skipping $run_name (not finished yet)" continue fi jsonl_file=$(find "$checkpoint_dir" -name "*.jsonl" | head -n 1) if [ -z "$jsonl_file" ]; then echo "Skipping $run_name (no .jsonl found)" continue fi echo "============================================================" echo " STARTING LARGE EVAL: $run_name" echo " CHECKPOINT: $jsonl_file" echo "============================================================" # Set up per-run staging and cache JOB_ROOT="$PERSIST_ROOT/job-stage/eval-$run_name-$SLURM_JOB_ID" CACHE_ROOT="$PERSIST_ROOT/job-cache/eval-$run_name-$SLURM_JOB_ID" rm -rf "$JOB_ROOT" "$CACHE_ROOT" mkdir -p "$JOB_ROOT" mkdir -p "$CACHE_ROOT"/{hf,hub,transformers,datasets,xdg-cache,xdg-state} # Stage the project rsync -a \ --exclude '.venv' \ --exclude '__pycache__' \ --exclude 'logs' \ --exclude '.pytest_cache' \ "$PROJECT_ROOT"/ "$JOB_ROOT"/ cd "$JOB_ROOT" export PYTHONPATH="$JOB_ROOT/src:$SITE_PACKAGES" export HF_HUB_ENABLE_HF_TRANSFER=1 export PYTHONUNBUFFERED=1 export TOKENIZERS_PARALLELISM=false export USE_TF=0 export USE_FLAX=0 export HF_TOKEN="YOUR_HF_TOKEN_HERE" export XDG_CACHE_HOME="$CACHE_ROOT/xdg-cache" export XDG_STATE_HOME="$CACHE_ROOT/xdg-state" export HF_HOME="$CACHE_ROOT/hf" export HF_DATASETS_CACHE="$CACHE_ROOT/datasets" export TRANSFORMERS_CACHE="$CACHE_ROOT/transformers" export HUGGINGFACE_HUB_CACHE="$CACHE_ROOT/hub" output_file="$PERSIST_ROOT/large_evals/${run_name}_large_eval.json" # Run the large evaluator "$SYS_PY" scripts/evaluate_large_dataset.py \ --checkpoint "$jsonl_file" \ --dataset "mlabonne/harmful_behaviors" \ --split "train+test" \ --column "text" \ --output "$output_file" || echo " FAILED EVAL for $run_name" # Clean up cd "$PROJECT_ROOT" rm -rf "$JOB_ROOT" "$CACHE_ROOT" echo " Done with $run_name" done echo "ALL LARGE EVALUATIONS COMPLETE."