File size: 3,845 Bytes

3236af9

#!/bin/bash
# Large-N Evaluator - runs evaluation on a 520-prompt dataset to prove statistical significance
#SBATCH --job-name=eval-large
#SBATCH --output=logs/eval-large-%j.out
#SBATCH --error=logs/eval-large-%j.err
#SBATCH --time=24:00:00
#SBATCH --mem=64G
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=4

set -euo pipefail

PERSIST_ROOT="/common/users/$USER/iconoclast_ilabs"
SITE_PACKAGES="$PERSIST_ROOT/python312-site"
SYS_PY="/common/system/venv/python312/bin/python"
PROJECT_ROOT="$HOME/iconoclast"

# List of all completed studies we want to do a large-N evaluation on.
# These must correspond to actual .jsonl files in checkpoints/
# Format: model_key (the part before .jsonl in checkpoints)
CHECKPOINTS=(
    "qwen3-1p7b-rutgers-paper-directness"
    "qwen2-5-3b-rutgers-benchmark"
    "qwen3-4b-rutgers-benchmark-v2"
    "phi35-mini-rutgers-nullspace-benchmark-v3"
    "llama3-1-8b-rutgers-benchmark"
    "smollm2-1p7b-rutgers-benchmark"
    "gemma2-2b-seq"
    "mistral-7b-seq"
    "phi4-mini-seq"
    "stablelm2-1p6b-seq"
    "yi-1p5-9b-seq"
    "falcon3-7b-seq"
    "olmo2-1b-seq"
)

# And their HERETIC counterparts
for cp in "${CHECKPOINTS[@]}"; do
    # Extract the base model name part from the run name
    base_name=$(echo "$cp" | sed -E 's/-(rutgers|seq|benchmark|paper).*//')
    HERETIC_CHECKPOINTS+=("${base_name}-heretic")
done

ALL_CHECKPOINTS=("${CHECKPOINTS[@]}" "${HERETIC_CHECKPOINTS[@]}")

cd "$PROJECT_ROOT"
mkdir -p logs
mkdir -p "$PERSIST_ROOT/large_evals"

for run_name in "${ALL_CHECKPOINTS[@]}"; do
    checkpoint_dir="$PERSIST_ROOT/checkpoints/$run_name"
    
    # The .jsonl file name is generated by replacing non-alnum with '--'.
    # We can just find it in the dir.
    if [ ! -d "$checkpoint_dir" ]; then
        echo "Skipping $run_name (not finished yet)"
        continue
    fi
    
    jsonl_file=$(find "$checkpoint_dir" -name "*.jsonl" | head -n 1)
    
    if [ -z "$jsonl_file" ]; then
        echo "Skipping $run_name (no .jsonl found)"
        continue
    fi

    echo "============================================================"
    echo "  STARTING LARGE EVAL: $run_name"
    echo "  CHECKPOINT: $jsonl_file"
    echo "============================================================"

    # Set up per-run staging and cache
    JOB_ROOT="$PERSIST_ROOT/job-stage/eval-$run_name-$SLURM_JOB_ID"
    CACHE_ROOT="$PERSIST_ROOT/job-cache/eval-$run_name-$SLURM_JOB_ID"
    rm -rf "$JOB_ROOT" "$CACHE_ROOT"
    mkdir -p "$JOB_ROOT"
    mkdir -p "$CACHE_ROOT"/{hf,hub,transformers,datasets,xdg-cache,xdg-state}

    # Stage the project
    rsync -a \
        --exclude '.venv' \
        --exclude '__pycache__' \
        --exclude 'logs' \
        --exclude '.pytest_cache' \
        "$PROJECT_ROOT"/ "$JOB_ROOT"/

    cd "$JOB_ROOT"

    export PYTHONPATH="$JOB_ROOT/src:$SITE_PACKAGES"
    export HF_HUB_ENABLE_HF_TRANSFER=1
    export PYTHONUNBUFFERED=1
    export TOKENIZERS_PARALLELISM=false
    export USE_TF=0
    export USE_FLAX=0
    export HF_TOKEN="YOUR_HF_TOKEN_HERE"
    export XDG_CACHE_HOME="$CACHE_ROOT/xdg-cache"
    export XDG_STATE_HOME="$CACHE_ROOT/xdg-state"
    export HF_HOME="$CACHE_ROOT/hf"
    export HF_DATASETS_CACHE="$CACHE_ROOT/datasets"
    export TRANSFORMERS_CACHE="$CACHE_ROOT/transformers"
    export HUGGINGFACE_HUB_CACHE="$CACHE_ROOT/hub"

    output_file="$PERSIST_ROOT/large_evals/${run_name}_large_eval.json"

    # Run the large evaluator
    "$SYS_PY" scripts/evaluate_large_dataset.py \
        --checkpoint "$jsonl_file" \
        --dataset "mlabonne/harmful_behaviors" \
        --split "train+test" \
        --column "text" \
        --output "$output_file" || echo "  FAILED EVAL for $run_name"

    # Clean up
    cd "$PROJECT_ROOT"
    rm -rf "$JOB_ROOT" "$CACHE_ROOT"
    echo "  Done with $run_name"
done

echo "ALL LARGE EVALUATIONS COMPLETE."