#!/usr/bin/env bash # Run after training completes to evaluate trained LoRA vs base and start run 2. # Usage: bash scripts/eval_after_training.sh [lora_dir] # # Steps: # 1. Start vLLM with base model (qwen35) + trained LoRA (qwen35-trained) # 2. Run inference.py with base model → measure baseline # 3. Run inference.py with trained model → measure improvement # 4. Print comparison and write to checkpoints/eval_results.txt # 5. Prompt whether to start run 2 (resume from trained LoRA) set -euo pipefail LORA_DIR="${1:-checkpoints/run2/developer_final}" MODEL_PATH="$HOME/models/Qwen3.5-2B" VLLM_LOG="$HOME/vllm_eval.log" RESULTS_FILE="checkpoints/eval_results.txt" PYTHON="/dev/shm/qwen35/bin/python" echo "=== Post-Training Evaluation ===" echo "LoRA: $LORA_DIR" echo "Model: $MODEL_PATH" echo "" # Verify LoRA exists if [ ! -d "$LORA_DIR" ]; then echo "ERROR: LoRA dir not found: $LORA_DIR" exit 1 fi # Kill any lingering vLLM processes if pgrep -f "vllm.entrypoints" > /dev/null 2>&1; then echo "Killing existing vLLM process..." pkill -f "vllm.entrypoints" || true sleep 5 fi echo "Starting vLLM with base + trained LoRA (tensor-parallel=2, port 8001)..." apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c " export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6 /dev/shm/qwen35/bin/python -m vllm.entrypoints.openai.api_server \ --model $MODEL_PATH \ --served-model-name qwen35 \ --tensor-parallel-size 2 \ --port 8001 \ --host 0.0.0.0 \ --max-model-len 65536 \ --enable-auto-tool-choice \ --tool-call-parser hermes \ --enable-lora \ --lora-modules qwen35-trained=$LORA_DIR " 2>&1 | tee "$VLLM_LOG" & VLLM_PID=$! echo "Waiting for vLLM to be ready..." for i in $(seq 1 120); do if curl -s http://localhost:8001/health > /dev/null 2>&1; then echo "vLLM ready (${i}s)" break fi sleep 5 done echo "" echo "--- BASELINE (base model, no LoRA) ---" export API_BASE_URL="http://localhost:8001/v1" export MODEL_NAME="qwen35" export HF_TOKEN="sk-local" export MAX_STEPS="3" export INFERENCE_SERVER_PORT="18082" $PYTHON inference.py 2>&1 | tee /tmp/eval_base.log BASE_SCORE=$(grep "\[END\]" /tmp/eval_base.log | grep -oP "score=\K[\d.]+" | paste -sd+ | bc -l 2>/dev/null || echo "0") echo "" echo "--- TRAINED (with LoRA) ---" export MODEL_NAME="qwen35-trained" $PYTHON inference.py 2>&1 | tee /tmp/eval_trained.log TRAINED_SCORE=$(grep "\[END\]" /tmp/eval_trained.log | grep -oP "score=\K[\d.]+" | paste -sd+ | bc -l 2>/dev/null || echo "0") echo "" echo "=== COMPARISON ===" echo "Base mean: $BASE_SCORE" echo "Trained mean: $TRAINED_SCORE" { echo "=== Evaluation Results $(date) ===" echo "LoRA: $LORA_DIR" echo "" echo "Base inference log:" grep "\[END\]\|\[START\]" /tmp/eval_base.log echo "" echo "Trained inference log:" grep "\[END\]\|\[START\]" /tmp/eval_trained.log } > "$RESULTS_FILE" echo "Results saved to $RESULTS_FILE" # Kill vLLM kill $VLLM_PID 2>/dev/null || pkill -f "vllm.entrypoints" || true wait $VLLM_PID 2>/dev/null || true echo "" echo "To start run 2 (resume from trained LoRA with fixed CRITIC + MAX_NEW_TOKENS=2048):" echo " apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c \\" echo " 'export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6; \\" echo " /dev/shm/qwen35/bin/python train.py \\" echo " --phase developer \\" echo " --episodes 10 \\" echo " --k-rollouts 4 \\" echo " --model $MODEL_PATH \\" echo " --checkpoint-dir checkpoints/run3 \\" echo " --resume-from $LORA_DIR' \\" echo " 2>&1 | tee checkpoints/train_run3.log"