vision-coder-openenv / scripts /eval_after_training.sh
amaljoe88's picture
deploy: sync 712e5bc -> HF
cf6c0e0
#!/usr/bin/env bash
# Run after training completes to evaluate trained LoRA vs base and start run 2.
# Usage: bash scripts/eval_after_training.sh [lora_dir]
#
# Steps:
# 1. Start vLLM with base model (qwen35) + trained LoRA (qwen35-trained)
# 2. Run inference.py with base model → measure baseline
# 3. Run inference.py with trained model → measure improvement
# 4. Print comparison and write to checkpoints/eval_results.txt
# 5. Prompt whether to start run 2 (resume from trained LoRA)
set -euo pipefail
LORA_DIR="${1:-checkpoints/run2/developer_final}"
MODEL_PATH="$HOME/models/Qwen3.5-2B"
VLLM_LOG="$HOME/vllm_eval.log"
RESULTS_FILE="checkpoints/eval_results.txt"
PYTHON="/dev/shm/qwen35/bin/python"
echo "=== Post-Training Evaluation ==="
echo "LoRA: $LORA_DIR"
echo "Model: $MODEL_PATH"
echo ""
# Verify LoRA exists
if [ ! -d "$LORA_DIR" ]; then
echo "ERROR: LoRA dir not found: $LORA_DIR"
exit 1
fi
# Kill any lingering vLLM processes
if pgrep -f "vllm.entrypoints" > /dev/null 2>&1; then
echo "Killing existing vLLM process..."
pkill -f "vllm.entrypoints" || true
sleep 5
fi
echo "Starting vLLM with base + trained LoRA (tensor-parallel=2, port 8001)..."
apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c "
export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6
/dev/shm/qwen35/bin/python -m vllm.entrypoints.openai.api_server \
--model $MODEL_PATH \
--served-model-name qwen35 \
--tensor-parallel-size 2 \
--port 8001 \
--host 0.0.0.0 \
--max-model-len 65536 \
--enable-auto-tool-choice \
--tool-call-parser hermes \
--enable-lora \
--lora-modules qwen35-trained=$LORA_DIR
" 2>&1 | tee "$VLLM_LOG" &
VLLM_PID=$!
echo "Waiting for vLLM to be ready..."
for i in $(seq 1 120); do
if curl -s http://localhost:8001/health > /dev/null 2>&1; then
echo "vLLM ready (${i}s)"
break
fi
sleep 5
done
echo ""
echo "--- BASELINE (base model, no LoRA) ---"
export API_BASE_URL="http://localhost:8001/v1"
export MODEL_NAME="qwen35"
export HF_TOKEN="sk-local"
export MAX_STEPS="3"
export INFERENCE_SERVER_PORT="18082"
$PYTHON inference.py 2>&1 | tee /tmp/eval_base.log
BASE_SCORE=$(grep "\[END\]" /tmp/eval_base.log | grep -oP "score=\K[\d.]+" | paste -sd+ | bc -l 2>/dev/null || echo "0")
echo ""
echo "--- TRAINED (with LoRA) ---"
export MODEL_NAME="qwen35-trained"
$PYTHON inference.py 2>&1 | tee /tmp/eval_trained.log
TRAINED_SCORE=$(grep "\[END\]" /tmp/eval_trained.log | grep -oP "score=\K[\d.]+" | paste -sd+ | bc -l 2>/dev/null || echo "0")
echo ""
echo "=== COMPARISON ==="
echo "Base mean: $BASE_SCORE"
echo "Trained mean: $TRAINED_SCORE"
{
echo "=== Evaluation Results $(date) ==="
echo "LoRA: $LORA_DIR"
echo ""
echo "Base inference log:"
grep "\[END\]\|\[START\]" /tmp/eval_base.log
echo ""
echo "Trained inference log:"
grep "\[END\]\|\[START\]" /tmp/eval_trained.log
} > "$RESULTS_FILE"
echo "Results saved to $RESULTS_FILE"
# Kill vLLM
kill $VLLM_PID 2>/dev/null || pkill -f "vllm.entrypoints" || true
wait $VLLM_PID 2>/dev/null || true
echo ""
echo "To start run 2 (resume from trained LoRA with fixed CRITIC + MAX_NEW_TOKENS=2048):"
echo " apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c \\"
echo " 'export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6; \\"
echo " /dev/shm/qwen35/bin/python train.py \\"
echo " --phase developer \\"
echo " --episodes 10 \\"
echo " --k-rollouts 4 \\"
echo " --model $MODEL_PATH \\"
echo " --checkpoint-dir checkpoints/run3 \\"
echo " --resume-from $LORA_DIR' \\"
echo " 2>&1 | tee checkpoints/train_run3.log"