Spaces:

amaljoe88
/

vision-coder-openenv

Running

App Files Files Community

vision-coder-openenv / scripts /eval_after_training.sh

amaljoe88

deploy: sync 712e5bc -> HF

cf6c0e0 24 days ago

raw

history blame contribute delete

3.75 kB

	#!/usr/bin/env bash
	# Run after training completes to evaluate trained LoRA vs base and start run 2.
	# Usage: bash scripts/eval_after_training.sh [lora_dir]
	#
	# Steps:
	# 1. Start vLLM with base model (qwen35) + trained LoRA (qwen35-trained)
	# 2. Run inference.py with base model → measure baseline
	# 3. Run inference.py with trained model → measure improvement
	# 4. Print comparison and write to checkpoints/eval_results.txt
	# 5. Prompt whether to start run 2 (resume from trained LoRA)

	set -euo pipefail

	LORA_DIR="${1:-checkpoints/run2/developer_final}"
	MODEL_PATH="$HOME/models/Qwen3.5-2B"
	VLLM_LOG="$HOME/vllm_eval.log"
	RESULTS_FILE="checkpoints/eval_results.txt"
	PYTHON="/dev/shm/qwen35/bin/python"

	echo "=== Post-Training Evaluation ==="
	echo "LoRA: $LORA_DIR"
	echo "Model: $MODEL_PATH"
	echo ""

	# Verify LoRA exists
	if [ ! -d "$LORA_DIR" ]; then
	echo "ERROR: LoRA dir not found: $LORA_DIR"
	exit 1
	fi

	# Kill any lingering vLLM processes
	if pgrep -f "vllm.entrypoints" > /dev/null 2>&1; then
	echo "Killing existing vLLM process..."
	pkill -f "vllm.entrypoints" \|\| true
	sleep 5
	fi

	echo "Starting vLLM with base + trained LoRA (tensor-parallel=2, port 8001)..."
	apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c "
	export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6
	/dev/shm/qwen35/bin/python -m vllm.entrypoints.openai.api_server \
	--model $MODEL_PATH \
	--served-model-name qwen35 \
	--tensor-parallel-size 2 \
	--port 8001 \
	--host 0.0.0.0 \
	--max-model-len 65536 \
	--enable-auto-tool-choice \
	--tool-call-parser hermes \
	--enable-lora \
	--lora-modules qwen35-trained=$LORA_DIR
	" 2>&1 \| tee "$VLLM_LOG" &
	VLLM_PID=$!

	echo "Waiting for vLLM to be ready..."
	for i in $(seq 1 120); do
	if curl -s http://localhost:8001/health > /dev/null 2>&1; then
	echo "vLLM ready (${i}s)"
	break
	fi
	sleep 5
	done

	echo ""
	echo "--- BASELINE (base model, no LoRA) ---"
	export API_BASE_URL="http://localhost:8001/v1"
	export MODEL_NAME="qwen35"
	export HF_TOKEN="sk-local"
	export MAX_STEPS="3"
	export INFERENCE_SERVER_PORT="18082"

	$PYTHON inference.py 2>&1 \| tee /tmp/eval_base.log
	BASE_SCORE=$(grep "\[END\]" /tmp/eval_base.log \| grep -oP "score=\K[\d.]+" \| paste -sd+ \| bc -l 2>/dev/null \|\| echo "0")

	echo ""
	echo "--- TRAINED (with LoRA) ---"
	export MODEL_NAME="qwen35-trained"

	$PYTHON inference.py 2>&1 \| tee /tmp/eval_trained.log
	TRAINED_SCORE=$(grep "\[END\]" /tmp/eval_trained.log \| grep -oP "score=\K[\d.]+" \| paste -sd+ \| bc -l 2>/dev/null \|\| echo "0")

	echo ""
	echo "=== COMPARISON ==="
	echo "Base mean: $BASE_SCORE"
	echo "Trained mean: $TRAINED_SCORE"

	{
	echo "=== Evaluation Results $(date) ==="
	echo "LoRA: $LORA_DIR"
	echo ""
	echo "Base inference log:"
	grep "\[END\]\\|\[START\]" /tmp/eval_base.log
	echo ""
	echo "Trained inference log:"
	grep "\[END\]\\|\[START\]" /tmp/eval_trained.log
	} > "$RESULTS_FILE"

	echo "Results saved to $RESULTS_FILE"

	# Kill vLLM
	kill $VLLM_PID 2>/dev/null \|\| pkill -f "vllm.entrypoints" \|\| true
	wait $VLLM_PID 2>/dev/null \|\| true

	echo ""
	echo "To start run 2 (resume from trained LoRA with fixed CRITIC + MAX_NEW_TOKENS=2048):"
	echo " apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c \\"
	echo " 'export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6; \\"
	echo " /dev/shm/qwen35/bin/python train.py \\"
	echo " --phase developer \\"
	echo " --episodes 10 \\"
	echo " --k-rollouts 4 \\"
	echo " --model $MODEL_PATH \\"
	echo " --checkpoint-dir checkpoints/run3 \\"
	echo " --resume-from $LORA_DIR' \\"
	echo " 2>&1 \| tee checkpoints/train_run3.log"