Spaces:
Running
Running
| # Run after training completes to evaluate trained LoRA vs base and start run 2. | |
| # Usage: bash scripts/eval_after_training.sh [lora_dir] | |
| # | |
| # Steps: | |
| # 1. Start vLLM with base model (qwen35) + trained LoRA (qwen35-trained) | |
| # 2. Run inference.py with base model → measure baseline | |
| # 3. Run inference.py with trained model → measure improvement | |
| # 4. Print comparison and write to checkpoints/eval_results.txt | |
| # 5. Prompt whether to start run 2 (resume from trained LoRA) | |
| set -euo pipefail | |
| LORA_DIR="${1:-checkpoints/run2/developer_final}" | |
| MODEL_PATH="$HOME/models/Qwen3.5-2B" | |
| VLLM_LOG="$HOME/vllm_eval.log" | |
| RESULTS_FILE="checkpoints/eval_results.txt" | |
| PYTHON="/dev/shm/qwen35/bin/python" | |
| echo "=== Post-Training Evaluation ===" | |
| echo "LoRA: $LORA_DIR" | |
| echo "Model: $MODEL_PATH" | |
| echo "" | |
| # Verify LoRA exists | |
| if [ ! -d "$LORA_DIR" ]; then | |
| echo "ERROR: LoRA dir not found: $LORA_DIR" | |
| exit 1 | |
| fi | |
| # Kill any lingering vLLM processes | |
| if pgrep -f "vllm.entrypoints" > /dev/null 2>&1; then | |
| echo "Killing existing vLLM process..." | |
| pkill -f "vllm.entrypoints" || true | |
| sleep 5 | |
| fi | |
| echo "Starting vLLM with base + trained LoRA (tensor-parallel=2, port 8001)..." | |
| apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c " | |
| export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6 | |
| /dev/shm/qwen35/bin/python -m vllm.entrypoints.openai.api_server \ | |
| --model $MODEL_PATH \ | |
| --served-model-name qwen35 \ | |
| --tensor-parallel-size 2 \ | |
| --port 8001 \ | |
| --host 0.0.0.0 \ | |
| --max-model-len 65536 \ | |
| --enable-auto-tool-choice \ | |
| --tool-call-parser hermes \ | |
| --enable-lora \ | |
| --lora-modules qwen35-trained=$LORA_DIR | |
| " 2>&1 | tee "$VLLM_LOG" & | |
| VLLM_PID=$! | |
| echo "Waiting for vLLM to be ready..." | |
| for i in $(seq 1 120); do | |
| if curl -s http://localhost:8001/health > /dev/null 2>&1; then | |
| echo "vLLM ready (${i}s)" | |
| break | |
| fi | |
| sleep 5 | |
| done | |
| echo "" | |
| echo "--- BASELINE (base model, no LoRA) ---" | |
| export API_BASE_URL="http://localhost:8001/v1" | |
| export MODEL_NAME="qwen35" | |
| export HF_TOKEN="sk-local" | |
| export MAX_STEPS="3" | |
| export INFERENCE_SERVER_PORT="18082" | |
| $PYTHON inference.py 2>&1 | tee /tmp/eval_base.log | |
| BASE_SCORE=$(grep "\[END\]" /tmp/eval_base.log | grep -oP "score=\K[\d.]+" | paste -sd+ | bc -l 2>/dev/null || echo "0") | |
| echo "" | |
| echo "--- TRAINED (with LoRA) ---" | |
| export MODEL_NAME="qwen35-trained" | |
| $PYTHON inference.py 2>&1 | tee /tmp/eval_trained.log | |
| TRAINED_SCORE=$(grep "\[END\]" /tmp/eval_trained.log | grep -oP "score=\K[\d.]+" | paste -sd+ | bc -l 2>/dev/null || echo "0") | |
| echo "" | |
| echo "=== COMPARISON ===" | |
| echo "Base mean: $BASE_SCORE" | |
| echo "Trained mean: $TRAINED_SCORE" | |
| { | |
| echo "=== Evaluation Results $(date) ===" | |
| echo "LoRA: $LORA_DIR" | |
| echo "" | |
| echo "Base inference log:" | |
| grep "\[END\]\|\[START\]" /tmp/eval_base.log | |
| echo "" | |
| echo "Trained inference log:" | |
| grep "\[END\]\|\[START\]" /tmp/eval_trained.log | |
| } > "$RESULTS_FILE" | |
| echo "Results saved to $RESULTS_FILE" | |
| # Kill vLLM | |
| kill $VLLM_PID 2>/dev/null || pkill -f "vllm.entrypoints" || true | |
| wait $VLLM_PID 2>/dev/null || true | |
| echo "" | |
| echo "To start run 2 (resume from trained LoRA with fixed CRITIC + MAX_NEW_TOKENS=2048):" | |
| echo " apptainer exec --nv ~/apptainer-images/cuda-custom-amal_latest.sif bash -c \\" | |
| echo " 'export LD_PRELOAD=/dev/shm/qwen35/lib/libstdc++.so.6; \\" | |
| echo " /dev/shm/qwen35/bin/python train.py \\" | |
| echo " --phase developer \\" | |
| echo " --episodes 10 \\" | |
| echo " --k-rollouts 4 \\" | |
| echo " --model $MODEL_PATH \\" | |
| echo " --checkpoint-dir checkpoints/run3 \\" | |
| echo " --resume-from $LORA_DIR' \\" | |
| echo " 2>&1 | tee checkpoints/train_run3.log" | |