#!/bin/bash set -euo pipefail ############################################################################### # Full Pipeline: vLLM Server → Inference → Testing → Summary # # Usage: # bash run_full_pipeline.sh [--gpu GPU_ID] [--port PORT] # # This script: # 1. Starts a vLLM server for the converted RL model # 2. Waits until the server is healthy # 3. Runs batched inference (run_inference_vllm_server.py) # 4. Runs classifier + subclaim threshold evaluation # 5. Prints a final summary of all results ############################################################################### # ─── Defaults (override via env vars or CLI flags) ─────────────────────────── MODEL_PATH="${MODEL_PATH:-/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1}" CONDA_ENV="${CONDA_ENV:-verl}" GPU_ID="${GPU_ID:-1}" PORT="${PORT:-8001}" SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-inference}" DTYPE="${DTYPE:-bfloat16}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-16384}" DATASET_PATH="${DATASET_PATH:-/home/mshahidul/readctrl/code/readctrl_rl_inference/verified_combined_0-80_clean200.json}" INFERENCE_OUTPUT_DIR="${INFERENCE_OUTPUT_DIR:-/home/mshahidul/readctrl/code/RL_model/inference_data}" BATCH_SIZE="${BATCH_SIZE:-64}" MAX_TOKENS="${MAX_TOKENS:-1024}" TEMPERATURE="${TEMPERATURE:-0.7}" TOP_P="${TOP_P:-0.8}" NUM_WORKERS="${NUM_WORKERS:-4}" CLASSIFIER_API_BASE="${CLASSIFIER_API_BASE:-http://172.16.34.19:8040/v1}" # Support API: FastAPI /check_support endpoint — NO /v1 suffix SUPPORT_API_BASE="${SUPPORT_API_BASE:-http://172.16.34.19:8090}" SUPPORT_MODEL="${SUPPORT_MODEL:-sc}" CLASSIFIER_MODEL_PATH="${CLASSIFIER_MODEL_PATH:-/home/mshahidul/readctrl/code/readctrl_rl_inference/model.json}" REFERENCE_SUBCLAIMS="${REFERENCE_SUBCLAIMS:-/home/mshahidul/readctrl/code/text_classifier/data/verified_combined_0-80_clean200_with_subclaims.json}" TEST_OUTPUT_DIR="${TEST_OUTPUT_DIR:-/home/mshahidul/readctrl/code/readctrl_rl_inference/test_result_v4}" PROMPT_LOW="${PROMPT_LOW:-/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt_low}" PROMPT_INTERMEDIATE="${PROMPT_INTERMEDIATE:-/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt_intermediate}" PROMPT_PROFICIENT="${PROMPT_PROFICIENT:-/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt_proficient}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" INFERENCE_SCRIPT="${SCRIPT_DIR}/run_inference_vllm_server.py" TEST_SCRIPT="${SCRIPT_DIR}/test_classifier_with_subclaim_thresholds.py" SERVER_STARTUP_TIMEOUT=300 # seconds to wait for vLLM to become healthy VLLM_PID="" # ─── Parse CLI args ───────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case "$1" in --gpu) GPU_ID="$2"; shift 2 ;; --port) PORT="$2"; shift 2 ;; --model) MODEL_PATH="$2"; shift 2 ;; --batch-size) BATCH_SIZE="$2"; shift 2 ;; --max-samples) MAX_SAMPLES="$2"; shift 2 ;; --dtype) DTYPE="$2"; shift 2 ;; --classifier-api) CLASSIFIER_API_BASE="$2"; shift 2 ;; --support-api) SUPPORT_API_BASE="$2"; shift 2 ;; *) echo "[WARN] Unknown arg: $1"; shift ;; esac done MAX_SAMPLES="${MAX_SAMPLES:--1}" BASE_URL="http://127.0.0.1:${PORT}/v1" # ─── Cleanup handler ──────────────────────────────────────────────────────── cleanup() { if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then echo "" echo "================================================================" echo " Shutting down vLLM server (PID ${VLLM_PID}) ..." echo "================================================================" kill "${VLLM_PID}" 2>/dev/null || true wait "${VLLM_PID}" 2>/dev/null || true echo "[INFO] vLLM server stopped." fi } trap cleanup EXIT INT TERM # ─── Activate conda ───────────────────────────────────────────────────────── eval "$(conda shell.bash hook)" conda activate "${CONDA_ENV}" RUN_TS="$(date +%Y%m%d_%H%M%S)" echo "╔══════════════════════════════════════════════════════════════════╗" echo "║ ReadCtrl Full Pipeline — ${RUN_TS} ║" echo "╠══════════════════════════════════════════════════════════════════╣" echo "║ Model: ${MODEL_PATH}" echo "║ GPU: ${GPU_ID}" echo "║ Port: ${PORT}" echo "║ Dtype: ${DTYPE}" echo "║ Batch: ${BATCH_SIZE} (${NUM_WORKERS} concurrent workers)" echo "║ Conda env: ${CONDA_ENV}" echo "╚══════════════════════════════════════════════════════════════════╝" echo "" ############################################################################### # STEP 1 — Start vLLM server ############################################################################### echo "================================================================" echo " STEP 1/4: Starting vLLM server on GPU ${GPU_ID}, port ${PORT}" echo "================================================================" VLLM_LOG="${INFERENCE_OUTPUT_DIR}/vllm_server_${RUN_TS}.log" mkdir -p "${INFERENCE_OUTPUT_DIR}" CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="${GPU_ID}" \ python -m vllm.entrypoints.openai.api_server \ --model "${MODEL_PATH}" \ --served-model-name "${SERVED_MODEL_NAME}" \ --dtype "${DTYPE}" \ --port "${PORT}" \ --max-model-len "${MAX_MODEL_LEN}" \ --gpu-memory-utilization 0.95 \ --max-num-seqs 256 \ --enable-prefix-caching \ --disable-log-requests \ > "${VLLM_LOG}" 2>&1 & VLLM_PID=$! echo "[INFO] vLLM server PID: ${VLLM_PID}" echo "[INFO] Server log: ${VLLM_LOG}" ############################################################################### # STEP 2 — Wait for vLLM to become healthy ############################################################################### echo "" echo "================================================================" echo " STEP 2/4: Waiting for vLLM server to be ready ..." echo "================================================================" ELAPSED=0 INTERVAL=5 while [[ ${ELAPSED} -lt ${SERVER_STARTUP_TIMEOUT} ]]; do if ! kill -0 "${VLLM_PID}" 2>/dev/null; then echo "[ERROR] vLLM server process died. Check log: ${VLLM_LOG}" tail -30 "${VLLM_LOG}" exit 1 fi HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL}/models" 2>/dev/null || echo "000") if [[ "${HTTP_CODE}" == "200" ]]; then echo "[INFO] vLLM server is healthy (${ELAPSED}s elapsed)." break fi echo " ... waiting (${ELAPSED}s / ${SERVER_STARTUP_TIMEOUT}s, last HTTP=${HTTP_CODE})" sleep ${INTERVAL} ELAPSED=$((ELAPSED + INTERVAL)) done if [[ ${ELAPSED} -ge ${SERVER_STARTUP_TIMEOUT} ]]; then echo "[ERROR] Server did not become healthy within ${SERVER_STARTUP_TIMEOUT}s." tail -30 "${VLLM_LOG}" exit 1 fi echo "" echo "[INFO] Available models on server:" curl -s "${BASE_URL}/models" | python -m json.tool 2>/dev/null || true echo "" ############################################################################### # STEP 3 — Run inference ############################################################################### echo "================================================================" echo " STEP 3/4: Running batched inference" echo "================================================================" echo "[INFO] Dataset: ${DATASET_PATH}" echo "[INFO] Output dir: ${INFERENCE_OUTPUT_DIR}" echo "" python "${INFERENCE_SCRIPT}" \ --model_path "${MODEL_PATH}" \ --dataset_path "${DATASET_PATH}" \ --prompt-low-path "${PROMPT_LOW}" \ --prompt-intermediate-path "${PROMPT_INTERMEDIATE}" \ --prompt-proficient-path "${PROMPT_PROFICIENT}" \ --output_dir "${INFERENCE_OUTPUT_DIR}" \ --base_url "${BASE_URL}" \ --served_model_name "${SERVED_MODEL_NAME}" \ --batch_size "${BATCH_SIZE}" \ --max_samples "${MAX_SAMPLES}" \ --max_tokens "${MAX_TOKENS}" \ --temperature "${TEMPERATURE}" \ --top_p "${TOP_P}" \ --num_workers "${NUM_WORKERS}" INFERENCE_JSONL="$(ls -t "${INFERENCE_OUTPUT_DIR}"/vllm_inference_*.jsonl 2>/dev/null | head -1)" if [[ -z "${INFERENCE_JSONL}" ]]; then echo "[ERROR] No inference JSONL output found in ${INFERENCE_OUTPUT_DIR}" exit 1 fi echo "" echo "[INFO] Inference output: ${INFERENCE_JSONL}" INFERENCE_LINE_COUNT="$(wc -l < "${INFERENCE_JSONL}")" echo "[INFO] Total inference rows: ${INFERENCE_LINE_COUNT}" ############################################################################### # STEP 4 — Run testing / evaluation ############################################################################### echo "" echo "================================================================" echo " STEP 4/4: Running classifier + subclaim threshold evaluation" echo "================================================================" echo "[INFO] Input JSONL: ${INFERENCE_JSONL}" echo "[INFO] Classifier API: ${CLASSIFIER_API_BASE}" echo "[INFO] Support API: ${SUPPORT_API_BASE} (FastAPI /check_support, no /v1)" echo "[INFO] Reference subclaims: ${REFERENCE_SUBCLAIMS}" echo "" python "${TEST_SCRIPT}" \ --model-path "${CLASSIFIER_MODEL_PATH}" \ --input-file "${INFERENCE_JSONL}" \ --reference-subclaims-file "${REFERENCE_SUBCLAIMS}" \ --classifier-api-base "${CLASSIFIER_API_BASE}" \ --support-api-base "${SUPPORT_API_BASE}" \ --output-dir "${TEST_OUTPUT_DIR}" \ --max-samples "${MAX_SAMPLES}" \ --provide-traceback TEST_SUMMARY_JSON="$(ls -t "${TEST_OUTPUT_DIR}"/classifier_subclaim_threshold_eval_*.json 2>/dev/null | head -1)" TEST_DETAILS_JSONL="$(ls -t "${TEST_OUTPUT_DIR}"/classifier_subclaim_threshold_eval_*.jsonl 2>/dev/null | head -1)" ############################################################################### # FINAL SUMMARY ############################################################################### echo "" echo "" echo "╔══════════════════════════════════════════════════════════════════╗" echo "║ PIPELINE COMPLETE ║" echo "╠══════════════════════════════════════════════════════════════════╣" echo "║ Run timestamp: ${RUN_TS}" echo "║ Model: ${MODEL_PATH}" echo "║ GPU: ${GPU_ID}" echo "║ Samples inferred: ${INFERENCE_LINE_COUNT}" echo "╠══════════════════════════════════════════════════════════════════╣" echo "║ OUTPUT FILES ║" echo "╠══════════════════════════════════════════════════════════════════╣" echo "║ Inference JSONL: ${INFERENCE_JSONL}" echo "║ vLLM server log: ${VLLM_LOG}" if [[ -n "${TEST_SUMMARY_JSON:-}" ]]; then echo "║ Test summary: ${TEST_SUMMARY_JSON}" fi if [[ -n "${TEST_DETAILS_JSONL:-}" ]]; then echo "║ Test details: ${TEST_DETAILS_JSONL}" fi echo "╠══════════════════════════════════════════════════════════════════╣" echo "║ EVALUATION RESULTS ║" echo "╠══════════════════════════════════════════════════════════════════╣" if [[ -n "${TEST_SUMMARY_JSON:-}" && -f "${TEST_SUMMARY_JSON}" ]]; then python3 -c " import json with open('${TEST_SUMMARY_JSON}') as f: s = json.load(f) total = s.get('total_samples', 0) cls_acc = s.get('classifier_only_accuracy', 0) comp_pr = s.get('completeness_pass_rate', 0) comp_mean = s.get('completeness_mean') halluc_fail = s.get('hallucination_fail_rate', 0) halluc_mean = s.get('hallucination_mean') cls_comp = s.get('accuracy_cls_and_completeness', 0) cls_comp_nh = s.get('accuracy_cls_comp_no_hallucination', 0) comp_thresh = s.get('completeness_threshold', 0) halluc_thresh= s.get('hallucination_threshold', 0) print(f' Total evaluated samples: {total}') print(f' Classifier-only accuracy: {cls_acc:.4f} ({cls_acc*100:.2f}%)') print() comp_str = f'{comp_mean:.4f}' if comp_mean is not None else 'N/A' print(f' Completeness pass rate: {comp_pr:.4f} ({comp_pr*100:.2f}%)') print(f' Completeness mean score: {comp_str}') print(f' Completeness threshold: >= {comp_thresh}') print() halluc_str = f'{halluc_mean:.4f}' if halluc_mean is not None else 'N/A' print(f' Hallucination fail rate: {halluc_fail:.4f} ({halluc_fail*100:.2f}%)') print(f' Hallucination mean score: {halluc_str}') print(f' Hallucination threshold: > {halluc_thresh}') print() print(f' Cls + Completeness: {cls_comp:.4f} ({cls_comp*100:.2f}%)') print(f' Cls + Comp + No Hallucination: {cls_comp_nh:.4f} ({cls_comp_nh*100:.2f}%)') " else echo " [WARN] No test summary JSON found." fi echo "╚══════════════════════════════════════════════════════════════════╝" echo "" echo "[DONE] Full pipeline finished at $(date '+%Y-%m-%d %H:%M:%S')"