| #!/bin/bash |
| set -euo pipefail |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| MODEL_PATH="${MODEL_PATH:-/home/mshahidul/readctrl/code/RL_model/models/converted_model/v1}" |
| CONDA_ENV="${CONDA_ENV:-verl}" |
| GPU_ID="${GPU_ID:-1}" |
| PORT="${PORT:-8001}" |
| SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-inference}" |
| DTYPE="${DTYPE:-bfloat16}" |
| MAX_MODEL_LEN="${MAX_MODEL_LEN:-16384}" |
|
|
| DATASET_PATH="${DATASET_PATH:-/home/mshahidul/readctrl/code/readctrl_rl_inference/verified_combined_0-80_clean200.json}" |
| INFERENCE_OUTPUT_DIR="${INFERENCE_OUTPUT_DIR:-/home/mshahidul/readctrl/code/RL_model/inference_data}" |
| BATCH_SIZE="${BATCH_SIZE:-64}" |
| MAX_TOKENS="${MAX_TOKENS:-1024}" |
| TEMPERATURE="${TEMPERATURE:-0.7}" |
| TOP_P="${TOP_P:-0.8}" |
| NUM_WORKERS="${NUM_WORKERS:-4}" |
|
|
| CLASSIFIER_API_BASE="${CLASSIFIER_API_BASE:-http://172.16.34.19:8040/v1}" |
| |
| SUPPORT_API_BASE="${SUPPORT_API_BASE:-http://172.16.34.19:8090}" |
| SUPPORT_MODEL="${SUPPORT_MODEL:-sc}" |
| CLASSIFIER_MODEL_PATH="${CLASSIFIER_MODEL_PATH:-/home/mshahidul/readctrl/code/readctrl_rl_inference/model.json}" |
| REFERENCE_SUBCLAIMS="${REFERENCE_SUBCLAIMS:-/home/mshahidul/readctrl/code/text_classifier/data/verified_combined_0-80_clean200_with_subclaims.json}" |
| TEST_OUTPUT_DIR="${TEST_OUTPUT_DIR:-/home/mshahidul/readctrl/code/readctrl_rl_inference/test_result_v4}" |
|
|
| PROMPT_LOW="${PROMPT_LOW:-/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt_low}" |
| PROMPT_INTERMEDIATE="${PROMPT_INTERMEDIATE:-/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt_intermediate}" |
| PROMPT_PROFICIENT="${PROMPT_PROFICIENT:-/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt_proficient}" |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| INFERENCE_SCRIPT="${SCRIPT_DIR}/run_inference_vllm_server.py" |
| TEST_SCRIPT="${SCRIPT_DIR}/test_classifier_with_subclaim_thresholds.py" |
|
|
| SERVER_STARTUP_TIMEOUT=300 |
| VLLM_PID="" |
|
|
| |
| while [[ $# -gt 0 ]]; do |
| case "$1" in |
| --gpu) GPU_ID="$2"; shift 2 ;; |
| --port) PORT="$2"; shift 2 ;; |
| --model) MODEL_PATH="$2"; shift 2 ;; |
| --batch-size) BATCH_SIZE="$2"; shift 2 ;; |
| --max-samples) MAX_SAMPLES="$2"; shift 2 ;; |
| --dtype) DTYPE="$2"; shift 2 ;; |
| --classifier-api) CLASSIFIER_API_BASE="$2"; shift 2 ;; |
| --support-api) SUPPORT_API_BASE="$2"; shift 2 ;; |
| *) echo "[WARN] Unknown arg: $1"; shift ;; |
| esac |
| done |
|
|
| MAX_SAMPLES="${MAX_SAMPLES:--1}" |
| BASE_URL="http://127.0.0.1:${PORT}/v1" |
|
|
| |
| cleanup() { |
| if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then |
| echo "" |
| echo "================================================================" |
| echo " Shutting down vLLM server (PID ${VLLM_PID}) ..." |
| echo "================================================================" |
| kill "${VLLM_PID}" 2>/dev/null || true |
| wait "${VLLM_PID}" 2>/dev/null || true |
| echo "[INFO] vLLM server stopped." |
| fi |
| } |
| trap cleanup EXIT INT TERM |
|
|
| |
| eval "$(conda shell.bash hook)" |
| conda activate "${CONDA_ENV}" |
|
|
| RUN_TS="$(date +%Y%m%d_%H%M%S)" |
|
|
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
| echo "β ReadCtrl Full Pipeline β ${RUN_TS} β" |
| echo "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£" |
| echo "β Model: ${MODEL_PATH}" |
| echo "β GPU: ${GPU_ID}" |
| echo "β Port: ${PORT}" |
| echo "β Dtype: ${DTYPE}" |
| echo "β Batch: ${BATCH_SIZE} (${NUM_WORKERS} concurrent workers)" |
| echo "β Conda env: ${CONDA_ENV}" |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
| echo "" |
|
|
| |
| |
| |
| echo "================================================================" |
| echo " STEP 1/4: Starting vLLM server on GPU ${GPU_ID}, port ${PORT}" |
| echo "================================================================" |
|
|
| VLLM_LOG="${INFERENCE_OUTPUT_DIR}/vllm_server_${RUN_TS}.log" |
| mkdir -p "${INFERENCE_OUTPUT_DIR}" |
|
|
| CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES="${GPU_ID}" \ |
| python -m vllm.entrypoints.openai.api_server \ |
| --model "${MODEL_PATH}" \ |
| --served-model-name "${SERVED_MODEL_NAME}" \ |
| --dtype "${DTYPE}" \ |
| --port "${PORT}" \ |
| --max-model-len "${MAX_MODEL_LEN}" \ |
| --gpu-memory-utilization 0.95 \ |
| --max-num-seqs 256 \ |
| --enable-prefix-caching \ |
| --disable-log-requests \ |
| > "${VLLM_LOG}" 2>&1 & |
| VLLM_PID=$! |
| echo "[INFO] vLLM server PID: ${VLLM_PID}" |
| echo "[INFO] Server log: ${VLLM_LOG}" |
|
|
| |
| |
| |
| echo "" |
| echo "================================================================" |
| echo " STEP 2/4: Waiting for vLLM server to be ready ..." |
| echo "================================================================" |
|
|
| ELAPSED=0 |
| INTERVAL=5 |
| while [[ ${ELAPSED} -lt ${SERVER_STARTUP_TIMEOUT} ]]; do |
| if ! kill -0 "${VLLM_PID}" 2>/dev/null; then |
| echo "[ERROR] vLLM server process died. Check log: ${VLLM_LOG}" |
| tail -30 "${VLLM_LOG}" |
| exit 1 |
| fi |
| HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL}/models" 2>/dev/null || echo "000") |
| if [[ "${HTTP_CODE}" == "200" ]]; then |
| echo "[INFO] vLLM server is healthy (${ELAPSED}s elapsed)." |
| break |
| fi |
| echo " ... waiting (${ELAPSED}s / ${SERVER_STARTUP_TIMEOUT}s, last HTTP=${HTTP_CODE})" |
| sleep ${INTERVAL} |
| ELAPSED=$((ELAPSED + INTERVAL)) |
| done |
|
|
| if [[ ${ELAPSED} -ge ${SERVER_STARTUP_TIMEOUT} ]]; then |
| echo "[ERROR] Server did not become healthy within ${SERVER_STARTUP_TIMEOUT}s." |
| tail -30 "${VLLM_LOG}" |
| exit 1 |
| fi |
|
|
| echo "" |
| echo "[INFO] Available models on server:" |
| curl -s "${BASE_URL}/models" | python -m json.tool 2>/dev/null || true |
| echo "" |
|
|
| |
| |
| |
| echo "================================================================" |
| echo " STEP 3/4: Running batched inference" |
| echo "================================================================" |
| echo "[INFO] Dataset: ${DATASET_PATH}" |
| echo "[INFO] Output dir: ${INFERENCE_OUTPUT_DIR}" |
| echo "" |
|
|
| python "${INFERENCE_SCRIPT}" \ |
| --model_path "${MODEL_PATH}" \ |
| --dataset_path "${DATASET_PATH}" \ |
| --prompt-low-path "${PROMPT_LOW}" \ |
| --prompt-intermediate-path "${PROMPT_INTERMEDIATE}" \ |
| --prompt-proficient-path "${PROMPT_PROFICIENT}" \ |
| --output_dir "${INFERENCE_OUTPUT_DIR}" \ |
| --base_url "${BASE_URL}" \ |
| --served_model_name "${SERVED_MODEL_NAME}" \ |
| --batch_size "${BATCH_SIZE}" \ |
| --max_samples "${MAX_SAMPLES}" \ |
| --max_tokens "${MAX_TOKENS}" \ |
| --temperature "${TEMPERATURE}" \ |
| --top_p "${TOP_P}" \ |
| --num_workers "${NUM_WORKERS}" |
|
|
| INFERENCE_JSONL="$(ls -t "${INFERENCE_OUTPUT_DIR}"/vllm_inference_*.jsonl 2>/dev/null | head -1)" |
| if [[ -z "${INFERENCE_JSONL}" ]]; then |
| echo "[ERROR] No inference JSONL output found in ${INFERENCE_OUTPUT_DIR}" |
| exit 1 |
| fi |
| echo "" |
| echo "[INFO] Inference output: ${INFERENCE_JSONL}" |
| INFERENCE_LINE_COUNT="$(wc -l < "${INFERENCE_JSONL}")" |
| echo "[INFO] Total inference rows: ${INFERENCE_LINE_COUNT}" |
|
|
| |
| |
| |
| echo "" |
| echo "================================================================" |
| echo " STEP 4/4: Running classifier + subclaim threshold evaluation" |
| echo "================================================================" |
| echo "[INFO] Input JSONL: ${INFERENCE_JSONL}" |
| echo "[INFO] Classifier API: ${CLASSIFIER_API_BASE}" |
| echo "[INFO] Support API: ${SUPPORT_API_BASE} (FastAPI /check_support, no /v1)" |
| echo "[INFO] Reference subclaims: ${REFERENCE_SUBCLAIMS}" |
| echo "" |
|
|
| python "${TEST_SCRIPT}" \ |
| --model-path "${CLASSIFIER_MODEL_PATH}" \ |
| --input-file "${INFERENCE_JSONL}" \ |
| --reference-subclaims-file "${REFERENCE_SUBCLAIMS}" \ |
| --classifier-api-base "${CLASSIFIER_API_BASE}" \ |
| --support-api-base "${SUPPORT_API_BASE}" \ |
| --output-dir "${TEST_OUTPUT_DIR}" \ |
| --max-samples "${MAX_SAMPLES}" \ |
| --provide-traceback |
|
|
| TEST_SUMMARY_JSON="$(ls -t "${TEST_OUTPUT_DIR}"/classifier_subclaim_threshold_eval_*.json 2>/dev/null | head -1)" |
| TEST_DETAILS_JSONL="$(ls -t "${TEST_OUTPUT_DIR}"/classifier_subclaim_threshold_eval_*.jsonl 2>/dev/null | head -1)" |
|
|
| |
| |
| |
| echo "" |
| echo "" |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
| echo "β PIPELINE COMPLETE β" |
| echo "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£" |
| echo "β Run timestamp: ${RUN_TS}" |
| echo "β Model: ${MODEL_PATH}" |
| echo "β GPU: ${GPU_ID}" |
| echo "β Samples inferred: ${INFERENCE_LINE_COUNT}" |
| echo "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£" |
| echo "β OUTPUT FILES β" |
| echo "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£" |
| echo "β Inference JSONL: ${INFERENCE_JSONL}" |
| echo "β vLLM server log: ${VLLM_LOG}" |
|
|
| if [[ -n "${TEST_SUMMARY_JSON:-}" ]]; then |
| echo "β Test summary: ${TEST_SUMMARY_JSON}" |
| fi |
| if [[ -n "${TEST_DETAILS_JSONL:-}" ]]; then |
| echo "β Test details: ${TEST_DETAILS_JSONL}" |
| fi |
|
|
| echo "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£" |
| echo "β EVALUATION RESULTS β" |
| echo "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£" |
|
|
| if [[ -n "${TEST_SUMMARY_JSON:-}" && -f "${TEST_SUMMARY_JSON}" ]]; then |
| python3 -c " |
| import json |
| with open('${TEST_SUMMARY_JSON}') as f: |
| s = json.load(f) |
| total = s.get('total_samples', 0) |
| cls_acc = s.get('classifier_only_accuracy', 0) |
| comp_pr = s.get('completeness_pass_rate', 0) |
| comp_mean = s.get('completeness_mean') |
| halluc_fail = s.get('hallucination_fail_rate', 0) |
| halluc_mean = s.get('hallucination_mean') |
| cls_comp = s.get('accuracy_cls_and_completeness', 0) |
| cls_comp_nh = s.get('accuracy_cls_comp_no_hallucination', 0) |
| comp_thresh = s.get('completeness_threshold', 0) |
| halluc_thresh= s.get('hallucination_threshold', 0) |
| print(f' Total evaluated samples: {total}') |
| print(f' Classifier-only accuracy: {cls_acc:.4f} ({cls_acc*100:.2f}%)') |
| print() |
| comp_str = f'{comp_mean:.4f}' if comp_mean is not None else 'N/A' |
| print(f' Completeness pass rate: {comp_pr:.4f} ({comp_pr*100:.2f}%)') |
| print(f' Completeness mean score: {comp_str}') |
| print(f' Completeness threshold: >= {comp_thresh}') |
| print() |
| halluc_str = f'{halluc_mean:.4f}' if halluc_mean is not None else 'N/A' |
| print(f' Hallucination fail rate: {halluc_fail:.4f} ({halluc_fail*100:.2f}%)') |
| print(f' Hallucination mean score: {halluc_str}') |
| print(f' Hallucination threshold: > {halluc_thresh}') |
| print() |
| print(f' Cls + Completeness: {cls_comp:.4f} ({cls_comp*100:.2f}%)') |
| print(f' Cls + Comp + No Hallucination: {cls_comp_nh:.4f} ({cls_comp_nh*100:.2f}%)') |
| " |
| else |
| echo " [WARN] No test summary JSON found." |
| fi |
|
|
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
| echo "" |
| echo "[DONE] Full pipeline finished at $(date '+%Y-%m-%d %H:%M:%S')" |
|
|