#!/usr/bin/env bash # test-api.sh — smoke tests for the vLLM API # Usage: # bash test-api.sh # test localhost:8000 # bash test-api.sh 192.168.1.50 # test remote host # bash test-api.sh 192.168.1.50 8080 # remote host, custom port set -euo pipefail HOST="${1:-localhost}" PORT="${2:-8000}" BASE_URL="http://${HOST}:${PORT}/v1" if [[ -t 1 ]]; then GREEN="\033[0;32m"; RED="\033[0;31m"; YELLOW="\033[0;33m"; NC="\033[0m" else GREEN=""; RED=""; YELLOW=""; NC="" fi ok() { echo -e "${GREEN}[OK]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; } info() { echo -e "${YELLOW}[INFO]${NC} $*"; } echo "============================================================" echo " vLLM API smoke tests — ${BASE_URL}" echo "============================================================" echo "" # --------------------------------------------------------------------------- # Test 1: Health endpoint # --------------------------------------------------------------------------- info "Test 1: /health" HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL%/v1}/health") if [[ "${HTTP_CODE}" == "200" ]]; then ok "/health returned HTTP 200" else fail "/health returned HTTP ${HTTP_CODE} (server may still be loading)" fi echo "" # --------------------------------------------------------------------------- # Test 2: Model list # --------------------------------------------------------------------------- info "Test 2: GET /v1/models" MODELS_RESPONSE=$(curl -s "${BASE_URL}/models") echo "${MODELS_RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${MODELS_RESPONSE}" MODEL_ID=$(echo "${MODELS_RESPONSE}" | python3 -c \ "import sys,json; data=json.load(sys.stdin); print(data['data'][0]['id'])" 2>/dev/null || echo "") if [[ -n "${MODEL_ID}" ]]; then ok "Model loaded: ${MODEL_ID}" else fail "Could not parse model list" MODEL_ID="GadflyII/Qwen3-Coder-Next-NVFP4" fi echo "" # --------------------------------------------------------------------------- # Test 3: Chat completion (reasoning off) # --------------------------------------------------------------------------- info "Test 3: POST /v1/chat/completions (reasoning off)" RESPONSE=$(curl -s \ -X POST "${BASE_URL}/chat/completions" \ -H "Content-Type: application/json" \ -d "{ \"model\": \"${MODEL_ID}\", \"messages\": [{\"role\": \"user\", \"content\": \"Reply in one sentence: what is the capital of France?\"}], \"max_tokens\": 60, \"temperature\": 0.1, \"chat_template_kwargs\": {\"enable_thinking\": false} }") CONTENT=$(echo "${RESPONSE}" | python3 -c \ "import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "") if [[ -n "${CONTENT}" ]]; then ok "Chat completion works." echo " >> ${CONTENT}" else fail "No response" echo "${RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${RESPONSE}" fi echo "" # --------------------------------------------------------------------------- # Test 4: Chat completion (reasoning on) # --------------------------------------------------------------------------- info "Test 4: POST /v1/chat/completions (reasoning on)" RESPONSE=$(curl -s \ -X POST "${BASE_URL}/chat/completions" \ -H "Content-Type: application/json" \ -d "{ \"model\": \"${MODEL_ID}\", \"messages\": [{\"role\": \"user\", \"content\": \"What is 17 * 23? Show your work.\"}], \"max_tokens\": 1000, \"temperature\": 0.1, \"chat_template_kwargs\": {\"enable_thinking\": true} }") CONTENT=$(echo "${RESPONSE}" | python3 -c \ "import sys,json; r=json.load(sys.stdin); m=r['choices'][0]['message']; thinking=m.get('reasoning_content') or m.get('reasoning',''); print('thinking:', repr(thinking)[:80], '\nanswer:', m.get('content',''))" \ 2>/dev/null || echo "") if [[ -n "${CONTENT}" ]]; then ok "Reasoning mode works." echo "${CONTENT}" else fail "No response from reasoning mode" fi echo "" # --------------------------------------------------------------------------- # Test 5: Code generation # --------------------------------------------------------------------------- info "Test 5: Code generation" RESPONSE=$(curl -s \ -X POST "${BASE_URL}/chat/completions" \ -H "Content-Type: application/json" \ -d "{ \"model\": \"${MODEL_ID}\", \"messages\": [{\"role\": \"user\", \"content\": \"Write a Python function that returns the nth Fibonacci number using memoization.\"}], \"max_tokens\": 300, \"temperature\": 0.1, \"chat_template_kwargs\": {\"enable_thinking\": false} }") CODE=$(echo "${RESPONSE}" | python3 -c \ "import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "") if [[ -n "${CODE}" ]]; then ok "Code generation works." echo "${CODE}" | head -10 echo " ..." else fail "No code response" fi echo "" # --------------------------------------------------------------------------- # Summary # --------------------------------------------------------------------------- echo "============================================================" echo " Cline configuration (OpenAI Compatible provider):" echo "" echo " Base URL : ${BASE_URL}" echo " Model ID : ${MODEL_ID}" echo " API Key : none (any non-empty string)" echo "============================================================"