File size: 5,494 Bytes

327350d

#!/usr/bin/env bash
# test-api.sh — smoke tests for the vLLM API
# Usage:
#   bash test-api.sh                      # test localhost:8000
#   bash test-api.sh 192.168.1.50         # test remote host
#   bash test-api.sh 192.168.1.50 8080    # remote host, custom port

set -euo pipefail

HOST="${1:-localhost}"
PORT="${2:-8000}"
BASE_URL="http://${HOST}:${PORT}/v1"

if [[ -t 1 ]]; then
    GREEN="\033[0;32m"; RED="\033[0;31m"; YELLOW="\033[0;33m"; NC="\033[0m"
else
    GREEN=""; RED=""; YELLOW=""; NC=""
fi

ok()   { echo -e "${GREEN}[OK]${NC}  $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
info() { echo -e "${YELLOW}[INFO]${NC} $*"; }

echo "============================================================"
echo "  vLLM API smoke tests — ${BASE_URL}"
echo "============================================================"
echo ""

# ---------------------------------------------------------------------------
# Test 1: Health endpoint
# ---------------------------------------------------------------------------
info "Test 1: /health"
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL%/v1}/health")
if [[ "${HTTP_CODE}" == "200" ]]; then
    ok "/health returned HTTP 200"
else
    fail "/health returned HTTP ${HTTP_CODE} (server may still be loading)"
fi
echo ""

# ---------------------------------------------------------------------------
# Test 2: Model list
# ---------------------------------------------------------------------------
info "Test 2: GET /v1/models"
MODELS_RESPONSE=$(curl -s "${BASE_URL}/models")
echo "${MODELS_RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${MODELS_RESPONSE}"

MODEL_ID=$(echo "${MODELS_RESPONSE}" | python3 -c \
    "import sys,json; data=json.load(sys.stdin); print(data['data'][0]['id'])" 2>/dev/null || echo "")

if [[ -n "${MODEL_ID}" ]]; then
    ok "Model loaded: ${MODEL_ID}"
else
    fail "Could not parse model list"
    MODEL_ID="GadflyII/Qwen3-Coder-Next-NVFP4"
fi
echo ""

# ---------------------------------------------------------------------------
# Test 3: Chat completion (reasoning off)
# ---------------------------------------------------------------------------
info "Test 3: POST /v1/chat/completions (reasoning off)"
RESPONSE=$(curl -s \
    -X POST "${BASE_URL}/chat/completions" \
    -H "Content-Type: application/json" \
    -d "{
        \"model\": \"${MODEL_ID}\",
        \"messages\": [{\"role\": \"user\", \"content\": \"Reply in one sentence: what is the capital of France?\"}],
        \"max_tokens\": 60,
        \"temperature\": 0.1,
        \"chat_template_kwargs\": {\"enable_thinking\": false}
    }")

CONTENT=$(echo "${RESPONSE}" | python3 -c \
    "import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "")

if [[ -n "${CONTENT}" ]]; then
    ok "Chat completion works."
    echo "  >> ${CONTENT}"
else
    fail "No response"
    echo "${RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${RESPONSE}"
fi
echo ""

# ---------------------------------------------------------------------------
# Test 4: Chat completion (reasoning on)
# ---------------------------------------------------------------------------
info "Test 4: POST /v1/chat/completions (reasoning on)"
RESPONSE=$(curl -s \
    -X POST "${BASE_URL}/chat/completions" \
    -H "Content-Type: application/json" \
    -d "{
        \"model\": \"${MODEL_ID}\",
        \"messages\": [{\"role\": \"user\", \"content\": \"What is 17 * 23? Show your work.\"}],
        \"max_tokens\": 1000,
        \"temperature\": 0.1,
        \"chat_template_kwargs\": {\"enable_thinking\": true}
    }")

CONTENT=$(echo "${RESPONSE}" | python3 -c \
    "import sys,json; r=json.load(sys.stdin); m=r['choices'][0]['message']; thinking=m.get('reasoning_content') or m.get('reasoning',''); print('thinking:', repr(thinking)[:80], '\nanswer:', m.get('content',''))" \
    2>/dev/null || echo "")

if [[ -n "${CONTENT}" ]]; then
    ok "Reasoning mode works."
    echo "${CONTENT}"
else
    fail "No response from reasoning mode"
fi
echo ""

# ---------------------------------------------------------------------------
# Test 5: Code generation
# ---------------------------------------------------------------------------
info "Test 5: Code generation"
RESPONSE=$(curl -s \
    -X POST "${BASE_URL}/chat/completions" \
    -H "Content-Type: application/json" \
    -d "{
        \"model\": \"${MODEL_ID}\",
        \"messages\": [{\"role\": \"user\", \"content\": \"Write a Python function that returns the nth Fibonacci number using memoization.\"}],
        \"max_tokens\": 300,
        \"temperature\": 0.1,
        \"chat_template_kwargs\": {\"enable_thinking\": false}
    }")

CODE=$(echo "${RESPONSE}" | python3 -c \
    "import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "")

if [[ -n "${CODE}" ]]; then
    ok "Code generation works."
    echo "${CODE}" | head -10
    echo "  ..."
else
    fail "No code response"
fi
echo ""

# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo "============================================================"
echo "  Cline configuration (OpenAI Compatible provider):"
echo ""
echo "    Base URL : ${BASE_URL}"
echo "    Model ID : ${MODEL_ID}"
echo "    API Key  : none  (any non-empty string)"
echo "============================================================"