| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| HOST="${1:-localhost}" |
| PORT="${2:-8000}" |
| BASE_URL="http://${HOST}:${PORT}/v1" |
|
|
| if [[ -t 1 ]]; then |
| GREEN="\033[0;32m"; RED="\033[0;31m"; YELLOW="\033[0;33m"; NC="\033[0m" |
| else |
| GREEN=""; RED=""; YELLOW=""; NC="" |
| fi |
|
|
| ok() { echo -e "${GREEN}[OK]${NC} $*"; } |
| fail() { echo -e "${RED}[FAIL]${NC} $*"; } |
| info() { echo -e "${YELLOW}[INFO]${NC} $*"; } |
|
|
| echo "============================================================" |
| echo " vLLM API smoke tests — ${BASE_URL}" |
| echo "============================================================" |
| echo "" |
|
|
| |
| |
| |
| info "Test 1: /health" |
| HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL%/v1}/health") |
| if [[ "${HTTP_CODE}" == "200" ]]; then |
| ok "/health returned HTTP 200" |
| else |
| fail "/health returned HTTP ${HTTP_CODE} (server may still be loading)" |
| fi |
| echo "" |
|
|
| |
| |
| |
| info "Test 2: GET /v1/models" |
| MODELS_RESPONSE=$(curl -s "${BASE_URL}/models") |
| echo "${MODELS_RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${MODELS_RESPONSE}" |
|
|
| MODEL_ID=$(echo "${MODELS_RESPONSE}" | python3 -c \ |
| "import sys,json; data=json.load(sys.stdin); print(data['data'][0]['id'])" 2>/dev/null || echo "") |
|
|
| if [[ -n "${MODEL_ID}" ]]; then |
| ok "Model loaded: ${MODEL_ID}" |
| else |
| fail "Could not parse model list" |
| MODEL_ID="GadflyII/Qwen3-Coder-Next-NVFP4" |
| fi |
| echo "" |
|
|
| |
| |
| |
| info "Test 3: POST /v1/chat/completions (reasoning off)" |
| RESPONSE=$(curl -s \ |
| -X POST "${BASE_URL}/chat/completions" \ |
| -H "Content-Type: application/json" \ |
| -d "{ |
| \"model\": \"${MODEL_ID}\", |
| \"messages\": [{\"role\": \"user\", \"content\": \"Reply in one sentence: what is the capital of France?\"}], |
| \"max_tokens\": 60, |
| \"temperature\": 0.1, |
| \"chat_template_kwargs\": {\"enable_thinking\": false} |
| }") |
|
|
| CONTENT=$(echo "${RESPONSE}" | python3 -c \ |
| "import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "") |
|
|
| if [[ -n "${CONTENT}" ]]; then |
| ok "Chat completion works." |
| echo " >> ${CONTENT}" |
| else |
| fail "No response" |
| echo "${RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${RESPONSE}" |
| fi |
| echo "" |
|
|
| |
| |
| |
| info "Test 4: POST /v1/chat/completions (reasoning on)" |
| RESPONSE=$(curl -s \ |
| -X POST "${BASE_URL}/chat/completions" \ |
| -H "Content-Type: application/json" \ |
| -d "{ |
| \"model\": \"${MODEL_ID}\", |
| \"messages\": [{\"role\": \"user\", \"content\": \"What is 17 * 23? Show your work.\"}], |
| \"max_tokens\": 1000, |
| \"temperature\": 0.1, |
| \"chat_template_kwargs\": {\"enable_thinking\": true} |
| }") |
|
|
| CONTENT=$(echo "${RESPONSE}" | python3 -c \ |
| "import sys,json; r=json.load(sys.stdin); m=r['choices'][0]['message']; thinking=m.get('reasoning_content') or m.get('reasoning',''); print('thinking:', repr(thinking)[:80], '\nanswer:', m.get('content',''))" \ |
| 2>/dev/null || echo "") |
|
|
| if [[ -n "${CONTENT}" ]]; then |
| ok "Reasoning mode works." |
| echo "${CONTENT}" |
| else |
| fail "No response from reasoning mode" |
| fi |
| echo "" |
|
|
| |
| |
| |
| info "Test 5: Code generation" |
| RESPONSE=$(curl -s \ |
| -X POST "${BASE_URL}/chat/completions" \ |
| -H "Content-Type: application/json" \ |
| -d "{ |
| \"model\": \"${MODEL_ID}\", |
| \"messages\": [{\"role\": \"user\", \"content\": \"Write a Python function that returns the nth Fibonacci number using memoization.\"}], |
| \"max_tokens\": 300, |
| \"temperature\": 0.1, |
| \"chat_template_kwargs\": {\"enable_thinking\": false} |
| }") |
|
|
| CODE=$(echo "${RESPONSE}" | python3 -c \ |
| "import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "") |
|
|
| if [[ -n "${CODE}" ]]; then |
| ok "Code generation works." |
| echo "${CODE}" | head -10 |
| echo " ..." |
| else |
| fail "No code response" |
| fi |
| echo "" |
|
|
| |
| |
| |
| echo "============================================================" |
| echo " Cline configuration (OpenAI Compatible provider):" |
| echo "" |
| echo " Base URL : ${BASE_URL}" |
| echo " Model ID : ${MODEL_ID}" |
| echo " API Key : none (any non-empty string)" |
| echo "============================================================" |
|
|