qwen3-coder-next / test-api.sh
gdubicki's picture
Switch to saricles/Qwen3-Coder-Next-NVFP4-GB10 with GB10-optimized settings
327350d verified
#!/usr/bin/env bash
# test-api.sh — smoke tests for the vLLM API
# Usage:
# bash test-api.sh # test localhost:8000
# bash test-api.sh 192.168.1.50 # test remote host
# bash test-api.sh 192.168.1.50 8080 # remote host, custom port
set -euo pipefail
HOST="${1:-localhost}"
PORT="${2:-8000}"
BASE_URL="http://${HOST}:${PORT}/v1"
if [[ -t 1 ]]; then
GREEN="\033[0;32m"; RED="\033[0;31m"; YELLOW="\033[0;33m"; NC="\033[0m"
else
GREEN=""; RED=""; YELLOW=""; NC=""
fi
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
info() { echo -e "${YELLOW}[INFO]${NC} $*"; }
echo "============================================================"
echo " vLLM API smoke tests — ${BASE_URL}"
echo "============================================================"
echo ""
# ---------------------------------------------------------------------------
# Test 1: Health endpoint
# ---------------------------------------------------------------------------
info "Test 1: /health"
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL%/v1}/health")
if [[ "${HTTP_CODE}" == "200" ]]; then
ok "/health returned HTTP 200"
else
fail "/health returned HTTP ${HTTP_CODE} (server may still be loading)"
fi
echo ""
# ---------------------------------------------------------------------------
# Test 2: Model list
# ---------------------------------------------------------------------------
info "Test 2: GET /v1/models"
MODELS_RESPONSE=$(curl -s "${BASE_URL}/models")
echo "${MODELS_RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${MODELS_RESPONSE}"
MODEL_ID=$(echo "${MODELS_RESPONSE}" | python3 -c \
"import sys,json; data=json.load(sys.stdin); print(data['data'][0]['id'])" 2>/dev/null || echo "")
if [[ -n "${MODEL_ID}" ]]; then
ok "Model loaded: ${MODEL_ID}"
else
fail "Could not parse model list"
MODEL_ID="GadflyII/Qwen3-Coder-Next-NVFP4"
fi
echo ""
# ---------------------------------------------------------------------------
# Test 3: Chat completion (reasoning off)
# ---------------------------------------------------------------------------
info "Test 3: POST /v1/chat/completions (reasoning off)"
RESPONSE=$(curl -s \
-X POST "${BASE_URL}/chat/completions" \
-H "Content-Type: application/json" \
-d "{
\"model\": \"${MODEL_ID}\",
\"messages\": [{\"role\": \"user\", \"content\": \"Reply in one sentence: what is the capital of France?\"}],
\"max_tokens\": 60,
\"temperature\": 0.1,
\"chat_template_kwargs\": {\"enable_thinking\": false}
}")
CONTENT=$(echo "${RESPONSE}" | python3 -c \
"import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "")
if [[ -n "${CONTENT}" ]]; then
ok "Chat completion works."
echo " >> ${CONTENT}"
else
fail "No response"
echo "${RESPONSE}" | python3 -m json.tool 2>/dev/null || echo "${RESPONSE}"
fi
echo ""
# ---------------------------------------------------------------------------
# Test 4: Chat completion (reasoning on)
# ---------------------------------------------------------------------------
info "Test 4: POST /v1/chat/completions (reasoning on)"
RESPONSE=$(curl -s \
-X POST "${BASE_URL}/chat/completions" \
-H "Content-Type: application/json" \
-d "{
\"model\": \"${MODEL_ID}\",
\"messages\": [{\"role\": \"user\", \"content\": \"What is 17 * 23? Show your work.\"}],
\"max_tokens\": 1000,
\"temperature\": 0.1,
\"chat_template_kwargs\": {\"enable_thinking\": true}
}")
CONTENT=$(echo "${RESPONSE}" | python3 -c \
"import sys,json; r=json.load(sys.stdin); m=r['choices'][0]['message']; thinking=m.get('reasoning_content') or m.get('reasoning',''); print('thinking:', repr(thinking)[:80], '\nanswer:', m.get('content',''))" \
2>/dev/null || echo "")
if [[ -n "${CONTENT}" ]]; then
ok "Reasoning mode works."
echo "${CONTENT}"
else
fail "No response from reasoning mode"
fi
echo ""
# ---------------------------------------------------------------------------
# Test 5: Code generation
# ---------------------------------------------------------------------------
info "Test 5: Code generation"
RESPONSE=$(curl -s \
-X POST "${BASE_URL}/chat/completions" \
-H "Content-Type: application/json" \
-d "{
\"model\": \"${MODEL_ID}\",
\"messages\": [{\"role\": \"user\", \"content\": \"Write a Python function that returns the nth Fibonacci number using memoization.\"}],
\"max_tokens\": 300,
\"temperature\": 0.1,
\"chat_template_kwargs\": {\"enable_thinking\": false}
}")
CODE=$(echo "${RESPONSE}" | python3 -c \
"import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null || echo "")
if [[ -n "${CODE}" ]]; then
ok "Code generation works."
echo "${CODE}" | head -10
echo " ..."
else
fail "No code response"
fi
echo ""
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo "============================================================"
echo " Cline configuration (OpenAI Compatible provider):"
echo ""
echo " Base URL : ${BASE_URL}"
echo " Model ID : ${MODEL_ID}"
echo " API Key : none (any non-empty string)"
echo "============================================================"