Switch to saricles/Qwen3-Coder-Next-NVFP4-GB10 with GB10-optimized settings

327350d verified about 1 month ago

5.49 kB

	#!/usr/bin/env bash
	# test-api.sh — smoke tests for the vLLM API
	# Usage:
	# bash test-api.sh # test localhost:8000
	# bash test-api.sh 192.168.1.50 # test remote host
	# bash test-api.sh 192.168.1.50 8080 # remote host, custom port

	set -euo pipefail

	HOST="${1:-localhost}"
	PORT="${2:-8000}"
	BASE_URL="http://${HOST}:${PORT}/v1"

	if [[ -t 1 ]]; then
	GREEN="\033[0;32m"; RED="\033[0;31m"; YELLOW="\033[0;33m"; NC="\033[0m"
	else
	GREEN=""; RED=""; YELLOW=""; NC=""
	fi

	ok() { echo -e "${GREEN}[OK]${NC} $*"; }
	fail() { echo -e "${RED}[FAIL]${NC} $*"; }
	info() { echo -e "${YELLOW}[INFO]${NC} $*"; }

	echo "============================================================"
	echo " vLLM API smoke tests — ${BASE_URL}"
	echo "============================================================"
	echo ""

	# ---------------------------------------------------------------------------
	# Test 1: Health endpoint
	# ---------------------------------------------------------------------------
	info "Test 1: /health"
	HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL%/v1}/health")
	if [[ "${HTTP_CODE}" == "200" ]]; then
	ok "/health returned HTTP 200"
	else
	fail "/health returned HTTP ${HTTP_CODE} (server may still be loading)"
	fi
	echo ""

	# ---------------------------------------------------------------------------
	# Test 2: Model list
	# ---------------------------------------------------------------------------
	info "Test 2: GET /v1/models"
	MODELS_RESPONSE=$(curl -s "${BASE_URL}/models")
	echo "${MODELS_RESPONSE}" \| python3 -m json.tool 2>/dev/null \|\| echo "${MODELS_RESPONSE}"

	MODEL_ID=$(echo "${MODELS_RESPONSE}" \| python3 -c \
	"import sys,json; data=json.load(sys.stdin); print(data['data'][0]['id'])" 2>/dev/null \|\| echo "")

	if [[ -n "${MODEL_ID}" ]]; then
	ok "Model loaded: ${MODEL_ID}"
	else
	fail "Could not parse model list"
	MODEL_ID="GadflyII/Qwen3-Coder-Next-NVFP4"
	fi
	echo ""

	# ---------------------------------------------------------------------------
	# Test 3: Chat completion (reasoning off)
	# ---------------------------------------------------------------------------
	info "Test 3: POST /v1/chat/completions (reasoning off)"
	RESPONSE=$(curl -s \
	-X POST "${BASE_URL}/chat/completions" \
	-H "Content-Type: application/json" \
	-d "{
	\"model\": \"${MODEL_ID}\",
	\"messages\": [{\"role\": \"user\", \"content\": \"Reply in one sentence: what is the capital of France?\"}],
	\"max_tokens\": 60,
	\"temperature\": 0.1,
	\"chat_template_kwargs\": {\"enable_thinking\": false}
	}")

	CONTENT=$(echo "${RESPONSE}" \| python3 -c \
	"import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null \|\| echo "")

	if [[ -n "${CONTENT}" ]]; then
	ok "Chat completion works."
	echo " >> ${CONTENT}"
	else
	fail "No response"
	echo "${RESPONSE}" \| python3 -m json.tool 2>/dev/null \|\| echo "${RESPONSE}"
	fi
	echo ""

	# ---------------------------------------------------------------------------
	# Test 4: Chat completion (reasoning on)
	# ---------------------------------------------------------------------------
	info "Test 4: POST /v1/chat/completions (reasoning on)"
	RESPONSE=$(curl -s \
	-X POST "${BASE_URL}/chat/completions" \
	-H "Content-Type: application/json" \
	-d "{
	\"model\": \"${MODEL_ID}\",
	\"messages\": [{\"role\": \"user\", \"content\": \"What is 17 * 23? Show your work.\"}],
	\"max_tokens\": 1000,
	\"temperature\": 0.1,
	\"chat_template_kwargs\": {\"enable_thinking\": true}
	}")

	CONTENT=$(echo "${RESPONSE}" \| python3 -c \
	"import sys,json; r=json.load(sys.stdin); m=r['choices'][0]['message']; thinking=m.get('reasoning_content') or m.get('reasoning',''); print('thinking:', repr(thinking)[:80], '\nanswer:', m.get('content',''))" \
	2>/dev/null \|\| echo "")

	if [[ -n "${CONTENT}" ]]; then
	ok "Reasoning mode works."
	echo "${CONTENT}"
	else
	fail "No response from reasoning mode"
	fi
	echo ""

	# ---------------------------------------------------------------------------
	# Test 5: Code generation
	# ---------------------------------------------------------------------------
	info "Test 5: Code generation"
	RESPONSE=$(curl -s \
	-X POST "${BASE_URL}/chat/completions" \
	-H "Content-Type: application/json" \
	-d "{
	\"model\": \"${MODEL_ID}\",
	\"messages\": [{\"role\": \"user\", \"content\": \"Write a Python function that returns the nth Fibonacci number using memoization.\"}],
	\"max_tokens\": 300,
	\"temperature\": 0.1,
	\"chat_template_kwargs\": {\"enable_thinking\": false}
	}")

	CODE=$(echo "${RESPONSE}" \| python3 -c \
	"import sys,json; r=json.load(sys.stdin); print(r['choices'][0]['message']['content'])" 2>/dev/null \|\| echo "")

	if [[ -n "${CODE}" ]]; then
	ok "Code generation works."
	echo "${CODE}" \| head -10
	echo " ..."
	else
	fail "No code response"
	fi
	echo ""

	# ---------------------------------------------------------------------------
	# Summary
	# ---------------------------------------------------------------------------
	echo "============================================================"
	echo " Cline configuration (OpenAI Compatible provider):"
	echo ""
	echo " Base URL : ${BASE_URL}"
	echo " Model ID : ${MODEL_ID}"
	echo " API Key : none (any non-empty string)"
	echo "============================================================"