open-finance-llm-8b / start-vllm.sh
jeanbaptdzd's picture
fix: vLLM tool calling - enable by default with hermes parser
7239fe3
#!/bin/bash
# vLLM OpenAI-compatible API server startup script
# Compatible with Koyeb GPU deployment patterns
# Based on Koyeb's one-click vLLM + Qwen deployment templates
set -e
# Configuration from environment (with defaults)
MODEL="${MODEL:-DragonLLM/Qwen-Open-Finance-R-8B}"
PORT="${PORT:-8000}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
DTYPE="${DTYPE:-bfloat16}"
TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-${KOYEB_GPU_COUNT:-1}}"
# HF Token - HF_TOKEN_LC2 is the model access token (priority)
export HF_TOKEN="${HF_TOKEN_LC2:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}}"
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
echo "=========================================="
echo "vLLM OpenAI Server - Starting"
echo "=========================================="
echo "Model: $MODEL"
echo "Port: $PORT"
echo "Max Model Len: $MAX_MODEL_LEN"
echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
echo "Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
echo "=========================================="
# Build vLLM arguments
VLLM_ARGS=(
--model "$MODEL"
--trust-remote-code
--dtype "$DTYPE"
--max-model-len "$MAX_MODEL_LEN"
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE"
--port "$PORT"
--host 0.0.0.0
)
# Tool Calling Support
# ENABLED BY DEFAULT for Qwen models (using hermes parser)
# Set ENABLE_AUTO_TOOL_CHOICE=false to disable
# For Qwen models, the default parser is 'hermes'
ENABLE_AUTO_TOOL_CHOICE="${ENABLE_AUTO_TOOL_CHOICE:-true}"
TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-hermes}"
if [ "${ENABLE_AUTO_TOOL_CHOICE}" = "true" ]; then
VLLM_ARGS+=(--enable-auto-tool-choice --tool-call-parser "$TOOL_CALL_PARSER")
echo "Tool Calling: ENABLED (parser: $TOOL_CALL_PARSER)"
else
echo "Tool Calling: DISABLED"
fi
echo "=========================================="
# Execute vLLM server
exec python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"