File size: 2,055 Bytes
e3878fa
e89a0e6
7239fe3
 
e3878fa
e89a0e6
0d30de3
e89a0e6
 
 
 
 
 
7239fe3
0d30de3
e89a0e6
 
 
0d30de3
 
e89a0e6
0d30de3
e89a0e6
 
 
 
7239fe3
e89a0e6
e3878fa
 
7239fe3
1055891
 
 
 
 
 
7239fe3
1055891
 
 
 
7239fe3
 
 
 
 
 
 
 
 
 
1055891
7239fe3
1055891
 
7239fe3
 
 
1055891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
# vLLM OpenAI-compatible API server startup script
# Compatible with Koyeb GPU deployment patterns
# Based on Koyeb's one-click vLLM + Qwen deployment templates

set -e

# Configuration from environment (with defaults)
MODEL="${MODEL:-DragonLLM/Qwen-Open-Finance-R-8B}"
PORT="${PORT:-8000}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
DTYPE="${DTYPE:-bfloat16}"
TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-${KOYEB_GPU_COUNT:-1}}"

# HF Token - HF_TOKEN_LC2 is the model access token (priority)
export HF_TOKEN="${HF_TOKEN_LC2:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}}"
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"

echo "=========================================="
echo "vLLM OpenAI Server - Starting"
echo "=========================================="
echo "Model: $MODEL"
echo "Port: $PORT"
echo "Max Model Len: $MAX_MODEL_LEN"
echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
echo "Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
echo "=========================================="

# Build vLLM arguments
VLLM_ARGS=(
    --model "$MODEL"
    --trust-remote-code
    --dtype "$DTYPE"
    --max-model-len "$MAX_MODEL_LEN"
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
    --tensor-parallel-size "$TENSOR_PARALLEL_SIZE"
    --port "$PORT"
    --host 0.0.0.0
)

# Tool Calling Support
# ENABLED BY DEFAULT for Qwen models (using hermes parser)
# Set ENABLE_AUTO_TOOL_CHOICE=false to disable
# For Qwen models, the default parser is 'hermes'
ENABLE_AUTO_TOOL_CHOICE="${ENABLE_AUTO_TOOL_CHOICE:-true}"
TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-hermes}"

if [ "${ENABLE_AUTO_TOOL_CHOICE}" = "true" ]; then
    VLLM_ARGS+=(--enable-auto-tool-choice --tool-call-parser "$TOOL_CALL_PARSER")
    echo "Tool Calling: ENABLED (parser: $TOOL_CALL_PARSER)"
else
    echo "Tool Calling: DISABLED"
fi

echo "=========================================="

# Execute vLLM server
exec python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"