|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
|
|
|
MODEL="${MODEL:-DragonLLM/Qwen-Open-Finance-R-8B}" |
|
|
PORT="${PORT:-8000}" |
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}" |
|
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}" |
|
|
DTYPE="${DTYPE:-bfloat16}" |
|
|
TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-${KOYEB_GPU_COUNT:-1}}" |
|
|
|
|
|
|
|
|
export HF_TOKEN="${HF_TOKEN_LC2:-${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}}" |
|
|
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" |
|
|
|
|
|
echo "==========================================" |
|
|
echo "vLLM OpenAI Server - Starting" |
|
|
echo "==========================================" |
|
|
echo "Model: $MODEL" |
|
|
echo "Port: $PORT" |
|
|
echo "Max Model Len: $MAX_MODEL_LEN" |
|
|
echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION" |
|
|
echo "Tensor Parallel Size: $TENSOR_PARALLEL_SIZE" |
|
|
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}" |
|
|
echo "==========================================" |
|
|
|
|
|
|
|
|
VLLM_ARGS=( |
|
|
--model "$MODEL" |
|
|
--trust-remote-code |
|
|
--dtype "$DTYPE" |
|
|
--max-model-len "$MAX_MODEL_LEN" |
|
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" |
|
|
--tensor-parallel-size "$TENSOR_PARALLEL_SIZE" |
|
|
--port "$PORT" |
|
|
--host 0.0.0.0 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENABLE_AUTO_TOOL_CHOICE="${ENABLE_AUTO_TOOL_CHOICE:-true}" |
|
|
TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-hermes}" |
|
|
|
|
|
if [ "${ENABLE_AUTO_TOOL_CHOICE}" = "true" ]; then |
|
|
VLLM_ARGS+=(--enable-auto-tool-choice --tool-call-parser "$TOOL_CALL_PARSER") |
|
|
echo "Tool Calling: ENABLED (parser: $TOOL_CALL_PARSER)" |
|
|
else |
|
|
echo "Tool Calling: DISABLED" |
|
|
fi |
|
|
|
|
|
echo "==========================================" |
|
|
|
|
|
|
|
|
exec python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" |
|
|
|