Commit
·
b91238a
1
Parent(s):
9566d4f
Enable tool calling in vLLM server
Browse files- Add --enable-auto-tool-choice flag
- Add --tool-call-parser json for JSON-based tool call parsing
- Update startup logging to show tool calling is enabled
- This enables OpenAI-compatible tool calling API support
- start-vllm.sh +5 -1
start-vllm.sh
CHANGED
|
@@ -22,10 +22,12 @@ echo "Model: $MODEL"
|
|
| 22 |
echo "Port: $PORT"
|
| 23 |
echo "Max Model Len: $MAX_MODEL_LEN"
|
| 24 |
echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
|
|
|
|
| 25 |
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
|
| 26 |
echo "=========================================="
|
| 27 |
|
| 28 |
# Execute vLLM server (use python3, not python)
|
|
|
|
| 29 |
exec python3 -m vllm.entrypoints.openai.api_server \
|
| 30 |
--model "$MODEL" \
|
| 31 |
--trust-remote-code \
|
|
@@ -33,4 +35,6 @@ exec python3 -m vllm.entrypoints.openai.api_server \
|
|
| 33 |
--max-model-len "$MAX_MODEL_LEN" \
|
| 34 |
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
| 35 |
--port "$PORT" \
|
| 36 |
-
--host 0.0.0.0
|
|
|
|
|
|
|
|
|
| 22 |
echo "Port: $PORT"
|
| 23 |
echo "Max Model Len: $MAX_MODEL_LEN"
|
| 24 |
echo "GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
|
| 25 |
+
echo "Tool Calling: ENABLED (auto-tool-choice, json parser)"
|
| 26 |
echo "HF Token: ${HF_TOKEN:+set (${#HF_TOKEN} chars)}"
|
| 27 |
echo "=========================================="
|
| 28 |
|
| 29 |
# Execute vLLM server (use python3, not python)
|
| 30 |
+
# Enable tool calling support for OpenAI-compatible API
|
| 31 |
exec python3 -m vllm.entrypoints.openai.api_server \
|
| 32 |
--model "$MODEL" \
|
| 33 |
--trust-remote-code \
|
|
|
|
| 35 |
--max-model-len "$MAX_MODEL_LEN" \
|
| 36 |
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
| 37 |
--port "$PORT" \
|
| 38 |
+
--host 0.0.0.0 \
|
| 39 |
+
--enable-auto-tool-choice \
|
| 40 |
+
--tool-call-parser json
|