vllm-tool-calling-guide / examples /basic_tool_call.py
Joshua Odmark
Initial release: VLLM tool calling guide for open source models
634c038
#!/usr/bin/env python3
"""
Basic Tool Calling with VLLM
============================
Minimal working example of tool calling via VLLM's OpenAI-compatible API.
Works with any model that supports tool calling (Hermes-3, Llama 3.3, Qwen2, Mistral).
Usage:
python basic_tool_call.py --url http://localhost:8000 --model NousResearch/Hermes-3-Llama-3.1-70B-FP8
"""
import argparse
import json
import requests
def make_tool_call(vllm_url: str, model: str):
"""Send a tool-enabled chat completion request to VLLM."""
# Define a simple tool
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name, e.g. 'San Francisco'"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "Temperature unit"
}
},
"required": ["location"]
}
}
}
]
# Send request
payload = {
"model": model,
"messages": [
{
"role": "user",
"content": "What's the weather like in San Francisco?"
}
],
"tools": tools,
"tool_choice": "auto",
"temperature": 0.1,
"max_tokens": 500
}
response = requests.post(
f"{vllm_url}/v1/chat/completions",
json=payload,
timeout=60
)
response.raise_for_status()
result = response.json()
# Extract tool calls
message = result["choices"][0]["message"]
if "tool_calls" in message and message["tool_calls"]:
print("Tool calls received:")
for tc in message["tool_calls"]:
print(f" Function: {tc['function']['name']}")
print(f" Arguments: {tc['function']['arguments']}")
return message["tool_calls"]
else:
print("No tool calls — model responded with text:")
print(f" {message.get('content', '(empty)')}")
return None
def multi_turn_tool_call(vllm_url: str, model: str):
"""Demonstrate a full tool calling conversation: request -> execute -> respond."""
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather for a location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location"]
}
}
}
]
messages = [
{"role": "user", "content": "What's the weather in Tokyo and London?"}
]
# Turn 1: Get tool calls
print("\n--- Turn 1: Request ---")
response = requests.post(
f"{vllm_url}/v1/chat/completions",
json={
"model": model,
"messages": messages,
"tools": tools,
"tool_choice": "auto",
"temperature": 0.1,
"max_tokens": 500
},
timeout=60
)
response.raise_for_status()
assistant_message = response.json()["choices"][0]["message"]
if not assistant_message.get("tool_calls"):
print("Model did not call tools.")
return
# Show tool calls
for tc in assistant_message["tool_calls"]:
print(f" Tool: {tc['function']['name']}({tc['function']['arguments']})")
# Add assistant message to conversation
messages.append(assistant_message)
# Turn 2: Simulate tool responses
print("\n--- Turn 2: Tool Responses ---")
for tc in assistant_message["tool_calls"]:
# In a real app, you'd execute the actual function here
fake_result = json.dumps({
"temperature": 22,
"unit": "celsius",
"condition": "Partly cloudy",
"location": json.loads(tc["function"]["arguments"]).get("location", "Unknown")
})
messages.append({
"role": "tool",
"tool_call_id": tc["id"],
"content": fake_result
})
print(f" Sent result for {tc['function']['name']}: {fake_result}")
# Turn 3: Get final response
print("\n--- Turn 3: Final Response ---")
response = requests.post(
f"{vllm_url}/v1/chat/completions",
json={
"model": model,
"messages": messages,
"tools": tools,
"temperature": 0.1,
"max_tokens": 500
},
timeout=60
)
response.raise_for_status()
final_message = response.json()["choices"][0]["message"]
print(f" {final_message.get('content', '(empty)')}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test VLLM tool calling")
parser.add_argument("--url", default="http://localhost:8000", help="VLLM server URL")
parser.add_argument("--model", default="NousResearch/Hermes-3-Llama-3.1-70B-FP8", help="Model name")
args = parser.parse_args()
print("=" * 60)
print("Test 1: Single Tool Call")
print("=" * 60)
make_tool_call(args.url, args.model)
print("\n" + "=" * 60)
print("Test 2: Multi-Turn Tool Calling")
print("=" * 60)
multi_turn_tool_call(args.url, args.model)