examples/test_tool_calling.py · joshuaeric/vllm-tool-calling-guide at main

vllm-tool-calling-guide / examples /test_tool_calling.py

Joshua Odmark

Initial release: VLLM tool calling guide for open source models

634c038 3 days ago

8.53 kB

	#!/usr/bin/env python3
	"""
	VLLM Tool Calling Smoke Test
	=============================

	Quick verification that your VLLM instance supports tool calling.
	Tests single tool calls, parallel tool calls, and multi-turn conversations.

	Usage:
	python test_tool_calling.py --url http://localhost:8000 --model NousResearch/Hermes-3-Llama-3.1-70B-FP8
	"""

	import argparse
	import json
	import sys
	import requests


	def test_single_tool_call(url: str, model: str) -> bool:
	"""Test basic single tool call."""
	print("Test 1: Single Tool Call")
	print("-" * 40)

	payload = {
	"model": model,
	"messages": [
	{"role": "user", "content": "What's the weather in San Francisco? Use the get_weather tool."}
	],
	"tools": [{
	"type": "function",
	"function": {
	"name": "get_weather",
	"description": "Get the current weather for a location",
	"parameters": {
	"type": "object",
	"properties": {
	"location": {"type": "string", "description": "City name"},
	"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
	},
	"required": ["location"]
	}
	}
	}],
	"tool_choice": "auto",
	"temperature": 0.1,
	"max_tokens": 500
	}

	try:
	response = requests.post(f"{url}/v1/chat/completions", json=payload, timeout=60)
	response.raise_for_status()
	result = response.json()
	message = result["choices"][0]["message"]

	if message.get("tool_calls"):
	tc = message["tool_calls"][0]
	print(f" Tool: {tc['function']['name']}")
	print(f" Arguments: {tc['function']['arguments']}")
	print(" PASSED\n")
	return True
	else:
	print(f" Model returned text: {message.get('content', '(empty)')[:100]}")
	print(" FAILED\n")
	return False

	except Exception as e:
	print(f" ERROR: {e}")
	print(" FAILED\n")
	return False


	def test_parallel_tool_calls(url: str, model: str) -> bool:
	"""Test if model can make multiple tool calls in one turn."""
	print("Test 2: Parallel Tool Calls")
	print("-" * 40)

	payload = {
	"model": model,
	"messages": [
	{"role": "user", "content": "Get the weather in Tokyo AND London."}
	],
	"tools": [{
	"type": "function",
	"function": {
	"name": "get_weather",
	"description": "Get the current weather for a location",
	"parameters": {
	"type": "object",
	"properties": {
	"location": {"type": "string"}
	},
	"required": ["location"]
	}
	}
	}],
	"tool_choice": "auto",
	"temperature": 0.1,
	"max_tokens": 500
	}

	try:
	response = requests.post(f"{url}/v1/chat/completions", json=payload, timeout=60)
	response.raise_for_status()
	result = response.json()
	message = result["choices"][0]["message"]

	if message.get("tool_calls") and len(message["tool_calls"]) >= 2:
	for tc in message["tool_calls"]:
	print(f" Tool: {tc['function']['name']}({tc['function']['arguments']})")
	print(" PASSED\n")
	return True
	elif message.get("tool_calls"):
	print(f" Only {len(message['tool_calls'])} tool call(s) — expected 2+")
	print(" PARTIAL (some models serialize parallel calls differently)\n")
	return True # Still counts as working
	else:
	print(f" Model returned text: {message.get('content', '(empty)')[:100]}")
	print(" FAILED\n")
	return False

	except Exception as e:
	print(f" ERROR: {e}")
	print(" FAILED\n")
	return False


	def test_multi_turn(url: str, model: str) -> bool:
	"""Test multi-turn: tool call -> tool result -> final answer."""
	print("Test 3: Multi-Turn Conversation")
	print("-" * 40)

	tools = [{
	"type": "function",
	"function": {
	"name": "get_weather",
	"description": "Get weather for a location",
	"parameters": {
	"type": "object",
	"properties": {"location": {"type": "string"}},
	"required": ["location"]
	}
	}
	}]

	messages = [
	{"role": "user", "content": "What's the weather in Paris?"}
	]

	try:
	# Turn 1: Get tool call
	response = requests.post(
	f"{url}/v1/chat/completions",
	json={"model": model, "messages": messages, "tools": tools, "tool_choice": "auto",
	"temperature": 0.1, "max_tokens": 500},
	timeout=60
	)
	response.raise_for_status()
	msg1 = response.json()["choices"][0]["message"]

	if not msg1.get("tool_calls"):
	print(" Turn 1: No tool calls")
	print(" FAILED\n")
	return False

	print(f" Turn 1: {msg1['tool_calls'][0]['function']['name']}() called")
	messages.append(msg1)

	# Add tool result
	messages.append({
	"role": "tool",
	"tool_call_id": msg1["tool_calls"][0]["id"],
	"content": json.dumps({"temperature": 18, "condition": "Sunny", "unit": "celsius"})
	})

	# Turn 2: Get final answer
	response = requests.post(
	f"{url}/v1/chat/completions",
	json={"model": model, "messages": messages, "tools": tools,
	"temperature": 0.1, "max_tokens": 500},
	timeout=60
	)
	response.raise_for_status()
	msg2 = response.json()["choices"][0]["message"]

	if msg2.get("content"):
	print(f" Turn 2: {msg2['content'][:100]}")
	print(" PASSED\n")
	return True
	else:
	print(" Turn 2: Empty response")
	print(" FAILED\n")
	return False

	except Exception as e:
	print(f" ERROR: {e}")
	print(" FAILED\n")
	return False


	def test_connection(url: str) -> bool:
	"""Test basic connectivity to VLLM."""
	print("Test 0: Connection")
	print("-" * 40)
	try:
	response = requests.get(f"{url}/v1/models", timeout=10)
	response.raise_for_status()
	models = response.json()
	model_ids = [m["id"] for m in models.get("data", [])]
	print(f" Available models: {', '.join(model_ids)}")
	print(" PASSED\n")
	return True
	except Exception as e:
	print(f" Cannot connect to {url}: {e}")
	print(" FAILED\n")
	return False


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="VLLM tool calling smoke test")
	parser.add_argument("--url", default="http://localhost:8000", help="VLLM server URL")
	parser.add_argument("--model", default="NousResearch/Hermes-3-Llama-3.1-70B-FP8")
	args = parser.parse_args()

	print("=" * 60)
	print(f"VLLM Tool Calling Smoke Test")
	print(f"Server: {args.url}")
	print(f"Model: {args.model}")
	print("=" * 60 + "\n")

	results = {}

	# Test connectivity first
	if not test_connection(args.url):
	print("Cannot connect to VLLM server. Is it running?")
	sys.exit(1)

	# Run tests
	results["Single tool call"] = test_single_tool_call(args.url, args.model)
	results["Parallel tool calls"] = test_parallel_tool_calls(args.url, args.model)
	results["Multi-turn"] = test_multi_turn(args.url, args.model)

	# Summary
	print("=" * 60)
	print("SUMMARY")
	print("=" * 60)
	for name, passed in results.items():
	status = "PASSED" if passed else "FAILED"
	print(f" {name}: {status}")

	all_passed = all(results.values())
	print(f"\n{'All tests passed!' if all_passed else 'Some tests failed.'}")

	if not results.get("Single tool call"):
	print("\nTroubleshooting:")
	print(" 1. Is --enable-auto-tool-choice set in VLLM launch?")
	print(" 2. Is --tool-call-parser set correctly? (hermes/llama3_json/mistral)")
	print(" 3. Is --max-model-len large enough? (128K recommended)")
	print(" 4. Check VLLM server logs for errors")

	sys.exit(0 if all_passed else 1)