examples/test_tool_calling.py · joshuaeric/vllm-tool-calling-guide at main

File size: 8,527 Bytes

634c038

#!/usr/bin/env python3
"""
VLLM Tool Calling Smoke Test
=============================

Quick verification that your VLLM instance supports tool calling.
Tests single tool calls, parallel tool calls, and multi-turn conversations.

Usage:
    python test_tool_calling.py --url http://localhost:8000 --model NousResearch/Hermes-3-Llama-3.1-70B-FP8
"""

import argparse
import json
import sys
import requests


def test_single_tool_call(url: str, model: str) -> bool:
    """Test basic single tool call."""
    print("Test 1: Single Tool Call")
    print("-" * 40)

    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": "What's the weather in San Francisco? Use the get_weather tool."}
        ],
        "tools": [{
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get the current weather for a location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {"type": "string", "description": "City name"},
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                    },
                    "required": ["location"]
                }
            }
        }],
        "tool_choice": "auto",
        "temperature": 0.1,
        "max_tokens": 500
    }

    try:
        response = requests.post(f"{url}/v1/chat/completions", json=payload, timeout=60)
        response.raise_for_status()
        result = response.json()
        message = result["choices"][0]["message"]

        if message.get("tool_calls"):
            tc = message["tool_calls"][0]
            print(f"  Tool:      {tc['function']['name']}")
            print(f"  Arguments: {tc['function']['arguments']}")
            print("  PASSED\n")
            return True
        else:
            print(f"  Model returned text: {message.get('content', '(empty)')[:100]}")
            print("  FAILED\n")
            return False

    except Exception as e:
        print(f"  ERROR: {e}")
        print("  FAILED\n")
        return False


def test_parallel_tool_calls(url: str, model: str) -> bool:
    """Test if model can make multiple tool calls in one turn."""
    print("Test 2: Parallel Tool Calls")
    print("-" * 40)

    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": "Get the weather in Tokyo AND London."}
        ],
        "tools": [{
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get the current weather for a location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {"type": "string"}
                    },
                    "required": ["location"]
                }
            }
        }],
        "tool_choice": "auto",
        "temperature": 0.1,
        "max_tokens": 500
    }

    try:
        response = requests.post(f"{url}/v1/chat/completions", json=payload, timeout=60)
        response.raise_for_status()
        result = response.json()
        message = result["choices"][0]["message"]

        if message.get("tool_calls") and len(message["tool_calls"]) >= 2:
            for tc in message["tool_calls"]:
                print(f"  Tool: {tc['function']['name']}({tc['function']['arguments']})")
            print("  PASSED\n")
            return True
        elif message.get("tool_calls"):
            print(f"  Only {len(message['tool_calls'])} tool call(s) — expected 2+")
            print("  PARTIAL (some models serialize parallel calls differently)\n")
            return True  # Still counts as working
        else:
            print(f"  Model returned text: {message.get('content', '(empty)')[:100]}")
            print("  FAILED\n")
            return False

    except Exception as e:
        print(f"  ERROR: {e}")
        print("  FAILED\n")
        return False


def test_multi_turn(url: str, model: str) -> bool:
    """Test multi-turn: tool call -> tool result -> final answer."""
    print("Test 3: Multi-Turn Conversation")
    print("-" * 40)

    tools = [{
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get weather for a location",
            "parameters": {
                "type": "object",
                "properties": {"location": {"type": "string"}},
                "required": ["location"]
            }
        }
    }]

    messages = [
        {"role": "user", "content": "What's the weather in Paris?"}
    ]

    try:
        # Turn 1: Get tool call
        response = requests.post(
            f"{url}/v1/chat/completions",
            json={"model": model, "messages": messages, "tools": tools, "tool_choice": "auto",
                  "temperature": 0.1, "max_tokens": 500},
            timeout=60
        )
        response.raise_for_status()
        msg1 = response.json()["choices"][0]["message"]

        if not msg1.get("tool_calls"):
            print("  Turn 1: No tool calls")
            print("  FAILED\n")
            return False

        print(f"  Turn 1: {msg1['tool_calls'][0]['function']['name']}() called")
        messages.append(msg1)

        # Add tool result
        messages.append({
            "role": "tool",
            "tool_call_id": msg1["tool_calls"][0]["id"],
            "content": json.dumps({"temperature": 18, "condition": "Sunny", "unit": "celsius"})
        })

        # Turn 2: Get final answer
        response = requests.post(
            f"{url}/v1/chat/completions",
            json={"model": model, "messages": messages, "tools": tools,
                  "temperature": 0.1, "max_tokens": 500},
            timeout=60
        )
        response.raise_for_status()
        msg2 = response.json()["choices"][0]["message"]

        if msg2.get("content"):
            print(f"  Turn 2: {msg2['content'][:100]}")
            print("  PASSED\n")
            return True
        else:
            print("  Turn 2: Empty response")
            print("  FAILED\n")
            return False

    except Exception as e:
        print(f"  ERROR: {e}")
        print("  FAILED\n")
        return False


def test_connection(url: str) -> bool:
    """Test basic connectivity to VLLM."""
    print("Test 0: Connection")
    print("-" * 40)
    try:
        response = requests.get(f"{url}/v1/models", timeout=10)
        response.raise_for_status()
        models = response.json()
        model_ids = [m["id"] for m in models.get("data", [])]
        print(f"  Available models: {', '.join(model_ids)}")
        print("  PASSED\n")
        return True
    except Exception as e:
        print(f"  Cannot connect to {url}: {e}")
        print("  FAILED\n")
        return False


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="VLLM tool calling smoke test")
    parser.add_argument("--url", default="http://localhost:8000", help="VLLM server URL")
    parser.add_argument("--model", default="NousResearch/Hermes-3-Llama-3.1-70B-FP8")
    args = parser.parse_args()

    print("=" * 60)
    print(f"VLLM Tool Calling Smoke Test")
    print(f"Server: {args.url}")
    print(f"Model:  {args.model}")
    print("=" * 60 + "\n")

    results = {}

    # Test connectivity first
    if not test_connection(args.url):
        print("Cannot connect to VLLM server. Is it running?")
        sys.exit(1)

    # Run tests
    results["Single tool call"] = test_single_tool_call(args.url, args.model)
    results["Parallel tool calls"] = test_parallel_tool_calls(args.url, args.model)
    results["Multi-turn"] = test_multi_turn(args.url, args.model)

    # Summary
    print("=" * 60)
    print("SUMMARY")
    print("=" * 60)
    for name, passed in results.items():
        status = "PASSED" if passed else "FAILED"
        print(f"  {name}: {status}")

    all_passed = all(results.values())
    print(f"\n{'All tests passed!' if all_passed else 'Some tests failed.'}")

    if not results.get("Single tool call"):
        print("\nTroubleshooting:")
        print("  1. Is --enable-auto-tool-choice set in VLLM launch?")
        print("  2. Is --tool-call-parser set correctly? (hermes/llama3_json/mistral)")
        print("  3. Is --max-model-len large enough? (128K recommended)")
        print("  4. Check VLLM server logs for errors")

    sys.exit(0 if all_passed else 1)