examples/basic_tool_call.py · joshuaeric/vllm-tool-calling-guide at main

File size: 5,797 Bytes

634c038

#!/usr/bin/env python3
"""
Basic Tool Calling with VLLM
============================

Minimal working example of tool calling via VLLM's OpenAI-compatible API.
Works with any model that supports tool calling (Hermes-3, Llama 3.3, Qwen2, Mistral).

Usage:
    python basic_tool_call.py --url http://localhost:8000 --model NousResearch/Hermes-3-Llama-3.1-70B-FP8
"""

import argparse
import json
import requests


def make_tool_call(vllm_url: str, model: str):
    """Send a tool-enabled chat completion request to VLLM."""

    # Define a simple tool
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get the current weather for a location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "City name, e.g. 'San Francisco'"
                        },
                        "unit": {
                            "type": "string",
                            "enum": ["celsius", "fahrenheit"],
                            "description": "Temperature unit"
                        }
                    },
                    "required": ["location"]
                }
            }
        }
    ]

    # Send request
    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": "What's the weather like in San Francisco?"
            }
        ],
        "tools": tools,
        "tool_choice": "auto",
        "temperature": 0.1,
        "max_tokens": 500
    }

    response = requests.post(
        f"{vllm_url}/v1/chat/completions",
        json=payload,
        timeout=60
    )
    response.raise_for_status()
    result = response.json()

    # Extract tool calls
    message = result["choices"][0]["message"]

    if "tool_calls" in message and message["tool_calls"]:
        print("Tool calls received:")
        for tc in message["tool_calls"]:
            print(f"  Function: {tc['function']['name']}")
            print(f"  Arguments: {tc['function']['arguments']}")
        return message["tool_calls"]
    else:
        print("No tool calls — model responded with text:")
        print(f"  {message.get('content', '(empty)')}")
        return None


def multi_turn_tool_call(vllm_url: str, model: str):
    """Demonstrate a full tool calling conversation: request -> execute -> respond."""

    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_weather",
                "description": "Get the current weather for a location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {"type": "string"},
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                    },
                    "required": ["location"]
                }
            }
        }
    ]

    messages = [
        {"role": "user", "content": "What's the weather in Tokyo and London?"}
    ]

    # Turn 1: Get tool calls
    print("\n--- Turn 1: Request ---")
    response = requests.post(
        f"{vllm_url}/v1/chat/completions",
        json={
            "model": model,
            "messages": messages,
            "tools": tools,
            "tool_choice": "auto",
            "temperature": 0.1,
            "max_tokens": 500
        },
        timeout=60
    )
    response.raise_for_status()
    assistant_message = response.json()["choices"][0]["message"]

    if not assistant_message.get("tool_calls"):
        print("Model did not call tools.")
        return

    # Show tool calls
    for tc in assistant_message["tool_calls"]:
        print(f"  Tool: {tc['function']['name']}({tc['function']['arguments']})")

    # Add assistant message to conversation
    messages.append(assistant_message)

    # Turn 2: Simulate tool responses
    print("\n--- Turn 2: Tool Responses ---")
    for tc in assistant_message["tool_calls"]:
        # In a real app, you'd execute the actual function here
        fake_result = json.dumps({
            "temperature": 22,
            "unit": "celsius",
            "condition": "Partly cloudy",
            "location": json.loads(tc["function"]["arguments"]).get("location", "Unknown")
        })
        messages.append({
            "role": "tool",
            "tool_call_id": tc["id"],
            "content": fake_result
        })
        print(f"  Sent result for {tc['function']['name']}: {fake_result}")

    # Turn 3: Get final response
    print("\n--- Turn 3: Final Response ---")
    response = requests.post(
        f"{vllm_url}/v1/chat/completions",
        json={
            "model": model,
            "messages": messages,
            "tools": tools,
            "temperature": 0.1,
            "max_tokens": 500
        },
        timeout=60
    )
    response.raise_for_status()
    final_message = response.json()["choices"][0]["message"]
    print(f"  {final_message.get('content', '(empty)')}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Test VLLM tool calling")
    parser.add_argument("--url", default="http://localhost:8000", help="VLLM server URL")
    parser.add_argument("--model", default="NousResearch/Hermes-3-Llama-3.1-70B-FP8", help="Model name")
    args = parser.parse_args()

    print("=" * 60)
    print("Test 1: Single Tool Call")
    print("=" * 60)
    make_tool_call(args.url, args.model)

    print("\n" + "=" * 60)
    print("Test 2: Multi-Turn Tool Calling")
    print("=" * 60)
    multi_turn_tool_call(args.url, args.model)