#!/usr/bin/env python3 """ Basic Tool Calling with VLLM ============================ Minimal working example of tool calling via VLLM's OpenAI-compatible API. Works with any model that supports tool calling (Hermes-3, Llama 3.3, Qwen2, Mistral). Usage: python basic_tool_call.py --url http://localhost:8000 --model NousResearch/Hermes-3-Llama-3.1-70B-FP8 """ import argparse import json import requests def make_tool_call(vllm_url: str, model: str): """Send a tool-enabled chat completion request to VLLM.""" # Define a simple tool tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "City name, e.g. 'San Francisco'" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit" } }, "required": ["location"] } } } ] # Send request payload = { "model": model, "messages": [ { "role": "user", "content": "What's the weather like in San Francisco?" } ], "tools": tools, "tool_choice": "auto", "temperature": 0.1, "max_tokens": 500 } response = requests.post( f"{vllm_url}/v1/chat/completions", json=payload, timeout=60 ) response.raise_for_status() result = response.json() # Extract tool calls message = result["choices"][0]["message"] if "tool_calls" in message and message["tool_calls"]: print("Tool calls received:") for tc in message["tool_calls"]: print(f" Function: {tc['function']['name']}") print(f" Arguments: {tc['function']['arguments']}") return message["tool_calls"] else: print("No tool calls — model responded with text:") print(f" {message.get('content', '(empty)')}") return None def multi_turn_tool_call(vllm_url: str, model: str): """Demonstrate a full tool calling conversation: request -> execute -> respond.""" tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a location", "parameters": { "type": "object", "properties": { "location": {"type": "string"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} }, "required": ["location"] } } } ] messages = [ {"role": "user", "content": "What's the weather in Tokyo and London?"} ] # Turn 1: Get tool calls print("\n--- Turn 1: Request ---") response = requests.post( f"{vllm_url}/v1/chat/completions", json={ "model": model, "messages": messages, "tools": tools, "tool_choice": "auto", "temperature": 0.1, "max_tokens": 500 }, timeout=60 ) response.raise_for_status() assistant_message = response.json()["choices"][0]["message"] if not assistant_message.get("tool_calls"): print("Model did not call tools.") return # Show tool calls for tc in assistant_message["tool_calls"]: print(f" Tool: {tc['function']['name']}({tc['function']['arguments']})") # Add assistant message to conversation messages.append(assistant_message) # Turn 2: Simulate tool responses print("\n--- Turn 2: Tool Responses ---") for tc in assistant_message["tool_calls"]: # In a real app, you'd execute the actual function here fake_result = json.dumps({ "temperature": 22, "unit": "celsius", "condition": "Partly cloudy", "location": json.loads(tc["function"]["arguments"]).get("location", "Unknown") }) messages.append({ "role": "tool", "tool_call_id": tc["id"], "content": fake_result }) print(f" Sent result for {tc['function']['name']}: {fake_result}") # Turn 3: Get final response print("\n--- Turn 3: Final Response ---") response = requests.post( f"{vllm_url}/v1/chat/completions", json={ "model": model, "messages": messages, "tools": tools, "temperature": 0.1, "max_tokens": 500 }, timeout=60 ) response.raise_for_status() final_message = response.json()["choices"][0]["message"] print(f" {final_message.get('content', '(empty)')}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Test VLLM tool calling") parser.add_argument("--url", default="http://localhost:8000", help="VLLM server URL") parser.add_argument("--model", default="NousResearch/Hermes-3-Llama-3.1-70B-FP8", help="Model name") args = parser.parse_args() print("=" * 60) print("Test 1: Single Tool Call") print("=" * 60) make_tool_call(args.url, args.model) print("\n" + "=" * 60) print("Test 2: Multi-Turn Tool Calling") print("=" * 60) multi_turn_tool_call(args.url, args.model)