| | |
| | """ |
| | Basic Tool Calling with VLLM |
| | ============================ |
| | |
| | Minimal working example of tool calling via VLLM's OpenAI-compatible API. |
| | Works with any model that supports tool calling (Hermes-3, Llama 3.3, Qwen2, Mistral). |
| | |
| | Usage: |
| | python basic_tool_call.py --url http://localhost:8000 --model NousResearch/Hermes-3-Llama-3.1-70B-FP8 |
| | """ |
| |
|
| | import argparse |
| | import json |
| | import requests |
| |
|
| |
|
| | def make_tool_call(vllm_url: str, model: str): |
| | """Send a tool-enabled chat completion request to VLLM.""" |
| |
|
| | |
| | tools = [ |
| | { |
| | "type": "function", |
| | "function": { |
| | "name": "get_weather", |
| | "description": "Get the current weather for a location", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "location": { |
| | "type": "string", |
| | "description": "City name, e.g. 'San Francisco'" |
| | }, |
| | "unit": { |
| | "type": "string", |
| | "enum": ["celsius", "fahrenheit"], |
| | "description": "Temperature unit" |
| | } |
| | }, |
| | "required": ["location"] |
| | } |
| | } |
| | } |
| | ] |
| |
|
| | |
| | payload = { |
| | "model": model, |
| | "messages": [ |
| | { |
| | "role": "user", |
| | "content": "What's the weather like in San Francisco?" |
| | } |
| | ], |
| | "tools": tools, |
| | "tool_choice": "auto", |
| | "temperature": 0.1, |
| | "max_tokens": 500 |
| | } |
| |
|
| | response = requests.post( |
| | f"{vllm_url}/v1/chat/completions", |
| | json=payload, |
| | timeout=60 |
| | ) |
| | response.raise_for_status() |
| | result = response.json() |
| |
|
| | |
| | message = result["choices"][0]["message"] |
| |
|
| | if "tool_calls" in message and message["tool_calls"]: |
| | print("Tool calls received:") |
| | for tc in message["tool_calls"]: |
| | print(f" Function: {tc['function']['name']}") |
| | print(f" Arguments: {tc['function']['arguments']}") |
| | return message["tool_calls"] |
| | else: |
| | print("No tool calls — model responded with text:") |
| | print(f" {message.get('content', '(empty)')}") |
| | return None |
| |
|
| |
|
| | def multi_turn_tool_call(vllm_url: str, model: str): |
| | """Demonstrate a full tool calling conversation: request -> execute -> respond.""" |
| |
|
| | tools = [ |
| | { |
| | "type": "function", |
| | "function": { |
| | "name": "get_weather", |
| | "description": "Get the current weather for a location", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "location": {"type": "string"}, |
| | "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} |
| | }, |
| | "required": ["location"] |
| | } |
| | } |
| | } |
| | ] |
| |
|
| | messages = [ |
| | {"role": "user", "content": "What's the weather in Tokyo and London?"} |
| | ] |
| |
|
| | |
| | print("\n--- Turn 1: Request ---") |
| | response = requests.post( |
| | f"{vllm_url}/v1/chat/completions", |
| | json={ |
| | "model": model, |
| | "messages": messages, |
| | "tools": tools, |
| | "tool_choice": "auto", |
| | "temperature": 0.1, |
| | "max_tokens": 500 |
| | }, |
| | timeout=60 |
| | ) |
| | response.raise_for_status() |
| | assistant_message = response.json()["choices"][0]["message"] |
| |
|
| | if not assistant_message.get("tool_calls"): |
| | print("Model did not call tools.") |
| | return |
| |
|
| | |
| | for tc in assistant_message["tool_calls"]: |
| | print(f" Tool: {tc['function']['name']}({tc['function']['arguments']})") |
| |
|
| | |
| | messages.append(assistant_message) |
| |
|
| | |
| | print("\n--- Turn 2: Tool Responses ---") |
| | for tc in assistant_message["tool_calls"]: |
| | |
| | fake_result = json.dumps({ |
| | "temperature": 22, |
| | "unit": "celsius", |
| | "condition": "Partly cloudy", |
| | "location": json.loads(tc["function"]["arguments"]).get("location", "Unknown") |
| | }) |
| | messages.append({ |
| | "role": "tool", |
| | "tool_call_id": tc["id"], |
| | "content": fake_result |
| | }) |
| | print(f" Sent result for {tc['function']['name']}: {fake_result}") |
| |
|
| | |
| | print("\n--- Turn 3: Final Response ---") |
| | response = requests.post( |
| | f"{vllm_url}/v1/chat/completions", |
| | json={ |
| | "model": model, |
| | "messages": messages, |
| | "tools": tools, |
| | "temperature": 0.1, |
| | "max_tokens": 500 |
| | }, |
| | timeout=60 |
| | ) |
| | response.raise_for_status() |
| | final_message = response.json()["choices"][0]["message"] |
| | print(f" {final_message.get('content', '(empty)')}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="Test VLLM tool calling") |
| | parser.add_argument("--url", default="http://localhost:8000", help="VLLM server URL") |
| | parser.add_argument("--model", default="NousResearch/Hermes-3-Llama-3.1-70B-FP8", help="Model name") |
| | args = parser.parse_args() |
| |
|
| | print("=" * 60) |
| | print("Test 1: Single Tool Call") |
| | print("=" * 60) |
| | make_tool_call(args.url, args.model) |
| |
|
| | print("\n" + "=" * 60) |
| | print("Test 2: Multi-Turn Tool Calling") |
| | print("=" * 60) |
| | multi_turn_tool_call(args.url, args.model) |
| |
|