|
|
|
|
|
""" |
|
|
Generate the FunctionGemma evaluation benchmark. |
|
|
|
|
|
Creates 100 high-quality samples to assess function-calling accuracy across: |
|
|
- SEARCH_TOKEN calls |
|
|
- EXECUTE_SWAP calls |
|
|
- Incomplete requests (should ask back) |
|
|
- Irrelevant requests (should refuse) |
|
|
""" |
|
|
|
|
|
import json |
|
|
import random |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Any, Optional |
|
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent |
|
|
DEFAULT_BENCHMARK_PATH = PROJECT_ROOT / "data" / "benchmark_dataset.json" |
|
|
|
|
|
|
|
|
TOKENS = { |
|
|
"SOL": {"ca": "So11111111111111111111111111111111111111112", "chain": "solana"}, |
|
|
"USDC": {"ca": "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", "chain": "solana"}, |
|
|
"JUP": {"ca": "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", "chain": "solana"}, |
|
|
"RAY": {"ca": "4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R", "chain": "solana"}, |
|
|
"BONK": {"ca": "DezXAZ8z7PnrnRJjz3wXBoRgixCa6xjnB7YaB1pPB263", "chain": "solana"}, |
|
|
"WIF": {"ca": "EKpQGSJtjMFqKZ9KQanSqYXRcF8fBopzLHYxdM65zcjm", "chain": "solana"}, |
|
|
"ETH": {"ca": "7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs", "chain": "solana"}, |
|
|
"BTC": {"ca": "9n4nbM75f5Ui33ZbPYXn59EwSgE8CGsHtAeTH5YFeJ9E", "chain": "solana"}, |
|
|
"POPCAT": {"ca": "7GCihgDB8fe6KNjn2MYtkzZcRjQy3t9GHdC8uHYmW2hr", "chain": "solana"}, |
|
|
"TRUMP": {"ca": "6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN", "chain": "solana"}, |
|
|
} |
|
|
|
|
|
CHAINS = ["solana", "ethereum", "bsc", "base"] |
|
|
|
|
|
|
|
|
TOOLS = [ |
|
|
{ |
|
|
"type": "function", |
|
|
"function": { |
|
|
"name": "SEARCH_TOKEN", |
|
|
"description": "search token onchain", |
|
|
"parameters": { |
|
|
"type": "object", |
|
|
"properties": { |
|
|
"symbol": {"type": ["string", "null"], "description": "Symbol of the token"}, |
|
|
"address": {"type": ["string", "null"], "description": "Contract address of the token"}, |
|
|
"chain": {"type": "string", "enum": ["solana", "ethereum", "bsc", "base"], "description": "supported chains"}, |
|
|
"keyword": {"type": ["string", "null"], "description": "keyword to search for the token"} |
|
|
}, |
|
|
"required": [] |
|
|
} |
|
|
} |
|
|
}, |
|
|
{ |
|
|
"type": "function", |
|
|
"function": { |
|
|
"name": "EXECUTE_SWAP", |
|
|
"description": "Swap tokens on the Solana blockchain. When the user specifies 'buy <token>', the default input token is SOL. When the user specifies 'sell <token>', the default output token is SOL.", |
|
|
"parameters": { |
|
|
"type": "object", |
|
|
"properties": { |
|
|
"inputTokenSymbol": {"type": ["string", "null"], "description": "Symbol of the token to sell."}, |
|
|
"inputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to sell."}, |
|
|
"outputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to buy."}, |
|
|
"inputTokenAmount": {"type": ["string", "null"], "description": "Exact amount of the input token to swap."}, |
|
|
"inputTokenPercentage": {"type": ["number", "null"], "description": "Percentage of the input token balance to swap."}, |
|
|
"outputTokenAmount": {"type": ["string", "null"], "description": "Expected amount of the output token to receive."} |
|
|
}, |
|
|
"required": ["inputTokenCA", "outputTokenCA", "inputTokenAmount", "inputTokenPercentage"] |
|
|
} |
|
|
} |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
def create_benchmark_item( |
|
|
user_input: str, |
|
|
expected_function: Optional[str], |
|
|
expected_args: Optional[Dict] = None, |
|
|
category: str = "function_call", |
|
|
description: str = "" |
|
|
) -> Dict: |
|
|
"""Create one benchmark sample.""" |
|
|
return { |
|
|
"id": None, |
|
|
"category": category, |
|
|
"description": description, |
|
|
"input": { |
|
|
"messages": [ |
|
|
{"role": "developer", "content": "You are a model that can do function calling with the following functions"}, |
|
|
{"role": "user", "content": user_input} |
|
|
], |
|
|
"tools": TOOLS |
|
|
}, |
|
|
"expected": { |
|
|
"function_name": expected_function, |
|
|
"arguments": expected_args |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
def generate_search_token_benchmarks() -> List[Dict]: |
|
|
"""Generate SEARCH_TOKEN cases.""" |
|
|
benchmarks = [] |
|
|
|
|
|
|
|
|
test_cases = [ |
|
|
("Search for BONK token", "BONK", "solana", None, None), |
|
|
("Find WIF on solana", "WIF", "solana", None, None), |
|
|
("Look up JUP token", "JUP", "solana", None, None), |
|
|
("Search ETH on ethereum", "ETH", "ethereum", None, None), |
|
|
("Find USDC token on base", "USDC", "base", None, None), |
|
|
] |
|
|
|
|
|
for query, symbol, chain, address, keyword in test_cases: |
|
|
expected_args = {"symbol": symbol, "chain": chain} |
|
|
if address: |
|
|
expected_args["address"] = address |
|
|
if keyword: |
|
|
expected_args["keyword"] = keyword |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "SEARCH_TOKEN", expected_args, |
|
|
"search_by_symbol", f"Search {symbol} by symbol" |
|
|
)) |
|
|
|
|
|
|
|
|
cn_cases = [ |
|
|
("帮我搜索 BONK 代币", "BONK", "solana"), |
|
|
("查一下 WIF 这个币", "WIF", "solana"), |
|
|
("找一下 JUP 代币信息", "JUP", "solana"), |
|
|
("搜索 RAY 代币", "RAY", "solana"), |
|
|
("查询 POPCAT 代币", "POPCAT", "solana"), |
|
|
] |
|
|
|
|
|
for query, symbol, chain in cn_cases: |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "SEARCH_TOKEN", {"symbol": symbol, "chain": chain}, |
|
|
"search_by_symbol_cn", f"Search {symbol} by symbol (Chinese)" |
|
|
)) |
|
|
|
|
|
|
|
|
for token, info in list(TOKENS.items())[:5]: |
|
|
query = f"Search token at address {info['ca']}" |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "SEARCH_TOKEN", {"address": info['ca'], "chain": info['chain']}, |
|
|
"search_by_address", f"Search {token} by address" |
|
|
)) |
|
|
|
|
|
|
|
|
keyword_cases = [ |
|
|
("Search for dog themed tokens", "dog", "solana"), |
|
|
("Find meme coins", "meme", "solana"), |
|
|
("Look for cat tokens on base", "cat", "base"), |
|
|
] |
|
|
|
|
|
for query, keyword, chain in keyword_cases: |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "SEARCH_TOKEN", {"keyword": keyword, "chain": chain}, |
|
|
"search_by_keyword", f"Search by keyword: {keyword}" |
|
|
)) |
|
|
|
|
|
return benchmarks |
|
|
|
|
|
|
|
|
def generate_execute_swap_benchmarks() -> List[Dict]: |
|
|
"""Generate EXECUTE_SWAP cases.""" |
|
|
benchmarks = [] |
|
|
|
|
|
|
|
|
buy_cases = [ |
|
|
("Buy 1 SOL worth of BONK", "SOL", "BONK", "1", None), |
|
|
("Purchase 5 SOL of WIF", "SOL", "WIF", "5", None), |
|
|
("Buy 10 USDC worth of JUP", "USDC", "JUP", "10", None), |
|
|
("I want to buy 2 SOL of RAY", "SOL", "RAY", "2", None), |
|
|
("Get me 0.5 SOL of POPCAT", "SOL", "POPCAT", "0.5", None), |
|
|
] |
|
|
|
|
|
for query, input_token, output_token, amount, percentage in buy_cases: |
|
|
input_ca = TOKENS[input_token]["ca"] |
|
|
output_ca = TOKENS[output_token]["ca"] |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
|
|
"buy_with_amount", f"Buy {output_token} with {amount} {input_token}" |
|
|
)) |
|
|
|
|
|
|
|
|
buy_pct_cases = [ |
|
|
("Buy BONK with 50% of my SOL", "SOL", "BONK", None, 0.5), |
|
|
("Use 30% of my USDC to buy WIF", "USDC", "WIF", None, 0.3), |
|
|
("Spend 100% of my SOL on JUP", "SOL", "JUP", None, 1.0), |
|
|
("Put 25% of my ETH into RAY", "ETH", "RAY", None, 0.25), |
|
|
("Use half of my BTC to get BONK", "BTC", "BONK", None, 0.5), |
|
|
] |
|
|
|
|
|
for query, input_token, output_token, amount, percentage in buy_pct_cases: |
|
|
input_ca = TOKENS[input_token]["ca"] |
|
|
output_ca = TOKENS[output_token]["ca"] |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
|
|
"buy_with_percentage", f"Buy {output_token} with {int(percentage*100)}% {input_token}" |
|
|
)) |
|
|
|
|
|
|
|
|
sell_cases = [ |
|
|
("Sell 1000 BONK", "BONK", "SOL", "1000", None), |
|
|
("Sell 500 WIF for SOL", "WIF", "SOL", "500", None), |
|
|
("Convert 100 JUP to SOL", "JUP", "SOL", "100", None), |
|
|
("Dump 2000 RAY", "RAY", "SOL", "2000", None), |
|
|
("Sell 50 USDC", "USDC", "SOL", "50", None), |
|
|
] |
|
|
|
|
|
for query, input_token, output_token, amount, percentage in sell_cases: |
|
|
input_ca = TOKENS[input_token]["ca"] |
|
|
output_ca = TOKENS[output_token]["ca"] |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
|
|
"sell_with_amount", f"Sell {amount} {input_token}" |
|
|
)) |
|
|
|
|
|
|
|
|
sell_pct_cases = [ |
|
|
("Sell 50% of my BONK", "BONK", "SOL", None, 0.5), |
|
|
("Dump all my WIF", "WIF", "SOL", None, 1.0), |
|
|
("Sell 30% of my JUP holdings", "JUP", "SOL", None, 0.3), |
|
|
("Get rid of 75% of my RAY", "RAY", "SOL", None, 0.75), |
|
|
("Sell a quarter of my USDC", "USDC", "SOL", None, 0.25), |
|
|
] |
|
|
|
|
|
for query, input_token, output_token, amount, percentage in sell_pct_cases: |
|
|
input_ca = TOKENS[input_token]["ca"] |
|
|
output_ca = TOKENS[output_token]["ca"] |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
|
|
"sell_with_percentage", f"Sell {int(percentage*100)}% {input_token}" |
|
|
)) |
|
|
|
|
|
|
|
|
cn_swap_cases = [ |
|
|
("用 1 个 SOL 买 BONK", "SOL", "BONK", "1", None), |
|
|
("把 50% 的 USDC 换成 WIF", "USDC", "WIF", None, 0.5), |
|
|
("卖掉 1000 个 BONK", "BONK", "SOL", "1000", None), |
|
|
("把所有 JUP 都卖了", "JUP", "SOL", None, 1.0), |
|
|
("用 2 SOL 购买 RAY", "SOL", "RAY", "2", None), |
|
|
("出售 30% 的 WIF", "WIF", "SOL", None, 0.3), |
|
|
("买入 5 SOL 的 POPCAT", "SOL", "POPCAT", "5", None), |
|
|
("清仓 ETH", "ETH", "SOL", None, 1.0), |
|
|
] |
|
|
|
|
|
for query, input_token, output_token, amount, percentage in cn_swap_cases: |
|
|
input_ca = TOKENS[input_token]["ca"] |
|
|
output_ca = TOKENS[output_token]["ca"] |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
|
|
"swap_chinese", f"Swap request in Chinese" |
|
|
)) |
|
|
|
|
|
|
|
|
swap_cases = [ |
|
|
("Swap 100 USDC for BONK", "USDC", "BONK", "100", None), |
|
|
("Exchange 50 JUP for WIF", "JUP", "WIF", "50", None), |
|
|
("Convert all my ETH to USDC", "ETH", "USDC", None, 1.0), |
|
|
] |
|
|
|
|
|
for query, input_token, output_token, amount, percentage in swap_cases: |
|
|
input_ca = TOKENS[input_token]["ca"] |
|
|
output_ca = TOKENS[output_token]["ca"] |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
|
|
"token_to_token", f"Swap {input_token} to {output_token}" |
|
|
)) |
|
|
|
|
|
return benchmarks |
|
|
|
|
|
|
|
|
def generate_incomplete_benchmarks() -> List[Dict]: |
|
|
"""Generate incomplete requests (should ask clarification).""" |
|
|
benchmarks = [] |
|
|
|
|
|
incomplete_cases = [ |
|
|
("I want to buy some tokens", "incomplete_no_token", "Missing token name"), |
|
|
("Sell my holdings", "incomplete_no_token", "Missing which token to sell"), |
|
|
("Search for a token", "incomplete_no_info", "Missing token info"), |
|
|
("Buy something", "incomplete_vague", "Too vague"), |
|
|
("我想买币", "incomplete_cn", "Missing token (Chinese)"), |
|
|
("帮我卖掉", "incomplete_cn", "Missing token and amount (Chinese)"), |
|
|
("Swap tokens", "incomplete_swap", "Missing swap details"), |
|
|
("I want to trade", "incomplete_trade", "Missing trade details"), |
|
|
] |
|
|
|
|
|
for query, category, description in incomplete_cases: |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, None, None, category, description |
|
|
)) |
|
|
|
|
|
return benchmarks |
|
|
|
|
|
|
|
|
def generate_irrelevant_benchmarks() -> List[Dict]: |
|
|
"""Generate irrelevant requests (should not call any function).""" |
|
|
benchmarks = [] |
|
|
|
|
|
irrelevant_cases = [ |
|
|
("What's the weather today?", "irrelevant_weather", "Weather query"), |
|
|
("Tell me a joke", "irrelevant_joke", "Joke request"), |
|
|
("What time is it?", "irrelevant_time", "Time query"), |
|
|
("Who is the president?", "irrelevant_general", "General knowledge"), |
|
|
("今天天气怎么样?", "irrelevant_cn", "Weather (Chinese)"), |
|
|
("给我讲个笑话", "irrelevant_cn", "Joke (Chinese)"), |
|
|
("Hello, how are you?", "irrelevant_greeting", "Greeting"), |
|
|
("What is Bitcoin?", "irrelevant_info", "Info request (no action)"), |
|
|
] |
|
|
|
|
|
for query, category, description in irrelevant_cases: |
|
|
benchmarks.append(create_benchmark_item( |
|
|
query, None, None, category, description |
|
|
)) |
|
|
|
|
|
return benchmarks |
|
|
|
|
|
|
|
|
def generate_benchmark_dataset(output_path: str = str(DEFAULT_BENCHMARK_PATH)): |
|
|
"""Generate the full benchmark dataset.""" |
|
|
|
|
|
print("=" * 60) |
|
|
print("Generating FunctionGemma benchmark dataset") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
all_benchmarks = [] |
|
|
|
|
|
|
|
|
search_benchmarks = generate_search_token_benchmarks() |
|
|
print(f"SEARCH_TOKEN cases: {len(search_benchmarks)}") |
|
|
all_benchmarks.extend(search_benchmarks) |
|
|
|
|
|
|
|
|
swap_benchmarks = generate_execute_swap_benchmarks() |
|
|
print(f"EXECUTE_SWAP cases: {len(swap_benchmarks)}") |
|
|
all_benchmarks.extend(swap_benchmarks) |
|
|
|
|
|
|
|
|
incomplete_benchmarks = generate_incomplete_benchmarks() |
|
|
print(f"Incomplete request cases: {len(incomplete_benchmarks)}") |
|
|
all_benchmarks.extend(incomplete_benchmarks) |
|
|
|
|
|
|
|
|
irrelevant_benchmarks = generate_irrelevant_benchmarks() |
|
|
print(f"Irrelevant request cases: {len(irrelevant_benchmarks)}") |
|
|
all_benchmarks.extend(irrelevant_benchmarks) |
|
|
|
|
|
|
|
|
while len(all_benchmarks) < 100: |
|
|
|
|
|
extra_cases = [ |
|
|
("Buy 3 SOL of TRUMP", "SOL", "TRUMP", "3", None, "EXECUTE_SWAP"), |
|
|
("Search for TRUMP token", "TRUMP", "solana", None, None, "SEARCH_TOKEN"), |
|
|
] |
|
|
for case in extra_cases: |
|
|
if len(all_benchmarks) >= 100: |
|
|
break |
|
|
if case[5] == "EXECUTE_SWAP": |
|
|
input_ca = TOKENS[case[1]]["ca"] |
|
|
output_ca = TOKENS[case[2]]["ca"] |
|
|
all_benchmarks.append(create_benchmark_item( |
|
|
case[0], "EXECUTE_SWAP", |
|
|
{"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": case[3], "inputTokenPercentage": case[4]}, |
|
|
"extra", "Extra test case" |
|
|
)) |
|
|
else: |
|
|
all_benchmarks.append(create_benchmark_item( |
|
|
case[0], "SEARCH_TOKEN", |
|
|
{"symbol": case[1], "chain": case[2]}, |
|
|
"extra", "Extra test case" |
|
|
)) |
|
|
|
|
|
|
|
|
all_benchmarks = all_benchmarks[:100] |
|
|
|
|
|
|
|
|
for i, item in enumerate(all_benchmarks): |
|
|
item["id"] = i + 1 |
|
|
|
|
|
|
|
|
random.seed(42) |
|
|
random.shuffle(all_benchmarks) |
|
|
|
|
|
|
|
|
for i, item in enumerate(all_benchmarks): |
|
|
item["id"] = i + 1 |
|
|
|
|
|
print(f"\nTotal: {len(all_benchmarks)} cases") |
|
|
|
|
|
|
|
|
categories = {} |
|
|
for item in all_benchmarks: |
|
|
cat = item["category"] |
|
|
categories[cat] = categories.get(cat, 0) + 1 |
|
|
|
|
|
print("\nCategory distribution:") |
|
|
for cat, count in sorted(categories.items()): |
|
|
print(f" - {cat}: {count}") |
|
|
|
|
|
|
|
|
func_counts = {"SEARCH_TOKEN": 0, "EXECUTE_SWAP": 0, "None": 0} |
|
|
for item in all_benchmarks: |
|
|
func = item["expected"]["function_name"] |
|
|
if func: |
|
|
func_counts[func] = func_counts.get(func, 0) + 1 |
|
|
else: |
|
|
func_counts["None"] += 1 |
|
|
|
|
|
print("\nFunction distribution:") |
|
|
for func, count in func_counts.items(): |
|
|
print(f" - {func}: {count}") |
|
|
|
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(all_benchmarks, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f"\nBenchmark saved to: {output_path}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Examples:") |
|
|
print("=" * 60) |
|
|
|
|
|
for i, item in enumerate(all_benchmarks[:3]): |
|
|
print(f"\n--- Example {i+1} ---") |
|
|
print(f"ID: {item['id']}") |
|
|
print(f"Category: {item['category']}") |
|
|
print(f"Input: {item['input']['messages'][1]['content']}") |
|
|
print(f"Expected function: {item['expected']['function_name']}") |
|
|
if item['expected']['arguments']: |
|
|
print(f"Expected args: {json.dumps(item['expected']['arguments'], ensure_ascii=False)}") |
|
|
|
|
|
return all_benchmarks |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Generate FunctionGemma benchmark dataset") |
|
|
parser.add_argument("--output", type=str, default=str(DEFAULT_BENCHMARK_PATH), help="Output file path") |
|
|
args = parser.parse_args() |
|
|
|
|
|
output_path = Path(args.output) |
|
|
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
generate_benchmark_dataset(str(output_path)) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|