#!/usr/bin/env python3 """ NexusRaven API evaluation for local Ollama models. Tests function calling accuracy on 318 real-world API tasks across 4 domains. Usage: python3 run_nexusraven.py --model qwen3.5:9b --limit 50 """ import argparse import json import os import re import time import requests SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) REPO_DIR = os.path.dirname(SCRIPT_DIR) RESULTS_DIR = os.environ.get('RESULTS_DIR', os.path.join(REPO_DIR, 'results', 'nexusraven')) BACKEND_CONFIG = { 'ollama': {'url': 'http://localhost:11434/api/chat', 'type': 'ollama'}, 'llama-cpp': {'url': 'http://localhost:8081/v1/chat/completions', 'type': 'openai'}, 'mlx-vlm': {'url': 'http://localhost:8080/chat/completions', 'type': 'openai'}, } SYSTEM_PROMPT = """You are a helpful assistant that can call functions. When the user asks you to do something, respond with ONLY a function call in this exact JSON format: {"name": "function_name", "arguments": {"param1": "value1", "param2": "value2"}} Output ONLY the JSON object. No explanation, no markdown, no extra text. Only include parameters that are explicitly needed. Do not include parameters with default/None values.""" def load_data(): """Load NexusRaven evaluation data from HuggingFace.""" from datasets import load_dataset queries = load_dataset('Nexusflow/NexusRaven_API_evaluation', 'standardized_queries')['train'] apis = load_dataset('Nexusflow/NexusRaven_API_evaluation', 'standardized_api_list')['train'] # Build API lookup by dataset api_by_ds = {} for api in apis: ds = api['dataset'] if ds not in api_by_ds: api_by_ds[ds] = {} api_by_ds[ds][api['name']] = api return list(queries), api_by_ds def format_api(api, max_params=15): """Format an API definition for the prompt. Truncate long APIs.""" # Use first 200 chars of description desc = api['description'].split('\n')[0][:200] if api['description'] else '' lines = ['%s: %s' % (api['name'], desc)] lines.append(' Parameters:') args = api['args_dicts'][:max_params] for arg in args: req = '(required)' if arg.get('required') else '(optional)' desc = arg.get('description', '')[:60] lines.append(' - %s: %s %s %s' % ( arg.get('name', '?'), arg.get('type', 'any'), req, desc)) if len(api['args_dicts']) > max_params: lines.append(' ... and %d more optional parameters' % (len(api['args_dicts']) - max_params)) return '\n'.join(lines) def call_llm(model, messages, backend='ollama'): cfg = BACKEND_CONFIG[backend] if cfg['type'] == 'ollama': payload = { 'model': model, 'messages': messages, 'stream': False, 'think': False, 'options': {'temperature': 0, 'num_predict': 512}, } else: payload = { 'model': model, 'messages': messages, 'temperature': 0, 'max_tokens': 512, } try: resp = requests.post(cfg['url'], json=payload, timeout=300) resp.raise_for_status() data = resp.json() if cfg['type'] == 'ollama': content = data.get('message', {}).get('content', '') tokens = data.get('eval_count', 0) + data.get('prompt_eval_count', 0) else: content = data.get('choices', [{}])[0].get('message', {}).get('content', '') usage = data.get('usage', {}) tokens = usage.get('total_tokens', 0) return content, tokens except Exception as e: return str(e), 0 def parse_function_call(text): """Extract a single function call from model output.""" # Strip thinking text = re.sub(r'.*?', '', text, flags=re.DOTALL) text = re.sub(r'.*', '', text, flags=re.DOTALL) text = text.strip() # Try parsing as JSON object try: result = json.loads(text) if isinstance(result, dict): return result if isinstance(result, list) and len(result) > 0: return result[0] except json.JSONDecodeError: pass # Try extracting from markdown m = re.search(r'```(?:json)?\s*([\{\[].*?[\}\]])\s*```', text, re.DOTALL) if m: try: result = json.loads(m.group(1)) if isinstance(result, dict): return result if isinstance(result, list) and len(result) > 0: return result[0] except json.JSONDecodeError: pass # Try finding JSON object m = re.search(r'\{.*\}', text, re.DOTALL) if m: try: result = json.loads(m.group()) if isinstance(result, dict): return result except json.JSONDecodeError: pass return None def check_call(predicted, gt_fn_name, gt_args_str): """Check if predicted call matches ground truth.""" if not predicted: return False, 'no_parse' pred_name = predicted.get('name', '') pred_args = predicted.get('arguments', {}) # Check function name if pred_name != gt_fn_name: return False, 'wrong_fn' # Parse ground truth args try: gt_args = json.loads(gt_args_str) if isinstance(gt_args_str, str) else gt_args_str except json.JSONDecodeError: return False, 'bad_gt' # Check each GT arg is present and matches for param, expected in gt_args.items(): if param not in pred_args: return False, 'missing_param' pred_val = pred_args[param] # Type-flexible comparison if pred_val == expected: continue if str(pred_val) == str(expected): continue try: if float(pred_val) == float(expected): continue except (ValueError, TypeError): pass if str(pred_val).lower() == str(expected).lower(): continue return False, 'wrong_value' return True, 'ok' def run_eval(model, queries, api_by_ds, limit=50, stratified=True, backend='ollama'): """Run evaluation.""" if stratified and limit and limit < len(queries): # Sample evenly across datasets from collections import defaultdict by_ds = defaultdict(list) for q in queries: by_ds[q['dataset']].append(q) per_ds = max(1, limit // len(by_ds)) sampled = [] for ds_queries in by_ds.values(): sampled.extend(ds_queries[:per_ds]) queries = sampled[:limit] elif limit and limit < len(queries): queries = queries[:limit] print('\n=== NexusRaven Eval: %s (%d queries) ===' % (model, len(queries))) correct = 0 total = 0 errors = {'no_parse': 0, 'wrong_fn': 0, 'missing_param': 0, 'wrong_value': 0} by_dataset = {} total_tokens = 0 total_time = 0 for i, q in enumerate(queries): ds = q['dataset'] prompt = q['prompt'] gt_fn = q['python_function_name'] gt_args = q['python_args_dict'] context_fns = q['context_functions'] # Build API context api_descs = [] for fn_name in context_fns: api = api_by_ds.get(ds, {}).get(fn_name) if api: api_descs.append(format_api(api)) func_context = '\n\n'.join(api_descs) messages = [ {'role': 'system', 'content': SYSTEM_PROMPT + '\n\nAvailable functions:\n' + func_context}, {'role': 'user', 'content': prompt}, ] t0 = time.time() response, tokens = call_llm(model, messages, backend) elapsed = time.time() - t0 total_tokens += tokens total_time += elapsed predicted = parse_function_call(response) passed, reason = check_call(predicted, gt_fn, gt_args) total += 1 if passed: correct += 1 else: errors[reason] = errors.get(reason, 0) + 1 # Per-dataset tracking if ds not in by_dataset: by_dataset[ds] = {'correct': 0, 'total': 0} by_dataset[ds]['total'] += 1 if passed: by_dataset[ds]['correct'] += 1 if i < 3 or (not passed and i < 8): pred_short = json.dumps(predicted)[:80] if predicted else '(none)' status = 'PASS' if passed else 'FAIL(%s)' % reason print(' [%d] %s | %s -> %s' % (i, status, gt_fn, pred_short)) if (i + 1) % 25 == 0: print(' ... %d/%d done (%.0f%% correct)' % (i + 1, total, 100 * correct / total)) accuracy = correct / total if total else 0 avg_time = total_time / total if total else 0 print('\n Overall: %d/%d correct (%.1f%%)' % (correct, total, 100 * accuracy)) print(' Errors: %s' % json.dumps(errors)) print(' Avg time: %.2fs' % avg_time) print('\n By dataset:') for ds, r in sorted(by_dataset.items()): ds_acc = r['correct'] / r['total'] if r['total'] else 0 print(' %-15s %d/%d (%.0f%%)' % (ds, r['correct'], r['total'], 100 * ds_acc)) return { 'model': model, 'correct': correct, 'total': total, 'accuracy': round(accuracy, 4), 'errors': errors, 'avg_time': round(avg_time, 2), 'avg_tokens': round(total_tokens / total) if total else 0, 'by_dataset': by_dataset, } def main(): parser = argparse.ArgumentParser() parser.add_argument('--model', required=True) parser.add_argument('--backend', default='ollama', choices=['ollama', 'llama-cpp', 'mlx-vlm']) parser.add_argument('--limit', type=int, default=80, help='Max queries (0=all)') args = parser.parse_args() queries, api_by_ds = load_data() limit = args.limit if args.limit > 0 else len(queries) result = run_eval(args.model, queries, api_by_ds, limit, backend=args.backend) # Save model_tag = args.model.replace(':', '_').replace('/', '_') os.makedirs(RESULTS_DIR, exist_ok=True) backend_suffix = '_%s' % args.backend if args.backend != 'ollama' else '' outfile = os.path.join(RESULTS_DIR, 'nexusraven_%s%s.json' % (model_tag, backend_suffix)) with open(outfile, 'w') as f: json.dump(result, f, indent=2) print('\nSaved to: %s' % outfile) if __name__ == '__main__': main()