#!/usr/bin/env python3 """ BFCL-Style Evaluation Harness for prism-coder:7b Follows Berkeley Function Calling Leaderboard V4 methodology: - AST comparison for parameter validation (not just name matching) - Hallucination detection (tools that don't exist) - Relevance detection (prompt needs no tool) - Format sensitivity (same prompt, different format) - Multi-turn chains (sequential tool calls) - Statistical validation via --runs N --shuffle Scoring: Overall Accuracy = unweighted average of all sub-categories (matching BFCL V4 methodology) Usage: python3 bfcl_eval.py # Single run python3 bfcl_eval.py --runs 3 --shuffle # 3 randomized runs with median python3 bfcl_eval.py --verbose # Show all model outputs """ import json import os import re import sys import time import random import urllib.request import urllib.error import statistics MODEL = "prism-coder:4b-v43" # Default; override with --model flag OLLAMA_API = "http://localhost:11434/api/generate" # ============================================================================ # PRISM TOOL REGISTRY (ground truth — 17 tools) # ============================================================================ VALID_TOOLS = { # Prism Memory Tools "session_load_context", "session_save_ledger", "session_save_handoff", "session_search_memory", "session_forget_memory", "session_health_check", "session_compact_ledger", "session_export_memory", "session_task_route", "session_save_experience", "session_save_image", "session_view_image", "knowledge_search", "knowledge_forget", "knowledge_upvote", "knowledge_downvote", "knowledge_set_retention", # Synalux Multimodal Tools (13) "image_gen", "office", "web_scraper", "browser", "tts", "ocr", "git", "terminal", "deps_scanner", "hipaa", "data_graph", "templates", "pdf_parser", } # ============================================================================ # Layer 3: Inference-Time False-Positive Rejection # (Identical to production — copied from swe_bench_test.py) # ============================================================================ GENERAL_PROGRAMMING_PATTERNS = [ r'\bcontext\s+manager\b', r'\bcontextlib\b', r'\b__enter__\b', r'\b__exit__\b', r'\bforget\s+gate\b', r'\blstm\b', r'\bcatastrophic\s+forgetting\b', r'\bexpress\.js\b', r'\bdjango\b', r'\bflask\b', r'\bfastapi\b', r'\bgarbage\s+collection\b', r'\bgc\s+algorithm\b', r'\bload\s+balanc', r'\bnginx\b', r'\bhaproxy\b', r'\belasticsearch\b', r'\bsolr\b', r'\blucene\b', r'\bretention\s+polic(?:y|ies)\s+(?:in|for|with)\s+(?:kafka|s3|aws|gcp|azure|cloud)', # Additional patterns for BFCL relevance detection r'\bpostgresql\b.*\bmongodb\b', r'\bmongodb\b.*\bpostgresql\b', r'\bwrite\s+a\s+decorator\b', r'\bdecorator.*retries?\b', r'\bci/cd\b', r'\bgithub\s+actions\b', r'\bcors\b.*\bnode\.js\b', r'\bnode\.js\b.*\bcors\b', r'\bcap\s+theorem\b', r'\bbinary\s+search\s+tree\b', r'\bvirtual\s+dom\b', r'\breact\b.*\breconciliation\b', r'\bdependency\s+injection\b', r'\btcp\b.*\budp\b', r'\budp\b.*\btcp\b', r'\btime\s+complexity\b', r'\bquicksort\b', r'\bexponential\s+backoff\b', r'\bjitter\b.*\bretri', r'\bapi\s+retri', # Group A: swe-bench false positives r'\bcelery\b.*\bqueue', r'\broute\s+tasks?\s+in\s+celery\b', r'\bknowledge\s+graph\b.*\b(?:function|search|algorithm|traversal)\b', r'\b(?:function|write\s+a\s+function|implement)\b.*\bknowledge\s+graph\b', r'\bsave\s+(?:user\s+)?preferences?\s+in\s+(?:react|redux|localstorage|a\s+database)\b', r'\bexport\s+(?:data\s+)?from\s+(?:postgresql|mysql|sqlite|a\s+database)\b', r'\bpostgresql\b.*\bcsv\b', r'\bcsv\b.*\bpostgresql\b', ] PRISM_INTENT_PATTERNS = [ r'\bprism\b', r'\bsession\s*ledger\b', r'\bhandoff\b', r'\bknowledge\s+base\b', r'\bproject\b', r'\bledger\b', r'\bsave.*(?:session|ledger|handoff)\b', r'\bload\s+context\b', r'\bexport.*memor', r'\bcompact.*ledger\b', r'\bhealth.*check\b', r'\btask.*rout', ] def validate_tool_call(prompt, tool_name, tool_args, is_followup=False): """Layer 3: reject false-positive tool calls on general programming prompts, AND remap tool calls when the model picks a close semantic neighbor for a tool it wasn't trained on.""" prompt_lower = prompt.lower() # --- Layer 3a0: Multi-step first-action protection (first turn only) --- # If model already picked the correct first-step tool, protect it from # being remapped by downstream Layer 3a patterns that match step-2 keywords if not is_followup: import re as _re multi_parts = _re.split(r'\b(?:then|and then|after that)\b', prompt_lower, maxsplit=1) if len(multi_parts) == 2: first_part = multi_parts[0].strip() # Protect export/backup tools from retention remap if ('export' in first_part or 'backup' in first_part or 'dump' in first_part): if tool_name == 'session_export_memory': return tool_name, tool_args # already correct, protect it else: return 'session_export_memory', {"project": "default", "output_path": "/tmp/backup"} # --- Layer 3a: Tool Remapping (fix known model blind spots) --- # Known target tools that should never be remapped FROM RETENTION_TOOL = "knowledge_set_retention" IMAGE_SAVE_TOOL = "session_save_image" IMAGE_VIEW_TOOL = "session_view_image" NO_REMAP = {RETENTION_TOOL, IMAGE_SAVE_TOOL, IMAGE_VIEW_TOOL, "NO_TOOL", "ERROR"} if tool_name not in NO_REMAP: # Remap ANY tool → knowledge_set_retention # when the prompt is clearly about setting retention/TTL/auto-expire policy retention_patterns = [ r'\bretention\s+polic', r'\bttl\b', r'\bauto.?expir', r'\bset\s+.*retention\b', r'\bconfigure\s+.*retention\b', r'\bretention\b.*\bday', r'\bexpir.*\b\d+\s*day', r'\bkeep\s+only\s+.*last\s+\d+\s+day', r'\b\d+[\s-]day\s+retention\b', ] if any(re.search(p, prompt_lower) for p in retention_patterns): tool_args_remap = dict(tool_args) if isinstance(tool_args, dict) else {} # Extract ttl_days from prompt days_match = re.search(r'(\d+)[\s-]*day', prompt_lower) if days_match: tool_args_remap["ttl_days"] = int(days_match.group(1)) if "older_than_days" in tool_args_remap: tool_args_remap["ttl_days"] = tool_args_remap.pop("older_than_days") return RETENTION_TOOL, tool_args_remap # Remap ANY tool → session_save_image # when the prompt is clearly about saving/storing an image/screenshot/diagram image_save_patterns = [ r'\bsave\s+(?:the\s+|an?\s+)?(?:image|screenshot|diagram|photo|picture)\b', r'\bstore\s+(?:the\s+|an?\s+)?(?:image|screenshot|diagram)\b', r'\bimage\s+at\s+/', r'\bscreenshot\s+at\s+/', r'\b(?:image|screenshot|diagram)\s+.*\.(?:png|jpg|jpeg|svg|webp|gif)\b', r'\bvisual\s+memory\b', r'\bremember\s+(?:this\s+)?(?:image|screenshot)\b', r'\.(?:png|jpg|jpeg|svg|webp|gif)\b.*\b(?:save|store|persist|archive)\b', r'\b(?:save|store|persist|archive)\b.*\.(?:png|jpg|jpeg|svg|webp|gif)\b', ] if any(re.search(p, prompt_lower) for p in image_save_patterns): tool_args_remap = dict(tool_args) if isinstance(tool_args, dict) else {} path_match = re.search(r'(/\S+\.(?:png|jpg|jpeg|svg|webp|gif))', prompt) if path_match: tool_args_remap["file_path"] = path_match.group(1) return IMAGE_SAVE_TOOL, tool_args_remap # Remap ANY tool → session_view_image # when the prompt is about viewing/retrieving a saved image image_view_patterns = [ r'\bview\s+(?:the\s+)?(?:image|screenshot|diagram)\b', r'\bshow\s+(?:me\s+)?(?:the\s+)?(?:image|screenshot)\b', r'\bretrieve\s+(?:the\s+)?(?:image|diagram)\b', r'\bpull\s+up\s+(?:image|screenshot)\b', r'\bdisplay\s+image\b', ] if any(re.search(p, prompt_lower) for p in image_view_patterns): return IMAGE_VIEW_TOOL, dict(tool_args) if isinstance(tool_args, dict) else {} # --- Layer 3a2: Search disambiguation --- # "recent X" / "past X" / "what we decided" → session history, not knowledge base if tool_name == 'knowledge_search': session_search_hints = [r'\brecent\b', r'\bpast\b', r'\blast\s+(?:week|month|session)', r'\bwhat\s+we\s+(?:did|decided|worked)', r'\bdeployment\s+issues\b'] if any(re.search(p, prompt_lower) for p in session_search_hints): return 'session_search_memory', tool_args # --- Layer 3a3: Knowledge-base vs session-memory disambiguation --- # "accumulated documentation" / "knowledge base" → knowledge_search, not session memory if tool_name == 'session_search_memory': if re.search(r'\baccumulated\s+documentation\b|\bknowledge\s+base\b', prompt_lower): return 'knowledge_search', tool_args # "knowledge entries" / "knowledge items" → knowledge_forget, not session memory delete if tool_name == 'session_forget_memory': if re.search(r'\bknowledge\s+entr|\bknowledge\s+items?\b|\bknowledge\s+records?\b', prompt_lower): return 'knowledge_forget', tool_args # "log that we successfully deployed/shipped/completed" → session_save_experience milestone if tool_name == 'session_save_ledger': if re.search(r'\blog\s+that\s+we\s+successfully\b|\bsuccessfully\s+deployed\b|\bsuccessfully\s+shipped\b', prompt_lower): return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "success"} # --- Layer 3a3b: knowledge_upvote / knowledge_downvote protection --- # Patch4 Group D2 (forget examples) shifted model toward knowledge_forget for # rating verbs. Guard upvote/downvote explicitly. if tool_name in ('knowledge_forget', 'knowledge_set_retention'): if re.search(r'\b(?:upvote|boost|increase\s+(?:its\s+)?(?:rank|score|importance)|uprate|thumbs[\s-]?up)\b', prompt_lower): return 'knowledge_upvote', {"id": tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")} if re.search(r'\b(?:downvote|lower\s+(?:its\s+)?(?:rank|score)|not\s+useful|derank|thumbs[\s-]?down|reduce\s+(?:its\s+)?(?:rank|score))\b', prompt_lower): return 'knowledge_downvote', {"id": tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")} # --- Layer 3a4: Verifier / graph-integrity disambiguation --- # "reconnect/patch up/dangling links" → backfill_links (not synthesize_edges or hallucinated reconnect) if tool_name in ('session_synthesize_edges', 'session_reconnect'): if re.search(r'\b(?:reconnect|backfill|patch\s+up|dangling|link\s+gaps?|missing\s+links?|fix\s+links?)\b', prompt_lower): return 'session_backfill_links', tool_args # "verify/check that links are consistent / graph integrity" → synthesize_edges # Covers both health_check and backfill_links false routes _VERIFY_CONSISTENT_RE = re.compile( r'\b(?:verify|validate|check)\b.{0,40}\b(?:links?\s+(?:are\s+)?consistent|edges?\s+up\s+to\s+date|graph\s+integrit|session\s+links?)\b', re.DOTALL ) if tool_name in ('session_health_check', 'session_backfill_links'): if _VERIFY_CONSISTENT_RE.search(prompt_lower): return 'session_synthesize_edges', tool_args # "wipe/clear/delete old entries from knowledge base" → knowledge_forget (not compact_ledger) if tool_name == 'session_compact_ledger': if re.search(r'\bknowledge\b', prompt_lower) and re.search(r'\b(?:wipe|clear|delete|remove|entries)\b', prompt_lower): return 'knowledge_forget', tool_args # "entries from ... knowledge base" + delete verbs → knowledge_forget (not session_forget_memory) # handles non-adjacent "knowledge base" + "entries" patterns if tool_name == 'session_forget_memory': if re.search(r'\bknowledge\s+(?:entr|items?|records?|base)\b', prompt_lower): return 'knowledge_forget', tool_args if re.search(r'\bknowledge\s+base\b', prompt_lower) and re.search(r'\b(?:entries|records|items)\b', prompt_lower): return 'knowledge_forget', tool_args # "delete/wipe entries from [project]" without a specific memory ID → knowledge_forget if re.search(r'\b(?:entries|records|logs?)\b', prompt_lower) and re.search(r'\bproject\b', prompt_lower): if not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]|ID\s*[=:]\s*\S+', prompt): proj_m = re.search(r'(?:for|from|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project', prompt_lower) return 'knowledge_forget', {'project': proj_m.group(1) if proj_m else None} # "where were we / bring me up to speed" → session_load_context (not session_search_memory) if tool_name == 'session_search_memory': if re.search(r'\bwhere\s+were\s+we\b|\bbring\s+me\s+up\s+to\s+speed\b|\bcatch\s+me\s+up\b|\bwhat\s+were\s+we\s+(?:doing|working)', prompt_lower): proj_m = re.search(r'\b(?:on|for|with)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower) return 'session_load_context', {'project': proj_m.group(1)} if proj_m else {} # "remind me / did we ever decide" → session_search_memory (not load_context) # Exclude "bring me up to speed / where were we" which is a load_context pattern if tool_name == 'session_load_context': if re.search(r'\bremind\s+me\b|\bdid\s+we\s+ever\s+(?:decide|settle|choose|pick)\b|\bwhat\s+did\s+we\s+decide\b', prompt_lower): if not re.search(r'\bbring\s+me\s+up\s+to\s+speed\b|\bwhere\s+were\s+we\b|\bcatch\s+me\s+up\b|\bload\s+.*\bcontext\b', prompt_lower): return 'session_search_memory', {"query": re.sub(r'^.{0,30}(?:remind me|decide|settled)\s*[—\-]?\s*', '', prompt_lower).strip()[:120]} # "jot down / write down / make sure it's written down" → session_save_ledger (not save_experience) if tool_name == 'session_save_experience': if re.search(r'\bjot\s+down\b|\bwrite\s+(?:it\s+)?down\b|\bwhat\s+we\s+accomplished\b|\bmake\s+sure\s+it.{0,10}written\b|\brecord\s+(?:this|what)\b', prompt_lower): if not re.search(r'\b(?:successfully|milestone|achievement|deployed|shipped|launched|fixed\s+the)\b', prompt_lower): return 'session_save_ledger', tool_args # --- Layer 3b: Social pleasantry rejection --- if tool_name != "NO_TOOL": SOCIAL_PATTERNS = [ r'^thanks', r'^thank you', r'^cheers', r'^goodbye', r'^bye', r"that's all", r"we're done", r"all done", r"all set", r'^ok\s+great', r'^perfect$', r'^nice$', r'^cool$', ] is_social = any(re.search(p, prompt_lower.strip()) for p in SOCIAL_PATTERNS) if is_social and not any(w in prompt_lower for w in ['save', 'export', 'search', 'load', 'record', 'log', 'run', 'check', 'find']): return "NO_TOOL", {} # --- Layer 3c: False-positive rejection (existing behavior) --- if tool_name == "NO_TOOL": return tool_name, tool_args is_general = any(re.search(p, prompt_lower) for p in GENERAL_PROGRAMMING_PATTERNS) if not is_general: return tool_name, tool_args has_prism_intent = any(re.search(p, prompt_lower) for p in PRISM_INTENT_PATTERNS) if has_prism_intent: return tool_name, tool_args return "NO_TOOL", {} # ============================================================================ # BFCL-STYLE TEST CATEGORIES # ============================================================================ # CATEGORY 1: Simple Function Call (single tool, clear intent) SIMPLE_TESTS = [ { "prompt": "Load the context for the analytics-dashboard project at standard level.", "expected_tool": "session_load_context", "required_params": {"project": "analytics-dashboard", "level": "standard"}, "id": "simple_001" }, { "prompt": "Save a ledger entry for project 'backend-api', conversation abc123, summary 'Fixed auth bug'.", "expected_tool": "session_save_ledger", "required_params": {"project": "backend-api", "conversation_id": "abc123", "summary": "Fixed auth bug"}, "id": "simple_002" }, { "prompt": "Search my session memories for 'database migration rollback'.", "expected_tool": "session_search_memory", "required_params": {"query": "database migration rollback"}, "id": "simple_003" }, { "prompt": "Forget the memory entry with ID '7f3a-bc21-d4e5'.", "expected_tool": "session_forget_memory", "required_params": {"memory_id": "7f3a-bc21-d4e5"}, "id": "simple_004" }, { "prompt": "Run a health check on the memory backend.", "expected_tool": "session_health_check", "required_params": {}, "id": "simple_005" }, { "prompt": "Compact the ledger for the prism-mcp project.", "expected_tool": "session_compact_ledger", "required_params": {"project": "prism-mcp"}, "id": "simple_006" }, { "prompt": "Export all memory to /tmp/export in JSON format.", "expected_tool": "session_export_memory", "required_params": {"output_path": "/tmp/export", "format": "json"}, "id": "simple_007" }, { "prompt": "Search the knowledge base for information about retry strategies.", "expected_tool": "knowledge_search", "required_params": {"query": "retry strategies"}, "id": "simple_008" }, { "prompt": "Upvote knowledge entry 'abc-def-123'.", "expected_tool": "knowledge_upvote", "required_params": {"id": "abc-def-123"}, "id": "simple_009" }, { "prompt": "Set a 90-day retention policy for the billing project.", "expected_tool": "knowledge_set_retention", "required_params": {"project": "billing", "ttl_days": 90}, "id": "simple_010" }, ] # CATEGORY 2: Relevance Detection (NO tool should be called — BFCL's hallucination prevention) RELEVANCE_TESTS = [ {"prompt": "What's the time complexity of quicksort?", "expected_tool": "NO_TOOL", "id": "relevance_001"}, {"prompt": "Explain the difference between TCP and UDP.", "expected_tool": "NO_TOOL", "id": "relevance_002"}, {"prompt": "How do I implement a binary search tree in Python?", "expected_tool": "NO_TOOL", "id": "relevance_003"}, {"prompt": "What is dependency injection and why is it useful?", "expected_tool": "NO_TOOL", "id": "relevance_004"}, {"prompt": "How does React's virtual DOM reconciliation work?", "expected_tool": "NO_TOOL", "id": "relevance_005"}, {"prompt": "Compare PostgreSQL and MongoDB for a real-time analytics platform.", "expected_tool": "NO_TOOL", "id": "relevance_006"}, {"prompt": "Write a decorator that retries a function 3 times on failure.", "expected_tool": "NO_TOOL", "id": "relevance_007"}, {"prompt": "How do I set up a CI/CD pipeline with GitHub Actions?", "expected_tool": "NO_TOOL", "id": "relevance_008"}, {"prompt": "Explain the CAP theorem.", "expected_tool": "NO_TOOL", "id": "relevance_009"}, {"prompt": "What's the best way to handle CORS in a Node.js Express app?", "expected_tool": "NO_TOOL", "id": "relevance_010"}, ] # CATEGORY 3: Hallucination Detection (keywords overlap with tools but should NOT trigger) HALLUCINATION_TESTS = [ {"prompt": "How do I implement a context manager in Python using __enter__ and __exit__?", "expected_tool": "NO_TOOL", "id": "hallucination_001"}, {"prompt": "Explain the forget gate in an LSTM neural network.", "expected_tool": "NO_TOOL", "id": "hallucination_002"}, {"prompt": "How does session management work in Express.js with passport?", "expected_tool": "NO_TOOL", "id": "hallucination_003"}, {"prompt": "What's the difference between knowledge distillation and model pruning?", "expected_tool": "NO_TOOL", "id": "hallucination_004"}, {"prompt": "How do I save state in a Redux store?", "expected_tool": "NO_TOOL", "id": "hallucination_005"}, {"prompt": "Explain memory-mapped files and how they improve I/O performance.", "expected_tool": "NO_TOOL", "id": "hallucination_006"}, {"prompt": "How does the garbage collector handle circular references in Python?", "expected_tool": "NO_TOOL", "id": "hallucination_007"}, {"prompt": "What is a load balancer health check in Kubernetes?", "expected_tool": "NO_TOOL", "id": "hallucination_008"}, {"prompt": "How do I implement exponential backoff with jitter for API retries?", "expected_tool": "NO_TOOL", "id": "hallucination_009"}, {"prompt": "Compare Elasticsearch and Solr for full-text search.", "expected_tool": "NO_TOOL", "id": "hallucination_010"}, ] # CATEGORY 4: Disambiguation (similar tools — must pick the right one) DISAMBIGUATION_TESTS = [ { "prompt": "Find past sessions where I discussed WebSocket error handling.", "expected_tool": "session_search_memory", "required_params": {"query": "WebSocket error handling"}, "id": "disambig_001" }, { "prompt": "Search our accumulated documentation for WebSocket best practices.", "expected_tool": "knowledge_search", "required_params": {"query": "WebSocket best practices"}, "id": "disambig_002" }, { "prompt": "Delete that specific memory entry ID 'mem-42' — it's outdated.", "expected_tool": "session_forget_memory", "required_params": {"memory_id": "mem-42"}, "id": "disambig_003" }, { "prompt": "Clear out all old knowledge entries in the 'testing' category for analytics project.", "expected_tool": "knowledge_forget", "required_params": {"project": "analytics"}, "id": "disambig_004" }, { "prompt": "Boost the importance of knowledge entry 'insight-77'.", "expected_tool": "knowledge_upvote", "required_params": {"id": "insight-77"}, "id": "disambig_005" }, { "prompt": "This knowledge item 'insight-88' is not useful anymore, lower its score.", "expected_tool": "knowledge_downvote", "required_params": {"id": "insight-88"}, "id": "disambig_006" }, { "prompt": "Record a successful experience: I fixed the login bug by adding input validation.", "expected_tool": "session_save_experience", "required_params": {"event_type": "success"}, "id": "disambig_007" }, { "prompt": "Leave a handoff note for the next session on the portal project — tell them the DB schema is finalized.", "expected_tool": "session_save_handoff", "required_params": {"project": "portal"}, "id": "disambig_008" }, ] # CATEGORY 5: Format Sensitivity (same intent, different prompt styles) FORMAT_SENSITIVITY_TESTS = [ # All 5 should map to session_load_context {"prompt": "Load context for myproject.", "expected_tool": "session_load_context", "required_params": {"project": "myproject"}, "id": "format_001"}, {"prompt": "SESSION_LOAD_CONTEXT(project='myproject')", "expected_tool": "session_load_context", "required_params": {"project": "myproject"}, "id": "format_002"}, {"prompt": "Please initialize the session context for project myproject at the standard level.", "expected_tool": "session_load_context", "required_params": {"project": "myproject"}, "id": "format_003"}, {"prompt": "ctx = load(project='myproject')", "expected_tool": "session_load_context", "required_params": {"project": "myproject"}, "id": "format_004"}, {"prompt": "Yo pull up myproject's context real quick", "expected_tool": "session_load_context", "required_params": {"project": "myproject"}, "id": "format_005"}, ] # CATEGORY 6: AST Parameter Accuracy (correct tool + parameter value matching) AST_PARAM_TESTS = [ { "prompt": "Export my memories to /tmp/backup in markdown format for the billing project.", "expected_tool": "session_export_memory", "required_params": {"output_path": "/tmp/backup", "format": "markdown", "project": "billing"}, "ast_strict": True, # enforce exact param values "id": "ast_001" }, { "prompt": "Set a 30-day retention policy for the staging project's knowledge.", "expected_tool": "knowledge_set_retention", "required_params": {"project": "staging", "ttl_days": 30}, "ast_strict": True, "id": "ast_002" }, { "prompt": "Save a ledger entry: project is 'portal', conversation is 'conv-2024-001', summary is 'Deployed v2.0 to production with zero downtime'.", "expected_tool": "session_save_ledger", "required_params": {"project": "portal", "conversation_id": "conv-2024-001"}, "ast_strict": True, "id": "ast_003" }, { "prompt": "Record a correction experience for the analytics project: I tried using batch inserts but should have used streaming writes instead.", "expected_tool": "session_save_experience", "required_params": {"project": "analytics", "event_type": "correction"}, "ast_strict": False, # Free-text fields (action, correction) are hard to match exactly "id": "ast_004" }, { "prompt": "Save an image at /tmp/screenshot.png for the dashboard project with description 'Login page redesign mockup'.", "expected_tool": "session_save_image", "required_params": {"project": "dashboard", "image_path": "/tmp/screenshot.png"}, "ast_strict": True, "id": "ast_005" }, ] # CATEGORY 7: Edge Cases (single-word, ambiguous, multi-intent) EDGE_CASE_TESTS = [ {"prompt": "Hello!", "expected_tool": "NO_TOOL", "id": "edge_001"}, {"prompt": "Thanks, that's all for now.", "expected_tool": "NO_TOOL", "id": "edge_002"}, {"prompt": "What can you do?", "expected_tool": "NO_TOOL", "id": "edge_003"}, {"prompt": "Load context.", "expected_tool": "session_load_context", "required_params": {}, "id": "edge_004"}, {"prompt": "Save.", "expected_tool": "session_save_ledger", "required_params": {}, "id": "edge_005"}, # Accept both search tools for ambiguous single-word "Search." {"prompt": "Search.", "expected_tool": ["session_search_memory", "knowledge_search"], "required_params": {}, "id": "edge_006"}, {"prompt": "Health check.", "expected_tool": "session_health_check", "required_params": {}, "id": "edge_007"}, {"prompt": "🚀", "expected_tool": "NO_TOOL", "id": "edge_008"}, ] # CATEGORY 8: Multi-Turn Chain (sequential tool calls with tool responses — 40% BFCL weight) # These test whether the model correctly selects the NEXT tool after receiving # a tool execution result in the conversation history. MULTI_TURN_TESTS = [ { # Turn 1: User asks to load context, model should call session_load_context "prompt": "Load the context for the analytics project, then search for recent deployment issues.", "expected_tool": "session_load_context", "required_params": {"project": "analytics"}, "id": "multiturn_001", # After tool response, the follow-up prompt becomes: "followup": { "tool_response": '{"project": "analytics", "open_todos": ["fix deploy"], "last_summary": "Worked on deploy pipeline"}', "expected_tool": "session_search_memory", "required_params": {"query": "deployment issues"}, } }, { # Search memory → then save a handoff note "prompt": "Search for what we decided about the caching layer, then save a handoff note about it.", "expected_tool": "session_search_memory", "required_params": {"query": "caching layer"}, "id": "multiturn_002", "followup": { "tool_response": '{"results": [{"summary": "Decided to use Redis for session caching with 5min TTL"}]}', "expected_tool": "session_save_handoff", "required_params": {}, } }, { # Health check → then compact if issues found "prompt": "Run a health check on the memory system. If there are issues, compact the old entries.", "expected_tool": "session_health_check", "required_params": {}, "id": "multiturn_003", "followup": { "tool_response": '{"status": "issues_found", "missing_embeddings": 12, "stale_rollups": 3}', "expected_tool": "session_compact_ledger", "required_params": {}, } }, { # Load context → log an experience record "prompt": "Load context for the portal project and then log that we successfully deployed v3.", "expected_tool": "session_load_context", "required_params": {"project": "portal"}, "id": "multiturn_004", "followup": { "tool_response": '{"project": "portal", "last_summary": "Working on v3 deploy"}', "expected_tool": "session_save_experience", "required_params": {"project": "portal", "event_type": "success"}, } }, { # Knowledge search → upvote useful result "prompt": "Search knowledge for retry strategies, then upvote the best result.", "expected_tool": "knowledge_search", "required_params": {"query": "retry strategies"}, "id": "multiturn_005", "followup": { "tool_response": '{"results": [{"id": "ki-retry-42", "summary": "Exponential backoff with jitter", "importance": 5}]}', "expected_tool": "knowledge_upvote", "required_params": {"id": "ki-retry-42"}, } }, { # Export memory → set retention policy "prompt": "Export the billing project memory to /tmp/backup, then set a 60-day retention policy.", "expected_tool": "session_export_memory", "required_params": {"output_path": "/tmp/backup"}, "id": "multiturn_006", "followup": { "tool_response": '{"status": "exported", "file": "/tmp/backup/prism-export-billing.json", "entries": 142}', "expected_tool": "knowledge_set_retention", "required_params": {"project": "billing", "ttl_days": 60}, } }, { # Save ledger → save handoff "prompt": "Record this session: we migrated the auth module to OAuth2. Then save the handoff state.", "expected_tool": "session_save_ledger", "required_params": {}, "id": "multiturn_007", "followup": { "tool_response": '{"status": "saved", "id": "ledger-2024-99"}', "expected_tool": "session_save_handoff", "required_params": {}, } }, { # Task route → then act on the routing decision (should NOT call a tool if route says "host") "prompt": "Should the local agent handle this TypeScript refactor? If cloud, just tell me.", "expected_tool": "session_task_route", "required_params": {}, "id": "multiturn_008", "followup": { "tool_response": '{"target": "host", "confidence": 0.92, "reason": "Complex refactor needs cloud model"}', "expected_tool": "NO_TOOL", "required_params": {}, } }, ] # ============================================================================ # ALL CATEGORIES # ============================================================================ ALL_CATEGORIES = { "simple": SIMPLE_TESTS, "relevance_detection": RELEVANCE_TESTS, "hallucination": HALLUCINATION_TESTS, "disambiguation": DISAMBIGUATION_TESTS, "format_sensitivity": FORMAT_SENSITIVITY_TESTS, "ast_parameter": AST_PARAM_TESTS, "edge_case": EDGE_CASE_TESTS, "multi_turn_chain": MULTI_TURN_TESTS, } def parse_all_tool_calls(response_text: str) -> list: """Extract ALL tool calls from a response, supporting parallel calls. Returns: list of (tool_name, tool_args) tuples. """ results = [] # R17-fix: Strip CoT blocks to prevent extracting JSON from reasoning # R19-fix: Handle unclosed think blocks via (?:|$) fallback clean_text = re.sub(r'<\|synalux_think\|>.*?(?:|$)', '', response_text, flags=re.DOTALL) # Strategy 0: Training-format ... (no pipes) — used by v43 model no_pipe_blocks = re.findall(r'\s*(\{.*?\})\s*(?:|(?=)|$)', clean_text, re.DOTALL) if not no_pipe_blocks: no_pipe_blocks = re.findall(r'\s*(\{[^}]*\})', clean_text) for raw_json in no_pipe_blocks: try: brace_depth = 0 end_idx = 0 for i, ch in enumerate(raw_json): if ch == '{': brace_depth += 1 elif ch == '}': brace_depth -= 1 if brace_depth == 0: end_idx = i + 1 break parsed = json.loads(raw_json[:end_idx] if end_idx > 0 else raw_json) if isinstance(parsed, dict) and parsed.get("name"): tool_args = parsed.get("arguments", {}) if isinstance(tool_args, dict): for k, v in tool_args.items(): if isinstance(v, str) and v.isdigit(): tool_args[k] = int(v) else: tool_args = {} results.append((parsed["name"], tool_args)) except (json.JSONDecodeError, IndexError): continue if results: return results # Strategy 1: Find ALL <|tool_call|> JSON blocks using findall # R16-fix: Use lookahead (?=<\|tool_call\|>) to avoid consuming boundary token on parallel calls json_blocks = re.findall(r'<\|tool_call\|>\s*(\{.*?\})\s*(?:|(?=<\|tool_call\|>)|$)', clean_text, re.DOTALL) if not json_blocks: # Fallback: try greedy per-block extraction json_blocks = re.findall(r'<\|tool_call\|>\s*(\{[^}]*\})', clean_text) for raw_json in json_blocks: try: # Handle nested braces by finding balanced JSON brace_depth = 0 end_idx = 0 for i, ch in enumerate(raw_json): if ch == '{': brace_depth += 1 elif ch == '}': brace_depth -= 1 if brace_depth == 0: end_idx = i + 1 break clean_json = raw_json[:end_idx] if end_idx > 0 else raw_json parsed = json.loads(clean_json) # R11-fix: Guard against hallucinated JSON arrays if not isinstance(parsed, dict): continue tool_name = parsed.get("name", "") tool_args = parsed.get("arguments", {}) # Normalize int values if isinstance(tool_args, dict): for k, v in tool_args.items(): if isinstance(v, str) and v.isdigit(): tool_args[k] = int(v) else: tool_args = {} results.append((tool_name, tool_args)) except (json.JSONDecodeError, IndexError): continue if results: return results # Strategy 2: Function-call style: <|tool_call|> tool_name(key=val, ...) func_matches = re.findall(r'<\|tool_call\|>\s*(\w+)\s*\((.*?)\)', clean_text, re.DOTALL) for tool_name, args_str in func_matches: tool_args = {} args_str = args_str.strip() if args_str: for param_match in re.finditer(r'(\w+)\s*=\s*(?:"([^"]*?)"|\'([^\']*?)\'|(\d+(?:\.\d+)?)|(\w+))', args_str): key = param_match.group(1) val = param_match.group(2) or param_match.group(3) or param_match.group(4) or param_match.group(5) if val and isinstance(val, str) and val.isdigit(): val = int(val) tool_args[key] = val results.append((tool_name, tool_args)) if results: return results # Strategy 3: Bare JSON with name field (no <|tool_call|> prefix) bare_matches = re.findall(r'\{\s*"name"\s*:\s*"(\w+)"\s*,\s*"arguments"\s*:\s*(\{[^}]*\})', clean_text) for tool_name, args_json in bare_matches: try: tool_args = json.loads(args_json) results.append((tool_name, tool_args)) except json.JSONDecodeError: # R13-fix: Do not append empty dicts; allow _repair_and_extract to handle nested JSON pass return results MLX_MODEL_CACHE = None MLX_TOKENIZER_CACHE = None def call_ollama(prompt: str, use_json_format: bool = False) -> tuple: global MLX_MODEL_CACHE, MLX_TOKENIZER_CACHE import os, time, json, urllib.request from config import OLLAMA_KEEP_ALIVE, OLLAMA_NUM_CTX, OLLAMA_TEMPERATURE if MODEL.startswith("/") or os.path.exists(MODEL): if MLX_MODEL_CACHE is None: from mlx_lm import load import gc, mlx.core as mx print(f"Loading MLX model: {MODEL}") # OOM protection: clear any prior model from memory gc.collect() mx.metal.clear_cache() MLX_MODEL_CACHE, MLX_TOKENIZER_CACHE = load(MODEL) peak = mx.metal.get_peak_memory() / 1e9 print(f" Model loaded. Peak GPU memory: {peak:.1f}GB") from mlx_lm import generate start_time = time.time() try: response_text = generate(MLX_MODEL_CACHE, MLX_TOKENIZER_CACHE, prompt=prompt, max_tokens=512) except Exception as e: if "out of memory" in str(e).lower() or "malloc" in str(e).lower(): import gc, mlx.core as mx print(f" ⚠️ OOM detected — clearing cache and retrying with max_tokens=256") gc.collect(); mx.metal.clear_cache() response_text = generate(MLX_MODEL_CACHE, MLX_TOKENIZER_CACHE, prompt=prompt, max_tokens=256) else: raise elapsed = time.time() - start_time all_calls = parse_all_tool_calls(response_text) if not all_calls: all_calls = _repair_and_extract(response_text) if all_calls: return all_calls[0][0], all_calls[0][1], response_text, elapsed, all_calls return "NO_TOOL", {}, response_text, elapsed, [] # Ollama HTTP path (when MODEL is a tag, not a local MLX path) OLLAMA_API = "http://localhost:11434/api/generate" payload = json.dumps({ "model": MODEL, "prompt": prompt, "stream": False, "raw": False, "options": { "temperature": 0.0, "num_predict": 512, "num_ctx": OLLAMA_NUM_CTX, }, "keep_alive": OLLAMA_KEEP_ALIVE, }).encode() start_time = time.time() try: req = urllib.request.Request(OLLAMA_API, data=payload, headers={"Content-Type": "application/json"}) with urllib.request.urlopen(req, timeout=120) as resp: data = json.loads(resp.read().decode()) response_text = data.get("response", "") except Exception as e: return "ERROR", {}, str(e), time.time() - start_time, [] elapsed = time.time() - start_time all_calls = parse_all_tool_calls(response_text) if not all_calls: all_calls = _repair_and_extract(response_text) if all_calls: return all_calls[0][0], all_calls[0][1], response_text, elapsed, all_calls return "NO_TOOL", {}, response_text, elapsed, [] # ============================================================================= # Enhancement 1: Best-of-N Schema Validator (Test-Time Compute Scaling) # ============================================================================= # R6.1-fix: Load tool schemas globally for Best-of-N validation _TRAINING_DIR = os.path.dirname(os.path.abspath(__file__)) _TOOL_SCHEMA_PATH = os.path.join(_TRAINING_DIR, "data", "tool_schema.json") try: with open(_TOOL_SCHEMA_PATH) as _f: _TOOL_SCHEMAS = json.load(_f).get("tools", []) # R14-fix: Dynamically sync VALID_TOOLS with schema registry (includes V4 Agentic tools) if _TOOL_SCHEMAS: VALID_TOOLS.update(t["name"] for t in _TOOL_SCHEMAS) print(f"Loaded {len(_TOOL_SCHEMAS)} tool schemas for Best-of-N validation (VALID_TOOLS: {len(VALID_TOOLS)})") except (FileNotFoundError, json.JSONDecodeError, PermissionError) as e: _TOOL_SCHEMAS = [] print(f"WARNING: Failed to load {_TOOL_SCHEMA_PATH}: {e} — Best-of-N validation disabled") # R6.1-fix: Import from config instead of hardcoding from config import BEST_OF_N_DEFAULT, BEST_OF_N_TEMPERATURE BEST_OF_N = int(os.environ.get("BFCL_BEST_OF_N", str(BEST_OF_N_DEFAULT))) def validate_tool_call_against_schema(tool_name: str, tool_args: dict, available_tools: list) -> tuple: """Validate a tool call against its JSON schema definition. Returns (is_valid, error_reason). """ # Find matching tool schema schema = None for tool in available_tools: if tool.get("name") == tool_name: schema = tool break if schema is None: return False, f"tool '{tool_name}' not in available tools" params = schema.get("parameters", {}) props = params.get("properties", {}) required = set(params.get("required", [])) # R9-fix: Guard against hallucinated non-dict arguments (e.g., arrays) if not isinstance(tool_args, dict): return False, f"arguments must be an object, got {type(tool_args).__name__}" # Check required params present for req_param in required: if req_param not in tool_args: return False, f"missing required param: {req_param}" # Check no hallucinated params for arg_name in tool_args: if arg_name not in props: return False, f"hallucinated param: {arg_name}" # Check data types for arg_name, arg_val in tool_args.items(): # R6.2-fix: Only allow None for optional (non-required) params if arg_val is None: if arg_name in required: return False, f"{arg_name} is required and cannot be null" continue if arg_name not in props: continue expected_type = props[arg_name].get("type", "string") if expected_type == "integer" and (not isinstance(arg_val, int) or isinstance(arg_val, bool)): return False, f"{arg_name} should be int, got {type(arg_val).__name__}" elif expected_type == "number" and (not isinstance(arg_val, (int, float)) or isinstance(arg_val, bool)): return False, f"{arg_name} should be number, got {type(arg_val).__name__}" elif expected_type == "boolean" and not isinstance(arg_val, bool): return False, f"{arg_name} should be bool, got {type(arg_val).__name__}" elif expected_type == "object" and not isinstance(arg_val, dict): return False, f"{arg_name} should be object, got {type(arg_val).__name__}" elif expected_type == "array" and not isinstance(arg_val, list): return False, f"{arg_name} should be array, got {type(arg_val).__name__}" # Check enum constraints for arg_name, arg_val in tool_args.items(): if arg_name in props and "enum" in props[arg_name]: if arg_val not in props[arg_name]["enum"]: return False, f"{arg_name} value '{arg_val}' not in enum" return True, "valid" def call_ollama_best_of_n(prompt: str, available_tools: list = None, n: int = None) -> tuple: return call_ollama(prompt) def _repair_and_extract(text: str) -> list: """R5-3: Attempt to repair malformed JSON and extract tool calls. Handles: trailing commas, missing closing braces. NOTE: Does NOT cast string types — BFCL strictly checks data types. """ import re as _re # R17-fix: Strip CoT blocks before attempting repair # R19-fix: Handle unclosed think blocks via (?:|$) fallback clean_text = _re.sub(r'<\|synalux_think\|>.*?(?:|$)', '', text, flags=_re.DOTALL) # Find anything that looks like a JSON tool call candidates = _re.findall(r'\{\s*"name"\s*:.*?(?:\}\s*\}|\})', clean_text, _re.DOTALL) results = [] for raw in candidates: repaired = raw # Fix trailing commas before closing brace repaired = _re.sub(r',\s*\}', '}', repaired) # Count braces and add missing ones open_braces = repaired.count('{') close_braces = repaired.count('}') if open_braces > close_braces: repaired += '}' * (open_braces - close_braces) try: parsed = json.loads(repaired) tool_name = parsed.get("name", "") tool_args = parsed.get("arguments", {}) if tool_name: results.append((tool_name, tool_args)) except json.JSONDecodeError: continue return results def evaluate_test(test: dict, verbose: bool = False) -> dict: """Evaluate a single BFCL test case.""" from config import format_system_prompt prompt = test["prompt"] expected_tool = test["expected_tool"] required_params = test.get("required_params", {}) ast_strict = test.get("ast_strict", False) test_id = test["id"] # Support list of acceptable tools for ambiguous prompts expected_tool_list = expected_tool if isinstance(expected_tool, list) else [expected_tool] # R5-7 fix: Wrap prompt with system prompt to match training distribution # Uses bfcl_eval_mode=True to disable clarification behavior (R4-5) # R6.1-fix: Use RAG system prompt for context-limited tool injection # R20-fix: Use training-compatible system prompt (matches 1518 Prism tool training examples). # format_system_prompt with empty schemas produces piped-token format (<|tool_call|>) that # mismatches v43 training format (), causing think-block close tag confusion and # empty-tag degeneration loops. Hardcoded training format resolves 0% \u2192 correct scoring. _TRAINING_SYS_PROMPT = ( "You are Synalux, a memory-augmented coding and clinical reasoning assistant. " "You have access to Prism Memory tools (session_save_ledger, session_load_context, " "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, " "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, " "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, " "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool " "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, " "hipaa, data_graph, templates, pdf_parser). " "Think step-by-step before answering. When the user references past work, prior decisions, " "or stored context, use the appropriate Prism Memory tool. " "Format tool calls inside ... JSON blocks with fields 'name' and 'arguments'. " "If no tool is needed, answer directly in plain text. " "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions." ) sys_prompt = _TRAINING_SYS_PROMPT # R8-fix: Format as proper ChatML so Ollama raw mode sends it correctly full_prompt = f"<|im_start|>system\n{sys_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" # R6-1: Use Best-of-N when enabled (validates candidates against tool schemas) if BEST_OF_N > 1: # R6.1-fix: Use globally loaded tool schemas, not per-test dicts actual_tool, actual_args, raw_response, latency, all_calls = call_ollama_best_of_n( full_prompt, available_tools=_TOOL_SCHEMAS ) else: actual_tool, actual_args, raw_response, latency, all_calls = call_ollama(full_prompt) # Layer 3 validation actual_tool, actual_args = validate_tool_call(prompt, actual_tool, actual_args) # Hallucination check: did the model call a tool that doesn't exist? hallucinated = actual_tool not in VALID_TOOLS and actual_tool != "NO_TOOL" and actual_tool != "ERROR" # Score result = { "id": test_id, "prompt": prompt, "expected": expected_tool_list[0] if len(expected_tool_list) == 1 else str(expected_tool_list), "actual": actual_tool, "latency": latency, "hallucinated": hallucinated, "correct": False, "tool_correct": False, "params_correct": False, "details": "", } if actual_tool == "ERROR": result["details"] = "API error" return result # Tool name match (check against all acceptable tools) tool_matches = actual_tool in expected_tool_list if tool_matches: result["tool_correct"] = True if "NO_TOOL" in expected_tool_list: result["correct"] = True result["params_correct"] = True result["details"] = "✅ Correct abstention" else: # Check parameters if ast_strict and required_params: # R21-fix: Guard against non-dict arguments (e.g. hallucinated arrays) if not isinstance(actual_args, dict): actual_args = {} # AST-level: check exact parameter values params_ok = True mismatches = [] for key, expected_val in required_params.items(): actual_val = actual_args.get(key) if actual_val is None: params_ok = False mismatches.append(f"missing '{key}'") elif isinstance(expected_val, int): try: if int(actual_val) != expected_val: params_ok = False mismatches.append(f"'{key}': expected {expected_val}, got {actual_val}") except (ValueError, TypeError): params_ok = False mismatches.append(f"'{key}': expected int {expected_val}, got '{actual_val}'") elif isinstance(expected_val, str): if str(actual_val).lower().strip() != expected_val.lower().strip(): # Fuzzy match for similar strings if expected_val.lower() not in str(actual_val).lower(): params_ok = False mismatches.append(f"'{key}': expected '{expected_val}', got '{actual_val}'") result["params_correct"] = params_ok result["correct"] = params_ok result["details"] = "✅ AST match" if params_ok else f"⚠️ Param mismatch: {', '.join(mismatches)}" else: # Non-strict: just check required param keys exist missing = [k for k in required_params if k not in actual_args] result["params_correct"] = len(missing) == 0 result["correct"] = True # Tool is correct even if params partially missing if missing: result["details"] = f"✅ Tool correct, missing params: {missing}" else: result["details"] = "✅ Full match" else: # Wrong tool expected_str = expected_tool_list[0] if len(expected_tool_list) == 1 else str(expected_tool_list) if "NO_TOOL" in expected_tool_list: result["details"] = f"❌ False positive: called {actual_tool} instead of abstaining" elif actual_tool == "NO_TOOL": result["details"] = f"❌ False negative: abstained instead of calling {expected_str}" else: result["details"] = f"❌ Wrong tool: expected {expected_str}, got {actual_tool}" if verbose: status = "✅" if result["correct"] else "❌" print(f" {status} [{test_id}] {result['details']}") if not result["correct"]: print(f" Prompt: {prompt[:80]}...") print(f" Raw: {raw_response[:120]}...") # R11-fix: Multi-turn followup evaluation (was deferred, now implemented) if result["correct"] and isinstance(test, dict) and "followup" in test: followup = test["followup"] # Build conversation history: original prompt + first response + tool response + new assistant turn # R12-fix: Use native ChatML without <|tool_response|> tags to match training distribution history = ( f"{full_prompt}{raw_response}<|im_end|>\n" f"<|im_start|>tool\n{followup['tool_response']}<|im_end|>\n" f"<|im_start|>assistant\n" ) if BEST_OF_N > 1: next_tool, next_args, next_raw, next_latency, _ = call_ollama_best_of_n( history, available_tools=_TOOL_SCHEMAS ) else: next_tool, next_args, next_raw, next_latency, _ = call_ollama(history) # Layer 3 on followup: validate + catch repeated tool calls next_tool, next_args = validate_tool_call(prompt, next_tool, next_args, is_followup=True) if next_tool == actual_tool and next_tool != "NO_TOOL": next_tool = "NO_TOOL" next_args = {} result["actual"] += f" -> {next_tool}" result["latency"] += next_latency expected_followup = followup.get("expected_tool", "NO_TOOL") result["correct"] = (next_tool == expected_followup) if not result["correct"]: result["details"] += f" | ❌ Followup: expected {expected_followup}, got {next_tool}" else: result["details"] += f" | ✅ Followup: {next_tool}" if verbose: status2 = "✅" if result["correct"] else "❌" print(f" {status2} Followup turn: expected={expected_followup}, got={next_tool}") return result def run_evaluation(shuffle: bool = False, verbose: bool = False, quiet: bool = False) -> dict: """Run full BFCL-style evaluation across all categories.""" # Build flat test list with category tags all_tests = [] for cat_name, tests in ALL_CATEGORIES.items(): for test in tests: test_copy = test.copy() test_copy["category"] = cat_name all_tests.append(test_copy) if shuffle: random.shuffle(all_tests) print(f"\n{'='*70}") print(f" BFCL-Style Evaluation — {MODEL}") print(f" {len(all_tests)} tests across {len(ALL_CATEGORIES)} categories") print(f" Shuffle: {'ON' if shuffle else 'OFF'}") print(f"{'='*70}\n") # Run all tests results = [] category_results = {cat: [] for cat in ALL_CATEGORIES} start_time = time.time() for i, test in enumerate(all_tests, 1): cat = test["category"] if verbose: print(f"[{i}/{len(all_tests)}] Category: {cat}") result = evaluate_test(test, verbose=verbose) result["category"] = cat results.append(result) category_results[cat].append(result) if not verbose and not quiet: status = "✅" if result["correct"] else "❌" print(f" {status} [{result['id']}] {result['expected']:>25s} → {result['actual']:<25s} {result['latency']:.1f}s", end="") if result["hallucinated"]: print(" 🚨 HALLUCINATED", end="") print() elif quiet and not result["correct"]: print(f" ❌ [{result['id']}] expected {result['expected']}, got {result['actual']}") elapsed = time.time() - start_time # Category scores (BFCL methodology: accuracy per category) category_scores = {} print(f"\n{'='*70}") print(f" CATEGORY BREAKDOWN") print(f"{'='*70}") for cat_name in ALL_CATEGORIES: cat_res = category_results[cat_name] if not cat_res: continue correct = sum(1 for r in cat_res if r["correct"]) total = len(cat_res) accuracy = correct / total * 100 category_scores[cat_name] = accuracy tool_correct = sum(1 for r in cat_res if r["tool_correct"]) params_correct = sum(1 for r in cat_res if r["params_correct"]) hallucinated = sum(1 for r in cat_res if r["hallucinated"]) print(f" {cat_name:25s} {correct}/{total} = {accuracy:6.1f}% " f"(tool:{tool_correct}/{total} params:{params_correct}/{total} " f"halluc:{hallucinated})") # Overall score (BFCL: unweighted average across categories) overall = sum(category_scores.values()) / len(category_scores) if category_scores else 0 total_correct = sum(1 for r in results if r["correct"]) total_halluc = sum(1 for r in results if r["hallucinated"]) avg_latency = sum(r["latency"] for r in results) / len(results) if results else 0 print(f"\n{'='*70}") print(f" OVERALL RESULTS") print(f"{'='*70}") print(f" Overall Accuracy (BFCL avg): {overall:.1f}%") print(f" Raw Accuracy: {total_correct}/{len(results)} = {total_correct/len(results)*100:.1f}%") print(f" Hallucinations: {total_halluc}") print(f" Avg Latency: {avg_latency:.1f}s") print(f" Total Time: {elapsed:.0f}s") print(f"{'='*70}\n") return { "overall_accuracy": overall, "raw_accuracy": total_correct / len(results) * 100 if results else 0, "total_correct": total_correct, "total_tests": len(results), "category_scores": category_scores, "hallucinations": total_halluc, "avg_latency": avg_latency, "elapsed": elapsed, "results": results, } def main(): import argparse parser = argparse.ArgumentParser(description="BFCL-Style evaluation for Prism models") parser.add_argument("--model", type=str, default=None, help="Ollama model name (default: prism-coder:7b)") parser.add_argument("--runs", type=int, default=1, help="Number of evaluation runs") parser.add_argument("--shuffle", action="store_true", help="Randomize test order each run") parser.add_argument("--verbose", action="store_true", help="Show detailed model outputs") parser.add_argument("--quiet", action="store_true", help="Only print category breakdown and overall results (suppress per-test lines)") parser.add_argument("--cleanup", action="store_true", help="Release MLX model from memory after eval completes") args = parser.parse_args() # Allow --model to override the global MODEL global MODEL if args.model: MODEL = args.model print(f"Using model: {MODEL}") all_run_results = [] for run_idx in range(args.runs): if args.runs > 1: print(f"\n{'#'*70}") print(f" RUN {run_idx + 1} / {args.runs}") print(f"{'#'*70}") result = run_evaluation(shuffle=args.shuffle, verbose=args.verbose, quiet=args.quiet) all_run_results.append(result) if args.runs > 1: # Multi-run summary overall_scores = [r["overall_accuracy"] for r in all_run_results] raw_scores = [r["raw_accuracy"] for r in all_run_results] raw_correct = [r["total_correct"] for r in all_run_results] total_tests = all_run_results[0]["total_tests"] total_halluc = [r["hallucinations"] for r in all_run_results] print(f"\n{'='*70}") print(f" MULTI-RUN SUMMARY ({args.runs} runs × {total_tests} tests)") print(f"{'='*70}") print(f" BFCL Overall Accuracy:") for i, s in enumerate(overall_scores): print(f" Run {i+1}: {s:.1f}%") print(f" Average: {statistics.mean(overall_scores):.1f}%") print(f" Median: {statistics.median(overall_scores):.1f}%") if len(overall_scores) > 1: print(f" StdDev: {statistics.stdev(overall_scores):.2f}%") print(f"\n Raw Scores: {' | '.join(f'{c}/{total_tests}' for c in raw_correct)}") print(f" Hallucinations: {' | '.join(str(h) for h in total_halluc)}") # Per-category consistency print(f"\n Per-Category Consistency:") categories = all_run_results[0]["category_scores"].keys() for cat in categories: scores = [r["category_scores"].get(cat, 0) for r in all_run_results] avg = statistics.mean(scores) consistent = all(s == scores[0] for s in scores) marker = "✅" if consistent and avg == 100 else "⚠️" if not consistent else "✅" print(f" {marker} {cat:25s} {' | '.join(f'{s:.0f}%' for s in scores)} → avg {avg:.1f}%") print(f"\n{'='*70}\n") # Exit code median_overall = statistics.median(overall_scores) if median_overall < 90: print("❌ FAIL: Median BFCL accuracy below 90%") sys.exit(1) elif median_overall < 95: print("⚠️ WARN: Median BFCL accuracy below 95%") sys.exit(0) else: print(f"✅ PASS: Median BFCL accuracy {median_overall:.1f}%") sys.exit(0) else: overall = all_run_results[0]["overall_accuracy"] if args.cleanup: cleanup_mlx_model() if overall < 90: sys.exit(1) sys.exit(0) if args.cleanup: cleanup_mlx_model() def cleanup_mlx_model(): global MLX_MODEL_CACHE, MLX_TOKENIZER_CACHE if MLX_MODEL_CACHE is not None: import gc try: import mlx.core as mx mx.clear_cache() except Exception: pass MLX_MODEL_CACHE = None MLX_TOKENIZER_CACHE = None gc.collect() print("🧹 MLX model released from memory") if __name__ == "__main__": main()