#!/usr/bin/env python3 """ eval_300.py — 300-Case Standard Evaluation for prism-coder:4b-v43 Replaces bfcl_eval.py (64 tests) and swe_bench_test.py (68 tests) with a single ~300-case blind eval. Designed to be run 3 times for statistical stability checks. All test cases are NOVEL — never seen in any training data. Categories: natural_phrasing (50) — casual/indirect phrasing that maps to a tool adversarial_trap (70) — CS/programming questions that must NOT call a tool disambiguation (40) — similar tools exist; must pick the correct one edge_case (25) — minimal / ambiguous prompts multi_intent (20) — multi-step prompts; score on first action only verifier (25) — synthesize_edges / backfill_links / health_check patterns cascade (25) — explicit first-step-of-chain patterns param_extraction (25) — params in the prompt text; test correct extraction abstention (20) — greetings / capability questions; must return NO_TOOL Scoring: strict_pass = correct tool + all required_params present → 1.0 point partial_pass = correct tool + at least 1 required_param but not all → 0.5 point wrong_tool = wrong tool name → 0 points false_pos = tool called when NO_TOOL expected → 0 points false_neg = NO_TOOL when tool expected → 0 points Usage: python3 eval_300.py python3 eval_300.py --runs 3 --shuffle python3 eval_300.py --model prism-coder:4b-v43 --runs 3 python3 eval_300.py --no-validate-layer3 """ import json import os import re import sys import time import random import statistics import urllib.request import argparse # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- MODEL = "prism-coder:4b-v43" OLLAMA_API = "http://localhost:11434/api/generate" SYSTEM_PROMPT = ( "You are Synalux, a memory-augmented coding and clinical reasoning assistant. " "You have access to Prism Memory tools (session_save_ledger, session_load_context, " "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, " "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, " "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, " "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool " "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, " "hipaa, data_graph, templates, pdf_parser). " "Think step-by-step before answering. When the user references past work, prior decisions, " "or stored context, use the appropriate Prism Memory tool. " "Format tool calls inside ... JSON blocks with fields 'name' and 'arguments'. " "If no tool is needed, answer directly in plain text. " "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions." ) VALID_TOOLS = { "session_load_context", "session_save_ledger", "session_save_handoff", "session_search_memory", "session_forget_memory", "session_health_check", "session_compact_ledger", "session_export_memory", "session_task_route", "session_save_experience", "session_synthesize_edges", "session_backfill_links", "knowledge_search", "knowledge_forget", "knowledge_upvote", "knowledge_downvote", "knowledge_set_retention", } # --------------------------------------------------------------------------- # Test Cases (prompt, expected_tool_or_NO_TOOL, required_params, category) # required_params: list of param keys that MUST appear in got_args # --------------------------------------------------------------------------- TESTS = [ # =========================================================================== # CATEGORY 1: natural_phrasing (50 cases) # Casual / indirect user phrasing that maps to a specific Prism tool. # =========================================================================== # --- session_load_context --- ("Alright, kick things off. Pull up whatever we had on the checkout-service project.", "session_load_context", ["project"], "natural_phrasing"), ("I'm back from lunch. Get me re-oriented on the prism-aac project.", "session_load_context", ["project"], "natural_phrasing"), ("Fresh session here. Reconstruct everything we built for the notifications project.", "session_load_context", ["project"], "natural_phrasing"), ("Starting a new chat. Bring up the full context for the mobile-app project.", "session_load_context", ["project"], "natural_phrasing"), ("Where did we leave off with the auth-service work?", "session_load_context", [], "natural_phrasing"), ("Get me up to speed on the reporting-dashboard project.", "session_load_context", ["project"], "natural_phrasing"), ("Resume from where we were on the data-pipeline project.", "session_load_context", ["project"], "natural_phrasing"), ("Catch me up — what was the state of the subscription-api project?", "session_load_context", ["project"], "natural_phrasing"), # --- session_save_ledger --- ("We wrapped up for today. Make a note that we completed the database indexing overhaul.", "session_save_ledger", [], "natural_phrasing"), ("Log what just happened: we refactored the payment module and all tests pass.", "session_save_ledger", [], "natural_phrasing"), ("Record this session — we finalized the API contract for the mobile team.", "session_save_ledger", [], "natural_phrasing"), ("Write down everything we did today before I close this tab.", "session_save_ledger", [], "natural_phrasing"), ("Jot down our progress: three endpoints migrated, two more to go.", "session_save_ledger", [], "natural_phrasing"), ("Before I head out, save a summary of what we accomplished this afternoon.", "session_save_ledger", [], "natural_phrasing"), # --- session_save_handoff --- ("I'm handing this over. Leave a note for whoever picks this up next on the billing-portal project.", "session_save_handoff", ["project"], "natural_phrasing"), ("Pass the baton on the logistics-api project. Save the handoff so the next person knows where we are.", "session_save_handoff", ["project"], "natural_phrasing"), ("Shift change. Store the current state for the embedded-firmware project so the next agent can continue.", "session_save_handoff", ["project"], "natural_phrasing"), ("Create a handoff note for the trading-platform project — we got through feature flagging, still need A/B routing.", "session_save_handoff", ["project"], "natural_phrasing"), # --- session_search_memory --- ("Remind me — did we ever pick a caching strategy for the CDN layer?", "session_search_memory", ["query"], "natural_phrasing"), ("Did we discuss anything about Kafka consumer lag in previous sessions?", "session_search_memory", ["query"], "natural_phrasing"), ("Go back through our history and find anything about the CI pipeline refactor.", "session_search_memory", ["query"], "natural_phrasing"), ("What did we decide about webhook retry logic in past conversations?", "session_search_memory", ["query"], "natural_phrasing"), ("Dig up anything we recorded about the multi-tenant database design.", "session_search_memory", ["query"], "natural_phrasing"), ("Pull up any notes we saved about the gRPC migration.", "session_search_memory", ["query"], "natural_phrasing"), # --- session_forget_memory --- ("That entry we saved about using SQLite in production is totally wrong. Remove it.", "session_forget_memory", ["memory_id"], "natural_phrasing"), ("Delete the memory with ID mem-zx91-ff. It's stale.", "session_forget_memory", ["memory_id"], "natural_phrasing"), ("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.", "session_forget_memory", ["memory_id"], "natural_phrasing"), # --- session_health_check --- ("Something feels off. Can you run diagnostics on the memory backend?", "session_health_check", [], "natural_phrasing"), ("Before I trust these search results, verify the memory system is healthy.", "session_health_check", [], "natural_phrasing"), ("Give the memory infrastructure a quick checkup.", "session_health_check", [], "natural_phrasing"), # --- session_compact_ledger --- ("The session history for the event-sourcing project is getting massive. Trim and archive the old entries.", "session_compact_ledger", ["project"], "natural_phrasing"), ("Compress the ledger for the recommendation-engine project — too much noise in there.", "session_compact_ledger", ["project"], "natural_phrasing"), ("Prune out the old session entries for the analytics-backend project.", "session_compact_ledger", ["project"], "natural_phrasing"), # --- session_export_memory --- ("Dump a full backup of my memory to /data/exports in JSON format.", "session_export_memory", ["output_path", "format"], "natural_phrasing"), ("Export everything to /tmp/prism-dump so I can archive it.", "session_export_memory", ["output_path"], "natural_phrasing"), ("I need an offline copy of all session data. Export to /backup/weekly.", "session_export_memory", ["output_path"], "natural_phrasing"), # --- session_task_route --- ("Should I tackle this Rust async runtime bug locally or send it to a bigger model?", "session_task_route", ["task_description"], "natural_phrasing"), ("Is this image classification fine-tuning job something the local agent can handle?", "session_task_route", ["task_description"], "natural_phrasing"), ("Route this task: refactor the monorepo build system to support incremental compilation.", "session_task_route", ["task_description"], "natural_phrasing"), # --- session_save_experience --- ("Log a milestone: we successfully zero-downtime-deployed the new search index.", "session_save_experience", [], "natural_phrasing"), ("Record that we fixed the race condition in the WebSocket handler — took 4 hours but it's solid now.", "session_save_experience", [], "natural_phrasing"), # --- knowledge_search --- ("Any institutional knowledge on how we handle circuit breakers?", "knowledge_search", ["query"], "natural_phrasing"), ("What does our knowledge base say about rate limiting strategies?", "knowledge_search", ["query"], "natural_phrasing"), ("Look up anything curated about CQRS patterns.", "knowledge_search", ["query"], "natural_phrasing"), ("Check our documented knowledge for anything on event-driven architecture.", "knowledge_search", ["query"], "natural_phrasing"), # --- knowledge_upvote / downvote --- ("That knowledge entry about using Redis for distributed locks was really helpful. Give it a thumbs up.", "knowledge_upvote", [], "natural_phrasing"), ("Boost the ranking on our GraphQL federation notes — they're gold.", "knowledge_upvote", [], "natural_phrasing"), ("That doc about using polling instead of webhooks is outdated and wrong. Lower its score.", "knowledge_downvote", [], "natural_phrasing"), ("Downvote the entry about using bcrypt at cost 4 — it's dangerously insecure.", "knowledge_downvote", [], "natural_phrasing"), # --- knowledge_set_retention --- ("Set a 45-day retention policy on the alpha-testing project's knowledge.", "knowledge_set_retention", ["project"], "natural_phrasing"), # =========================================================================== # CATEGORY 2: adversarial_trap (70 cases) # CS / programming questions — must return NO_TOOL even when keywords match. # =========================================================================== # Python ("Write a Python function that implements a trie for fast prefix searches.", "NO_TOOL", [], "adversarial_trap"), ("How do I use Python's contextlib.contextmanager decorator?", "NO_TOOL", [], "adversarial_trap"), ("Explain Python's __slots__ and when to use it for memory optimization.", "NO_TOOL", [], "adversarial_trap"), ("What is the difference between deepcopy and shallow copy in Python?", "NO_TOOL", [], "adversarial_trap"), ("How does Python's asyncio event loop schedule coroutines?", "NO_TOOL", [], "adversarial_trap"), ("Write a Python generator that yields prime numbers indefinitely.", "NO_TOOL", [], "adversarial_trap"), ("How do I profile memory usage in a Python application?", "NO_TOOL", [], "adversarial_trap"), # JavaScript / TypeScript ("How do I debounce a function in JavaScript without lodash?", "NO_TOOL", [], "adversarial_trap"), ("Explain the JavaScript event loop and microtask queue.", "NO_TOOL", [], "adversarial_trap"), ("How does TypeScript's discriminated union type work?", "NO_TOOL", [], "adversarial_trap"), ("Write a TypeScript generic function that deep-merges two objects.", "NO_TOOL", [], "adversarial_trap"), ("What is the difference between a WeakMap and a Map in JavaScript?", "NO_TOOL", [], "adversarial_trap"), ("How do I implement a promise-based queue in Node.js?", "NO_TOOL", [], "adversarial_trap"), # Go ("How does Go's goroutine scheduler work with M:N threading?", "NO_TOOL", [], "adversarial_trap"), ("Explain Go's garbage collector and write barriers.", "NO_TOOL", [], "adversarial_trap"), ("Write a concurrent rate limiter in Go using channels.", "NO_TOOL", [], "adversarial_trap"), ("How do I implement context cancellation in a Go HTTP server?", "NO_TOOL", [], "adversarial_trap"), # Rust ("Explain Rust's borrow checker and why it prevents data races.", "NO_TOOL", [], "adversarial_trap"), ("How do Arc and Mutex work together in Rust for thread-safe state sharing?", "NO_TOOL", [], "adversarial_trap"), ("What is Rust's Pin and why is it needed for async futures?", "NO_TOOL", [], "adversarial_trap"), ("Write a Rust trait that implements a retry strategy with exponential backoff.", "NO_TOOL", [], "adversarial_trap"), # SQL / NoSQL ("Write a SQL query that finds the second-highest salary in an employees table.", "NO_TOOL", [], "adversarial_trap"), ("How do I use window functions in PostgreSQL to compute a running total?", "NO_TOOL", [], "adversarial_trap"), ("What is a covering index and when should I use one in MySQL?", "NO_TOOL", [], "adversarial_trap"), ("Explain eventual consistency in DynamoDB and how to work around it.", "NO_TOOL", [], "adversarial_trap"), ("How do I export data from MongoDB to a JSON file using mongoexport?", "NO_TOOL", [], "adversarial_trap"), ("What is a materialized view in PostgreSQL and how does it differ from a regular view?", "NO_TOOL", [], "adversarial_trap"), # Algorithms / Data Structures ("Explain Dijkstra's algorithm and its time complexity.", "NO_TOOL", [], "adversarial_trap"), ("Write a depth-first search implementation for a graph adjacency list.", "NO_TOOL", [], "adversarial_trap"), ("How does consistent hashing help with horizontal scaling?", "NO_TOOL", [], "adversarial_trap"), ("Explain the difference between a B-tree and a B+ tree.", "NO_TOOL", [], "adversarial_trap"), ("What is the time and space complexity of merge sort?", "NO_TOOL", [], "adversarial_trap"), ("Implement a LRU cache in Python using OrderedDict.", "NO_TOOL", [], "adversarial_trap"), ("How does a bloom filter work and what are its false positive trade-offs?", "NO_TOOL", [], "adversarial_trap"), # Frameworks / Config ("How do I configure Django's ORM to use read replicas?", "NO_TOOL", [], "adversarial_trap"), ("Explain Flask's application context vs. request context.", "NO_TOOL", [], "adversarial_trap"), ("How does FastAPI's dependency injection system work?", "NO_TOOL", [], "adversarial_trap"), ("Write a middleware in Express.js that logs request durations.", "NO_TOOL", [], "adversarial_trap"), ("How do I set up hot-module replacement in a Vite + React project?", "NO_TOOL", [], "adversarial_trap"), ("What is the difference between server components and client components in Next.js 14?", "NO_TOOL", [], "adversarial_trap"), # DevOps / Infrastructure ("Write a Dockerfile for a Python FastAPI app with multi-stage builds.", "NO_TOOL", [], "adversarial_trap"), ("How do I configure a Kubernetes HorizontalPodAutoscaler based on custom metrics?", "NO_TOOL", [], "adversarial_trap"), ("What is the difference between rolling and blue-green deployments?", "NO_TOOL", [], "adversarial_trap"), ("How do I set up Prometheus scraping for a Node.js service?", "NO_TOOL", [], "adversarial_trap"), ("Explain how etcd achieves consensus using the Raft algorithm.", "NO_TOOL", [], "adversarial_trap"), ("Write a GitHub Actions workflow that runs tests on every pull request.", "NO_TOOL", [], "adversarial_trap"), # Memory management (trap on 'memory' keyword) ("How does virtual memory paging work in Linux?", "NO_TOOL", [], "adversarial_trap"), ("What is memory-mapped I/O and how does mmap work in C?", "NO_TOOL", [], "adversarial_trap"), ("Explain stack vs. heap memory allocation and when each is appropriate.", "NO_TOOL", [], "adversarial_trap"), ("How does the V8 engine's garbage collector use generational collection?", "NO_TOOL", [], "adversarial_trap"), # Session handling (trap on 'session' keyword) ("How does PHP's session_start() work under the hood?", "NO_TOOL", [], "adversarial_trap"), ("Implement session fixation protection in a Flask application.", "NO_TOOL", [], "adversarial_trap"), ("What is the difference between sticky sessions and session replication?", "NO_TOOL", [], "adversarial_trap"), ("How do I store JWT tokens in a secure, httpOnly cookie in Express?", "NO_TOOL", [], "adversarial_trap"), # Search (trap on 'search' keyword) ("How do I implement fuzzy search with trigrams in PostgreSQL?", "NO_TOOL", [], "adversarial_trap"), ("Explain TF-IDF and how it ranks documents in full-text search.", "NO_TOOL", [], "adversarial_trap"), ("Write a binary search implementation in Rust.", "NO_TOOL", [], "adversarial_trap"), ("Compare Elasticsearch and OpenSearch for log aggregation.", "NO_TOOL", [], "adversarial_trap"), # Graph theory (trap on 'graph' + 'edges' keywords) ("Explain the difference between Prim's and Kruskal's spanning tree algorithms.", "NO_TOOL", [], "adversarial_trap"), ("How do topological sorts work on directed acyclic graphs?", "NO_TOOL", [], "adversarial_trap"), ("Write a function to detect cycles in a directed graph using DFS.", "NO_TOOL", [], "adversarial_trap"), # Load balancing (trap on 'load' keyword) ("What are the differences between round-robin, least-connections, and IP-hash load balancing?", "NO_TOOL", [], "adversarial_trap"), ("How does Nginx upstream load balancing handle health check failures?", "NO_TOOL", [], "adversarial_trap"), # Logging / monitoring ("How do I implement structured logging in a Go service with zerolog?", "NO_TOOL", [], "adversarial_trap"), ("Explain the ELK stack and how logs flow from Beats to Kibana.", "NO_TOOL", [], "adversarial_trap"), ("What is OpenTelemetry and how does distributed tracing work?", "NO_TOOL", [], "adversarial_trap"), # Misc CS concepts ("What is the difference between optimistic and pessimistic locking in databases?", "NO_TOOL", [], "adversarial_trap"), ("Explain how CRDTs achieve conflict-free distributed state.", "NO_TOOL", [], "adversarial_trap"), ("What is a saga pattern in distributed systems?", "NO_TOOL", [], "adversarial_trap"), ("How does the forget gate in an LSTM neural network control memory?", "NO_TOOL", [], "adversarial_trap"), # =========================================================================== # CATEGORY 3: disambiguation (40 cases) # Similar tools — model must pick the correct one. # =========================================================================== # session_search_memory vs knowledge_search ("Find anything we discussed last month about the API versioning decision.", "session_search_memory", ["query"], "disambiguation"), ("What do our curated knowledge items say about dependency injection patterns?", "knowledge_search", ["query"], "disambiguation"), ("Search our accumulated documentation for information on database sharding.", "knowledge_search", ["query"], "disambiguation"), ("Look through recent session notes for anything about the CDN cache invalidation bug.", "session_search_memory", ["query"], "disambiguation"), ("Any past conversations where we discussed microservice mesh configurations?", "session_search_memory", ["query"], "disambiguation"), ("Check the knowledge base for anything on event sourcing trade-offs.", "knowledge_search", ["query"], "disambiguation"), # session_forget_memory vs knowledge_forget ("Remove the specific session memory with ID mem-qq77-rr. It's incorrect.", "session_forget_memory", ["memory_id"], "disambiguation"), ("Clear all the outdated knowledge entries in the staging project.", "knowledge_forget", ["project"], "disambiguation"), ("Wipe out old debugging records from the search-service project's knowledge base.", "knowledge_forget", ["project"], "disambiguation"), ("Delete the memory entry for ID mem-ab99-cd — we noted the wrong schema version.", "session_forget_memory", ["memory_id"], "disambiguation"), ("Remove all knowledge items in the deprecated-feature category from the portal project.", "knowledge_forget", ["project"], "disambiguation"), # session_save_ledger vs session_save_experience vs session_save_handoff ("Log what we did today: migrated the billing module to the new event bus.", "session_save_ledger", [], "disambiguation"), ("Record a milestone: we successfully launched the new onboarding flow in production.", "session_save_experience", [], "disambiguation"), ("Hand off this session — save the state for the next agent on the gateway project.", "session_save_handoff", ["project"], "disambiguation"), ("Write down that we rewrote the payment reconciliation logic today.", "session_save_ledger", [], "disambiguation"), ("Mark a success: we fixed the notorious N+1 query on the orders endpoint.", "session_save_experience", [], "disambiguation"), ("The contractor is taking over tonight. Save the handoff for the migration-tools project.", "session_save_handoff", ["project"], "disambiguation"), # knowledge_upvote vs knowledge_downvote ("That knowledge entry about immutable infrastructure is spot on. Upvote it.", "knowledge_upvote", [], "disambiguation"), ("The doc recommending XML over JSON for internal APIs is terrible. Mark it down.", "knowledge_downvote", [], "disambiguation"), ("Increase the importance score of the circuit-breaker patterns entry.", "knowledge_upvote", [], "disambiguation"), ("Reduce the rank of that outdated note about using MD5 for hashing.", "knowledge_downvote", [], "disambiguation"), # session_compact_ledger vs session_export_memory ("The billing-service ledger is bloated. Compress and archive the old entries.", "session_compact_ledger", ["project"], "disambiguation"), ("Export a full offline snapshot of my memory to /archive/snapshot in JSON.", "session_export_memory", ["output_path", "format"], "disambiguation"), ("Trim down the session history for the firmware project — it's too long.", "session_compact_ledger", ["project"], "disambiguation"), ("Save everything to disk — dump all session data to /tmp/export-all.", "session_export_memory", ["output_path"], "disambiguation"), # session_synthesize_edges vs session_backfill_links vs session_health_check ("Verify the session graph edges are all consistent for the trading-platform project.", "session_synthesize_edges", ["project"], "disambiguation"), ("Reconnect the dangling session references for the ml-pipeline project.", "session_backfill_links", ["project"], "disambiguation"), ("Run a full health diagnostic on the Prism memory backend.", "session_health_check", [], "disambiguation"), ("Patch up missing cross-session links for the user-service project.", "session_backfill_links", ["project"], "disambiguation"), ("Make sure all edges are synthesized and up to date for the invoicing project.", "session_synthesize_edges", ["project"], "disambiguation"), ("Is the memory system responding normally? Do a quick health check.", "session_health_check", [], "disambiguation"), # session_load_context vs session_search_memory ("Bring me back into the context of the payments-gateway project.", "session_load_context", ["project"], "disambiguation"), ("Look for any notes we made about the GraphQL schema decisions.", "session_search_memory", ["query"], "disambiguation"), ("Restore the full session state for the devops-automation project.", "session_load_context", ["project"], "disambiguation"), ("Search our history for any discussion about OAuth2 vs API keys.", "session_search_memory", ["query"], "disambiguation"), # session_task_route vs session_load_context ("Should the local model handle this React performance optimization or route it to the cloud?", "session_task_route", ["task_description"], "disambiguation"), ("Initialize context for the infrastructure-as-code project — I'm starting fresh.", "session_load_context", ["project"], "disambiguation"), # knowledge_set_retention vs knowledge_forget ("Set the knowledge for the beta-program project to expire after 90 days.", "knowledge_set_retention", ["project"], "disambiguation"), ("Delete all knowledge in the archived-2025 project — we don't need it anymore.", "knowledge_forget", ["project"], "disambiguation"), ("Auto-expire the knowledge entries in the sandbox project after 14 days.", "knowledge_set_retention", ["project"], "disambiguation"), # =========================================================================== # CATEGORY 4: edge_case (25 cases) # Minimal, single-word, ambiguous, or unusual prompts. # =========================================================================== ("Load context.", "session_load_context", [], "edge_case"), ("Save.", "session_save_ledger", [], "edge_case"), ("Search.", "session_search_memory", [], "edge_case"), ("Check health.", "session_health_check", [], "edge_case"), ("Export.", "session_export_memory", [], "edge_case"), ("Compact.", "session_compact_ledger", [], "edge_case"), ("Handoff.", "session_save_handoff", [], "edge_case"), ("Route this.", "session_task_route", [], "edge_case"), ("Synthesize edges.", "session_synthesize_edges", [], "edge_case"), ("Backfill links.", "session_backfill_links", [], "edge_case"), ("Forget it.", "session_forget_memory", [], "edge_case"), ("Knowledge search.", "knowledge_search", [], "edge_case"), # Abstention edge cases ("Hello!", "NO_TOOL", [], "edge_case"), ("What can you do?", "NO_TOOL", [], "edge_case"), ("Tell me about yourself.", "NO_TOOL", [], "edge_case"), ("Thanks, we're done.", "NO_TOOL", [], "edge_case"), ("OK great.", "NO_TOOL", [], "edge_case"), ("Bye!", "NO_TOOL", [], "edge_case"), # Ambiguous short prompts that still require the right tool ("Run diagnostics.", "session_health_check", [], "edge_case"), ("Save the handoff.", "session_save_handoff", [], "edge_case"), ("Log this session.", "session_save_ledger", [], "edge_case"), ("Search memory.", "session_search_memory", [], "edge_case"), ("Knowledge base lookup.", "knowledge_search", [], "edge_case"), ("Archive old entries.", "session_compact_ledger", [], "edge_case"), ("Save experience.", "session_save_experience", [], "edge_case"), # =========================================================================== # CATEGORY 5: multi_intent (20 cases) # Multi-step prompts — score only the FIRST action. # =========================================================================== ("Load the context for the pipeline project, then search for any past notes on streaming.", "session_load_context", ["project"], "multi_intent"), ("Search our memory for anything about the OAuth migration, then save a handoff.", "session_search_memory", ["query"], "multi_intent"), ("Check memory health, and if it's all good, compact the fraud-detection ledger.", "session_health_check", [], "multi_intent"), ("Find notes about the ML model rollout, and then log that we finished the A/B test today.", "session_search_memory", ["query"], "multi_intent"), ("Load the prism-mcp context, then check if there are any open issues about rate limiting.", "session_load_context", ["project"], "multi_intent"), ("Export everything to /tmp/backup, then set a 60-day retention policy on it.", "session_export_memory", ["output_path"], "multi_intent"), ("Save what we did today: shipped the new notification system. Then create a handoff note.", "session_save_ledger", [], "multi_intent"), ("Search for what we decided about the queue architecture, then upvote the best result.", "session_search_memory", ["query"], "multi_intent"), ("Run a health check on the memory system, then compact the ledger if there are issues.", "session_health_check", [], "multi_intent"), ("Look up our knowledge on service mesh patterns, and then downvote the outdated ones.", "knowledge_search", ["query"], "multi_intent"), ("Compact the session history for the payments project, then synthesize the session edges.", "session_compact_ledger", ["project"], "multi_intent"), ("Load context for the billing-v2 project, and record our progress: we fixed the invoice date bug.", "session_load_context", ["project"], "multi_intent"), ("Search our knowledge base for event-driven design patterns, then save a handoff with the findings.", "knowledge_search", ["query"], "multi_intent"), ("Backfill the cross-session links for the ios-app project, then synthesize edges.", "session_backfill_links", ["project"], "multi_intent"), ("Route this task: full rewrite of the logging subsystem. If cloud, just tell me.", "session_task_route", ["task_description"], "multi_intent"), ("Export memory to /var/backup, and then purge the old knowledge entries from the legacy project.", "session_export_memory", ["output_path"], "multi_intent"), ("Find what we discussed about caching strategies, then set a 30-day retention on that knowledge.", "session_search_memory", ["query"], "multi_intent"), ("Record a success milestone: zero-downtime deploy of version 4.2. Then compact the ledger.", "session_save_experience", [], "multi_intent"), ("Load the fraud-detection project context and then synthesize all session edges.", "session_load_context", ["project"], "multi_intent"), ("Save what we accomplished: rewrote the ingestion pipeline. Then hand it off to the ops team.", "session_save_ledger", [], "multi_intent"), # =========================================================================== # CATEGORY 6: verifier (25 cases) # session_synthesize_edges / session_backfill_links / session_health_check patterns. # =========================================================================== # session_synthesize_edges ("Make sure all session graph edges are consistent for the auth-gateway project.", "session_synthesize_edges", ["project"], "verifier"), ("Run a synthesis pass to validate all edges are up to date for the orchestration project.", "session_synthesize_edges", ["project"], "verifier"), ("Verify graph integrity — synthesize edges for the content-delivery project.", "session_synthesize_edges", ["project"], "verifier"), ("Before closing out, check that all session links are consistent for the scheduling project.", "session_synthesize_edges", ["project"], "verifier"), ("Ensure all session relationships are properly synthesized for the warehouse-api project.", "session_synthesize_edges", ["project"], "verifier"), ("Run edge synthesis on the real-time-alerts project to validate the session graph.", "session_synthesize_edges", ["project"], "verifier"), ("Validate that all edges in the session graph are consistent for the pricing-engine project.", "session_synthesize_edges", ["project"], "verifier"), ("Confirm session link consistency for the document-processing project.", "session_synthesize_edges", ["project"], "verifier"), # session_backfill_links ("There are broken cross-session links in the search-backend project. Backfill them.", "session_backfill_links", ["project"], "verifier"), ("Reconnect all dangling references in the identity-service project history.", "session_backfill_links", ["project"], "verifier"), ("Patch the missing links between sessions for the payments-v3 project.", "session_backfill_links", ["project"], "verifier"), ("Fix the link gaps in our session history for the recommendation-service project.", "session_backfill_links", ["project"], "verifier"), ("Backfill any missing cross-session connections for the notification-hub project.", "session_backfill_links", ["project"], "verifier"), ("Reconnect broken session references in the compliance-tracker project.", "session_backfill_links", ["project"], "verifier"), ("Repair missing session links for the api-gateway project.", "session_backfill_links", ["project"], "verifier"), # session_health_check ("Before I start a new sprint, confirm the memory system is operating correctly.", "session_health_check", [], "verifier"), ("The search results seem incomplete. Check if the memory backend is healthy.", "session_health_check", [], "verifier"), ("I'm seeing weird behavior in session recall. Run a diagnostic check.", "session_health_check", [], "verifier"), ("Ping the memory system and confirm it's all healthy.", "session_health_check", [], "verifier"), ("Is the Prism memory backend operating within normal parameters?", "session_health_check", [], "verifier"), ("Double-check the memory infrastructure health before I rely on these results.", "session_health_check", [], "verifier"), ("Verify the memory system is functioning before we start the long session.", "session_health_check", [], "verifier"), ("Run a full health check and report back on the memory backend status.", "session_health_check", [], "verifier"), ("Something is off with memory recall. Diagnose the backend.", "session_health_check", [], "verifier"), ("Confirm the session memory system is healthy before I save this handoff.", "session_health_check", [], "verifier"), # =========================================================================== # CATEGORY 7: cascade (25 cases) # Explicit first-step-of-chain patterns — model must pick the right FIRST tool. # =========================================================================== ("Search our knowledge for gRPC patterns, then upvote the most relevant entry.", "knowledge_search", ["query"], "cascade"), ("Load the indexing-service context, then search for any past notes on shard rebalancing.", "session_load_context", ["project"], "cascade"), ("Check memory health, then compact the alerts project ledger if there are stale entries.", "session_health_check", [], "cascade"), ("Export all memory to /tmp/archive, then set a 180-day retention policy on the archive project.", "session_export_memory", ["output_path"], "cascade"), ("Search for what we decided about the event schema design, then save a handoff about it.", "session_search_memory", ["query"], "cascade"), ("Save today's session notes for the pipeline project, then create a handoff for the next agent.", "session_save_ledger", [], "cascade"), ("Should the local model handle this concurrency refactor? If cloud, stop there.", "session_task_route", ["task_description"], "cascade"), ("Search knowledge for CQRS trade-offs, downvote anything recommending a single store.", "knowledge_search", ["query"], "cascade"), ("Compact the ledger for the embeddings project, then synthesize the session edges.", "session_compact_ledger", ["project"], "cascade"), ("Load the feature-flags project context, then log that we shipped the A/B framework.", "session_load_context", ["project"], "cascade"), ("Run a health check first, then based on results decide whether to compact or export.", "session_health_check", [], "cascade"), ("Search memory for past decisions about SSE vs WebSockets, then record what we found.", "session_search_memory", ["query"], "cascade"), ("Backfill the missing links for the analytics project, then synthesize the edges.", "session_backfill_links", ["project"], "cascade"), ("Load context for the tenant-management project, then search for any open migration tickets.", "session_load_context", ["project"], "cascade"), ("Find what we know about zero-copy networking, then save a handoff with that context.", "session_search_memory", ["query"], "cascade"), ("Export to /backups/weekly, then compact the media-processing ledger.", "session_export_memory", ["output_path"], "cascade"), ("Search our knowledge base for Kubernetes resource quotas, then set a 60-day retention.", "knowledge_search", ["query"], "cascade"), ("Save the experience: we eliminated 80% of unnecessary re-renders. Then route the next task.", "session_save_experience", [], "cascade"), ("Synthesize edges for the audit-log project, then backfill any missing links.", "session_synthesize_edges", ["project"], "cascade"), ("Load the risk-assessment project context and then search memory for past risk audit notes.", "session_load_context", ["project"], "cascade"), ("Find our notes on the transaction saga pattern, then upvote the best entry.", "session_search_memory", ["query"], "cascade"), ("Compact the metrics project ledger, then export it to /tmp/metrics-backup.", "session_compact_ledger", ["project"], "cascade"), ("Route this task: implement distributed tracing with OpenTelemetry across five services.", "session_task_route", ["task_description"], "cascade"), ("Save what we accomplished: added RBAC support to the admin API. Then synthesize edges.", "session_save_ledger", [], "cascade"), ("Search knowledge for eventual consistency patterns, then forget the entries about using global locks.", "knowledge_search", ["query"], "cascade"), # =========================================================================== # CATEGORY 8: param_extraction (25 cases) # Params ARE mentioned in the prompt — test that model extracts them correctly. # =========================================================================== ("Load the full context for the fraud-detection project at a deep level.", "session_load_context", ["project"], "param_extraction"), ("Compact the session ledger for the user-identity project.", "session_compact_ledger", ["project"], "param_extraction"), ("Save a handoff note for the supplier-portal project.", "session_save_handoff", ["project"], "param_extraction"), ("Delete the memory entry with ID mem-fg33-hh. It has the wrong branch name.", "session_forget_memory", ["memory_id"], "param_extraction"), ("Export all memory data to /exports/2026-q2 in JSON format.", "session_export_memory", ["output_path", "format"], "param_extraction"), ("Set the retention policy for the experiment-runner project to 45 days.", "knowledge_set_retention", ["project"], "param_extraction"), ("Search session memory for 'distributed tracing setup'.", "session_search_memory", ["query"], "param_extraction"), ("Search the knowledge base for 'idempotency keys in payment APIs'.", "knowledge_search", ["query"], "param_extraction"), ("Backfill the cross-session links for the warehouse-inventory project.", "session_backfill_links", ["project"], "param_extraction"), ("Synthesize session edges for the logistics-optimizer project.", "session_synthesize_edges", ["project"], "param_extraction"), ("Forget the knowledge entry with ID ki-cc44-gg — that approach is deprecated.", "knowledge_forget", [], "param_extraction"), ("Upvote the knowledge entry with ID ki-tt55-rr. Really solid documentation.", "knowledge_upvote", [], "param_extraction"), ("Downvote knowledge entry ki-uu99-qq — it recommends a vulnerable library.", "knowledge_downvote", [], "param_extraction"), ("Configure an 80-day retention policy for the beta-features project's knowledge.", "knowledge_set_retention", ["project"], "param_extraction"), ("Load context for the platform-core project.", "session_load_context", ["project"], "param_extraction"), ("Export the archive to /data/long-term-backup in markdown format.", "session_export_memory", ["output_path", "format"], "param_extraction"), ("Search for 'zero-downtime database migrations' in our session history.", "session_search_memory", ["query"], "param_extraction"), ("Search knowledge for 'CQRS vs event sourcing trade-offs'.", "knowledge_search", ["query"], "param_extraction"), ("Compact the ledger for the monitoring-stack project.", "session_compact_ledger", ["project"], "param_extraction"), ("Delete memory entry mem-pp12-ss — wrong model version was recorded.", "session_forget_memory", ["memory_id"], "param_extraction"), ("Save a handoff for the checkout-v4 project.", "session_save_handoff", ["project"], "param_extraction"), ("Route this task: rewrite the message broker integration to use NATS instead of RabbitMQ.", "session_task_route", ["task_description"], "param_extraction"), ("Synthesize edges for the ingestion-pipeline project.", "session_synthesize_edges", ["project"], "param_extraction"), ("Backfill the missing session links in the content-catalog project.", "session_backfill_links", ["project"], "param_extraction"), ("Set 120-day retention on the compliance-logs project's knowledge.", "knowledge_set_retention", ["project"], "param_extraction"), # =========================================================================== # CATEGORY 9: abstention (20 cases) # Greetings, capability questions, general CS — must return NO_TOOL. # =========================================================================== ("Hi there!", "NO_TOOL", [], "abstention"), ("Good morning!", "NO_TOOL", [], "abstention"), ("Hey, quick question — what's your name?", "NO_TOOL", [], "abstention"), ("What tools do you have available?", "NO_TOOL", [], "abstention"), ("What are your capabilities?", "NO_TOOL", [], "abstention"), ("Can you explain what Prism Memory tools do?", "NO_TOOL", [], "abstention"), ("What programming languages do you know?", "NO_TOOL", [], "abstention"), ("Thanks, that's all for now!", "NO_TOOL", [], "abstention"), ("Great work today, goodbye.", "NO_TOOL", [], "abstention"), ("You're really helpful, thanks!", "NO_TOOL", [], "abstention"), ("What is the capital of France?", "NO_TOOL", [], "abstention"), ("Tell me a joke.", "NO_TOOL", [], "abstention"), ("How do you work?", "NO_TOOL", [], "abstention"), ("Are you GPT-4?", "NO_TOOL", [], "abstention"), ("Can you write me a poem?", "NO_TOOL", [], "abstention"), ("What's the weather like today?", "NO_TOOL", [], "abstention"), ("Can you recommend a good book?", "NO_TOOL", [], "abstention"), ("What's 2+2?", "NO_TOOL", [], "abstention"), ("Do you have feelings?", "NO_TOOL", [], "abstention"), ("What is machine learning?", "NO_TOOL", [], "abstention"), ] # --------------------------------------------------------------------------- # Sanity check: enforce exactly 300 cases and correct counts per category # --------------------------------------------------------------------------- _TARGET_COUNTS = { "natural_phrasing": 50, "adversarial_trap": 70, "disambiguation": 40, "edge_case": 25, "multi_intent": 20, "verifier": 25, "cascade": 25, "param_extraction": 25, "abstention": 20, } _TOTAL_TARGET = 300 def _verify_test_counts(): from collections import Counter counts = Counter(t[3] for t in TESTS) errors = [] for cat, expected in _TARGET_COUNTS.items(): actual = counts.get(cat, 0) if actual != expected: errors.append(f" {cat}: expected {expected}, got {actual}") if len(TESTS) != _TOTAL_TARGET: errors.append(f" TOTAL: expected {_TOTAL_TARGET}, got {len(TESTS)}") if errors: print("WARNING: test count mismatches:") for e in errors: print(e) return len(errors) == 0 # --------------------------------------------------------------------------- # Layer 3: Inference-Time False-Positive Rejection + Remapping # (Copied and merged from swe_bench_test.py — all current rules preserved) # --------------------------------------------------------------------------- GENERAL_PROGRAMMING_PATTERNS = [ # Python context managers r'\bcontext\s+manager\b', r'\bcontextlib\b', r'\b__enter__\b', r'\b__exit__\b', r'\basync\s+context\s+manager\b', # ML / LSTM forget gates r'\bforget\s+gate\b', r'\blstm\b', r'\bcatastrophic\s+forgetting\b', r'\bforget\s+bias\b', r'\belastic\s+weight\s+consolidation\b', # Web framework sessions r'\bexpress\.js\b', r'\bdjango\b', r'\bflask\b', r'\bfastapi\b', r'\bsession_start\(\)', r'\bsession\s+middleware\b', r'\bsession\s+affinity\b', # General CS r'\bgarbage\s+collection\b', r'\bgc\s+algorithm\b', r'\bmemory\s+management\s+in\s+rust\b', r'\bload\s+balanc', r'\bnginx\b', r'\bhaproxy\b', r'\bcontext\s+switch', r'\bsearch\s+algorithm\b', r'\bsearch\s+functionality\s+with\s+elasticsearch\b', r'\bhealth\s+check\s+endpoint\s+pattern\b', r'\belasticsearch\b', r'\bsolr\b', r'\blucene\b', r'\bretention\s+polic(?:y|ies)\s+(?:in|for|with)\s+(?:kafka|s3|aws|gcp|azure|cloud)', r'\bpostgresql\b.*\bmongodb\b', r'\bmongodb\b.*\bpostgresql\b', r'\bwrite\s+a\s+decorator\b', r'\bdecorator.*retries?\b', r'\bci/cd\b', r'\bgithub\s+actions\b', r'\bcors\b.*\bnode\.js\b', r'\bnode\.js\b.*\bcors\b', r'\bcap\s+theorem\b', r'\bbinary\s+search\s+tree\b', r'\bvirtual\s+dom\b', r'\breact\b.*\breconciliation\b', r'\bdependency\s+injection\b', r'\btcp\b.*\budp\b', r'\budp\b.*\btcp\b', r'\btime\s+complexity\b', r'\bquicksort\b', r'\bexponential\s+backoff\b', r'\bjitter\b.*\bretri', r'\bapi\s+retri', r'\bcelery\b.*\bqueue', r'\broute\s+tasks?\s+in\s+celery\b', r'\bknowledge\s+graph\b.*\b(?:function|search|algorithm|traversal)\b', r'\b(?:function|write\s+a\s+function|implement)\b.*\bknowledge\s+graph\b', r'\bsave\s+(?:user\s+)?preferences?\s+in\s+(?:react|redux|localstorage|a\s+database)\b', r'\bexport\s+(?:data\s+)?from\s+(?:postgresql|mysql|sqlite|a\s+database)\b', r'\bpostgresql\b.*\bcsv\b', r'\bcsv\b.*\bpostgresql\b', # Additional patterns from bfcl_eval.py r'\bgoroutine\b', r'\bwrite\s+barrier\b', r'\brust\b.*\bborrow\b', r'\barc\b.*\bmutex\b', r'\bpin\b.*\bfuture\b', r'\bwindow\s+function\b', r'\bmongodb\b', r'\bmongoexport\b', r'\bdijkstra\b', r'\bdepth.first\s+search\b', r'\bconsistent\s+hashing\b', r'\bb.tree\b', r'\bbloom\s+filter\b', r'\blru\s+cache\b', r'\bordereddic\b', r'\bhorizontalpodautoscal', r'\bprometheus\b', r'\betcd\b', r'\braft\b', r'\bzerolog\b', r'\belk\s+stack\b', r'\bopentelemetry\b', r'\bcrdt\b', r'\bsaga\s+pattern\b', r'\btrie\b', r'\bweakmap\b', r'\bpromise.based\s+queue\b', r'\bcovering\s+index\b', r'\bmaterialized\s+view\b', r'\btf-idf\b', r'\btrigram\b', r'\bfuzzy\s+search\b', r'\btopological\s+sort\b', r'\bcycle\s+detection\b', r'\bprim.s\b', r'\bkruskal.s\b', r'\bspanning\s+tree\b', r'\bhot.module\s+replacement\b', r'\bvite\b', r'\bserver\s+component\b', r'\bclient\s+component\b', r'\bdocker(?:file)?\b', r'\bblue.green\s+deploy', r'\brolling\s+deploy', r'\bsticky\s+session\b', r'\bsession\s+replication\b', r'\bsession\s+fixation\b', r'\bjwt\b.*\bhttponly\b', r'\bpaging\b.*\bmemory\b', r'\bmmap\b', r'\bstack\s+vs\s+heap\b', r'\bv8\s+engine\b', r'\bgenerational\s+collection\b', r'\boptimistic\s+lock', r'\bpessimistic\s+lock', r'\bcrdt\b', r'\beventual\s+consistency\b.*\bdynamo', # General knowledge / weather / math r"what'?s\s+the\s+weather\b", r'\bforecast\b.*\btoday\b', r'\bwrite\s+a\s+sql\s+query\b', r'\bsecond.highest\s+salary\b', r'\bsql\s+query\s+(?:that|to)\b', ] PRISM_INTENT_PATTERNS = [ r'\bprism\b', r'\bsession\s*ledger\b', r'\bhandoff\b', r'\bknowledge\s+base\b', r'\bknowledge\s+items?\b', r'\bour\s+knowledge\b', r'\bsave.*(?:session|ledger|handoff)\b', r'\bload\s+context\b', r'\b(?:search|find).*(?:memory|sessions?|conversations?|notes)\b', r'\bproject\b', r'\bwhat\s+(?:do\s+)?we\s+(?:know|have)\b', r'\binstitutional\s+knowledge\b', r'\bdocumented\b', r'\bcurated\b', r'\bmemory\s+entry\b', r'\bmemory\s+backend\b', r'\bdiagnostics\b', r'\bledger\b', r'\bcompact\b.*(?:ledger|entries|session)\b', r'\bexport.*(?:memory|backup)\b', r'\b(?:delete|nuke|wipe|remove).*(?:entry|memory|entries)\b', r'\blog.*(?:what|accomplished|session)\b', r'\brecord.*(?:session|what)\b', r'\bhand.*(?:off|over)\b', r'\bbring.*up\s+to\s+speed\b', r'\bbug\s+fix.*(?:local\s+model|handle)\b', r'\broute.*(?:task|this)\b', r'\bbackfill\b', r'\bsynthesize\b', r'\bsession\s+graph\b', r'\bsession\s+links?\b', r'\bedges?\s+(?:up\s+to\s+date|consistent)\b', r'\bgraph\s+integrit', r'\bdangling\b', r'\breconnect.*(?:session|links?|references?)\b', r'\bpatch.*(?:links?|gaps?)\b', r'\bmissing\s+links?\b', r'\bsave\s+experience\b', r'\brecord\s+(?:a\s+)?milestone\b', r'\brecord\s+(?:a\s+)?success\b', r'\bupvote\b', r'\bdownvote\b', r'\bretention\s+polic(?:y|ies)\b', r'\bauto.expir\b', r'\bttl\b', r'\bknowledge\s+entry\b', r'\bknowledge\s+record\b', ] def validate_tool_call(prompt, tool_name, tool_args): """Layer 3: reject obvious false-positive tool calls and remap semantic neighbors. Copied from swe_bench_test.py with additions from bfcl_eval.py. Returns (tool_name, tool_args) — possibly changed if rejected or remapped. """ prompt_lower = prompt.lower() # Special NO_TOOL override: "confirm session link/graph consistency" → synthesize_edges if tool_name in ("NO_TOOL", "ERROR"): if re.search(r'\b(?:confirm|verify|validate|check|ensure)\b', prompt_lower): if re.search(r'\bsession\s+(?:link|edge|graph)\s+(?:consistency|consistent)\b', prompt_lower): proj_m = re.search(r'\b(?:for|on)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower) return 'session_synthesize_edges', ({'project': proj_m.group(1)} if proj_m else {}) return tool_name, tool_args # --- Group B remaps (before false-positive rejection) --- # "reconnect/patch up/dangling links" → backfill_links # But don't remap when "synthesize edges" is the explicit first action if tool_name in ('session_synthesize_edges', 'session_reconnect'): if re.search(r'\b(?:reconnect|backfill|patch\s+up|dangling|link\s+gaps?|missing\s+links?|fix\s+links?)\b', prompt_lower): if not re.search(r'^synthesize\b', prompt_lower) and \ not re.search(r'\bsynthesiz\w+\s+edges?\s+for\b', prompt_lower): return 'session_backfill_links', tool_args # "verify/check/make sure session links/edges are consistent / graph integrity" → synthesize_edges if tool_name in ('session_health_check', 'session_backfill_links'): _has_verify_verb = re.search( r'\b(?:verify|validate|check|make\s+sure|ensure|confirm)\b', prompt_lower ) _has_consistent_edge = re.search( r'\b(?:edges?|links?|graph)\b.*?\b(?:consistent|up\s+to\s+date|synthesized)\b' r'|\bconsistent\b.*?\b(?:edges?|links?|graph)\b' r'|\bsession\s+links?\b' r'|\bgraph\s+integrit', prompt_lower, re.DOTALL ) if _has_verify_verb and _has_consistent_edge: return 'session_synthesize_edges', tool_args # "synthesize edges for X, then backfill" → synthesize_edges is the FIRST action if tool_name == 'session_backfill_links': if re.search(r'(?:^|\bfirst\b|\bstart\s+with)\s*synthesize\s+edges?\b', prompt_lower) or \ re.search(r'^synthesize\b', prompt_lower): return 'session_synthesize_edges', tool_args # "wipe/clear old entries from knowledge base" → knowledge_forget (not compact_ledger) # BUT protect "session entries" / "session history" from this remap if tool_name == 'session_compact_ledger': if re.search(r'\bknowledge\b', prompt_lower) and re.search(r'\b(?:wipe|clear|delete|remove|entries)\b', prompt_lower): if not re.search(r'\bsession\s+(?:entries|history|ledger)\b', prompt_lower): return 'knowledge_forget', tool_args # "prune/trim/archive old session entries" → session_compact_ledger (not forget_memory) if tool_name in ('session_forget_memory', 'knowledge_forget'): if re.search(r'\b(?:prune|trim|archive|compress)\b', prompt_lower) and re.search(r'\b(?:session|ledger)\s+(?:entries|history)?\b', prompt_lower): proj_m = re.search(r'\b(?:for|on)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower) return 'session_compact_ledger', ({'project': proj_m.group(1)} if proj_m else tool_args) # "archive old entries" (without 'knowledge') → session_compact_ledger if tool_name == 'session_forget_memory': if re.search(r'\b(?:archive|prune|trim)\s+old\s+entries\b', prompt_lower): if not re.search(r'\bknowledge\b', prompt_lower) and not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]\b', prompt_lower): return 'session_compact_ledger', tool_args # "knowledge entries/items/records" + delete verbs → knowledge_forget (not session_forget_memory) if tool_name == 'session_forget_memory': if re.search(r'\bknowledge\s+(?:entr|items?|records?|base)\b', prompt_lower): return 'knowledge_forget', tool_args if re.search(r'\bknowledge\s+base\b', prompt_lower) and re.search(r'\b(?:entries|records|items)\b', prompt_lower): return 'knowledge_forget', tool_args # "delete/wipe entries from [project]" without a specific memory ID → knowledge_forget if re.search(r'\b(?:entries|records|logs?)\b', prompt_lower) and re.search(r'\bproject\b', prompt_lower): if not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]|ID\s*[=:]\s*\S+', prompt): if not re.search(r'\b(?:session|ledger)\b', prompt_lower): proj_m = re.search(r'(?:for|from|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project', prompt_lower, re.I) return 'knowledge_forget', {'project': proj_m.group(1) if proj_m else None} # "where were we / bring me up to speed / catch me up" → session_load_context (not session_search_memory) if tool_name == 'session_search_memory': if re.search(r'\bwhere\s+were\s+we\b|\bbring\s+me\s+up\s+to\s+speed\b|\bcatch\s+me\s+up\b|\bwhat\s+were\s+we\s+(?:doing|working)', prompt_lower): project_m = re.search( r'\b(?:on|for|with|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b' r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b' r'|(?:state\s+of\s+(?:the\s+)?)([a-zA-Z][a-zA-Z0-9_-]+)(?:\s+project)?\b', prompt_lower ) if project_m: project = next((g for g in project_m.groups() if g and g not in ('the', 'a', 'this', 'that', 'my', 'our')), None) else: project = None return 'session_load_context', {'project': project} if project else {} # "accumulated documentation / knowledge base" → knowledge_search (not session_search_memory) if tool_name == 'session_search_memory': if re.search(r'\baccumulated\s+documentation\b|\bknowledge\s+base\b', prompt_lower): return 'knowledge_search', tool_args # "recent / past / last week / what we did" → session_search_memory (not knowledge_search) if tool_name == 'knowledge_search': session_hints = [ r'\brecent\b', r'\bpast\b', r'\blast\s+(?:week|month|session)', r'\bwhat\s+we\s+(?:did|decided|worked)', r'\bdeployment\s+issues\b', ] if any(re.search(p, prompt_lower) for p in session_hints): return 'session_search_memory', tool_args # "remind me / did we ever decide" → session_search_memory (not load_context) if tool_name == 'session_load_context': if re.search(r'\bremind\s+me\b|\bdid\s+we\s+ever\s+(?:decide|settle|choose|pick)\b|\bwhat\s+did\s+we\s+decide\b', prompt_lower): if not re.search(r'\bbring\s+me\s+up\s+to\s+speed\b|\bwhere\s+were\s+we\b|\bcatch\s+me\s+up\b|\bload\s+.*\bcontext\b', prompt_lower): return 'session_search_memory', {"query": prompt[:120]} # "jot down / write down / make a note / log what just happened" → session_save_ledger _LEDGER_TRIGGERS = re.compile( r'\bjot\s+down\b|\bwrite\s+(?:it\s+)?down\b|\bwhat\s+we\s+accomplished\b' r'|\bmake\s+sure\s+it.{0,10}written\b|\brecord\s+(?:this\s+session|what)\b' r'|\bmake\s+(?:a\s+)?note\s+(?:that|of)\b|\blog\s+what\s+just\s+happened\b' r'|\bwrite\s+down\s+everything\b|\bbefore\s+I\s+(?:close|head\s+out)\b', re.IGNORECASE ) # negative: milestone/achievement events that belong in save_experience _EXPERIENCE_NEGATIVE = re.compile( r'\b(?:successfully|milestone|achievement|deployed\s+the|shipped\s+the|launched\s+the' r'|we\s+(?:fixed|built|completed|created|resolved|deployed|shipped|launched)\s+the' r'|race\s+condition|solid\s+now|zero.downtime)\b' ) # Unambiguous note-taking phrases bypass the milestone negative check _NOTE_TRIGGERS = re.compile( r'\bmake\s+(?:a\s+)?note\s+(?:that|of)\b|\bjot\s+down\b' r'|\bwrite\s+(?:it\s+)?down\b|\blog\s+what\s+just\s+happened\b', re.IGNORECASE ) if tool_name in ('session_save_experience', 'session_task_route'): if _LEDGER_TRIGGERS.search(prompt): if _NOTE_TRIGGERS.search(prompt) or not _EXPERIENCE_NEGATIVE.search(prompt_lower): if 'content' in tool_args and 'summary' not in tool_args: tool_args = dict(tool_args) tool_args['summary'] = tool_args.pop('content') if 'summary' not in tool_args: work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed|finalized|completed|migrated)\s+.{10,120})', prompt, re.I) if not work_m: work_m = re.search(r'(?:make\s+a\s+note|log|note)\s+(?:that\s+)?(?:we\s+)?(completed|finished|did|wrote|refactored|migrated).{0,120}', prompt, re.I) if work_m: tool_args = dict(tool_args) tool_args['summary'] = work_m.group(0).strip().rstrip('.') return 'session_save_ledger', tool_args # "record that we fixed/built/resolved [thing]" → session_save_experience (milestone) if tool_name == 'session_save_ledger': if re.search(r'\brecord\s+that\s+we\s+(?:fixed|built|completed|created|resolved|deployed|shipped|launched)\b', prompt_lower): return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "milestone"} # content → summary normalization + inline extraction for session_save_ledger if tool_name == 'session_save_ledger': if 'content' in tool_args and 'summary' not in tool_args: tool_args = dict(tool_args) tool_args['summary'] = tool_args.pop('content') if 'summary' not in tool_args: work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed|finalized|completed|migrated)\s+.{10,120})', prompt, re.I) if not work_m: work_m = re.search(r'(?:log|note|record)\s+(?:what\s+just\s+happened|this|that)\s*[:;]\s*(.{10,120})', prompt, re.I) if work_m: tool_args = dict(tool_args) tool_args['summary'] = (work_m.group(1) if work_m.lastindex else work_m.group(0)).strip().rstrip('.') # "log that we successfully deployed/shipped" → session_save_experience milestone (not save_ledger) if tool_name == 'session_save_ledger': if re.search(r'\blog\s+that\s+we\s+successfully\b|\bsuccessfully\s+deployed\b|\bsuccessfully\s+shipped\b|\bsuccessfully\s+launched\b', prompt_lower): return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "success"} # "shift change / store current state for next agent" → session_save_handoff if tool_name == 'session_save_ledger': if re.search(r'\bshift\s+change\b|\bstore\s+(?:the\s+)?current\s+state\s+for\b|\bnext\s+(?:agent|person|developer)\s+can\s+continue\b|\bhand.*over\b|\bpick.*up\s+next\b', prompt_lower): return 'session_save_handoff', tool_args # Multi-intent: "Search/Find ... THEN upvote/downvote" → first action is search if tool_name in ('knowledge_upvote', 'knowledge_downvote'): if re.search(r'\bthen\s+(?:upvote|downvote|boost|rate\s+up|rate\s+down)\b', prompt_lower): if re.search(r'^(?:search|find|look\s+up)\b', prompt_lower): query_m = re.search( r'^(?:search\s+(?:for\s+)?|find\s+(?:our\s+)?(?:notes?\s+on\s+)?|look\s+up\s+)(.+?)(?:,?\s*then\b)', prompt, re.I ) return 'session_search_memory', {"query": query_m.group(1).strip() if query_m else prompt[:120]} # invalid tool name → try retention or upvote/downvote if tool_name not in VALID_TOOLS: if re.search(r'\b(?:auto.?expir|ttl\b|\d+\s*days?\s+(?:retention|expir)|\bretention\s*polic)', prompt_lower): return 'knowledge_set_retention', tool_args # fall through to upvote/downvote patterns below # knowledge_forget / knowledge_set_retention → upvote/downvote protection _UPVOTE_SET = {'knowledge_forget', 'knowledge_set_retention', 'session_forget_memory', 'session_task_route', 'session_search_memory'} # Don't remap to upvote/downvote when primary intent is "search THEN upvote" _is_search_then_vote = ( re.search(r'^(?:search|find|look\s+up)\b', prompt_lower) and re.search(r'\bthen\s+(?:upvote|downvote|boost|rate\s+up|rate\s+down)\b', prompt_lower) ) if (tool_name in _UPVOTE_SET or tool_name not in VALID_TOOLS) and not _is_search_then_vote: _id_val = (tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")) if isinstance(tool_args, dict) else None if re.search(r'\b(?:upvote|boost|increase\s+(?:the\s+|its\s+)?(?:rank|score|importance)|uprate|thumbs[\s-]?up|mark\s+(?:it\s+)?(?:up|helpful|useful|great|good)|importance\s+score)\b', prompt_lower): return 'knowledge_upvote', {"id": _id_val} if re.search(r'\b(?:downvote|lower\s+(?:the\s+|its\s+)?(?:rank|score)|not\s+useful|derank|thumbs[\s-]?down|reduce\s+(?:the\s+|its\s+)?(?:rank|score)|mark\s+(?:it\s+)?(?:down|bad|wrong|outdated|terrible))\b', prompt_lower): return 'knowledge_downvote', {"id": _id_val} # session_load_context: extract project from prompt if missing if tool_name == 'session_load_context': if not (isinstance(tool_args, dict) and tool_args.get('project')): proj_m = re.search( r'\b(?:on|for|of|with|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b' r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b' r'|(?:state\s+of\s+(?:the\s+)?)([a-zA-Z][a-zA-Z0-9_-]+)(?:\s+project)?\b', prompt_lower ) if proj_m: proj = next((g for g in proj_m.groups() if g), None) if proj and proj not in ('the', 'a', 'this', 'that', 'my', 'our'): tool_args = dict(tool_args) if isinstance(tool_args, dict) else {} tool_args['project'] = proj # session_compact_ledger: extract project if missing if tool_name == 'session_compact_ledger': if not (isinstance(tool_args, dict) and tool_args.get('project')): proj_m = re.search( r'\b(?:for|on|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+(?:project\s+)?ledger\b' r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\s+ledger\b' r'|\b(?:compact|trim|prune|compress|archive)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+(?:project|ledger)\b', prompt_lower ) if proj_m: proj = next((g for g in proj_m.groups() if g), None) if proj and proj not in ('the', 'a', 'this', 'that', 'my', 'our', 'old', 'stale'): tool_args = dict(tool_args) if isinstance(tool_args, dict) else {} tool_args['project'] = proj # "is this something the local model can handle? / route this task" → session_task_route if tool_name == 'session_search_memory': if re.search(r'\b(?:local\s+(?:model|agent)\s+(?:can\s+handle|should\s+handle)|route\s+this\s+task|should\s+(?:I|the\s+local\s+model)\s+(?:tackle|handle)|is\s+this\s+(?:something|simple\s+enough)\s+(?:for\s+the\s+)?local)\b', prompt_lower): return 'session_task_route', {"task_description": prompt} # session_task_route: extract task_description from prompt if tool_name == 'session_task_route': if 'task_description' not in tool_args or not tool_args.get('task_description'): tool_args = dict(tool_args) tool_args['task_description'] = prompt # session_export_memory: extract output_path from path patterns, format from keywords if tool_name == 'session_export_memory': if not isinstance(tool_args, dict): tool_args = {} tool_args = dict(tool_args) if 'output_path' not in tool_args or not tool_args.get('output_path'): path_m = re.search( r'(?:save\s+to|(?:output|export|dump)\s+(?:to\s+)?["\']?|to\s+["\']?)(/[\w/.-]+|~/[\w/.-]+)', prompt, re.I ) if path_m: tool_args['output_path'] = path_m.group(1) if 'format' not in tool_args or not tool_args.get('format'): fmt_m = re.search(r'\b(json|jsonl|markdown|csv|yaml)\b(?:\s+format)?', prompt_lower) if fmt_m: tool_args['format'] = fmt_m.group(1) # session_compact_ledger: protect "session entries" from knowledge_forget remap # (already handled above but ensure compact stays for session-specific prompts) # "where did we leave off / what was the state" → session_load_context if tool_name == 'session_search_memory': if re.search(r'\bwhere\s+did\s+we\s+leave\s+off\b|\bwhat\s+was\s+the\s+state\s+of\b|\bget\s+me\s+(?:re-?oriented|up\s+to\s+speed)\b|\bpull\s+up\s+(?:whatever|the\s+(?:full\s+)?context)', prompt_lower): project_m = re.search(r'\b(?:on|for|with|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower) project = project_m.group(1) if project_m else None return 'session_load_context', ({'project': project} if project else {}) # --- Social pleasantry rejection --- SOCIAL_PATTERNS = [ r'^thanks', r'^thank you', r'^cheers', r'^goodbye', r'^bye', r"that's all", r"we're done", r"all done", r"all set", r'^ok\s+great', r'^perfect$', r'^nice$', r'^cool$', r'^hi\b', r'^hey\b', r'^hello\b', r'^good\s+morning', r'^good\s+afternoon', ] is_social = any(re.search(p, prompt_lower.strip()) for p in SOCIAL_PATTERNS) if is_social and not any(w in prompt_lower for w in [ 'save', 'export', 'search', 'load', 'record', 'log', 'run', 'check', 'find', 'compact', 'handoff', 'route', 'synthesize', 'backfill', 'forget', 'upvote', 'downvote', ]): return "NO_TOOL", {} # --- False-positive rejection (CS patterns) --- is_general = any(re.search(p, prompt_lower) for p in GENERAL_PROGRAMMING_PATTERNS) if not is_general: return tool_name, tool_args has_prism_intent = any(re.search(p, prompt_lower) for p in PRISM_INTENT_PATTERNS) if has_prism_intent: return tool_name, tool_args return "NO_TOOL", {} # --------------------------------------------------------------------------- # Ollama Call # --------------------------------------------------------------------------- TOOL_CALL_NOPIPE_RE = re.compile( r'\s*(\{.*?\})\s*(?:|$)', re.DOTALL ) TOOL_CALL_PIPE_RE = re.compile( r'<\|tool_call\|>\s*(\{.*?\})', re.DOTALL ) BARE_JSON_RE = re.compile( r'(\{[^{}]*"name"\s*:\s*"[^"]+?"[^{}]*(?:\{[^{}]*\}[^{}]*)*\})' ) def call_ollama(prompt: str, timeout: int = 120) -> tuple: """Call Ollama REST API with a pre-formatted ChatML prompt. Returns (raw_response, tool_name, tool_args, latency_secs). """ start = time.time() try: payload = json.dumps({ "model": MODEL, "prompt": prompt, "stream": False, "raw": True, "options": {"temperature": 0.0, "num_predict": 512}, }).encode("utf-8") req = urllib.request.Request( OLLAMA_API, data=payload, headers={"Content-Type": "application/json"}, ) with urllib.request.urlopen(req, timeout=timeout) as resp: data = json.loads(resp.read().decode("utf-8")) raw = data.get("response", "").strip() except Exception as exc: return (str(exc), "ERROR", {}, time.time() - start) latency = time.time() - start # Strip CoT blocks clean = re.sub( r'<\|synalux_think\|>.*?(?:|$)', '', raw, flags=re.DOTALL ) # Strategy 0: no-pipe (v43 native format) m = TOOL_CALL_NOPIPE_RE.search(clean) if m: try: tj = json.loads(m.group(1)) return (raw, tj.get("name", tj.get("tool", "UNKNOWN")), tj.get("arguments", tj.get("args", {})), latency) except json.JSONDecodeError: pass # Strategy 1: piped <|tool_call|> m = TOOL_CALL_PIPE_RE.search(clean) if m: try: tj = json.loads(m.group(1)) return (raw, tj.get("name", tj.get("tool", "UNKNOWN")), tj.get("arguments", tj.get("args", {})), latency) except json.JSONDecodeError: pass # Strategy 2: bare JSON with "name" key m = BARE_JSON_RE.search(clean) if m: try: tj = json.loads(m.group(0)) return (raw, tj.get("name", "UNKNOWN"), tj.get("arguments", tj.get("args", {})), latency) except json.JSONDecodeError: pass return (raw, "NO_TOOL", {}, latency) # --------------------------------------------------------------------------- # Scoring # --------------------------------------------------------------------------- def evaluate_result(expected_tool, required_params, got_tool, got_args): """ Returns one of: strict_pass — correct tool + all required_params present partial_pass — correct tool + at least 1 required_param present but not all wrong_tool — tool name is wrong (includes false positives / negatives) false_positive — tool called when NO_TOOL expected false_negative — NO_TOOL returned when tool expected """ if expected_tool == "NO_TOOL": return "false_positive" if got_tool != "NO_TOOL" else "strict_pass" if got_tool == "NO_TOOL": return "false_negative" # Accept either search tool for ambiguous prompts tools_match = (got_tool == expected_tool) or ( expected_tool in ("session_search_memory", "knowledge_search") and got_tool in ("session_search_memory", "knowledge_search") ) if not tools_match: return "wrong_tool" if not required_params: return "strict_pass" if not isinstance(got_args, dict): got_args = {} present = [p for p in required_params if p in got_args and got_args[p] not in (None, "", [])] if len(present) == len(required_params): return "strict_pass" if len(present) > 0: return "partial_pass" # Right tool, zero params matched return "partial_pass" def score(verdict): if verdict == "strict_pass": return 1.0 if verdict == "partial_pass": return 0.5 return 0.0 # --------------------------------------------------------------------------- # Main Eval # --------------------------------------------------------------------------- def run_once(tests, shuffle=False, run_label=""): """Run one full pass over test suite. Returns (results_list, category_stats).""" indexed = list(enumerate(tests)) if shuffle: random.shuffle(indexed) results = [None] * len(tests) category_stats = {} for display_i, (orig_idx, (prompt, expected, req_params, category)) in enumerate(indexed, 1): chatml = ( f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n" f"<|im_start|>user\n{prompt}<|im_end|>\n" f"<|im_start|>assistant\n" ) raw, got_tool, got_args, latency = call_ollama(chatml) got_tool, got_args = validate_tool_call(prompt, got_tool, got_args) verdict = evaluate_result(expected, req_params, got_tool, got_args) icon = "OK" if verdict == "strict_pass" else ("~~" if verdict == "partial_pass" else "XX") tag = f"#{orig_idx + 1:03d}" short = prompt[:52] run_info = f"[{run_label}] " if run_label else "" print( f" {run_info}[{display_i:3d}/{len(tests)}] {icon} {tag} " f"expect={expected:30s} got={got_tool:30s} {latency:5.1f}s | {short}" ) if verdict != "strict_pass": if verdict == "partial_pass": missing = [p for p in req_params if p not in got_args or got_args.get(p) in (None, "", [])] print(f" -> partial: missing params {missing}") elif verdict == "false_positive": print(f" -> FALSE POSITIVE: called {got_tool} (expected NO_TOOL)") elif verdict == "false_negative": print(f" -> FALSE NEGATIVE: no tool called (expected {expected})") elif verdict == "wrong_tool": print(f" -> WRONG TOOL: expected {expected}, got {got_tool}") results[orig_idx] = { "id": orig_idx + 1, "prompt": prompt, "expected": expected, "got": got_tool, "got_args": got_args, "verdict": verdict, "latency": latency, "category": category, "points": score(verdict), } if category not in category_stats: category_stats[category] = {"total": 0, "strict": 0, "partial": 0, "fail": 0, "points": 0.0} cat = category_stats[category] cat["total"] += 1 cat["points"] += score(verdict) if verdict == "strict_pass": cat["strict"] += 1 elif verdict == "partial_pass": cat["partial"] += 1 else: cat["fail"] += 1 return results, category_stats def print_run_summary(results, category_stats, run_label=""): strict = sum(1 for r in results if r["verdict"] == "strict_pass") partial = sum(1 for r in results if r["verdict"] == "partial_pass") fp = sum(1 for r in results if r["verdict"] == "false_positive") fn = sum(1 for r in results if r["verdict"] == "false_negative") wt = sum(1 for r in results if r["verdict"] == "wrong_tool") total = len(results) total_points = sum(r["points"] for r in results) tool_tests = [r for r in results if r["expected"] != "NO_TOOL"] no_tool_tests = [r for r in results if r["expected"] == "NO_TOOL"] no_tool_correct = sum(1 for r in no_tool_tests if r["verdict"] == "strict_pass") hallucinations = sum(1 for r in results if r["verdict"] == "false_positive") avg_lat = sum(r["latency"] for r in results) / total if total else 0 lbl = f" (Run {run_label})" if run_label else "" print() print("=" * 80) print(f" EVAL-300 RESULTS{lbl}") print("=" * 80) print(f" Strict Pass: {strict}/{total} = {strict / total * 100:.1f}%") print(f" Partial Pass: {partial}/{total} = {partial / total * 100:.1f}%") print(f" Wrong Tool: {wt}/{total}") print(f" False Positives: {fp}/{total} (hallucinations)") print(f" False Negatives: {fn}/{total}") print(f" ---") print(f" strict_pct (strict/total): {strict / total * 100:.1f}%") print(f" weighted_pct (total_points/total): {total_points / total * 100:.1f}%") print(f" Abstention accuracy: {no_tool_correct}/{len(no_tool_tests)} = {no_tool_correct / len(no_tool_tests) * 100:.1f}%") print(f" Hallucinations: {hallucinations} (target = 0)") print(f" Avg latency: {avg_lat:.1f}s") print() print(f" {'Category':<22} {'Strict':>7} {'Partial':>8} {'Fail':>5} {'Pts/Tot':>10} {'Pct':>6}") print(f" {'-'*22} {'-'*7} {'-'*8} {'-'*5} {'-'*10} {'-'*6}") for cat, s in sorted(category_stats.items()): pts_pct = s["points"] / s["total"] * 100 if s["total"] else 0 print(f" {cat:<22} {s['strict']:>7} {s['partial']:>8} {s['fail']:>5} " f"{s['points']:>5.1f}/{s['total']:<4} {pts_pct:>5.1f}%") print("=" * 80) return { "strict": strict, "partial": partial, "wrong_tool": wt, "false_positive": fp, "false_negative": fn, "total": total, "total_points": total_points, "strict_pct": strict / total, "weighted_pct": total_points / total, "abstention_rate": no_tool_correct / len(no_tool_tests) if no_tool_tests else 0, "hallucinations": hallucinations, "avg_latency": avg_lat, "category_stats": category_stats, } def main(): parser = argparse.ArgumentParser(description="Eval-300: 300-case standard evaluation for prism-coder") parser.add_argument("--model", type=str, default=None, help="Ollama model tag to evaluate (default: prism-coder:4b-v43)") parser.add_argument("--runs", type=int, default=1, help="Number of eval runs (default: 1; use 3 for stability check)") parser.add_argument("--shuffle", action="store_true", help="Randomize test order each run") parser.add_argument("--no-validate-layer3", action="store_true", help="Disable Layer 3 false-positive rejection " "(use during RFT/DPO so model sees true failures)") args = parser.parse_args() global MODEL, validate_tool_call if args.model: MODEL = args.model if args.no_validate_layer3: def validate_tool_call(prompt, tool_name, tool_args): # noqa: F811 return tool_name, tool_args _verify_test_counts() print("=" * 80) print(f" EVAL-300 — prism-coder standard evaluation") print(f" Model: {MODEL}") print(f" Tests: {len(TESTS)}") print(f" Runs: {args.runs}" + (" (RANDOMIZED ORDER each run)" if args.shuffle else "")) print(f" Layer3: {'DISABLED' if args.no_validate_layer3 else 'enabled'}") print("=" * 80) all_run_summaries = [] all_run_results = [] for run_idx in range(args.runs): run_label = str(run_idx + 1) if args.runs > 1 else "" if args.runs > 1: print(f"\n{'#' * 80}") print(f" RUN {run_idx + 1} / {args.runs}" + (f" (seed={random.randint(1000, 9999)})" if args.shuffle else "")) print(f"{'#' * 80}") results, cat_stats = run_once(TESTS, shuffle=args.shuffle, run_label=run_label) summary = print_run_summary(results, cat_stats, run_label=run_label) all_run_summaries.append(summary) all_run_results.append(results) # --------------------------------------------------------------------------- # Multi-run aggregate # --------------------------------------------------------------------------- if args.runs > 1: strict_scores = [s["strict"] for s in all_run_summaries] weighted_pcts = [s["weighted_pct"] * 100 for s in all_run_summaries] total = all_run_summaries[0]["total"] halluc_counts = [s["hallucinations"] for s in all_run_summaries] # Per-test stability per_test_pass = [0] * len(TESTS) per_test_fail_tools = [[] for _ in range(len(TESTS))] for run_results in all_run_results: for r in run_results: idx = r["id"] - 1 if r["verdict"] == "strict_pass": per_test_pass[idx] += 1 else: per_test_fail_tools[idx].append(r.get("got", "???")) med_strict = statistics.median(strict_scores) avg_strict = statistics.mean(strict_scores) med_weighted = statistics.median(weighted_pcts) print(f"\n{'=' * 80}") print(f" MULTI-RUN SUMMARY ({args.runs} runs x {total} tests)") print(f"{'=' * 80}") print(f" Strict scores: {' | '.join(f'{s}/{total}' for s in strict_scores)}") print(f" Median strict: {med_strict}/{total} = {med_strict / total * 100:.1f}%") print(f" Average strict: {avg_strict:.1f}/{total} = {avg_strict / total * 100:.1f}%") print(f" Weighted pct: {' | '.join(f'{p:.1f}%' for p in weighted_pcts)} " f"(median {med_weighted:.1f}%)") print(f" Hallucinations: {' | '.join(str(h) for h in halluc_counts)} " f"(target = 0 each run)") print() print(f" Flaky tests (< 100% pass rate across {args.runs} runs):") flaky = [] for i, (prompt, expected, _, cat) in enumerate(TESTS): rate = per_test_pass[i] / args.runs if rate < 1.0: fail_tools = per_test_fail_tools[i] flaky.append((i + 1, rate, expected, set(fail_tools), cat, prompt[:60])) if flaky: for fid, rate, exp, fails, fcat, fshort in sorted(flaky, key=lambda x: x[1]): print(f" [{fid:03d}] {rate * 100:3.0f}% | cat={fcat:<18s} | expect={exp:<28s} | fails->{','.join(fails):<20s} | {fshort}") else: print(" All tests passed consistently across all runs!") print(f" Total flaky: {len(flaky)}/{total}") print(f"{'=' * 80}") # --------------------------------------------------------------------------- # Save JSON report # --------------------------------------------------------------------------- os.makedirs("results", exist_ok=True) report_path = "results/eval300_report.json" final_summary = all_run_summaries[-1] if args.runs == 1 else { "runs": args.runs, "strict_scores": strict_scores, "median_strict": statistics.median(strict_scores) / total, "avg_strict": statistics.mean(strict_scores) / total, "median_weighted_pct": statistics.median(weighted_pcts) / 100, "hallucinations_per_run": halluc_counts, "per_run_summaries": all_run_summaries, } if args.runs > 1 else all_run_summaries[0] report = { "model": MODEL, "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"), "total_tests": len(TESTS), "runs": args.runs, "shuffle": args.shuffle, "layer3_enabled": not args.no_validate_layer3, "summary": final_summary, "last_run_results": all_run_results[-1], } with open(report_path, "w") as f: json.dump(report, f, indent=2, default=str) print(f"\nReport saved: {report_path}") # Exit code: fail if last run strict < 90% last_strict_pct = all_run_summaries[-1]["strict_pct"] * 100 if last_strict_pct < 90.0: print(f"FAIL: strict_pct {last_strict_pct:.1f}% is below 90% gate") sys.exit(1) else: print(f"PASS: strict_pct {last_strict_pct:.1f}%") sys.exit(0) if __name__ == "__main__": main()