#!/usr/bin/env python3
"""
eval_300.py — 300-Case Standard Evaluation for prism-coder:4b-v43
Replaces bfcl_eval.py (64 tests) and swe_bench_test.py (68 tests) with a single
~300-case blind eval. Designed to be run 3 times for statistical stability checks.
All test cases are NOVEL — never seen in any training data.
Categories:
natural_phrasing (50) — casual/indirect phrasing that maps to a tool
adversarial_trap (70) — CS/programming questions that must NOT call a tool
disambiguation (40) — similar tools exist; must pick the correct one
edge_case (25) — minimal / ambiguous prompts
multi_intent (20) — multi-step prompts; score on first action only
verifier (25) — synthesize_edges / backfill_links / health_check patterns
cascade (25) — explicit first-step-of-chain patterns
param_extraction (25) — params in the prompt text; test correct extraction
abstention (20) — greetings / capability questions; must return NO_TOOL
Scoring:
strict_pass = correct tool + all required_params present → 1.0 point
partial_pass = correct tool + at least 1 required_param but not all → 0.5 point
wrong_tool = wrong tool name → 0 points
false_pos = tool called when NO_TOOL expected → 0 points
false_neg = NO_TOOL when tool expected → 0 points
Usage:
python3 eval_300.py
python3 eval_300.py --runs 3 --shuffle
python3 eval_300.py --model prism-coder:4b-v43 --runs 3
python3 eval_300.py --no-validate-layer3
"""
import json
import os
import re
import sys
import time
import random
import statistics
import urllib.request
import argparse
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
MODEL = "prism-coder:4b-v43"
OLLAMA_API = "http://localhost:11434/api/generate"
SYSTEM_PROMPT = (
"You are Synalux, a memory-augmented coding and clinical reasoning assistant. "
"You have access to Prism Memory tools (session_save_ledger, session_load_context, "
"session_search_memory, session_save_handoff, session_forget_memory, session_health_check, "
"session_compact_ledger, session_export_memory, session_task_route, session_save_experience, "
"session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, "
"knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool "
"modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, "
"hipaa, data_graph, templates, pdf_parser). "
"Think step-by-step before answering. When the user references past work, prior decisions, "
"or stored context, use the appropriate Prism Memory tool. "
"Format tool calls inside ... JSON blocks with fields 'name' and 'arguments'. "
"If no tool is needed, answer directly in plain text. "
"ABSTAIN for general programming questions, CS concepts, greetings, and capability questions."
)
VALID_TOOLS = {
"session_load_context", "session_save_ledger", "session_save_handoff",
"session_search_memory", "session_forget_memory", "session_health_check",
"session_compact_ledger", "session_export_memory", "session_task_route",
"session_save_experience", "session_synthesize_edges", "session_backfill_links",
"knowledge_search", "knowledge_forget", "knowledge_upvote",
"knowledge_downvote", "knowledge_set_retention",
}
# ---------------------------------------------------------------------------
# Test Cases (prompt, expected_tool_or_NO_TOOL, required_params, category)
# required_params: list of param keys that MUST appear in got_args
# ---------------------------------------------------------------------------
TESTS = [
# ===========================================================================
# CATEGORY 1: natural_phrasing (50 cases)
# Casual / indirect user phrasing that maps to a specific Prism tool.
# ===========================================================================
# --- session_load_context ---
("Alright, kick things off. Pull up whatever we had on the checkout-service project.",
"session_load_context", ["project"], "natural_phrasing"),
("I'm back from lunch. Get me re-oriented on the prism-aac project.",
"session_load_context", ["project"], "natural_phrasing"),
("Fresh session here. Reconstruct everything we built for the notifications project.",
"session_load_context", ["project"], "natural_phrasing"),
("Starting a new chat. Bring up the full context for the mobile-app project.",
"session_load_context", ["project"], "natural_phrasing"),
("Where did we leave off with the auth-service work?",
"session_load_context", [], "natural_phrasing"),
("Get me up to speed on the reporting-dashboard project.",
"session_load_context", ["project"], "natural_phrasing"),
("Resume from where we were on the data-pipeline project.",
"session_load_context", ["project"], "natural_phrasing"),
("Catch me up — what was the state of the subscription-api project?",
"session_load_context", ["project"], "natural_phrasing"),
# --- session_save_ledger ---
("We wrapped up for today. Make a note that we completed the database indexing overhaul.",
"session_save_ledger", [], "natural_phrasing"),
("Log what just happened: we refactored the payment module and all tests pass.",
"session_save_ledger", [], "natural_phrasing"),
("Record this session — we finalized the API contract for the mobile team.",
"session_save_ledger", [], "natural_phrasing"),
("Write down everything we did today before I close this tab.",
"session_save_ledger", [], "natural_phrasing"),
("Jot down our progress: three endpoints migrated, two more to go.",
"session_save_ledger", [], "natural_phrasing"),
("Before I head out, save a summary of what we accomplished this afternoon.",
"session_save_ledger", [], "natural_phrasing"),
# --- session_save_handoff ---
("I'm handing this over. Leave a note for whoever picks this up next on the billing-portal project.",
"session_save_handoff", ["project"], "natural_phrasing"),
("Pass the baton on the logistics-api project. Save the handoff so the next person knows where we are.",
"session_save_handoff", ["project"], "natural_phrasing"),
("Shift change. Store the current state for the embedded-firmware project so the next agent can continue.",
"session_save_handoff", ["project"], "natural_phrasing"),
("Create a handoff note for the trading-platform project — we got through feature flagging, still need A/B routing.",
"session_save_handoff", ["project"], "natural_phrasing"),
# --- session_search_memory ---
("Remind me — did we ever pick a caching strategy for the CDN layer?",
"session_search_memory", ["query"], "natural_phrasing"),
("Did we discuss anything about Kafka consumer lag in previous sessions?",
"session_search_memory", ["query"], "natural_phrasing"),
("Go back through our history and find anything about the CI pipeline refactor.",
"session_search_memory", ["query"], "natural_phrasing"),
("What did we decide about webhook retry logic in past conversations?",
"session_search_memory", ["query"], "natural_phrasing"),
("Dig up anything we recorded about the multi-tenant database design.",
"session_search_memory", ["query"], "natural_phrasing"),
("Pull up any notes we saved about the gRPC migration.",
"session_search_memory", ["query"], "natural_phrasing"),
# --- session_forget_memory ---
("That entry we saved about using SQLite in production is totally wrong. Remove it.",
"session_forget_memory", ["memory_id"], "natural_phrasing"),
("Delete the memory with ID mem-zx91-ff. It's stale.",
"session_forget_memory", ["memory_id"], "natural_phrasing"),
("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.",
"session_forget_memory", ["memory_id"], "natural_phrasing"),
# --- session_health_check ---
("Something feels off. Can you run diagnostics on the memory backend?",
"session_health_check", [], "natural_phrasing"),
("Before I trust these search results, verify the memory system is healthy.",
"session_health_check", [], "natural_phrasing"),
("Give the memory infrastructure a quick checkup.",
"session_health_check", [], "natural_phrasing"),
# --- session_compact_ledger ---
("The session history for the event-sourcing project is getting massive. Trim and archive the old entries.",
"session_compact_ledger", ["project"], "natural_phrasing"),
("Compress the ledger for the recommendation-engine project — too much noise in there.",
"session_compact_ledger", ["project"], "natural_phrasing"),
("Prune out the old session entries for the analytics-backend project.",
"session_compact_ledger", ["project"], "natural_phrasing"),
# --- session_export_memory ---
("Dump a full backup of my memory to /data/exports in JSON format.",
"session_export_memory", ["output_path", "format"], "natural_phrasing"),
("Export everything to /tmp/prism-dump so I can archive it.",
"session_export_memory", ["output_path"], "natural_phrasing"),
("I need an offline copy of all session data. Export to /backup/weekly.",
"session_export_memory", ["output_path"], "natural_phrasing"),
# --- session_task_route ---
("Should I tackle this Rust async runtime bug locally or send it to a bigger model?",
"session_task_route", ["task_description"], "natural_phrasing"),
("Is this image classification fine-tuning job something the local agent can handle?",
"session_task_route", ["task_description"], "natural_phrasing"),
("Route this task: refactor the monorepo build system to support incremental compilation.",
"session_task_route", ["task_description"], "natural_phrasing"),
# --- session_save_experience ---
("Log a milestone: we successfully zero-downtime-deployed the new search index.",
"session_save_experience", [], "natural_phrasing"),
("Record that we fixed the race condition in the WebSocket handler — took 4 hours but it's solid now.",
"session_save_experience", [], "natural_phrasing"),
# --- knowledge_search ---
("Any institutional knowledge on how we handle circuit breakers?",
"knowledge_search", ["query"], "natural_phrasing"),
("What does our knowledge base say about rate limiting strategies?",
"knowledge_search", ["query"], "natural_phrasing"),
("Look up anything curated about CQRS patterns.",
"knowledge_search", ["query"], "natural_phrasing"),
("Check our documented knowledge for anything on event-driven architecture.",
"knowledge_search", ["query"], "natural_phrasing"),
# --- knowledge_upvote / downvote ---
("That knowledge entry about using Redis for distributed locks was really helpful. Give it a thumbs up.",
"knowledge_upvote", [], "natural_phrasing"),
("Boost the ranking on our GraphQL federation notes — they're gold.",
"knowledge_upvote", [], "natural_phrasing"),
("That doc about using polling instead of webhooks is outdated and wrong. Lower its score.",
"knowledge_downvote", [], "natural_phrasing"),
("Downvote the entry about using bcrypt at cost 4 — it's dangerously insecure.",
"knowledge_downvote", [], "natural_phrasing"),
# --- knowledge_set_retention ---
("Set a 45-day retention policy on the alpha-testing project's knowledge.",
"knowledge_set_retention", ["project"], "natural_phrasing"),
# ===========================================================================
# CATEGORY 2: adversarial_trap (70 cases)
# CS / programming questions — must return NO_TOOL even when keywords match.
# ===========================================================================
# Python
("Write a Python function that implements a trie for fast prefix searches.",
"NO_TOOL", [], "adversarial_trap"),
("How do I use Python's contextlib.contextmanager decorator?",
"NO_TOOL", [], "adversarial_trap"),
("Explain Python's __slots__ and when to use it for memory optimization.",
"NO_TOOL", [], "adversarial_trap"),
("What is the difference between deepcopy and shallow copy in Python?",
"NO_TOOL", [], "adversarial_trap"),
("How does Python's asyncio event loop schedule coroutines?",
"NO_TOOL", [], "adversarial_trap"),
("Write a Python generator that yields prime numbers indefinitely.",
"NO_TOOL", [], "adversarial_trap"),
("How do I profile memory usage in a Python application?",
"NO_TOOL", [], "adversarial_trap"),
# JavaScript / TypeScript
("How do I debounce a function in JavaScript without lodash?",
"NO_TOOL", [], "adversarial_trap"),
("Explain the JavaScript event loop and microtask queue.",
"NO_TOOL", [], "adversarial_trap"),
("How does TypeScript's discriminated union type work?",
"NO_TOOL", [], "adversarial_trap"),
("Write a TypeScript generic function that deep-merges two objects.",
"NO_TOOL", [], "adversarial_trap"),
("What is the difference between a WeakMap and a Map in JavaScript?",
"NO_TOOL", [], "adversarial_trap"),
("How do I implement a promise-based queue in Node.js?",
"NO_TOOL", [], "adversarial_trap"),
# Go
("How does Go's goroutine scheduler work with M:N threading?",
"NO_TOOL", [], "adversarial_trap"),
("Explain Go's garbage collector and write barriers.",
"NO_TOOL", [], "adversarial_trap"),
("Write a concurrent rate limiter in Go using channels.",
"NO_TOOL", [], "adversarial_trap"),
("How do I implement context cancellation in a Go HTTP server?",
"NO_TOOL", [], "adversarial_trap"),
# Rust
("Explain Rust's borrow checker and why it prevents data races.",
"NO_TOOL", [], "adversarial_trap"),
("How do Arc and Mutex work together in Rust for thread-safe state sharing?",
"NO_TOOL", [], "adversarial_trap"),
("What is Rust's Pin and why is it needed for async futures?",
"NO_TOOL", [], "adversarial_trap"),
("Write a Rust trait that implements a retry strategy with exponential backoff.",
"NO_TOOL", [], "adversarial_trap"),
# SQL / NoSQL
("Write a SQL query that finds the second-highest salary in an employees table.",
"NO_TOOL", [], "adversarial_trap"),
("How do I use window functions in PostgreSQL to compute a running total?",
"NO_TOOL", [], "adversarial_trap"),
("What is a covering index and when should I use one in MySQL?",
"NO_TOOL", [], "adversarial_trap"),
("Explain eventual consistency in DynamoDB and how to work around it.",
"NO_TOOL", [], "adversarial_trap"),
("How do I export data from MongoDB to a JSON file using mongoexport?",
"NO_TOOL", [], "adversarial_trap"),
("What is a materialized view in PostgreSQL and how does it differ from a regular view?",
"NO_TOOL", [], "adversarial_trap"),
# Algorithms / Data Structures
("Explain Dijkstra's algorithm and its time complexity.",
"NO_TOOL", [], "adversarial_trap"),
("Write a depth-first search implementation for a graph adjacency list.",
"NO_TOOL", [], "adversarial_trap"),
("How does consistent hashing help with horizontal scaling?",
"NO_TOOL", [], "adversarial_trap"),
("Explain the difference between a B-tree and a B+ tree.",
"NO_TOOL", [], "adversarial_trap"),
("What is the time and space complexity of merge sort?",
"NO_TOOL", [], "adversarial_trap"),
("Implement a LRU cache in Python using OrderedDict.",
"NO_TOOL", [], "adversarial_trap"),
("How does a bloom filter work and what are its false positive trade-offs?",
"NO_TOOL", [], "adversarial_trap"),
# Frameworks / Config
("How do I configure Django's ORM to use read replicas?",
"NO_TOOL", [], "adversarial_trap"),
("Explain Flask's application context vs. request context.",
"NO_TOOL", [], "adversarial_trap"),
("How does FastAPI's dependency injection system work?",
"NO_TOOL", [], "adversarial_trap"),
("Write a middleware in Express.js that logs request durations.",
"NO_TOOL", [], "adversarial_trap"),
("How do I set up hot-module replacement in a Vite + React project?",
"NO_TOOL", [], "adversarial_trap"),
("What is the difference between server components and client components in Next.js 14?",
"NO_TOOL", [], "adversarial_trap"),
# DevOps / Infrastructure
("Write a Dockerfile for a Python FastAPI app with multi-stage builds.",
"NO_TOOL", [], "adversarial_trap"),
("How do I configure a Kubernetes HorizontalPodAutoscaler based on custom metrics?",
"NO_TOOL", [], "adversarial_trap"),
("What is the difference between rolling and blue-green deployments?",
"NO_TOOL", [], "adversarial_trap"),
("How do I set up Prometheus scraping for a Node.js service?",
"NO_TOOL", [], "adversarial_trap"),
("Explain how etcd achieves consensus using the Raft algorithm.",
"NO_TOOL", [], "adversarial_trap"),
("Write a GitHub Actions workflow that runs tests on every pull request.",
"NO_TOOL", [], "adversarial_trap"),
# Memory management (trap on 'memory' keyword)
("How does virtual memory paging work in Linux?",
"NO_TOOL", [], "adversarial_trap"),
("What is memory-mapped I/O and how does mmap work in C?",
"NO_TOOL", [], "adversarial_trap"),
("Explain stack vs. heap memory allocation and when each is appropriate.",
"NO_TOOL", [], "adversarial_trap"),
("How does the V8 engine's garbage collector use generational collection?",
"NO_TOOL", [], "adversarial_trap"),
# Session handling (trap on 'session' keyword)
("How does PHP's session_start() work under the hood?",
"NO_TOOL", [], "adversarial_trap"),
("Implement session fixation protection in a Flask application.",
"NO_TOOL", [], "adversarial_trap"),
("What is the difference between sticky sessions and session replication?",
"NO_TOOL", [], "adversarial_trap"),
("How do I store JWT tokens in a secure, httpOnly cookie in Express?",
"NO_TOOL", [], "adversarial_trap"),
# Search (trap on 'search' keyword)
("How do I implement fuzzy search with trigrams in PostgreSQL?",
"NO_TOOL", [], "adversarial_trap"),
("Explain TF-IDF and how it ranks documents in full-text search.",
"NO_TOOL", [], "adversarial_trap"),
("Write a binary search implementation in Rust.",
"NO_TOOL", [], "adversarial_trap"),
("Compare Elasticsearch and OpenSearch for log aggregation.",
"NO_TOOL", [], "adversarial_trap"),
# Graph theory (trap on 'graph' + 'edges' keywords)
("Explain the difference between Prim's and Kruskal's spanning tree algorithms.",
"NO_TOOL", [], "adversarial_trap"),
("How do topological sorts work on directed acyclic graphs?",
"NO_TOOL", [], "adversarial_trap"),
("Write a function to detect cycles in a directed graph using DFS.",
"NO_TOOL", [], "adversarial_trap"),
# Load balancing (trap on 'load' keyword)
("What are the differences between round-robin, least-connections, and IP-hash load balancing?",
"NO_TOOL", [], "adversarial_trap"),
("How does Nginx upstream load balancing handle health check failures?",
"NO_TOOL", [], "adversarial_trap"),
# Logging / monitoring
("How do I implement structured logging in a Go service with zerolog?",
"NO_TOOL", [], "adversarial_trap"),
("Explain the ELK stack and how logs flow from Beats to Kibana.",
"NO_TOOL", [], "adversarial_trap"),
("What is OpenTelemetry and how does distributed tracing work?",
"NO_TOOL", [], "adversarial_trap"),
# Misc CS concepts
("What is the difference between optimistic and pessimistic locking in databases?",
"NO_TOOL", [], "adversarial_trap"),
("Explain how CRDTs achieve conflict-free distributed state.",
"NO_TOOL", [], "adversarial_trap"),
("What is a saga pattern in distributed systems?",
"NO_TOOL", [], "adversarial_trap"),
("How does the forget gate in an LSTM neural network control memory?",
"NO_TOOL", [], "adversarial_trap"),
# ===========================================================================
# CATEGORY 3: disambiguation (40 cases)
# Similar tools — model must pick the correct one.
# ===========================================================================
# session_search_memory vs knowledge_search
("Find anything we discussed last month about the API versioning decision.",
"session_search_memory", ["query"], "disambiguation"),
("What do our curated knowledge items say about dependency injection patterns?",
"knowledge_search", ["query"], "disambiguation"),
("Search our accumulated documentation for information on database sharding.",
"knowledge_search", ["query"], "disambiguation"),
("Look through recent session notes for anything about the CDN cache invalidation bug.",
"session_search_memory", ["query"], "disambiguation"),
("Any past conversations where we discussed microservice mesh configurations?",
"session_search_memory", ["query"], "disambiguation"),
("Check the knowledge base for anything on event sourcing trade-offs.",
"knowledge_search", ["query"], "disambiguation"),
# session_forget_memory vs knowledge_forget
("Remove the specific session memory with ID mem-qq77-rr. It's incorrect.",
"session_forget_memory", ["memory_id"], "disambiguation"),
("Clear all the outdated knowledge entries in the staging project.",
"knowledge_forget", ["project"], "disambiguation"),
("Wipe out old debugging records from the search-service project's knowledge base.",
"knowledge_forget", ["project"], "disambiguation"),
("Delete the memory entry for ID mem-ab99-cd — we noted the wrong schema version.",
"session_forget_memory", ["memory_id"], "disambiguation"),
("Remove all knowledge items in the deprecated-feature category from the portal project.",
"knowledge_forget", ["project"], "disambiguation"),
# session_save_ledger vs session_save_experience vs session_save_handoff
("Log what we did today: migrated the billing module to the new event bus.",
"session_save_ledger", [], "disambiguation"),
("Record a milestone: we successfully launched the new onboarding flow in production.",
"session_save_experience", [], "disambiguation"),
("Hand off this session — save the state for the next agent on the gateway project.",
"session_save_handoff", ["project"], "disambiguation"),
("Write down that we rewrote the payment reconciliation logic today.",
"session_save_ledger", [], "disambiguation"),
("Mark a success: we fixed the notorious N+1 query on the orders endpoint.",
"session_save_experience", [], "disambiguation"),
("The contractor is taking over tonight. Save the handoff for the migration-tools project.",
"session_save_handoff", ["project"], "disambiguation"),
# knowledge_upvote vs knowledge_downvote
("That knowledge entry about immutable infrastructure is spot on. Upvote it.",
"knowledge_upvote", [], "disambiguation"),
("The doc recommending XML over JSON for internal APIs is terrible. Mark it down.",
"knowledge_downvote", [], "disambiguation"),
("Increase the importance score of the circuit-breaker patterns entry.",
"knowledge_upvote", [], "disambiguation"),
("Reduce the rank of that outdated note about using MD5 for hashing.",
"knowledge_downvote", [], "disambiguation"),
# session_compact_ledger vs session_export_memory
("The billing-service ledger is bloated. Compress and archive the old entries.",
"session_compact_ledger", ["project"], "disambiguation"),
("Export a full offline snapshot of my memory to /archive/snapshot in JSON.",
"session_export_memory", ["output_path", "format"], "disambiguation"),
("Trim down the session history for the firmware project — it's too long.",
"session_compact_ledger", ["project"], "disambiguation"),
("Save everything to disk — dump all session data to /tmp/export-all.",
"session_export_memory", ["output_path"], "disambiguation"),
# session_synthesize_edges vs session_backfill_links vs session_health_check
("Verify the session graph edges are all consistent for the trading-platform project.",
"session_synthesize_edges", ["project"], "disambiguation"),
("Reconnect the dangling session references for the ml-pipeline project.",
"session_backfill_links", ["project"], "disambiguation"),
("Run a full health diagnostic on the Prism memory backend.",
"session_health_check", [], "disambiguation"),
("Patch up missing cross-session links for the user-service project.",
"session_backfill_links", ["project"], "disambiguation"),
("Make sure all edges are synthesized and up to date for the invoicing project.",
"session_synthesize_edges", ["project"], "disambiguation"),
("Is the memory system responding normally? Do a quick health check.",
"session_health_check", [], "disambiguation"),
# session_load_context vs session_search_memory
("Bring me back into the context of the payments-gateway project.",
"session_load_context", ["project"], "disambiguation"),
("Look for any notes we made about the GraphQL schema decisions.",
"session_search_memory", ["query"], "disambiguation"),
("Restore the full session state for the devops-automation project.",
"session_load_context", ["project"], "disambiguation"),
("Search our history for any discussion about OAuth2 vs API keys.",
"session_search_memory", ["query"], "disambiguation"),
# session_task_route vs session_load_context
("Should the local model handle this React performance optimization or route it to the cloud?",
"session_task_route", ["task_description"], "disambiguation"),
("Initialize context for the infrastructure-as-code project — I'm starting fresh.",
"session_load_context", ["project"], "disambiguation"),
# knowledge_set_retention vs knowledge_forget
("Set the knowledge for the beta-program project to expire after 90 days.",
"knowledge_set_retention", ["project"], "disambiguation"),
("Delete all knowledge in the archived-2025 project — we don't need it anymore.",
"knowledge_forget", ["project"], "disambiguation"),
("Auto-expire the knowledge entries in the sandbox project after 14 days.",
"knowledge_set_retention", ["project"], "disambiguation"),
# ===========================================================================
# CATEGORY 4: edge_case (25 cases)
# Minimal, single-word, ambiguous, or unusual prompts.
# ===========================================================================
("Load context.", "session_load_context", [], "edge_case"),
("Save.", "session_save_ledger", [], "edge_case"),
("Search.", "session_search_memory", [], "edge_case"),
("Check health.", "session_health_check", [], "edge_case"),
("Export.", "session_export_memory", [], "edge_case"),
("Compact.", "session_compact_ledger", [], "edge_case"),
("Handoff.", "session_save_handoff", [], "edge_case"),
("Route this.", "session_task_route", [], "edge_case"),
("Synthesize edges.", "session_synthesize_edges", [], "edge_case"),
("Backfill links.", "session_backfill_links", [], "edge_case"),
("Forget it.", "session_forget_memory", [], "edge_case"),
("Knowledge search.", "knowledge_search", [], "edge_case"),
# Abstention edge cases
("Hello!", "NO_TOOL", [], "edge_case"),
("What can you do?", "NO_TOOL", [], "edge_case"),
("Tell me about yourself.", "NO_TOOL", [], "edge_case"),
("Thanks, we're done.", "NO_TOOL", [], "edge_case"),
("OK great.", "NO_TOOL", [], "edge_case"),
("Bye!", "NO_TOOL", [], "edge_case"),
# Ambiguous short prompts that still require the right tool
("Run diagnostics.", "session_health_check", [], "edge_case"),
("Save the handoff.", "session_save_handoff", [], "edge_case"),
("Log this session.", "session_save_ledger", [], "edge_case"),
("Search memory.", "session_search_memory", [], "edge_case"),
("Knowledge base lookup.", "knowledge_search", [], "edge_case"),
("Archive old entries.", "session_compact_ledger", [], "edge_case"),
("Save experience.", "session_save_experience", [], "edge_case"),
# ===========================================================================
# CATEGORY 5: multi_intent (20 cases)
# Multi-step prompts — score only the FIRST action.
# ===========================================================================
("Load the context for the pipeline project, then search for any past notes on streaming.",
"session_load_context", ["project"], "multi_intent"),
("Search our memory for anything about the OAuth migration, then save a handoff.",
"session_search_memory", ["query"], "multi_intent"),
("Check memory health, and if it's all good, compact the fraud-detection ledger.",
"session_health_check", [], "multi_intent"),
("Find notes about the ML model rollout, and then log that we finished the A/B test today.",
"session_search_memory", ["query"], "multi_intent"),
("Load the prism-mcp context, then check if there are any open issues about rate limiting.",
"session_load_context", ["project"], "multi_intent"),
("Export everything to /tmp/backup, then set a 60-day retention policy on it.",
"session_export_memory", ["output_path"], "multi_intent"),
("Save what we did today: shipped the new notification system. Then create a handoff note.",
"session_save_ledger", [], "multi_intent"),
("Search for what we decided about the queue architecture, then upvote the best result.",
"session_search_memory", ["query"], "multi_intent"),
("Run a health check on the memory system, then compact the ledger if there are issues.",
"session_health_check", [], "multi_intent"),
("Look up our knowledge on service mesh patterns, and then downvote the outdated ones.",
"knowledge_search", ["query"], "multi_intent"),
("Compact the session history for the payments project, then synthesize the session edges.",
"session_compact_ledger", ["project"], "multi_intent"),
("Load context for the billing-v2 project, and record our progress: we fixed the invoice date bug.",
"session_load_context", ["project"], "multi_intent"),
("Search our knowledge base for event-driven design patterns, then save a handoff with the findings.",
"knowledge_search", ["query"], "multi_intent"),
("Backfill the cross-session links for the ios-app project, then synthesize edges.",
"session_backfill_links", ["project"], "multi_intent"),
("Route this task: full rewrite of the logging subsystem. If cloud, just tell me.",
"session_task_route", ["task_description"], "multi_intent"),
("Export memory to /var/backup, and then purge the old knowledge entries from the legacy project.",
"session_export_memory", ["output_path"], "multi_intent"),
("Find what we discussed about caching strategies, then set a 30-day retention on that knowledge.",
"session_search_memory", ["query"], "multi_intent"),
("Record a success milestone: zero-downtime deploy of version 4.2. Then compact the ledger.",
"session_save_experience", [], "multi_intent"),
("Load the fraud-detection project context and then synthesize all session edges.",
"session_load_context", ["project"], "multi_intent"),
("Save what we accomplished: rewrote the ingestion pipeline. Then hand it off to the ops team.",
"session_save_ledger", [], "multi_intent"),
# ===========================================================================
# CATEGORY 6: verifier (25 cases)
# session_synthesize_edges / session_backfill_links / session_health_check patterns.
# ===========================================================================
# session_synthesize_edges
("Make sure all session graph edges are consistent for the auth-gateway project.",
"session_synthesize_edges", ["project"], "verifier"),
("Run a synthesis pass to validate all edges are up to date for the orchestration project.",
"session_synthesize_edges", ["project"], "verifier"),
("Verify graph integrity — synthesize edges for the content-delivery project.",
"session_synthesize_edges", ["project"], "verifier"),
("Before closing out, check that all session links are consistent for the scheduling project.",
"session_synthesize_edges", ["project"], "verifier"),
("Ensure all session relationships are properly synthesized for the warehouse-api project.",
"session_synthesize_edges", ["project"], "verifier"),
("Run edge synthesis on the real-time-alerts project to validate the session graph.",
"session_synthesize_edges", ["project"], "verifier"),
("Validate that all edges in the session graph are consistent for the pricing-engine project.",
"session_synthesize_edges", ["project"], "verifier"),
("Confirm session link consistency for the document-processing project.",
"session_synthesize_edges", ["project"], "verifier"),
# session_backfill_links
("There are broken cross-session links in the search-backend project. Backfill them.",
"session_backfill_links", ["project"], "verifier"),
("Reconnect all dangling references in the identity-service project history.",
"session_backfill_links", ["project"], "verifier"),
("Patch the missing links between sessions for the payments-v3 project.",
"session_backfill_links", ["project"], "verifier"),
("Fix the link gaps in our session history for the recommendation-service project.",
"session_backfill_links", ["project"], "verifier"),
("Backfill any missing cross-session connections for the notification-hub project.",
"session_backfill_links", ["project"], "verifier"),
("Reconnect broken session references in the compliance-tracker project.",
"session_backfill_links", ["project"], "verifier"),
("Repair missing session links for the api-gateway project.",
"session_backfill_links", ["project"], "verifier"),
# session_health_check
("Before I start a new sprint, confirm the memory system is operating correctly.",
"session_health_check", [], "verifier"),
("The search results seem incomplete. Check if the memory backend is healthy.",
"session_health_check", [], "verifier"),
("I'm seeing weird behavior in session recall. Run a diagnostic check.",
"session_health_check", [], "verifier"),
("Ping the memory system and confirm it's all healthy.",
"session_health_check", [], "verifier"),
("Is the Prism memory backend operating within normal parameters?",
"session_health_check", [], "verifier"),
("Double-check the memory infrastructure health before I rely on these results.",
"session_health_check", [], "verifier"),
("Verify the memory system is functioning before we start the long session.",
"session_health_check", [], "verifier"),
("Run a full health check and report back on the memory backend status.",
"session_health_check", [], "verifier"),
("Something is off with memory recall. Diagnose the backend.",
"session_health_check", [], "verifier"),
("Confirm the session memory system is healthy before I save this handoff.",
"session_health_check", [], "verifier"),
# ===========================================================================
# CATEGORY 7: cascade (25 cases)
# Explicit first-step-of-chain patterns — model must pick the right FIRST tool.
# ===========================================================================
("Search our knowledge for gRPC patterns, then upvote the most relevant entry.",
"knowledge_search", ["query"], "cascade"),
("Load the indexing-service context, then search for any past notes on shard rebalancing.",
"session_load_context", ["project"], "cascade"),
("Check memory health, then compact the alerts project ledger if there are stale entries.",
"session_health_check", [], "cascade"),
("Export all memory to /tmp/archive, then set a 180-day retention policy on the archive project.",
"session_export_memory", ["output_path"], "cascade"),
("Search for what we decided about the event schema design, then save a handoff about it.",
"session_search_memory", ["query"], "cascade"),
("Save today's session notes for the pipeline project, then create a handoff for the next agent.",
"session_save_ledger", [], "cascade"),
("Should the local model handle this concurrency refactor? If cloud, stop there.",
"session_task_route", ["task_description"], "cascade"),
("Search knowledge for CQRS trade-offs, downvote anything recommending a single store.",
"knowledge_search", ["query"], "cascade"),
("Compact the ledger for the embeddings project, then synthesize the session edges.",
"session_compact_ledger", ["project"], "cascade"),
("Load the feature-flags project context, then log that we shipped the A/B framework.",
"session_load_context", ["project"], "cascade"),
("Run a health check first, then based on results decide whether to compact or export.",
"session_health_check", [], "cascade"),
("Search memory for past decisions about SSE vs WebSockets, then record what we found.",
"session_search_memory", ["query"], "cascade"),
("Backfill the missing links for the analytics project, then synthesize the edges.",
"session_backfill_links", ["project"], "cascade"),
("Load context for the tenant-management project, then search for any open migration tickets.",
"session_load_context", ["project"], "cascade"),
("Find what we know about zero-copy networking, then save a handoff with that context.",
"session_search_memory", ["query"], "cascade"),
("Export to /backups/weekly, then compact the media-processing ledger.",
"session_export_memory", ["output_path"], "cascade"),
("Search our knowledge base for Kubernetes resource quotas, then set a 60-day retention.",
"knowledge_search", ["query"], "cascade"),
("Save the experience: we eliminated 80% of unnecessary re-renders. Then route the next task.",
"session_save_experience", [], "cascade"),
("Synthesize edges for the audit-log project, then backfill any missing links.",
"session_synthesize_edges", ["project"], "cascade"),
("Load the risk-assessment project context and then search memory for past risk audit notes.",
"session_load_context", ["project"], "cascade"),
("Find our notes on the transaction saga pattern, then upvote the best entry.",
"session_search_memory", ["query"], "cascade"),
("Compact the metrics project ledger, then export it to /tmp/metrics-backup.",
"session_compact_ledger", ["project"], "cascade"),
("Route this task: implement distributed tracing with OpenTelemetry across five services.",
"session_task_route", ["task_description"], "cascade"),
("Save what we accomplished: added RBAC support to the admin API. Then synthesize edges.",
"session_save_ledger", [], "cascade"),
("Search knowledge for eventual consistency patterns, then forget the entries about using global locks.",
"knowledge_search", ["query"], "cascade"),
# ===========================================================================
# CATEGORY 8: param_extraction (25 cases)
# Params ARE mentioned in the prompt — test that model extracts them correctly.
# ===========================================================================
("Load the full context for the fraud-detection project at a deep level.",
"session_load_context", ["project"], "param_extraction"),
("Compact the session ledger for the user-identity project.",
"session_compact_ledger", ["project"], "param_extraction"),
("Save a handoff note for the supplier-portal project.",
"session_save_handoff", ["project"], "param_extraction"),
("Delete the memory entry with ID mem-fg33-hh. It has the wrong branch name.",
"session_forget_memory", ["memory_id"], "param_extraction"),
("Export all memory data to /exports/2026-q2 in JSON format.",
"session_export_memory", ["output_path", "format"], "param_extraction"),
("Set the retention policy for the experiment-runner project to 45 days.",
"knowledge_set_retention", ["project"], "param_extraction"),
("Search session memory for 'distributed tracing setup'.",
"session_search_memory", ["query"], "param_extraction"),
("Search the knowledge base for 'idempotency keys in payment APIs'.",
"knowledge_search", ["query"], "param_extraction"),
("Backfill the cross-session links for the warehouse-inventory project.",
"session_backfill_links", ["project"], "param_extraction"),
("Synthesize session edges for the logistics-optimizer project.",
"session_synthesize_edges", ["project"], "param_extraction"),
("Forget the knowledge entry with ID ki-cc44-gg — that approach is deprecated.",
"knowledge_forget", [], "param_extraction"),
("Upvote the knowledge entry with ID ki-tt55-rr. Really solid documentation.",
"knowledge_upvote", [], "param_extraction"),
("Downvote knowledge entry ki-uu99-qq — it recommends a vulnerable library.",
"knowledge_downvote", [], "param_extraction"),
("Configure an 80-day retention policy for the beta-features project's knowledge.",
"knowledge_set_retention", ["project"], "param_extraction"),
("Load context for the platform-core project.",
"session_load_context", ["project"], "param_extraction"),
("Export the archive to /data/long-term-backup in markdown format.",
"session_export_memory", ["output_path", "format"], "param_extraction"),
("Search for 'zero-downtime database migrations' in our session history.",
"session_search_memory", ["query"], "param_extraction"),
("Search knowledge for 'CQRS vs event sourcing trade-offs'.",
"knowledge_search", ["query"], "param_extraction"),
("Compact the ledger for the monitoring-stack project.",
"session_compact_ledger", ["project"], "param_extraction"),
("Delete memory entry mem-pp12-ss — wrong model version was recorded.",
"session_forget_memory", ["memory_id"], "param_extraction"),
("Save a handoff for the checkout-v4 project.",
"session_save_handoff", ["project"], "param_extraction"),
("Route this task: rewrite the message broker integration to use NATS instead of RabbitMQ.",
"session_task_route", ["task_description"], "param_extraction"),
("Synthesize edges for the ingestion-pipeline project.",
"session_synthesize_edges", ["project"], "param_extraction"),
("Backfill the missing session links in the content-catalog project.",
"session_backfill_links", ["project"], "param_extraction"),
("Set 120-day retention on the compliance-logs project's knowledge.",
"knowledge_set_retention", ["project"], "param_extraction"),
# ===========================================================================
# CATEGORY 9: abstention (20 cases)
# Greetings, capability questions, general CS — must return NO_TOOL.
# ===========================================================================
("Hi there!", "NO_TOOL", [], "abstention"),
("Good morning!", "NO_TOOL", [], "abstention"),
("Hey, quick question — what's your name?", "NO_TOOL", [], "abstention"),
("What tools do you have available?", "NO_TOOL", [], "abstention"),
("What are your capabilities?", "NO_TOOL", [], "abstention"),
("Can you explain what Prism Memory tools do?", "NO_TOOL", [], "abstention"),
("What programming languages do you know?", "NO_TOOL", [], "abstention"),
("Thanks, that's all for now!", "NO_TOOL", [], "abstention"),
("Great work today, goodbye.", "NO_TOOL", [], "abstention"),
("You're really helpful, thanks!", "NO_TOOL", [], "abstention"),
("What is the capital of France?", "NO_TOOL", [], "abstention"),
("Tell me a joke.", "NO_TOOL", [], "abstention"),
("How do you work?", "NO_TOOL", [], "abstention"),
("Are you GPT-4?", "NO_TOOL", [], "abstention"),
("Can you write me a poem?", "NO_TOOL", [], "abstention"),
("What's the weather like today?", "NO_TOOL", [], "abstention"),
("Can you recommend a good book?", "NO_TOOL", [], "abstention"),
("What's 2+2?", "NO_TOOL", [], "abstention"),
("Do you have feelings?", "NO_TOOL", [], "abstention"),
("What is machine learning?", "NO_TOOL", [], "abstention"),
]
# ---------------------------------------------------------------------------
# Sanity check: enforce exactly 300 cases and correct counts per category
# ---------------------------------------------------------------------------
_TARGET_COUNTS = {
"natural_phrasing": 50,
"adversarial_trap": 70,
"disambiguation": 40,
"edge_case": 25,
"multi_intent": 20,
"verifier": 25,
"cascade": 25,
"param_extraction": 25,
"abstention": 20,
}
_TOTAL_TARGET = 300
def _verify_test_counts():
from collections import Counter
counts = Counter(t[3] for t in TESTS)
errors = []
for cat, expected in _TARGET_COUNTS.items():
actual = counts.get(cat, 0)
if actual != expected:
errors.append(f" {cat}: expected {expected}, got {actual}")
if len(TESTS) != _TOTAL_TARGET:
errors.append(f" TOTAL: expected {_TOTAL_TARGET}, got {len(TESTS)}")
if errors:
print("WARNING: test count mismatches:")
for e in errors:
print(e)
return len(errors) == 0
# ---------------------------------------------------------------------------
# Layer 3: Inference-Time False-Positive Rejection + Remapping
# (Copied and merged from swe_bench_test.py — all current rules preserved)
# ---------------------------------------------------------------------------
GENERAL_PROGRAMMING_PATTERNS = [
# Python context managers
r'\bcontext\s+manager\b', r'\bcontextlib\b', r'\b__enter__\b', r'\b__exit__\b',
r'\basync\s+context\s+manager\b',
# ML / LSTM forget gates
r'\bforget\s+gate\b', r'\blstm\b', r'\bcatastrophic\s+forgetting\b',
r'\bforget\s+bias\b', r'\belastic\s+weight\s+consolidation\b',
# Web framework sessions
r'\bexpress\.js\b', r'\bdjango\b', r'\bflask\b', r'\bfastapi\b',
r'\bsession_start\(\)', r'\bsession\s+middleware\b', r'\bsession\s+affinity\b',
# General CS
r'\bgarbage\s+collection\b', r'\bgc\s+algorithm\b',
r'\bmemory\s+management\s+in\s+rust\b',
r'\bload\s+balanc', r'\bnginx\b', r'\bhaproxy\b',
r'\bcontext\s+switch',
r'\bsearch\s+algorithm\b',
r'\bsearch\s+functionality\s+with\s+elasticsearch\b',
r'\bhealth\s+check\s+endpoint\s+pattern\b',
r'\belasticsearch\b', r'\bsolr\b', r'\blucene\b',
r'\bretention\s+polic(?:y|ies)\s+(?:in|for|with)\s+(?:kafka|s3|aws|gcp|azure|cloud)',
r'\bpostgresql\b.*\bmongodb\b', r'\bmongodb\b.*\bpostgresql\b',
r'\bwrite\s+a\s+decorator\b', r'\bdecorator.*retries?\b',
r'\bci/cd\b', r'\bgithub\s+actions\b',
r'\bcors\b.*\bnode\.js\b', r'\bnode\.js\b.*\bcors\b',
r'\bcap\s+theorem\b', r'\bbinary\s+search\s+tree\b',
r'\bvirtual\s+dom\b', r'\breact\b.*\breconciliation\b',
r'\bdependency\s+injection\b',
r'\btcp\b.*\budp\b', r'\budp\b.*\btcp\b',
r'\btime\s+complexity\b', r'\bquicksort\b',
r'\bexponential\s+backoff\b', r'\bjitter\b.*\bretri', r'\bapi\s+retri',
r'\bcelery\b.*\bqueue', r'\broute\s+tasks?\s+in\s+celery\b',
r'\bknowledge\s+graph\b.*\b(?:function|search|algorithm|traversal)\b',
r'\b(?:function|write\s+a\s+function|implement)\b.*\bknowledge\s+graph\b',
r'\bsave\s+(?:user\s+)?preferences?\s+in\s+(?:react|redux|localstorage|a\s+database)\b',
r'\bexport\s+(?:data\s+)?from\s+(?:postgresql|mysql|sqlite|a\s+database)\b',
r'\bpostgresql\b.*\bcsv\b', r'\bcsv\b.*\bpostgresql\b',
# Additional patterns from bfcl_eval.py
r'\bgoroutine\b', r'\bwrite\s+barrier\b', r'\brust\b.*\bborrow\b',
r'\barc\b.*\bmutex\b', r'\bpin\b.*\bfuture\b',
r'\bwindow\s+function\b', r'\bmongodb\b', r'\bmongoexport\b',
r'\bdijkstra\b', r'\bdepth.first\s+search\b', r'\bconsistent\s+hashing\b',
r'\bb.tree\b', r'\bbloom\s+filter\b', r'\blru\s+cache\b', r'\bordereddic\b',
r'\bhorizontalpodautoscal', r'\bprometheus\b', r'\betcd\b', r'\braft\b',
r'\bzerolog\b', r'\belk\s+stack\b', r'\bopentelemetry\b',
r'\bcrdt\b', r'\bsaga\s+pattern\b',
r'\btrie\b', r'\bweakmap\b', r'\bpromise.based\s+queue\b',
r'\bcovering\s+index\b', r'\bmaterialized\s+view\b',
r'\btf-idf\b', r'\btrigram\b', r'\bfuzzy\s+search\b',
r'\btopological\s+sort\b', r'\bcycle\s+detection\b',
r'\bprim.s\b', r'\bkruskal.s\b', r'\bspanning\s+tree\b',
r'\bhot.module\s+replacement\b', r'\bvite\b',
r'\bserver\s+component\b', r'\bclient\s+component\b',
r'\bdocker(?:file)?\b', r'\bblue.green\s+deploy', r'\brolling\s+deploy',
r'\bsticky\s+session\b', r'\bsession\s+replication\b', r'\bsession\s+fixation\b',
r'\bjwt\b.*\bhttponly\b',
r'\bpaging\b.*\bmemory\b', r'\bmmap\b', r'\bstack\s+vs\s+heap\b',
r'\bv8\s+engine\b', r'\bgenerational\s+collection\b',
r'\boptimistic\s+lock', r'\bpessimistic\s+lock',
r'\bcrdt\b', r'\beventual\s+consistency\b.*\bdynamo',
# General knowledge / weather / math
r"what'?s\s+the\s+weather\b", r'\bforecast\b.*\btoday\b',
r'\bwrite\s+a\s+sql\s+query\b', r'\bsecond.highest\s+salary\b',
r'\bsql\s+query\s+(?:that|to)\b',
]
PRISM_INTENT_PATTERNS = [
r'\bprism\b', r'\bsession\s*ledger\b', r'\bhandoff\b', r'\bknowledge\s+base\b',
r'\bknowledge\s+items?\b', r'\bour\s+knowledge\b',
r'\bsave.*(?:session|ledger|handoff)\b', r'\bload\s+context\b',
r'\b(?:search|find).*(?:memory|sessions?|conversations?|notes)\b',
r'\bproject\b', r'\bwhat\s+(?:do\s+)?we\s+(?:know|have)\b',
r'\binstitutional\s+knowledge\b', r'\bdocumented\b', r'\bcurated\b',
r'\bmemory\s+entry\b', r'\bmemory\s+backend\b', r'\bdiagnostics\b',
r'\bledger\b', r'\bcompact\b.*(?:ledger|entries|session)\b',
r'\bexport.*(?:memory|backup)\b', r'\b(?:delete|nuke|wipe|remove).*(?:entry|memory|entries)\b',
r'\blog.*(?:what|accomplished|session)\b', r'\brecord.*(?:session|what)\b',
r'\bhand.*(?:off|over)\b', r'\bbring.*up\s+to\s+speed\b',
r'\bbug\s+fix.*(?:local\s+model|handle)\b', r'\broute.*(?:task|this)\b',
r'\bbackfill\b', r'\bsynthesize\b', r'\bsession\s+graph\b',
r'\bsession\s+links?\b', r'\bedges?\s+(?:up\s+to\s+date|consistent)\b',
r'\bgraph\s+integrit', r'\bdangling\b', r'\breconnect.*(?:session|links?|references?)\b',
r'\bpatch.*(?:links?|gaps?)\b', r'\bmissing\s+links?\b',
r'\bsave\s+experience\b', r'\brecord\s+(?:a\s+)?milestone\b',
r'\brecord\s+(?:a\s+)?success\b', r'\bupvote\b', r'\bdownvote\b',
r'\bretention\s+polic(?:y|ies)\b', r'\bauto.expir\b', r'\bttl\b',
r'\bknowledge\s+entry\b', r'\bknowledge\s+record\b',
]
def validate_tool_call(prompt, tool_name, tool_args):
"""Layer 3: reject obvious false-positive tool calls and remap semantic neighbors.
Copied from swe_bench_test.py with additions from bfcl_eval.py.
Returns (tool_name, tool_args) — possibly changed if rejected or remapped.
"""
prompt_lower = prompt.lower()
# Special NO_TOOL override: "confirm session link/graph consistency" → synthesize_edges
if tool_name in ("NO_TOOL", "ERROR"):
if re.search(r'\b(?:confirm|verify|validate|check|ensure)\b', prompt_lower):
if re.search(r'\bsession\s+(?:link|edge|graph)\s+(?:consistency|consistent)\b', prompt_lower):
proj_m = re.search(r'\b(?:for|on)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
return 'session_synthesize_edges', ({'project': proj_m.group(1)} if proj_m else {})
return tool_name, tool_args
# --- Group B remaps (before false-positive rejection) ---
# "reconnect/patch up/dangling links" → backfill_links
# But don't remap when "synthesize edges" is the explicit first action
if tool_name in ('session_synthesize_edges', 'session_reconnect'):
if re.search(r'\b(?:reconnect|backfill|patch\s+up|dangling|link\s+gaps?|missing\s+links?|fix\s+links?)\b', prompt_lower):
if not re.search(r'^synthesize\b', prompt_lower) and \
not re.search(r'\bsynthesiz\w+\s+edges?\s+for\b', prompt_lower):
return 'session_backfill_links', tool_args
# "verify/check/make sure session links/edges are consistent / graph integrity" → synthesize_edges
if tool_name in ('session_health_check', 'session_backfill_links'):
_has_verify_verb = re.search(
r'\b(?:verify|validate|check|make\s+sure|ensure|confirm)\b', prompt_lower
)
_has_consistent_edge = re.search(
r'\b(?:edges?|links?|graph)\b.*?\b(?:consistent|up\s+to\s+date|synthesized)\b'
r'|\bconsistent\b.*?\b(?:edges?|links?|graph)\b'
r'|\bsession\s+links?\b'
r'|\bgraph\s+integrit',
prompt_lower, re.DOTALL
)
if _has_verify_verb and _has_consistent_edge:
return 'session_synthesize_edges', tool_args
# "synthesize edges for X, then backfill" → synthesize_edges is the FIRST action
if tool_name == 'session_backfill_links':
if re.search(r'(?:^|\bfirst\b|\bstart\s+with)\s*synthesize\s+edges?\b', prompt_lower) or \
re.search(r'^synthesize\b', prompt_lower):
return 'session_synthesize_edges', tool_args
# "wipe/clear old entries from knowledge base" → knowledge_forget (not compact_ledger)
# BUT protect "session entries" / "session history" from this remap
if tool_name == 'session_compact_ledger':
if re.search(r'\bknowledge\b', prompt_lower) and re.search(r'\b(?:wipe|clear|delete|remove|entries)\b', prompt_lower):
if not re.search(r'\bsession\s+(?:entries|history|ledger)\b', prompt_lower):
return 'knowledge_forget', tool_args
# "prune/trim/archive old session entries" → session_compact_ledger (not forget_memory)
if tool_name in ('session_forget_memory', 'knowledge_forget'):
if re.search(r'\b(?:prune|trim|archive|compress)\b', prompt_lower) and re.search(r'\b(?:session|ledger)\s+(?:entries|history)?\b', prompt_lower):
proj_m = re.search(r'\b(?:for|on)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
return 'session_compact_ledger', ({'project': proj_m.group(1)} if proj_m else tool_args)
# "archive old entries" (without 'knowledge') → session_compact_ledger
if tool_name == 'session_forget_memory':
if re.search(r'\b(?:archive|prune|trim)\s+old\s+entries\b', prompt_lower):
if not re.search(r'\bknowledge\b', prompt_lower) and not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]\b', prompt_lower):
return 'session_compact_ledger', tool_args
# "knowledge entries/items/records" + delete verbs → knowledge_forget (not session_forget_memory)
if tool_name == 'session_forget_memory':
if re.search(r'\bknowledge\s+(?:entr|items?|records?|base)\b', prompt_lower):
return 'knowledge_forget', tool_args
if re.search(r'\bknowledge\s+base\b', prompt_lower) and re.search(r'\b(?:entries|records|items)\b', prompt_lower):
return 'knowledge_forget', tool_args
# "delete/wipe entries from [project]" without a specific memory ID → knowledge_forget
if re.search(r'\b(?:entries|records|logs?)\b', prompt_lower) and re.search(r'\bproject\b', prompt_lower):
if not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]|ID\s*[=:]\s*\S+', prompt):
if not re.search(r'\b(?:session|ledger)\b', prompt_lower):
proj_m = re.search(r'(?:for|from|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project', prompt_lower, re.I)
return 'knowledge_forget', {'project': proj_m.group(1) if proj_m else None}
# "where were we / bring me up to speed / catch me up" → session_load_context (not session_search_memory)
if tool_name == 'session_search_memory':
if re.search(r'\bwhere\s+were\s+we\b|\bbring\s+me\s+up\s+to\s+speed\b|\bcatch\s+me\s+up\b|\bwhat\s+were\s+we\s+(?:doing|working)', prompt_lower):
project_m = re.search(
r'\b(?:on|for|with|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
r'|(?:state\s+of\s+(?:the\s+)?)([a-zA-Z][a-zA-Z0-9_-]+)(?:\s+project)?\b',
prompt_lower
)
if project_m:
project = next((g for g in project_m.groups() if g and g not in ('the', 'a', 'this', 'that', 'my', 'our')), None)
else:
project = None
return 'session_load_context', {'project': project} if project else {}
# "accumulated documentation / knowledge base" → knowledge_search (not session_search_memory)
if tool_name == 'session_search_memory':
if re.search(r'\baccumulated\s+documentation\b|\bknowledge\s+base\b', prompt_lower):
return 'knowledge_search', tool_args
# "recent / past / last week / what we did" → session_search_memory (not knowledge_search)
if tool_name == 'knowledge_search':
session_hints = [
r'\brecent\b', r'\bpast\b', r'\blast\s+(?:week|month|session)',
r'\bwhat\s+we\s+(?:did|decided|worked)', r'\bdeployment\s+issues\b',
]
if any(re.search(p, prompt_lower) for p in session_hints):
return 'session_search_memory', tool_args
# "remind me / did we ever decide" → session_search_memory (not load_context)
if tool_name == 'session_load_context':
if re.search(r'\bremind\s+me\b|\bdid\s+we\s+ever\s+(?:decide|settle|choose|pick)\b|\bwhat\s+did\s+we\s+decide\b', prompt_lower):
if not re.search(r'\bbring\s+me\s+up\s+to\s+speed\b|\bwhere\s+were\s+we\b|\bcatch\s+me\s+up\b|\bload\s+.*\bcontext\b', prompt_lower):
return 'session_search_memory', {"query": prompt[:120]}
# "jot down / write down / make a note / log what just happened" → session_save_ledger
_LEDGER_TRIGGERS = re.compile(
r'\bjot\s+down\b|\bwrite\s+(?:it\s+)?down\b|\bwhat\s+we\s+accomplished\b'
r'|\bmake\s+sure\s+it.{0,10}written\b|\brecord\s+(?:this\s+session|what)\b'
r'|\bmake\s+(?:a\s+)?note\s+(?:that|of)\b|\blog\s+what\s+just\s+happened\b'
r'|\bwrite\s+down\s+everything\b|\bbefore\s+I\s+(?:close|head\s+out)\b',
re.IGNORECASE
)
# negative: milestone/achievement events that belong in save_experience
_EXPERIENCE_NEGATIVE = re.compile(
r'\b(?:successfully|milestone|achievement|deployed\s+the|shipped\s+the|launched\s+the'
r'|we\s+(?:fixed|built|completed|created|resolved|deployed|shipped|launched)\s+the'
r'|race\s+condition|solid\s+now|zero.downtime)\b'
)
# Unambiguous note-taking phrases bypass the milestone negative check
_NOTE_TRIGGERS = re.compile(
r'\bmake\s+(?:a\s+)?note\s+(?:that|of)\b|\bjot\s+down\b'
r'|\bwrite\s+(?:it\s+)?down\b|\blog\s+what\s+just\s+happened\b',
re.IGNORECASE
)
if tool_name in ('session_save_experience', 'session_task_route'):
if _LEDGER_TRIGGERS.search(prompt):
if _NOTE_TRIGGERS.search(prompt) or not _EXPERIENCE_NEGATIVE.search(prompt_lower):
if 'content' in tool_args and 'summary' not in tool_args:
tool_args = dict(tool_args)
tool_args['summary'] = tool_args.pop('content')
if 'summary' not in tool_args:
work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed|finalized|completed|migrated)\s+.{10,120})', prompt, re.I)
if not work_m:
work_m = re.search(r'(?:make\s+a\s+note|log|note)\s+(?:that\s+)?(?:we\s+)?(completed|finished|did|wrote|refactored|migrated).{0,120}', prompt, re.I)
if work_m:
tool_args = dict(tool_args)
tool_args['summary'] = work_m.group(0).strip().rstrip('.')
return 'session_save_ledger', tool_args
# "record that we fixed/built/resolved [thing]" → session_save_experience (milestone)
if tool_name == 'session_save_ledger':
if re.search(r'\brecord\s+that\s+we\s+(?:fixed|built|completed|created|resolved|deployed|shipped|launched)\b', prompt_lower):
return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "milestone"}
# content → summary normalization + inline extraction for session_save_ledger
if tool_name == 'session_save_ledger':
if 'content' in tool_args and 'summary' not in tool_args:
tool_args = dict(tool_args)
tool_args['summary'] = tool_args.pop('content')
if 'summary' not in tool_args:
work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed|finalized|completed|migrated)\s+.{10,120})', prompt, re.I)
if not work_m:
work_m = re.search(r'(?:log|note|record)\s+(?:what\s+just\s+happened|this|that)\s*[:;]\s*(.{10,120})', prompt, re.I)
if work_m:
tool_args = dict(tool_args)
tool_args['summary'] = (work_m.group(1) if work_m.lastindex else work_m.group(0)).strip().rstrip('.')
# "log that we successfully deployed/shipped" → session_save_experience milestone (not save_ledger)
if tool_name == 'session_save_ledger':
if re.search(r'\blog\s+that\s+we\s+successfully\b|\bsuccessfully\s+deployed\b|\bsuccessfully\s+shipped\b|\bsuccessfully\s+launched\b', prompt_lower):
return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "success"}
# "shift change / store current state for next agent" → session_save_handoff
if tool_name == 'session_save_ledger':
if re.search(r'\bshift\s+change\b|\bstore\s+(?:the\s+)?current\s+state\s+for\b|\bnext\s+(?:agent|person|developer)\s+can\s+continue\b|\bhand.*over\b|\bpick.*up\s+next\b', prompt_lower):
return 'session_save_handoff', tool_args
# Multi-intent: "Search/Find ... THEN upvote/downvote" → first action is search
if tool_name in ('knowledge_upvote', 'knowledge_downvote'):
if re.search(r'\bthen\s+(?:upvote|downvote|boost|rate\s+up|rate\s+down)\b', prompt_lower):
if re.search(r'^(?:search|find|look\s+up)\b', prompt_lower):
query_m = re.search(
r'^(?:search\s+(?:for\s+)?|find\s+(?:our\s+)?(?:notes?\s+on\s+)?|look\s+up\s+)(.+?)(?:,?\s*then\b)',
prompt, re.I
)
return 'session_search_memory', {"query": query_m.group(1).strip() if query_m else prompt[:120]}
# invalid tool name → try retention or upvote/downvote
if tool_name not in VALID_TOOLS:
if re.search(r'\b(?:auto.?expir|ttl\b|\d+\s*days?\s+(?:retention|expir)|\bretention\s*polic)', prompt_lower):
return 'knowledge_set_retention', tool_args
# fall through to upvote/downvote patterns below
# knowledge_forget / knowledge_set_retention → upvote/downvote protection
_UPVOTE_SET = {'knowledge_forget', 'knowledge_set_retention', 'session_forget_memory',
'session_task_route', 'session_search_memory'}
# Don't remap to upvote/downvote when primary intent is "search THEN upvote"
_is_search_then_vote = (
re.search(r'^(?:search|find|look\s+up)\b', prompt_lower) and
re.search(r'\bthen\s+(?:upvote|downvote|boost|rate\s+up|rate\s+down)\b', prompt_lower)
)
if (tool_name in _UPVOTE_SET or tool_name not in VALID_TOOLS) and not _is_search_then_vote:
_id_val = (tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")) if isinstance(tool_args, dict) else None
if re.search(r'\b(?:upvote|boost|increase\s+(?:the\s+|its\s+)?(?:rank|score|importance)|uprate|thumbs[\s-]?up|mark\s+(?:it\s+)?(?:up|helpful|useful|great|good)|importance\s+score)\b', prompt_lower):
return 'knowledge_upvote', {"id": _id_val}
if re.search(r'\b(?:downvote|lower\s+(?:the\s+|its\s+)?(?:rank|score)|not\s+useful|derank|thumbs[\s-]?down|reduce\s+(?:the\s+|its\s+)?(?:rank|score)|mark\s+(?:it\s+)?(?:down|bad|wrong|outdated|terrible))\b', prompt_lower):
return 'knowledge_downvote', {"id": _id_val}
# session_load_context: extract project from prompt if missing
if tool_name == 'session_load_context':
if not (isinstance(tool_args, dict) and tool_args.get('project')):
proj_m = re.search(
r'\b(?:on|for|of|with|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
r'|(?:state\s+of\s+(?:the\s+)?)([a-zA-Z][a-zA-Z0-9_-]+)(?:\s+project)?\b',
prompt_lower
)
if proj_m:
proj = next((g for g in proj_m.groups() if g), None)
if proj and proj not in ('the', 'a', 'this', 'that', 'my', 'our'):
tool_args = dict(tool_args) if isinstance(tool_args, dict) else {}
tool_args['project'] = proj
# session_compact_ledger: extract project if missing
if tool_name == 'session_compact_ledger':
if not (isinstance(tool_args, dict) and tool_args.get('project')):
proj_m = re.search(
r'\b(?:for|on|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+(?:project\s+)?ledger\b'
r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\s+ledger\b'
r'|\b(?:compact|trim|prune|compress|archive)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+(?:project|ledger)\b',
prompt_lower
)
if proj_m:
proj = next((g for g in proj_m.groups() if g), None)
if proj and proj not in ('the', 'a', 'this', 'that', 'my', 'our', 'old', 'stale'):
tool_args = dict(tool_args) if isinstance(tool_args, dict) else {}
tool_args['project'] = proj
# "is this something the local model can handle? / route this task" → session_task_route
if tool_name == 'session_search_memory':
if re.search(r'\b(?:local\s+(?:model|agent)\s+(?:can\s+handle|should\s+handle)|route\s+this\s+task|should\s+(?:I|the\s+local\s+model)\s+(?:tackle|handle)|is\s+this\s+(?:something|simple\s+enough)\s+(?:for\s+the\s+)?local)\b', prompt_lower):
return 'session_task_route', {"task_description": prompt}
# session_task_route: extract task_description from prompt
if tool_name == 'session_task_route':
if 'task_description' not in tool_args or not tool_args.get('task_description'):
tool_args = dict(tool_args)
tool_args['task_description'] = prompt
# session_export_memory: extract output_path from path patterns, format from keywords
if tool_name == 'session_export_memory':
if not isinstance(tool_args, dict):
tool_args = {}
tool_args = dict(tool_args)
if 'output_path' not in tool_args or not tool_args.get('output_path'):
path_m = re.search(
r'(?:save\s+to|(?:output|export|dump)\s+(?:to\s+)?["\']?|to\s+["\']?)(/[\w/.-]+|~/[\w/.-]+)',
prompt, re.I
)
if path_m:
tool_args['output_path'] = path_m.group(1)
if 'format' not in tool_args or not tool_args.get('format'):
fmt_m = re.search(r'\b(json|jsonl|markdown|csv|yaml)\b(?:\s+format)?', prompt_lower)
if fmt_m:
tool_args['format'] = fmt_m.group(1)
# session_compact_ledger: protect "session entries" from knowledge_forget remap
# (already handled above but ensure compact stays for session-specific prompts)
# "where did we leave off / what was the state" → session_load_context
if tool_name == 'session_search_memory':
if re.search(r'\bwhere\s+did\s+we\s+leave\s+off\b|\bwhat\s+was\s+the\s+state\s+of\b|\bget\s+me\s+(?:re-?oriented|up\s+to\s+speed)\b|\bpull\s+up\s+(?:whatever|the\s+(?:full\s+)?context)', prompt_lower):
project_m = re.search(r'\b(?:on|for|with|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
project = project_m.group(1) if project_m else None
return 'session_load_context', ({'project': project} if project else {})
# --- Social pleasantry rejection ---
SOCIAL_PATTERNS = [
r'^thanks', r'^thank you', r'^cheers', r'^goodbye', r'^bye',
r"that's all", r"we're done", r"all done", r"all set",
r'^ok\s+great', r'^perfect$', r'^nice$', r'^cool$',
r'^hi\b', r'^hey\b', r'^hello\b', r'^good\s+morning', r'^good\s+afternoon',
]
is_social = any(re.search(p, prompt_lower.strip()) for p in SOCIAL_PATTERNS)
if is_social and not any(w in prompt_lower for w in [
'save', 'export', 'search', 'load', 'record', 'log', 'run', 'check', 'find',
'compact', 'handoff', 'route', 'synthesize', 'backfill', 'forget', 'upvote', 'downvote',
]):
return "NO_TOOL", {}
# --- False-positive rejection (CS patterns) ---
is_general = any(re.search(p, prompt_lower) for p in GENERAL_PROGRAMMING_PATTERNS)
if not is_general:
return tool_name, tool_args
has_prism_intent = any(re.search(p, prompt_lower) for p in PRISM_INTENT_PATTERNS)
if has_prism_intent:
return tool_name, tool_args
return "NO_TOOL", {}
# ---------------------------------------------------------------------------
# Ollama Call
# ---------------------------------------------------------------------------
TOOL_CALL_NOPIPE_RE = re.compile(
r'\s*(\{.*?\})\s*(?:|$)',
re.DOTALL
)
TOOL_CALL_PIPE_RE = re.compile(
r'<\|tool_call\|>\s*(\{.*?\})',
re.DOTALL
)
BARE_JSON_RE = re.compile(
r'(\{[^{}]*"name"\s*:\s*"[^"]+?"[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
)
def call_ollama(prompt: str, timeout: int = 120) -> tuple:
"""Call Ollama REST API with a pre-formatted ChatML prompt.
Returns (raw_response, tool_name, tool_args, latency_secs).
"""
start = time.time()
try:
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False,
"raw": True,
"options": {"temperature": 0.0, "num_predict": 512},
}).encode("utf-8")
req = urllib.request.Request(
OLLAMA_API,
data=payload,
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=timeout) as resp:
data = json.loads(resp.read().decode("utf-8"))
raw = data.get("response", "").strip()
except Exception as exc:
return (str(exc), "ERROR", {}, time.time() - start)
latency = time.time() - start
# Strip CoT blocks
clean = re.sub(
r'<\|synalux_think\|>.*?(?:\|synalux_think\|>|$)',
'', raw, flags=re.DOTALL
)
# Strategy 0: no-pipe … (v43 native format)
m = TOOL_CALL_NOPIPE_RE.search(clean)
if m:
try:
tj = json.loads(m.group(1))
return (raw, tj.get("name", tj.get("tool", "UNKNOWN")),
tj.get("arguments", tj.get("args", {})), latency)
except json.JSONDecodeError:
pass
# Strategy 1: piped <|tool_call|>
m = TOOL_CALL_PIPE_RE.search(clean)
if m:
try:
tj = json.loads(m.group(1))
return (raw, tj.get("name", tj.get("tool", "UNKNOWN")),
tj.get("arguments", tj.get("args", {})), latency)
except json.JSONDecodeError:
pass
# Strategy 2: bare JSON with "name" key
m = BARE_JSON_RE.search(clean)
if m:
try:
tj = json.loads(m.group(0))
return (raw, tj.get("name", "UNKNOWN"),
tj.get("arguments", tj.get("args", {})), latency)
except json.JSONDecodeError:
pass
return (raw, "NO_TOOL", {}, latency)
# ---------------------------------------------------------------------------
# Scoring
# ---------------------------------------------------------------------------
def evaluate_result(expected_tool, required_params, got_tool, got_args):
"""
Returns one of:
strict_pass — correct tool + all required_params present
partial_pass — correct tool + at least 1 required_param present but not all
wrong_tool — tool name is wrong (includes false positives / negatives)
false_positive — tool called when NO_TOOL expected
false_negative — NO_TOOL returned when tool expected
"""
if expected_tool == "NO_TOOL":
return "false_positive" if got_tool != "NO_TOOL" else "strict_pass"
if got_tool == "NO_TOOL":
return "false_negative"
# Accept either search tool for ambiguous prompts
tools_match = (got_tool == expected_tool) or (
expected_tool in ("session_search_memory", "knowledge_search") and
got_tool in ("session_search_memory", "knowledge_search")
)
if not tools_match:
return "wrong_tool"
if not required_params:
return "strict_pass"
if not isinstance(got_args, dict):
got_args = {}
present = [p for p in required_params if p in got_args and got_args[p] not in (None, "", [])]
if len(present) == len(required_params):
return "strict_pass"
if len(present) > 0:
return "partial_pass"
# Right tool, zero params matched
return "partial_pass"
def score(verdict):
if verdict == "strict_pass":
return 1.0
if verdict == "partial_pass":
return 0.5
return 0.0
# ---------------------------------------------------------------------------
# Main Eval
# ---------------------------------------------------------------------------
def run_once(tests, shuffle=False, run_label=""):
"""Run one full pass over test suite. Returns (results_list, category_stats)."""
indexed = list(enumerate(tests))
if shuffle:
random.shuffle(indexed)
results = [None] * len(tests)
category_stats = {}
for display_i, (orig_idx, (prompt, expected, req_params, category)) in enumerate(indexed, 1):
chatml = (
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
f"<|im_start|>user\n{prompt}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
raw, got_tool, got_args, latency = call_ollama(chatml)
got_tool, got_args = validate_tool_call(prompt, got_tool, got_args)
verdict = evaluate_result(expected, req_params, got_tool, got_args)
icon = "OK" if verdict == "strict_pass" else ("~~" if verdict == "partial_pass" else "XX")
tag = f"#{orig_idx + 1:03d}"
short = prompt[:52]
run_info = f"[{run_label}] " if run_label else ""
print(
f" {run_info}[{display_i:3d}/{len(tests)}] {icon} {tag} "
f"expect={expected:30s} got={got_tool:30s} {latency:5.1f}s | {short}"
)
if verdict != "strict_pass":
if verdict == "partial_pass":
missing = [p for p in req_params if p not in got_args or got_args.get(p) in (None, "", [])]
print(f" -> partial: missing params {missing}")
elif verdict == "false_positive":
print(f" -> FALSE POSITIVE: called {got_tool} (expected NO_TOOL)")
elif verdict == "false_negative":
print(f" -> FALSE NEGATIVE: no tool called (expected {expected})")
elif verdict == "wrong_tool":
print(f" -> WRONG TOOL: expected {expected}, got {got_tool}")
results[orig_idx] = {
"id": orig_idx + 1,
"prompt": prompt,
"expected": expected,
"got": got_tool,
"got_args": got_args,
"verdict": verdict,
"latency": latency,
"category": category,
"points": score(verdict),
}
if category not in category_stats:
category_stats[category] = {"total": 0, "strict": 0, "partial": 0, "fail": 0, "points": 0.0}
cat = category_stats[category]
cat["total"] += 1
cat["points"] += score(verdict)
if verdict == "strict_pass":
cat["strict"] += 1
elif verdict == "partial_pass":
cat["partial"] += 1
else:
cat["fail"] += 1
return results, category_stats
def print_run_summary(results, category_stats, run_label=""):
strict = sum(1 for r in results if r["verdict"] == "strict_pass")
partial = sum(1 for r in results if r["verdict"] == "partial_pass")
fp = sum(1 for r in results if r["verdict"] == "false_positive")
fn = sum(1 for r in results if r["verdict"] == "false_negative")
wt = sum(1 for r in results if r["verdict"] == "wrong_tool")
total = len(results)
total_points = sum(r["points"] for r in results)
tool_tests = [r for r in results if r["expected"] != "NO_TOOL"]
no_tool_tests = [r for r in results if r["expected"] == "NO_TOOL"]
no_tool_correct = sum(1 for r in no_tool_tests if r["verdict"] == "strict_pass")
hallucinations = sum(1 for r in results if r["verdict"] == "false_positive")
avg_lat = sum(r["latency"] for r in results) / total if total else 0
lbl = f" (Run {run_label})" if run_label else ""
print()
print("=" * 80)
print(f" EVAL-300 RESULTS{lbl}")
print("=" * 80)
print(f" Strict Pass: {strict}/{total} = {strict / total * 100:.1f}%")
print(f" Partial Pass: {partial}/{total} = {partial / total * 100:.1f}%")
print(f" Wrong Tool: {wt}/{total}")
print(f" False Positives: {fp}/{total} (hallucinations)")
print(f" False Negatives: {fn}/{total}")
print(f" ---")
print(f" strict_pct (strict/total): {strict / total * 100:.1f}%")
print(f" weighted_pct (total_points/total): {total_points / total * 100:.1f}%")
print(f" Abstention accuracy: {no_tool_correct}/{len(no_tool_tests)} = {no_tool_correct / len(no_tool_tests) * 100:.1f}%")
print(f" Hallucinations: {hallucinations} (target = 0)")
print(f" Avg latency: {avg_lat:.1f}s")
print()
print(f" {'Category':<22} {'Strict':>7} {'Partial':>8} {'Fail':>5} {'Pts/Tot':>10} {'Pct':>6}")
print(f" {'-'*22} {'-'*7} {'-'*8} {'-'*5} {'-'*10} {'-'*6}")
for cat, s in sorted(category_stats.items()):
pts_pct = s["points"] / s["total"] * 100 if s["total"] else 0
print(f" {cat:<22} {s['strict']:>7} {s['partial']:>8} {s['fail']:>5} "
f"{s['points']:>5.1f}/{s['total']:<4} {pts_pct:>5.1f}%")
print("=" * 80)
return {
"strict": strict,
"partial": partial,
"wrong_tool": wt,
"false_positive": fp,
"false_negative": fn,
"total": total,
"total_points": total_points,
"strict_pct": strict / total,
"weighted_pct": total_points / total,
"abstention_rate": no_tool_correct / len(no_tool_tests) if no_tool_tests else 0,
"hallucinations": hallucinations,
"avg_latency": avg_lat,
"category_stats": category_stats,
}
def main():
parser = argparse.ArgumentParser(description="Eval-300: 300-case standard evaluation for prism-coder")
parser.add_argument("--model", type=str, default=None,
help="Ollama model tag to evaluate (default: prism-coder:4b-v43)")
parser.add_argument("--runs", type=int, default=1,
help="Number of eval runs (default: 1; use 3 for stability check)")
parser.add_argument("--shuffle", action="store_true",
help="Randomize test order each run")
parser.add_argument("--no-validate-layer3", action="store_true",
help="Disable Layer 3 false-positive rejection "
"(use during RFT/DPO so model sees true failures)")
args = parser.parse_args()
global MODEL, validate_tool_call
if args.model:
MODEL = args.model
if args.no_validate_layer3:
def validate_tool_call(prompt, tool_name, tool_args): # noqa: F811
return tool_name, tool_args
_verify_test_counts()
print("=" * 80)
print(f" EVAL-300 — prism-coder standard evaluation")
print(f" Model: {MODEL}")
print(f" Tests: {len(TESTS)}")
print(f" Runs: {args.runs}" + (" (RANDOMIZED ORDER each run)" if args.shuffle else ""))
print(f" Layer3: {'DISABLED' if args.no_validate_layer3 else 'enabled'}")
print("=" * 80)
all_run_summaries = []
all_run_results = []
for run_idx in range(args.runs):
run_label = str(run_idx + 1) if args.runs > 1 else ""
if args.runs > 1:
print(f"\n{'#' * 80}")
print(f" RUN {run_idx + 1} / {args.runs}" +
(f" (seed={random.randint(1000, 9999)})" if args.shuffle else ""))
print(f"{'#' * 80}")
results, cat_stats = run_once(TESTS, shuffle=args.shuffle, run_label=run_label)
summary = print_run_summary(results, cat_stats, run_label=run_label)
all_run_summaries.append(summary)
all_run_results.append(results)
# ---------------------------------------------------------------------------
# Multi-run aggregate
# ---------------------------------------------------------------------------
if args.runs > 1:
strict_scores = [s["strict"] for s in all_run_summaries]
weighted_pcts = [s["weighted_pct"] * 100 for s in all_run_summaries]
total = all_run_summaries[0]["total"]
halluc_counts = [s["hallucinations"] for s in all_run_summaries]
# Per-test stability
per_test_pass = [0] * len(TESTS)
per_test_fail_tools = [[] for _ in range(len(TESTS))]
for run_results in all_run_results:
for r in run_results:
idx = r["id"] - 1
if r["verdict"] == "strict_pass":
per_test_pass[idx] += 1
else:
per_test_fail_tools[idx].append(r.get("got", "???"))
med_strict = statistics.median(strict_scores)
avg_strict = statistics.mean(strict_scores)
med_weighted = statistics.median(weighted_pcts)
print(f"\n{'=' * 80}")
print(f" MULTI-RUN SUMMARY ({args.runs} runs x {total} tests)")
print(f"{'=' * 80}")
print(f" Strict scores: {' | '.join(f'{s}/{total}' for s in strict_scores)}")
print(f" Median strict: {med_strict}/{total} = {med_strict / total * 100:.1f}%")
print(f" Average strict: {avg_strict:.1f}/{total} = {avg_strict / total * 100:.1f}%")
print(f" Weighted pct: {' | '.join(f'{p:.1f}%' for p in weighted_pcts)} "
f"(median {med_weighted:.1f}%)")
print(f" Hallucinations: {' | '.join(str(h) for h in halluc_counts)} "
f"(target = 0 each run)")
print()
print(f" Flaky tests (< 100% pass rate across {args.runs} runs):")
flaky = []
for i, (prompt, expected, _, cat) in enumerate(TESTS):
rate = per_test_pass[i] / args.runs
if rate < 1.0:
fail_tools = per_test_fail_tools[i]
flaky.append((i + 1, rate, expected, set(fail_tools), cat, prompt[:60]))
if flaky:
for fid, rate, exp, fails, fcat, fshort in sorted(flaky, key=lambda x: x[1]):
print(f" [{fid:03d}] {rate * 100:3.0f}% | cat={fcat:<18s} | expect={exp:<28s} | fails->{','.join(fails):<20s} | {fshort}")
else:
print(" All tests passed consistently across all runs!")
print(f" Total flaky: {len(flaky)}/{total}")
print(f"{'=' * 80}")
# ---------------------------------------------------------------------------
# Save JSON report
# ---------------------------------------------------------------------------
os.makedirs("results", exist_ok=True)
report_path = "results/eval300_report.json"
final_summary = all_run_summaries[-1] if args.runs == 1 else {
"runs": args.runs,
"strict_scores": strict_scores,
"median_strict": statistics.median(strict_scores) / total,
"avg_strict": statistics.mean(strict_scores) / total,
"median_weighted_pct": statistics.median(weighted_pcts) / 100,
"hallucinations_per_run": halluc_counts,
"per_run_summaries": all_run_summaries,
} if args.runs > 1 else all_run_summaries[0]
report = {
"model": MODEL,
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
"total_tests": len(TESTS),
"runs": args.runs,
"shuffle": args.shuffle,
"layer3_enabled": not args.no_validate_layer3,
"summary": final_summary,
"last_run_results": all_run_results[-1],
}
with open(report_path, "w") as f:
json.dump(report, f, indent=2, default=str)
print(f"\nReport saved: {report_path}")
# Exit code: fail if last run strict < 90%
last_strict_pct = all_run_summaries[-1]["strict_pct"] * 100
if last_strict_pct < 90.0:
print(f"FAIL: strict_pct {last_strict_pct:.1f}% is below 90% gate")
sys.exit(1)
else:
print(f"PASS: strict_pct {last_strict_pct:.1f}%")
sys.exit(0)
if __name__ == "__main__":
main()