#!/usr/bin/env python3
"""
eval_300.py — 300-Case Standard Evaluation for prism-coder:4b-v43

Replaces bfcl_eval.py (64 tests) and swe_bench_test.py (68 tests) with a single
~300-case blind eval. Designed to be run 3 times for statistical stability checks.

All test cases are NOVEL — never seen in any training data.

Categories:
  natural_phrasing   (50)  — casual/indirect phrasing that maps to a tool
  adversarial_trap   (70)  — CS/programming questions that must NOT call a tool
  disambiguation     (40)  — similar tools exist; must pick the correct one
  edge_case          (25)  — minimal / ambiguous prompts
  multi_intent       (20)  — multi-step prompts; score on first action only
  verifier           (25)  — synthesize_edges / backfill_links / health_check patterns
  cascade            (25)  — explicit first-step-of-chain patterns
  param_extraction   (25)  — params in the prompt text; test correct extraction
  abstention         (20)  — greetings / capability questions; must return NO_TOOL

Scoring:
  strict_pass  = correct tool + all required_params present → 1.0 point
  partial_pass = correct tool + at least 1 required_param but not all → 0.5 point
  wrong_tool   = wrong tool name → 0 points
  false_pos    = tool called when NO_TOOL expected → 0 points
  false_neg    = NO_TOOL when tool expected → 0 points

Usage:
  python3 eval_300.py
  python3 eval_300.py --runs 3 --shuffle
  python3 eval_300.py --model prism-coder:4b-v43 --runs 3
  python3 eval_300.py --no-validate-layer3
"""

import json
import os
import re
import sys
import time
import random
import statistics
import urllib.request
import argparse

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
MODEL = "prism-coder:4b-v43"
OLLAMA_API = "http://localhost:11434/api/generate"

SYSTEM_PROMPT = (
    "You are Synalux, a memory-augmented coding and clinical reasoning assistant. "
    "You have access to Prism Memory tools (session_save_ledger, session_load_context, "
    "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, "
    "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, "
    "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, "
    "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool "
    "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, "
    "hipaa, data_graph, templates, pdf_parser). "
    "Think step-by-step before answering. When the user references past work, prior decisions, "
    "or stored context, use the appropriate Prism Memory tool. "
    "Format tool calls inside <tool_call>...</tool_call> JSON blocks with fields 'name' and 'arguments'. "
    "If no tool is needed, answer directly in plain text. "
    "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions."
)

VALID_TOOLS = {
    "session_load_context", "session_save_ledger", "session_save_handoff",
    "session_search_memory", "session_forget_memory", "session_health_check",
    "session_compact_ledger", "session_export_memory", "session_task_route",
    "session_save_experience", "session_synthesize_edges", "session_backfill_links",
    "knowledge_search", "knowledge_forget", "knowledge_upvote",
    "knowledge_downvote", "knowledge_set_retention",
}

# ---------------------------------------------------------------------------
# Test Cases  (prompt, expected_tool_or_NO_TOOL, required_params, category)
# required_params: list of param keys that MUST appear in got_args
# ---------------------------------------------------------------------------

TESTS = [

    # ===========================================================================
    # CATEGORY 1: natural_phrasing (50 cases)
    # Casual / indirect user phrasing that maps to a specific Prism tool.
    # ===========================================================================

    # --- session_load_context ---
    ("Alright, kick things off. Pull up whatever we had on the checkout-service project.",
     "session_load_context", ["project"], "natural_phrasing"),

    ("I'm back from lunch. Get me re-oriented on the prism-aac project.",
     "session_load_context", ["project"], "natural_phrasing"),

    ("Fresh session here. Reconstruct everything we built for the notifications project.",
     "session_load_context", ["project"], "natural_phrasing"),

    ("Starting a new chat. Bring up the full context for the mobile-app project.",
     "session_load_context", ["project"], "natural_phrasing"),

    ("Where did we leave off with the auth-service work?",
     "session_load_context", [], "natural_phrasing"),

    ("Get me up to speed on the reporting-dashboard project.",
     "session_load_context", ["project"], "natural_phrasing"),

    ("Resume from where we were on the data-pipeline project.",
     "session_load_context", ["project"], "natural_phrasing"),

    ("Catch me up — what was the state of the subscription-api project?",
     "session_load_context", ["project"], "natural_phrasing"),

    # --- session_save_ledger ---
    ("We wrapped up for today. Make a note that we completed the database indexing overhaul.",
     "session_save_ledger", [], "natural_phrasing"),

    ("Log what just happened: we refactored the payment module and all tests pass.",
     "session_save_ledger", [], "natural_phrasing"),

    ("Record this session — we finalized the API contract for the mobile team.",
     "session_save_ledger", [], "natural_phrasing"),

    ("Write down everything we did today before I close this tab.",
     "session_save_ledger", [], "natural_phrasing"),

    ("Jot down our progress: three endpoints migrated, two more to go.",
     "session_save_ledger", [], "natural_phrasing"),

    ("Before I head out, save a summary of what we accomplished this afternoon.",
     "session_save_ledger", [], "natural_phrasing"),

    # --- session_save_handoff ---
    ("I'm handing this over. Leave a note for whoever picks this up next on the billing-portal project.",
     "session_save_handoff", ["project"], "natural_phrasing"),

    ("Pass the baton on the logistics-api project. Save the handoff so the next person knows where we are.",
     "session_save_handoff", ["project"], "natural_phrasing"),

    ("Shift change. Store the current state for the embedded-firmware project so the next agent can continue.",
     "session_save_handoff", ["project"], "natural_phrasing"),

    ("Create a handoff note for the trading-platform project — we got through feature flagging, still need A/B routing.",
     "session_save_handoff", ["project"], "natural_phrasing"),

    # --- session_search_memory ---
    ("Remind me — did we ever pick a caching strategy for the CDN layer?",
     "session_search_memory", ["query"], "natural_phrasing"),

    ("Did we discuss anything about Kafka consumer lag in previous sessions?",
     "session_search_memory", ["query"], "natural_phrasing"),

    ("Go back through our history and find anything about the CI pipeline refactor.",
     "session_search_memory", ["query"], "natural_phrasing"),

    ("What did we decide about webhook retry logic in past conversations?",
     "session_search_memory", ["query"], "natural_phrasing"),

    ("Dig up anything we recorded about the multi-tenant database design.",
     "session_search_memory", ["query"], "natural_phrasing"),

    ("Pull up any notes we saved about the gRPC migration.",
     "session_search_memory", ["query"], "natural_phrasing"),

    # --- session_forget_memory ---
    ("That entry we saved about using SQLite in production is totally wrong. Remove it.",
     "session_forget_memory", ["memory_id"], "natural_phrasing"),

    ("Delete the memory with ID mem-zx91-ff. It's stale.",
     "session_forget_memory", ["memory_id"], "natural_phrasing"),

    ("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.",
     "session_forget_memory", ["memory_id"], "natural_phrasing"),

    # --- session_health_check ---
    ("Something feels off. Can you run diagnostics on the memory backend?",
     "session_health_check", [], "natural_phrasing"),

    ("Before I trust these search results, verify the memory system is healthy.",
     "session_health_check", [], "natural_phrasing"),

    ("Give the memory infrastructure a quick checkup.",
     "session_health_check", [], "natural_phrasing"),

    # --- session_compact_ledger ---
    ("The session history for the event-sourcing project is getting massive. Trim and archive the old entries.",
     "session_compact_ledger", ["project"], "natural_phrasing"),

    ("Compress the ledger for the recommendation-engine project — too much noise in there.",
     "session_compact_ledger", ["project"], "natural_phrasing"),

    ("Prune out the old session entries for the analytics-backend project.",
     "session_compact_ledger", ["project"], "natural_phrasing"),

    # --- session_export_memory ---
    ("Dump a full backup of my memory to /data/exports in JSON format.",
     "session_export_memory", ["output_path", "format"], "natural_phrasing"),

    ("Export everything to /tmp/prism-dump so I can archive it.",
     "session_export_memory", ["output_path"], "natural_phrasing"),

    ("I need an offline copy of all session data. Export to /backup/weekly.",
     "session_export_memory", ["output_path"], "natural_phrasing"),

    # --- session_task_route ---
    ("Should I tackle this Rust async runtime bug locally or send it to a bigger model?",
     "session_task_route", ["task_description"], "natural_phrasing"),

    ("Is this image classification fine-tuning job something the local agent can handle?",
     "session_task_route", ["task_description"], "natural_phrasing"),

    ("Route this task: refactor the monorepo build system to support incremental compilation.",
     "session_task_route", ["task_description"], "natural_phrasing"),

    # --- session_save_experience ---
    ("Log a milestone: we successfully zero-downtime-deployed the new search index.",
     "session_save_experience", [], "natural_phrasing"),

    ("Record that we fixed the race condition in the WebSocket handler — took 4 hours but it's solid now.",
     "session_save_experience", [], "natural_phrasing"),

    # --- knowledge_search ---
    ("Any institutional knowledge on how we handle circuit breakers?",
     "knowledge_search", ["query"], "natural_phrasing"),

    ("What does our knowledge base say about rate limiting strategies?",
     "knowledge_search", ["query"], "natural_phrasing"),

    ("Look up anything curated about CQRS patterns.",
     "knowledge_search", ["query"], "natural_phrasing"),

    ("Check our documented knowledge for anything on event-driven architecture.",
     "knowledge_search", ["query"], "natural_phrasing"),

    # --- knowledge_upvote / downvote ---
    ("That knowledge entry about using Redis for distributed locks was really helpful. Give it a thumbs up.",
     "knowledge_upvote", [], "natural_phrasing"),

    ("Boost the ranking on our GraphQL federation notes — they're gold.",
     "knowledge_upvote", [], "natural_phrasing"),

    ("That doc about using polling instead of webhooks is outdated and wrong. Lower its score.",
     "knowledge_downvote", [], "natural_phrasing"),

    ("Downvote the entry about using bcrypt at cost 4 — it's dangerously insecure.",
     "knowledge_downvote", [], "natural_phrasing"),

    # --- knowledge_set_retention ---
    ("Set a 45-day retention policy on the alpha-testing project's knowledge.",
     "knowledge_set_retention", ["project"], "natural_phrasing"),


    # ===========================================================================
    # CATEGORY 2: adversarial_trap (70 cases)
    # CS / programming questions — must return NO_TOOL even when keywords match.
    # ===========================================================================

    # Python
    ("Write a Python function that implements a trie for fast prefix searches.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I use Python's contextlib.contextmanager decorator?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain Python's __slots__ and when to use it for memory optimization.",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is the difference between deepcopy and shallow copy in Python?",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does Python's asyncio event loop schedule coroutines?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a Python generator that yields prime numbers indefinitely.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I profile memory usage in a Python application?",
     "NO_TOOL", [], "adversarial_trap"),

    # JavaScript / TypeScript
    ("How do I debounce a function in JavaScript without lodash?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain the JavaScript event loop and microtask queue.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does TypeScript's discriminated union type work?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a TypeScript generic function that deep-merges two objects.",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is the difference between a WeakMap and a Map in JavaScript?",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I implement a promise-based queue in Node.js?",
     "NO_TOOL", [], "adversarial_trap"),

    # Go
    ("How does Go's goroutine scheduler work with M:N threading?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain Go's garbage collector and write barriers.",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a concurrent rate limiter in Go using channels.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I implement context cancellation in a Go HTTP server?",
     "NO_TOOL", [], "adversarial_trap"),

    # Rust
    ("Explain Rust's borrow checker and why it prevents data races.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do Arc and Mutex work together in Rust for thread-safe state sharing?",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is Rust's Pin and why is it needed for async futures?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a Rust trait that implements a retry strategy with exponential backoff.",
     "NO_TOOL", [], "adversarial_trap"),

    # SQL / NoSQL
    ("Write a SQL query that finds the second-highest salary in an employees table.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I use window functions in PostgreSQL to compute a running total?",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is a covering index and when should I use one in MySQL?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain eventual consistency in DynamoDB and how to work around it.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I export data from MongoDB to a JSON file using mongoexport?",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is a materialized view in PostgreSQL and how does it differ from a regular view?",
     "NO_TOOL", [], "adversarial_trap"),

    # Algorithms / Data Structures
    ("Explain Dijkstra's algorithm and its time complexity.",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a depth-first search implementation for a graph adjacency list.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does consistent hashing help with horizontal scaling?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain the difference between a B-tree and a B+ tree.",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is the time and space complexity of merge sort?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Implement a LRU cache in Python using OrderedDict.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does a bloom filter work and what are its false positive trade-offs?",
     "NO_TOOL", [], "adversarial_trap"),

    # Frameworks / Config
    ("How do I configure Django's ORM to use read replicas?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain Flask's application context vs. request context.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does FastAPI's dependency injection system work?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a middleware in Express.js that logs request durations.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I set up hot-module replacement in a Vite + React project?",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is the difference between server components and client components in Next.js 14?",
     "NO_TOOL", [], "adversarial_trap"),

    # DevOps / Infrastructure
    ("Write a Dockerfile for a Python FastAPI app with multi-stage builds.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I configure a Kubernetes HorizontalPodAutoscaler based on custom metrics?",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is the difference between rolling and blue-green deployments?",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I set up Prometheus scraping for a Node.js service?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain how etcd achieves consensus using the Raft algorithm.",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a GitHub Actions workflow that runs tests on every pull request.",
     "NO_TOOL", [], "adversarial_trap"),

    # Memory management (trap on 'memory' keyword)
    ("How does virtual memory paging work in Linux?",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is memory-mapped I/O and how does mmap work in C?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain stack vs. heap memory allocation and when each is appropriate.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does the V8 engine's garbage collector use generational collection?",
     "NO_TOOL", [], "adversarial_trap"),

    # Session handling (trap on 'session' keyword)
    ("How does PHP's session_start() work under the hood?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Implement session fixation protection in a Flask application.",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is the difference between sticky sessions and session replication?",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do I store JWT tokens in a secure, httpOnly cookie in Express?",
     "NO_TOOL", [], "adversarial_trap"),

    # Search (trap on 'search' keyword)
    ("How do I implement fuzzy search with trigrams in PostgreSQL?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain TF-IDF and how it ranks documents in full-text search.",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a binary search implementation in Rust.",
     "NO_TOOL", [], "adversarial_trap"),

    ("Compare Elasticsearch and OpenSearch for log aggregation.",
     "NO_TOOL", [], "adversarial_trap"),

    # Graph theory (trap on 'graph' + 'edges' keywords)
    ("Explain the difference between Prim's and Kruskal's spanning tree algorithms.",
     "NO_TOOL", [], "adversarial_trap"),

    ("How do topological sorts work on directed acyclic graphs?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Write a function to detect cycles in a directed graph using DFS.",
     "NO_TOOL", [], "adversarial_trap"),

    # Load balancing (trap on 'load' keyword)
    ("What are the differences between round-robin, least-connections, and IP-hash load balancing?",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does Nginx upstream load balancing handle health check failures?",
     "NO_TOOL", [], "adversarial_trap"),

    # Logging / monitoring
    ("How do I implement structured logging in a Go service with zerolog?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain the ELK stack and how logs flow from Beats to Kibana.",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is OpenTelemetry and how does distributed tracing work?",
     "NO_TOOL", [], "adversarial_trap"),

    # Misc CS concepts
    ("What is the difference between optimistic and pessimistic locking in databases?",
     "NO_TOOL", [], "adversarial_trap"),

    ("Explain how CRDTs achieve conflict-free distributed state.",
     "NO_TOOL", [], "adversarial_trap"),

    ("What is a saga pattern in distributed systems?",
     "NO_TOOL", [], "adversarial_trap"),

    ("How does the forget gate in an LSTM neural network control memory?",
     "NO_TOOL", [], "adversarial_trap"),


    # ===========================================================================
    # CATEGORY 3: disambiguation (40 cases)
    # Similar tools — model must pick the correct one.
    # ===========================================================================

    # session_search_memory vs knowledge_search
    ("Find anything we discussed last month about the API versioning decision.",
     "session_search_memory", ["query"], "disambiguation"),

    ("What do our curated knowledge items say about dependency injection patterns?",
     "knowledge_search", ["query"], "disambiguation"),

    ("Search our accumulated documentation for information on database sharding.",
     "knowledge_search", ["query"], "disambiguation"),

    ("Look through recent session notes for anything about the CDN cache invalidation bug.",
     "session_search_memory", ["query"], "disambiguation"),

    ("Any past conversations where we discussed microservice mesh configurations?",
     "session_search_memory", ["query"], "disambiguation"),

    ("Check the knowledge base for anything on event sourcing trade-offs.",
     "knowledge_search", ["query"], "disambiguation"),

    # session_forget_memory vs knowledge_forget
    ("Remove the specific session memory with ID mem-qq77-rr. It's incorrect.",
     "session_forget_memory", ["memory_id"], "disambiguation"),

    ("Clear all the outdated knowledge entries in the staging project.",
     "knowledge_forget", ["project"], "disambiguation"),

    ("Wipe out old debugging records from the search-service project's knowledge base.",
     "knowledge_forget", ["project"], "disambiguation"),

    ("Delete the memory entry for ID mem-ab99-cd — we noted the wrong schema version.",
     "session_forget_memory", ["memory_id"], "disambiguation"),

    ("Remove all knowledge items in the deprecated-feature category from the portal project.",
     "knowledge_forget", ["project"], "disambiguation"),

    # session_save_ledger vs session_save_experience vs session_save_handoff
    ("Log what we did today: migrated the billing module to the new event bus.",
     "session_save_ledger", [], "disambiguation"),

    ("Record a milestone: we successfully launched the new onboarding flow in production.",
     "session_save_experience", [], "disambiguation"),

    ("Hand off this session — save the state for the next agent on the gateway project.",
     "session_save_handoff", ["project"], "disambiguation"),

    ("Write down that we rewrote the payment reconciliation logic today.",
     "session_save_ledger", [], "disambiguation"),

    ("Mark a success: we fixed the notorious N+1 query on the orders endpoint.",
     "session_save_experience", [], "disambiguation"),

    ("The contractor is taking over tonight. Save the handoff for the migration-tools project.",
     "session_save_handoff", ["project"], "disambiguation"),

    # knowledge_upvote vs knowledge_downvote
    ("That knowledge entry about immutable infrastructure is spot on. Upvote it.",
     "knowledge_upvote", [], "disambiguation"),

    ("The doc recommending XML over JSON for internal APIs is terrible. Mark it down.",
     "knowledge_downvote", [], "disambiguation"),

    ("Increase the importance score of the circuit-breaker patterns entry.",
     "knowledge_upvote", [], "disambiguation"),

    ("Reduce the rank of that outdated note about using MD5 for hashing.",
     "knowledge_downvote", [], "disambiguation"),

    # session_compact_ledger vs session_export_memory
    ("The billing-service ledger is bloated. Compress and archive the old entries.",
     "session_compact_ledger", ["project"], "disambiguation"),

    ("Export a full offline snapshot of my memory to /archive/snapshot in JSON.",
     "session_export_memory", ["output_path", "format"], "disambiguation"),

    ("Trim down the session history for the firmware project — it's too long.",
     "session_compact_ledger", ["project"], "disambiguation"),

    ("Save everything to disk — dump all session data to /tmp/export-all.",
     "session_export_memory", ["output_path"], "disambiguation"),

    # session_synthesize_edges vs session_backfill_links vs session_health_check
    ("Verify the session graph edges are all consistent for the trading-platform project.",
     "session_synthesize_edges", ["project"], "disambiguation"),

    ("Reconnect the dangling session references for the ml-pipeline project.",
     "session_backfill_links", ["project"], "disambiguation"),

    ("Run a full health diagnostic on the Prism memory backend.",
     "session_health_check", [], "disambiguation"),

    ("Patch up missing cross-session links for the user-service project.",
     "session_backfill_links", ["project"], "disambiguation"),

    ("Make sure all edges are synthesized and up to date for the invoicing project.",
     "session_synthesize_edges", ["project"], "disambiguation"),

    ("Is the memory system responding normally? Do a quick health check.",
     "session_health_check", [], "disambiguation"),

    # session_load_context vs session_search_memory
    ("Bring me back into the context of the payments-gateway project.",
     "session_load_context", ["project"], "disambiguation"),

    ("Look for any notes we made about the GraphQL schema decisions.",
     "session_search_memory", ["query"], "disambiguation"),

    ("Restore the full session state for the devops-automation project.",
     "session_load_context", ["project"], "disambiguation"),

    ("Search our history for any discussion about OAuth2 vs API keys.",
     "session_search_memory", ["query"], "disambiguation"),

    # session_task_route vs session_load_context
    ("Should the local model handle this React performance optimization or route it to the cloud?",
     "session_task_route", ["task_description"], "disambiguation"),

    ("Initialize context for the infrastructure-as-code project — I'm starting fresh.",
     "session_load_context", ["project"], "disambiguation"),

    # knowledge_set_retention vs knowledge_forget
    ("Set the knowledge for the beta-program project to expire after 90 days.",
     "knowledge_set_retention", ["project"], "disambiguation"),

    ("Delete all knowledge in the archived-2025 project — we don't need it anymore.",
     "knowledge_forget", ["project"], "disambiguation"),

    ("Auto-expire the knowledge entries in the sandbox project after 14 days.",
     "knowledge_set_retention", ["project"], "disambiguation"),


    # ===========================================================================
    # CATEGORY 4: edge_case (25 cases)
    # Minimal, single-word, ambiguous, or unusual prompts.
    # ===========================================================================

    ("Load context.", "session_load_context", [], "edge_case"),
    ("Save.", "session_save_ledger", [], "edge_case"),
    ("Search.", "session_search_memory", [], "edge_case"),
    ("Check health.", "session_health_check", [], "edge_case"),
    ("Export.", "session_export_memory", [], "edge_case"),
    ("Compact.", "session_compact_ledger", [], "edge_case"),
    ("Handoff.", "session_save_handoff", [], "edge_case"),
    ("Route this.", "session_task_route", [], "edge_case"),
    ("Synthesize edges.", "session_synthesize_edges", [], "edge_case"),
    ("Backfill links.", "session_backfill_links", [], "edge_case"),
    ("Forget it.", "session_forget_memory", [], "edge_case"),
    ("Knowledge search.", "knowledge_search", [], "edge_case"),

    # Abstention edge cases
    ("Hello!", "NO_TOOL", [], "edge_case"),
    ("What can you do?", "NO_TOOL", [], "edge_case"),
    ("Tell me about yourself.", "NO_TOOL", [], "edge_case"),
    ("Thanks, we're done.", "NO_TOOL", [], "edge_case"),
    ("OK great.", "NO_TOOL", [], "edge_case"),
    ("Bye!", "NO_TOOL", [], "edge_case"),

    # Ambiguous short prompts that still require the right tool
    ("Run diagnostics.", "session_health_check", [], "edge_case"),
    ("Save the handoff.", "session_save_handoff", [], "edge_case"),
    ("Log this session.", "session_save_ledger", [], "edge_case"),
    ("Search memory.", "session_search_memory", [], "edge_case"),
    ("Knowledge base lookup.", "knowledge_search", [], "edge_case"),
    ("Archive old entries.", "session_compact_ledger", [], "edge_case"),
    ("Save experience.", "session_save_experience", [], "edge_case"),


    # ===========================================================================
    # CATEGORY 5: multi_intent (20 cases)
    # Multi-step prompts — score only the FIRST action.
    # ===========================================================================

    ("Load the context for the pipeline project, then search for any past notes on streaming.",
     "session_load_context", ["project"], "multi_intent"),

    ("Search our memory for anything about the OAuth migration, then save a handoff.",
     "session_search_memory", ["query"], "multi_intent"),

    ("Check memory health, and if it's all good, compact the fraud-detection ledger.",
     "session_health_check", [], "multi_intent"),

    ("Find notes about the ML model rollout, and then log that we finished the A/B test today.",
     "session_search_memory", ["query"], "multi_intent"),

    ("Load the prism-mcp context, then check if there are any open issues about rate limiting.",
     "session_load_context", ["project"], "multi_intent"),

    ("Export everything to /tmp/backup, then set a 60-day retention policy on it.",
     "session_export_memory", ["output_path"], "multi_intent"),

    ("Save what we did today: shipped the new notification system. Then create a handoff note.",
     "session_save_ledger", [], "multi_intent"),

    ("Search for what we decided about the queue architecture, then upvote the best result.",
     "session_search_memory", ["query"], "multi_intent"),

    ("Run a health check on the memory system, then compact the ledger if there are issues.",
     "session_health_check", [], "multi_intent"),

    ("Look up our knowledge on service mesh patterns, and then downvote the outdated ones.",
     "knowledge_search", ["query"], "multi_intent"),

    ("Compact the session history for the payments project, then synthesize the session edges.",
     "session_compact_ledger", ["project"], "multi_intent"),

    ("Load context for the billing-v2 project, and record our progress: we fixed the invoice date bug.",
     "session_load_context", ["project"], "multi_intent"),

    ("Search our knowledge base for event-driven design patterns, then save a handoff with the findings.",
     "knowledge_search", ["query"], "multi_intent"),

    ("Backfill the cross-session links for the ios-app project, then synthesize edges.",
     "session_backfill_links", ["project"], "multi_intent"),

    ("Route this task: full rewrite of the logging subsystem. If cloud, just tell me.",
     "session_task_route", ["task_description"], "multi_intent"),

    ("Export memory to /var/backup, and then purge the old knowledge entries from the legacy project.",
     "session_export_memory", ["output_path"], "multi_intent"),

    ("Find what we discussed about caching strategies, then set a 30-day retention on that knowledge.",
     "session_search_memory", ["query"], "multi_intent"),

    ("Record a success milestone: zero-downtime deploy of version 4.2. Then compact the ledger.",
     "session_save_experience", [], "multi_intent"),

    ("Load the fraud-detection project context and then synthesize all session edges.",
     "session_load_context", ["project"], "multi_intent"),

    ("Save what we accomplished: rewrote the ingestion pipeline. Then hand it off to the ops team.",
     "session_save_ledger", [], "multi_intent"),


    # ===========================================================================
    # CATEGORY 6: verifier (25 cases)
    # session_synthesize_edges / session_backfill_links / session_health_check patterns.
    # ===========================================================================

    # session_synthesize_edges
    ("Make sure all session graph edges are consistent for the auth-gateway project.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Run a synthesis pass to validate all edges are up to date for the orchestration project.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Verify graph integrity — synthesize edges for the content-delivery project.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Before closing out, check that all session links are consistent for the scheduling project.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Ensure all session relationships are properly synthesized for the warehouse-api project.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Run edge synthesis on the real-time-alerts project to validate the session graph.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Validate that all edges in the session graph are consistent for the pricing-engine project.",
     "session_synthesize_edges", ["project"], "verifier"),

    ("Confirm session link consistency for the document-processing project.",
     "session_synthesize_edges", ["project"], "verifier"),

    # session_backfill_links
    ("There are broken cross-session links in the search-backend project. Backfill them.",
     "session_backfill_links", ["project"], "verifier"),

    ("Reconnect all dangling references in the identity-service project history.",
     "session_backfill_links", ["project"], "verifier"),

    ("Patch the missing links between sessions for the payments-v3 project.",
     "session_backfill_links", ["project"], "verifier"),

    ("Fix the link gaps in our session history for the recommendation-service project.",
     "session_backfill_links", ["project"], "verifier"),

    ("Backfill any missing cross-session connections for the notification-hub project.",
     "session_backfill_links", ["project"], "verifier"),

    ("Reconnect broken session references in the compliance-tracker project.",
     "session_backfill_links", ["project"], "verifier"),

    ("Repair missing session links for the api-gateway project.",
     "session_backfill_links", ["project"], "verifier"),

    # session_health_check
    ("Before I start a new sprint, confirm the memory system is operating correctly.",
     "session_health_check", [], "verifier"),

    ("The search results seem incomplete. Check if the memory backend is healthy.",
     "session_health_check", [], "verifier"),

    ("I'm seeing weird behavior in session recall. Run a diagnostic check.",
     "session_health_check", [], "verifier"),

    ("Ping the memory system and confirm it's all healthy.",
     "session_health_check", [], "verifier"),

    ("Is the Prism memory backend operating within normal parameters?",
     "session_health_check", [], "verifier"),

    ("Double-check the memory infrastructure health before I rely on these results.",
     "session_health_check", [], "verifier"),

    ("Verify the memory system is functioning before we start the long session.",
     "session_health_check", [], "verifier"),

    ("Run a full health check and report back on the memory backend status.",
     "session_health_check", [], "verifier"),

    ("Something is off with memory recall. Diagnose the backend.",
     "session_health_check", [], "verifier"),

    ("Confirm the session memory system is healthy before I save this handoff.",
     "session_health_check", [], "verifier"),


    # ===========================================================================
    # CATEGORY 7: cascade (25 cases)
    # Explicit first-step-of-chain patterns — model must pick the right FIRST tool.
    # ===========================================================================

    ("Search our knowledge for gRPC patterns, then upvote the most relevant entry.",
     "knowledge_search", ["query"], "cascade"),

    ("Load the indexing-service context, then search for any past notes on shard rebalancing.",
     "session_load_context", ["project"], "cascade"),

    ("Check memory health, then compact the alerts project ledger if there are stale entries.",
     "session_health_check", [], "cascade"),

    ("Export all memory to /tmp/archive, then set a 180-day retention policy on the archive project.",
     "session_export_memory", ["output_path"], "cascade"),

    ("Search for what we decided about the event schema design, then save a handoff about it.",
     "session_search_memory", ["query"], "cascade"),

    ("Save today's session notes for the pipeline project, then create a handoff for the next agent.",
     "session_save_ledger", [], "cascade"),

    ("Should the local model handle this concurrency refactor? If cloud, stop there.",
     "session_task_route", ["task_description"], "cascade"),

    ("Search knowledge for CQRS trade-offs, downvote anything recommending a single store.",
     "knowledge_search", ["query"], "cascade"),

    ("Compact the ledger for the embeddings project, then synthesize the session edges.",
     "session_compact_ledger", ["project"], "cascade"),

    ("Load the feature-flags project context, then log that we shipped the A/B framework.",
     "session_load_context", ["project"], "cascade"),

    ("Run a health check first, then based on results decide whether to compact or export.",
     "session_health_check", [], "cascade"),

    ("Search memory for past decisions about SSE vs WebSockets, then record what we found.",
     "session_search_memory", ["query"], "cascade"),

    ("Backfill the missing links for the analytics project, then synthesize the edges.",
     "session_backfill_links", ["project"], "cascade"),

    ("Load context for the tenant-management project, then search for any open migration tickets.",
     "session_load_context", ["project"], "cascade"),

    ("Find what we know about zero-copy networking, then save a handoff with that context.",
     "session_search_memory", ["query"], "cascade"),

    ("Export to /backups/weekly, then compact the media-processing ledger.",
     "session_export_memory", ["output_path"], "cascade"),

    ("Search our knowledge base for Kubernetes resource quotas, then set a 60-day retention.",
     "knowledge_search", ["query"], "cascade"),

    ("Save the experience: we eliminated 80% of unnecessary re-renders. Then route the next task.",
     "session_save_experience", [], "cascade"),

    ("Synthesize edges for the audit-log project, then backfill any missing links.",
     "session_synthesize_edges", ["project"], "cascade"),

    ("Load the risk-assessment project context and then search memory for past risk audit notes.",
     "session_load_context", ["project"], "cascade"),

    ("Find our notes on the transaction saga pattern, then upvote the best entry.",
     "session_search_memory", ["query"], "cascade"),

    ("Compact the metrics project ledger, then export it to /tmp/metrics-backup.",
     "session_compact_ledger", ["project"], "cascade"),

    ("Route this task: implement distributed tracing with OpenTelemetry across five services.",
     "session_task_route", ["task_description"], "cascade"),

    ("Save what we accomplished: added RBAC support to the admin API. Then synthesize edges.",
     "session_save_ledger", [], "cascade"),

    ("Search knowledge for eventual consistency patterns, then forget the entries about using global locks.",
     "knowledge_search", ["query"], "cascade"),


    # ===========================================================================
    # CATEGORY 8: param_extraction (25 cases)
    # Params ARE mentioned in the prompt — test that model extracts them correctly.
    # ===========================================================================

    ("Load the full context for the fraud-detection project at a deep level.",
     "session_load_context", ["project"], "param_extraction"),

    ("Compact the session ledger for the user-identity project.",
     "session_compact_ledger", ["project"], "param_extraction"),

    ("Save a handoff note for the supplier-portal project.",
     "session_save_handoff", ["project"], "param_extraction"),

    ("Delete the memory entry with ID mem-fg33-hh. It has the wrong branch name.",
     "session_forget_memory", ["memory_id"], "param_extraction"),

    ("Export all memory data to /exports/2026-q2 in JSON format.",
     "session_export_memory", ["output_path", "format"], "param_extraction"),

    ("Set the retention policy for the experiment-runner project to 45 days.",
     "knowledge_set_retention", ["project"], "param_extraction"),

    ("Search session memory for 'distributed tracing setup'.",
     "session_search_memory", ["query"], "param_extraction"),

    ("Search the knowledge base for 'idempotency keys in payment APIs'.",
     "knowledge_search", ["query"], "param_extraction"),

    ("Backfill the cross-session links for the warehouse-inventory project.",
     "session_backfill_links", ["project"], "param_extraction"),

    ("Synthesize session edges for the logistics-optimizer project.",
     "session_synthesize_edges", ["project"], "param_extraction"),

    ("Forget the knowledge entry with ID ki-cc44-gg — that approach is deprecated.",
     "knowledge_forget", [], "param_extraction"),

    ("Upvote the knowledge entry with ID ki-tt55-rr. Really solid documentation.",
     "knowledge_upvote", [], "param_extraction"),

    ("Downvote knowledge entry ki-uu99-qq — it recommends a vulnerable library.",
     "knowledge_downvote", [], "param_extraction"),

    ("Configure an 80-day retention policy for the beta-features project's knowledge.",
     "knowledge_set_retention", ["project"], "param_extraction"),

    ("Load context for the platform-core project.",
     "session_load_context", ["project"], "param_extraction"),

    ("Export the archive to /data/long-term-backup in markdown format.",
     "session_export_memory", ["output_path", "format"], "param_extraction"),

    ("Search for 'zero-downtime database migrations' in our session history.",
     "session_search_memory", ["query"], "param_extraction"),

    ("Search knowledge for 'CQRS vs event sourcing trade-offs'.",
     "knowledge_search", ["query"], "param_extraction"),

    ("Compact the ledger for the monitoring-stack project.",
     "session_compact_ledger", ["project"], "param_extraction"),

    ("Delete memory entry mem-pp12-ss — wrong model version was recorded.",
     "session_forget_memory", ["memory_id"], "param_extraction"),

    ("Save a handoff for the checkout-v4 project.",
     "session_save_handoff", ["project"], "param_extraction"),

    ("Route this task: rewrite the message broker integration to use NATS instead of RabbitMQ.",
     "session_task_route", ["task_description"], "param_extraction"),

    ("Synthesize edges for the ingestion-pipeline project.",
     "session_synthesize_edges", ["project"], "param_extraction"),

    ("Backfill the missing session links in the content-catalog project.",
     "session_backfill_links", ["project"], "param_extraction"),

    ("Set 120-day retention on the compliance-logs project's knowledge.",
     "knowledge_set_retention", ["project"], "param_extraction"),


    # ===========================================================================
    # CATEGORY 9: abstention (20 cases)
    # Greetings, capability questions, general CS — must return NO_TOOL.
    # ===========================================================================

    ("Hi there!", "NO_TOOL", [], "abstention"),
    ("Good morning!", "NO_TOOL", [], "abstention"),
    ("Hey, quick question — what's your name?", "NO_TOOL", [], "abstention"),
    ("What tools do you have available?", "NO_TOOL", [], "abstention"),
    ("What are your capabilities?", "NO_TOOL", [], "abstention"),
    ("Can you explain what Prism Memory tools do?", "NO_TOOL", [], "abstention"),
    ("What programming languages do you know?", "NO_TOOL", [], "abstention"),
    ("Thanks, that's all for now!", "NO_TOOL", [], "abstention"),
    ("Great work today, goodbye.", "NO_TOOL", [], "abstention"),
    ("You're really helpful, thanks!", "NO_TOOL", [], "abstention"),
    ("What is the capital of France?", "NO_TOOL", [], "abstention"),
    ("Tell me a joke.", "NO_TOOL", [], "abstention"),
    ("How do you work?", "NO_TOOL", [], "abstention"),
    ("Are you GPT-4?", "NO_TOOL", [], "abstention"),
    ("Can you write me a poem?", "NO_TOOL", [], "abstention"),
    ("What's the weather like today?", "NO_TOOL", [], "abstention"),
    ("Can you recommend a good book?", "NO_TOOL", [], "abstention"),
    ("What's 2+2?", "NO_TOOL", [], "abstention"),
    ("Do you have feelings?", "NO_TOOL", [], "abstention"),
    ("What is machine learning?", "NO_TOOL", [], "abstention"),

]

# ---------------------------------------------------------------------------
# Sanity check: enforce exactly 300 cases and correct counts per category
# ---------------------------------------------------------------------------
_TARGET_COUNTS = {
    "natural_phrasing": 50,
    "adversarial_trap": 70,
    "disambiguation": 40,
    "edge_case": 25,
    "multi_intent": 20,
    "verifier": 25,
    "cascade": 25,
    "param_extraction": 25,
    "abstention": 20,
}
_TOTAL_TARGET = 300

def _verify_test_counts():
    from collections import Counter
    counts = Counter(t[3] for t in TESTS)
    errors = []
    for cat, expected in _TARGET_COUNTS.items():
        actual = counts.get(cat, 0)
        if actual != expected:
            errors.append(f"  {cat}: expected {expected}, got {actual}")
    if len(TESTS) != _TOTAL_TARGET:
        errors.append(f"  TOTAL: expected {_TOTAL_TARGET}, got {len(TESTS)}")
    if errors:
        print("WARNING: test count mismatches:")
        for e in errors:
            print(e)
    return len(errors) == 0

# ---------------------------------------------------------------------------
# Layer 3: Inference-Time False-Positive Rejection + Remapping
# (Copied and merged from swe_bench_test.py — all current rules preserved)
# ---------------------------------------------------------------------------

GENERAL_PROGRAMMING_PATTERNS = [
    # Python context managers
    r'\bcontext\s+manager\b', r'\bcontextlib\b', r'\b__enter__\b', r'\b__exit__\b',
    r'\basync\s+context\s+manager\b',
    # ML / LSTM forget gates
    r'\bforget\s+gate\b', r'\blstm\b', r'\bcatastrophic\s+forgetting\b',
    r'\bforget\s+bias\b', r'\belastic\s+weight\s+consolidation\b',
    # Web framework sessions
    r'\bexpress\.js\b', r'\bdjango\b', r'\bflask\b', r'\bfastapi\b',
    r'\bsession_start\(\)', r'\bsession\s+middleware\b', r'\bsession\s+affinity\b',
    # General CS
    r'\bgarbage\s+collection\b', r'\bgc\s+algorithm\b',
    r'\bmemory\s+management\s+in\s+rust\b',
    r'\bload\s+balanc', r'\bnginx\b', r'\bhaproxy\b',
    r'\bcontext\s+switch',
    r'\bsearch\s+algorithm\b',
    r'\bsearch\s+functionality\s+with\s+elasticsearch\b',
    r'\bhealth\s+check\s+endpoint\s+pattern\b',
    r'\belasticsearch\b', r'\bsolr\b', r'\blucene\b',
    r'\bretention\s+polic(?:y|ies)\s+(?:in|for|with)\s+(?:kafka|s3|aws|gcp|azure|cloud)',
    r'\bpostgresql\b.*\bmongodb\b', r'\bmongodb\b.*\bpostgresql\b',
    r'\bwrite\s+a\s+decorator\b', r'\bdecorator.*retries?\b',
    r'\bci/cd\b', r'\bgithub\s+actions\b',
    r'\bcors\b.*\bnode\.js\b', r'\bnode\.js\b.*\bcors\b',
    r'\bcap\s+theorem\b', r'\bbinary\s+search\s+tree\b',
    r'\bvirtual\s+dom\b', r'\breact\b.*\breconciliation\b',
    r'\bdependency\s+injection\b',
    r'\btcp\b.*\budp\b', r'\budp\b.*\btcp\b',
    r'\btime\s+complexity\b', r'\bquicksort\b',
    r'\bexponential\s+backoff\b', r'\bjitter\b.*\bretri', r'\bapi\s+retri',
    r'\bcelery\b.*\bqueue', r'\broute\s+tasks?\s+in\s+celery\b',
    r'\bknowledge\s+graph\b.*\b(?:function|search|algorithm|traversal)\b',
    r'\b(?:function|write\s+a\s+function|implement)\b.*\bknowledge\s+graph\b',
    r'\bsave\s+(?:user\s+)?preferences?\s+in\s+(?:react|redux|localstorage|a\s+database)\b',
    r'\bexport\s+(?:data\s+)?from\s+(?:postgresql|mysql|sqlite|a\s+database)\b',
    r'\bpostgresql\b.*\bcsv\b', r'\bcsv\b.*\bpostgresql\b',
    # Additional patterns from bfcl_eval.py
    r'\bgoroutine\b', r'\bwrite\s+barrier\b', r'\brust\b.*\bborrow\b',
    r'\barc\b.*\bmutex\b', r'\bpin\b.*\bfuture\b',
    r'\bwindow\s+function\b', r'\bmongodb\b', r'\bmongoexport\b',
    r'\bdijkstra\b', r'\bdepth.first\s+search\b', r'\bconsistent\s+hashing\b',
    r'\bb.tree\b', r'\bbloom\s+filter\b', r'\blru\s+cache\b', r'\bordereddic\b',
    r'\bhorizontalpodautoscal', r'\bprometheus\b', r'\betcd\b', r'\braft\b',
    r'\bzerolog\b', r'\belk\s+stack\b', r'\bopentelemetry\b',
    r'\bcrdt\b', r'\bsaga\s+pattern\b',
    r'\btrie\b', r'\bweakmap\b', r'\bpromise.based\s+queue\b',
    r'\bcovering\s+index\b', r'\bmaterialized\s+view\b',
    r'\btf-idf\b', r'\btrigram\b', r'\bfuzzy\s+search\b',
    r'\btopological\s+sort\b', r'\bcycle\s+detection\b',
    r'\bprim.s\b', r'\bkruskal.s\b', r'\bspanning\s+tree\b',
    r'\bhot.module\s+replacement\b', r'\bvite\b',
    r'\bserver\s+component\b', r'\bclient\s+component\b',
    r'\bdocker(?:file)?\b', r'\bblue.green\s+deploy', r'\brolling\s+deploy',
    r'\bsticky\s+session\b', r'\bsession\s+replication\b', r'\bsession\s+fixation\b',
    r'\bjwt\b.*\bhttponly\b',
    r'\bpaging\b.*\bmemory\b', r'\bmmap\b', r'\bstack\s+vs\s+heap\b',
    r'\bv8\s+engine\b', r'\bgenerational\s+collection\b',
    r'\boptimistic\s+lock', r'\bpessimistic\s+lock',
    r'\bcrdt\b', r'\beventual\s+consistency\b.*\bdynamo',
    # General knowledge / weather / math
    r"what'?s\s+the\s+weather\b", r'\bforecast\b.*\btoday\b',
    r'\bwrite\s+a\s+sql\s+query\b', r'\bsecond.highest\s+salary\b',
    r'\bsql\s+query\s+(?:that|to)\b',
]

PRISM_INTENT_PATTERNS = [
    r'\bprism\b', r'\bsession\s*ledger\b', r'\bhandoff\b', r'\bknowledge\s+base\b',
    r'\bknowledge\s+items?\b', r'\bour\s+knowledge\b',
    r'\bsave.*(?:session|ledger|handoff)\b', r'\bload\s+context\b',
    r'\b(?:search|find).*(?:memory|sessions?|conversations?|notes)\b',
    r'\bproject\b', r'\bwhat\s+(?:do\s+)?we\s+(?:know|have)\b',
    r'\binstitutional\s+knowledge\b', r'\bdocumented\b', r'\bcurated\b',
    r'\bmemory\s+entry\b', r'\bmemory\s+backend\b', r'\bdiagnostics\b',
    r'\bledger\b', r'\bcompact\b.*(?:ledger|entries|session)\b',
    r'\bexport.*(?:memory|backup)\b', r'\b(?:delete|nuke|wipe|remove).*(?:entry|memory|entries)\b',
    r'\blog.*(?:what|accomplished|session)\b', r'\brecord.*(?:session|what)\b',
    r'\bhand.*(?:off|over)\b', r'\bbring.*up\s+to\s+speed\b',
    r'\bbug\s+fix.*(?:local\s+model|handle)\b', r'\broute.*(?:task|this)\b',
    r'\bbackfill\b', r'\bsynthesize\b', r'\bsession\s+graph\b',
    r'\bsession\s+links?\b', r'\bedges?\s+(?:up\s+to\s+date|consistent)\b',
    r'\bgraph\s+integrit', r'\bdangling\b', r'\breconnect.*(?:session|links?|references?)\b',
    r'\bpatch.*(?:links?|gaps?)\b', r'\bmissing\s+links?\b',
    r'\bsave\s+experience\b', r'\brecord\s+(?:a\s+)?milestone\b',
    r'\brecord\s+(?:a\s+)?success\b', r'\bupvote\b', r'\bdownvote\b',
    r'\bretention\s+polic(?:y|ies)\b', r'\bauto.expir\b', r'\bttl\b',
    r'\bknowledge\s+entry\b', r'\bknowledge\s+record\b',
]


def validate_tool_call(prompt, tool_name, tool_args):
    """Layer 3: reject obvious false-positive tool calls and remap semantic neighbors.
    Copied from swe_bench_test.py with additions from bfcl_eval.py.
    Returns (tool_name, tool_args) — possibly changed if rejected or remapped.
    """
    prompt_lower = prompt.lower()

    # Special NO_TOOL override: "confirm session link/graph consistency" → synthesize_edges
    if tool_name in ("NO_TOOL", "ERROR"):
        if re.search(r'\b(?:confirm|verify|validate|check|ensure)\b', prompt_lower):
            if re.search(r'\bsession\s+(?:link|edge|graph)\s+(?:consistency|consistent)\b', prompt_lower):
                proj_m = re.search(r'\b(?:for|on)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
                return 'session_synthesize_edges', ({'project': proj_m.group(1)} if proj_m else {})
        return tool_name, tool_args

    # --- Group B remaps (before false-positive rejection) ---

    # "reconnect/patch up/dangling links" → backfill_links
    # But don't remap when "synthesize edges" is the explicit first action
    if tool_name in ('session_synthesize_edges', 'session_reconnect'):
        if re.search(r'\b(?:reconnect|backfill|patch\s+up|dangling|link\s+gaps?|missing\s+links?|fix\s+links?)\b', prompt_lower):
            if not re.search(r'^synthesize\b', prompt_lower) and \
               not re.search(r'\bsynthesiz\w+\s+edges?\s+for\b', prompt_lower):
                return 'session_backfill_links', tool_args

    # "verify/check/make sure session links/edges are consistent / graph integrity" → synthesize_edges
    if tool_name in ('session_health_check', 'session_backfill_links'):
        _has_verify_verb = re.search(
            r'\b(?:verify|validate|check|make\s+sure|ensure|confirm)\b', prompt_lower
        )
        _has_consistent_edge = re.search(
            r'\b(?:edges?|links?|graph)\b.*?\b(?:consistent|up\s+to\s+date|synthesized)\b'
            r'|\bconsistent\b.*?\b(?:edges?|links?|graph)\b'
            r'|\bsession\s+links?\b'
            r'|\bgraph\s+integrit',
            prompt_lower, re.DOTALL
        )
        if _has_verify_verb and _has_consistent_edge:
            return 'session_synthesize_edges', tool_args

    # "synthesize edges for X, then backfill" → synthesize_edges is the FIRST action
    if tool_name == 'session_backfill_links':
        if re.search(r'(?:^|\bfirst\b|\bstart\s+with)\s*synthesize\s+edges?\b', prompt_lower) or \
           re.search(r'^synthesize\b', prompt_lower):
            return 'session_synthesize_edges', tool_args

    # "wipe/clear old entries from knowledge base" → knowledge_forget (not compact_ledger)
    # BUT protect "session entries" / "session history" from this remap
    if tool_name == 'session_compact_ledger':
        if re.search(r'\bknowledge\b', prompt_lower) and re.search(r'\b(?:wipe|clear|delete|remove|entries)\b', prompt_lower):
            if not re.search(r'\bsession\s+(?:entries|history|ledger)\b', prompt_lower):
                return 'knowledge_forget', tool_args

    # "prune/trim/archive old session entries" → session_compact_ledger (not forget_memory)
    if tool_name in ('session_forget_memory', 'knowledge_forget'):
        if re.search(r'\b(?:prune|trim|archive|compress)\b', prompt_lower) and re.search(r'\b(?:session|ledger)\s+(?:entries|history)?\b', prompt_lower):
            proj_m = re.search(r'\b(?:for|on)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
            return 'session_compact_ledger', ({'project': proj_m.group(1)} if proj_m else tool_args)

    # "archive old entries" (without 'knowledge') → session_compact_ledger
    if tool_name == 'session_forget_memory':
        if re.search(r'\b(?:archive|prune|trim)\s+old\s+entries\b', prompt_lower):
            if not re.search(r'\bknowledge\b', prompt_lower) and not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]\b', prompt_lower):
                return 'session_compact_ledger', tool_args

    # "knowledge entries/items/records" + delete verbs → knowledge_forget (not session_forget_memory)
    if tool_name == 'session_forget_memory':
        if re.search(r'\bknowledge\s+(?:entr|items?|records?|base)\b', prompt_lower):
            return 'knowledge_forget', tool_args
        if re.search(r'\bknowledge\s+base\b', prompt_lower) and re.search(r'\b(?:entries|records|items)\b', prompt_lower):
            return 'knowledge_forget', tool_args
        # "delete/wipe entries from [project]" without a specific memory ID → knowledge_forget
        if re.search(r'\b(?:entries|records|logs?)\b', prompt_lower) and re.search(r'\bproject\b', prompt_lower):
            if not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]|ID\s*[=:]\s*\S+', prompt):
                if not re.search(r'\b(?:session|ledger)\b', prompt_lower):
                    proj_m = re.search(r'(?:for|from|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project', prompt_lower, re.I)
                    return 'knowledge_forget', {'project': proj_m.group(1) if proj_m else None}

    # "where were we / bring me up to speed / catch me up" → session_load_context (not session_search_memory)
    if tool_name == 'session_search_memory':
        if re.search(r'\bwhere\s+were\s+we\b|\bbring\s+me\s+up\s+to\s+speed\b|\bcatch\s+me\s+up\b|\bwhat\s+were\s+we\s+(?:doing|working)', prompt_lower):
            project_m = re.search(
                r'\b(?:on|for|with|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
                r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
                r'|(?:state\s+of\s+(?:the\s+)?)([a-zA-Z][a-zA-Z0-9_-]+)(?:\s+project)?\b',
                prompt_lower
            )
            if project_m:
                project = next((g for g in project_m.groups() if g and g not in ('the', 'a', 'this', 'that', 'my', 'our')), None)
            else:
                project = None
            return 'session_load_context', {'project': project} if project else {}

    # "accumulated documentation / knowledge base" → knowledge_search (not session_search_memory)
    if tool_name == 'session_search_memory':
        if re.search(r'\baccumulated\s+documentation\b|\bknowledge\s+base\b', prompt_lower):
            return 'knowledge_search', tool_args

    # "recent / past / last week / what we did" → session_search_memory (not knowledge_search)
    if tool_name == 'knowledge_search':
        session_hints = [
            r'\brecent\b', r'\bpast\b', r'\blast\s+(?:week|month|session)',
            r'\bwhat\s+we\s+(?:did|decided|worked)', r'\bdeployment\s+issues\b',
        ]
        if any(re.search(p, prompt_lower) for p in session_hints):
            return 'session_search_memory', tool_args

    # "remind me / did we ever decide" → session_search_memory (not load_context)
    if tool_name == 'session_load_context':
        if re.search(r'\bremind\s+me\b|\bdid\s+we\s+ever\s+(?:decide|settle|choose|pick)\b|\bwhat\s+did\s+we\s+decide\b', prompt_lower):
            if not re.search(r'\bbring\s+me\s+up\s+to\s+speed\b|\bwhere\s+were\s+we\b|\bcatch\s+me\s+up\b|\bload\s+.*\bcontext\b', prompt_lower):
                return 'session_search_memory', {"query": prompt[:120]}

    # "jot down / write down / make a note / log what just happened" → session_save_ledger
    _LEDGER_TRIGGERS = re.compile(
        r'\bjot\s+down\b|\bwrite\s+(?:it\s+)?down\b|\bwhat\s+we\s+accomplished\b'
        r'|\bmake\s+sure\s+it.{0,10}written\b|\brecord\s+(?:this\s+session|what)\b'
        r'|\bmake\s+(?:a\s+)?note\s+(?:that|of)\b|\blog\s+what\s+just\s+happened\b'
        r'|\bwrite\s+down\s+everything\b|\bbefore\s+I\s+(?:close|head\s+out)\b',
        re.IGNORECASE
    )
    # negative: milestone/achievement events that belong in save_experience
    _EXPERIENCE_NEGATIVE = re.compile(
        r'\b(?:successfully|milestone|achievement|deployed\s+the|shipped\s+the|launched\s+the'
        r'|we\s+(?:fixed|built|completed|created|resolved|deployed|shipped|launched)\s+the'
        r'|race\s+condition|solid\s+now|zero.downtime)\b'
    )
    # Unambiguous note-taking phrases bypass the milestone negative check
    _NOTE_TRIGGERS = re.compile(
        r'\bmake\s+(?:a\s+)?note\s+(?:that|of)\b|\bjot\s+down\b'
        r'|\bwrite\s+(?:it\s+)?down\b|\blog\s+what\s+just\s+happened\b',
        re.IGNORECASE
    )
    if tool_name in ('session_save_experience', 'session_task_route'):
        if _LEDGER_TRIGGERS.search(prompt):
            if _NOTE_TRIGGERS.search(prompt) or not _EXPERIENCE_NEGATIVE.search(prompt_lower):
                if 'content' in tool_args and 'summary' not in tool_args:
                    tool_args = dict(tool_args)
                    tool_args['summary'] = tool_args.pop('content')
                if 'summary' not in tool_args:
                    work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed|finalized|completed|migrated)\s+.{10,120})', prompt, re.I)
                    if not work_m:
                        work_m = re.search(r'(?:make\s+a\s+note|log|note)\s+(?:that\s+)?(?:we\s+)?(completed|finished|did|wrote|refactored|migrated).{0,120}', prompt, re.I)
                    if work_m:
                        tool_args = dict(tool_args)
                        tool_args['summary'] = work_m.group(0).strip().rstrip('.')
                return 'session_save_ledger', tool_args

    # "record that we fixed/built/resolved [thing]" → session_save_experience (milestone)
    if tool_name == 'session_save_ledger':
        if re.search(r'\brecord\s+that\s+we\s+(?:fixed|built|completed|created|resolved|deployed|shipped|launched)\b', prompt_lower):
            return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "milestone"}

    # content → summary normalization + inline extraction for session_save_ledger
    if tool_name == 'session_save_ledger':
        if 'content' in tool_args and 'summary' not in tool_args:
            tool_args = dict(tool_args)
            tool_args['summary'] = tool_args.pop('content')
        if 'summary' not in tool_args:
            work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed|finalized|completed|migrated)\s+.{10,120})', prompt, re.I)
            if not work_m:
                work_m = re.search(r'(?:log|note|record)\s+(?:what\s+just\s+happened|this|that)\s*[:;]\s*(.{10,120})', prompt, re.I)
            if work_m:
                tool_args = dict(tool_args)
                tool_args['summary'] = (work_m.group(1) if work_m.lastindex else work_m.group(0)).strip().rstrip('.')

    # "log that we successfully deployed/shipped" → session_save_experience milestone (not save_ledger)
    if tool_name == 'session_save_ledger':
        if re.search(r'\blog\s+that\s+we\s+successfully\b|\bsuccessfully\s+deployed\b|\bsuccessfully\s+shipped\b|\bsuccessfully\s+launched\b', prompt_lower):
            return 'session_save_experience', {"project": tool_args.get("project"), "event_type": "success"}

    # "shift change / store current state for next agent" → session_save_handoff
    if tool_name == 'session_save_ledger':
        if re.search(r'\bshift\s+change\b|\bstore\s+(?:the\s+)?current\s+state\s+for\b|\bnext\s+(?:agent|person|developer)\s+can\s+continue\b|\bhand.*over\b|\bpick.*up\s+next\b', prompt_lower):
            return 'session_save_handoff', tool_args

    # Multi-intent: "Search/Find ... THEN upvote/downvote" → first action is search
    if tool_name in ('knowledge_upvote', 'knowledge_downvote'):
        if re.search(r'\bthen\s+(?:upvote|downvote|boost|rate\s+up|rate\s+down)\b', prompt_lower):
            if re.search(r'^(?:search|find|look\s+up)\b', prompt_lower):
                query_m = re.search(
                    r'^(?:search\s+(?:for\s+)?|find\s+(?:our\s+)?(?:notes?\s+on\s+)?|look\s+up\s+)(.+?)(?:,?\s*then\b)',
                    prompt, re.I
                )
                return 'session_search_memory', {"query": query_m.group(1).strip() if query_m else prompt[:120]}

    # invalid tool name → try retention or upvote/downvote
    if tool_name not in VALID_TOOLS:
        if re.search(r'\b(?:auto.?expir|ttl\b|\d+\s*days?\s+(?:retention|expir)|\bretention\s*polic)', prompt_lower):
            return 'knowledge_set_retention', tool_args
        # fall through to upvote/downvote patterns below

    # knowledge_forget / knowledge_set_retention → upvote/downvote protection
    _UPVOTE_SET = {'knowledge_forget', 'knowledge_set_retention', 'session_forget_memory',
                   'session_task_route', 'session_search_memory'}
    # Don't remap to upvote/downvote when primary intent is "search THEN upvote"
    _is_search_then_vote = (
        re.search(r'^(?:search|find|look\s+up)\b', prompt_lower) and
        re.search(r'\bthen\s+(?:upvote|downvote|boost|rate\s+up|rate\s+down)\b', prompt_lower)
    )
    if (tool_name in _UPVOTE_SET or tool_name not in VALID_TOOLS) and not _is_search_then_vote:
        _id_val = (tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")) if isinstance(tool_args, dict) else None
        if re.search(r'\b(?:upvote|boost|increase\s+(?:the\s+|its\s+)?(?:rank|score|importance)|uprate|thumbs[\s-]?up|mark\s+(?:it\s+)?(?:up|helpful|useful|great|good)|importance\s+score)\b', prompt_lower):
            return 'knowledge_upvote', {"id": _id_val}
        if re.search(r'\b(?:downvote|lower\s+(?:the\s+|its\s+)?(?:rank|score)|not\s+useful|derank|thumbs[\s-]?down|reduce\s+(?:the\s+|its\s+)?(?:rank|score)|mark\s+(?:it\s+)?(?:down|bad|wrong|outdated|terrible))\b', prompt_lower):
            return 'knowledge_downvote', {"id": _id_val}

    # session_load_context: extract project from prompt if missing
    if tool_name == 'session_load_context':
        if not (isinstance(tool_args, dict) and tool_args.get('project')):
            proj_m = re.search(
                r'\b(?:on|for|of|with|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
                r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b'
                r'|(?:state\s+of\s+(?:the\s+)?)([a-zA-Z][a-zA-Z0-9_-]+)(?:\s+project)?\b',
                prompt_lower
            )
            if proj_m:
                proj = next((g for g in proj_m.groups() if g), None)
                if proj and proj not in ('the', 'a', 'this', 'that', 'my', 'our'):
                    tool_args = dict(tool_args) if isinstance(tool_args, dict) else {}
                    tool_args['project'] = proj

    # session_compact_ledger: extract project if missing
    if tool_name == 'session_compact_ledger':
        if not (isinstance(tool_args, dict) and tool_args.get('project')):
            proj_m = re.search(
                r'\b(?:for|on|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+(?:project\s+)?ledger\b'
                r'|\b([a-zA-Z][a-zA-Z0-9_-]+)\s+project\s+ledger\b'
                r'|\b(?:compact|trim|prune|compress|archive)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+(?:project|ledger)\b',
                prompt_lower
            )
            if proj_m:
                proj = next((g for g in proj_m.groups() if g), None)
                if proj and proj not in ('the', 'a', 'this', 'that', 'my', 'our', 'old', 'stale'):
                    tool_args = dict(tool_args) if isinstance(tool_args, dict) else {}
                    tool_args['project'] = proj

    # "is this something the local model can handle? / route this task" → session_task_route
    if tool_name == 'session_search_memory':
        if re.search(r'\b(?:local\s+(?:model|agent)\s+(?:can\s+handle|should\s+handle)|route\s+this\s+task|should\s+(?:I|the\s+local\s+model)\s+(?:tackle|handle)|is\s+this\s+(?:something|simple\s+enough)\s+(?:for\s+the\s+)?local)\b', prompt_lower):
            return 'session_task_route', {"task_description": prompt}

    # session_task_route: extract task_description from prompt
    if tool_name == 'session_task_route':
        if 'task_description' not in tool_args or not tool_args.get('task_description'):
            tool_args = dict(tool_args)
            tool_args['task_description'] = prompt

    # session_export_memory: extract output_path from path patterns, format from keywords
    if tool_name == 'session_export_memory':
        if not isinstance(tool_args, dict):
            tool_args = {}
        tool_args = dict(tool_args)
        if 'output_path' not in tool_args or not tool_args.get('output_path'):
            path_m = re.search(
                r'(?:save\s+to|(?:output|export|dump)\s+(?:to\s+)?["\']?|to\s+["\']?)(/[\w/.-]+|~/[\w/.-]+)',
                prompt, re.I
            )
            if path_m:
                tool_args['output_path'] = path_m.group(1)
        if 'format' not in tool_args or not tool_args.get('format'):
            fmt_m = re.search(r'\b(json|jsonl|markdown|csv|yaml)\b(?:\s+format)?', prompt_lower)
            if fmt_m:
                tool_args['format'] = fmt_m.group(1)

    # session_compact_ledger: protect "session entries" from knowledge_forget remap
    # (already handled above but ensure compact stays for session-specific prompts)

    # "where did we leave off / what was the state" → session_load_context
    if tool_name == 'session_search_memory':
        if re.search(r'\bwhere\s+did\s+we\s+leave\s+off\b|\bwhat\s+was\s+the\s+state\s+of\b|\bget\s+me\s+(?:re-?oriented|up\s+to\s+speed)\b|\bpull\s+up\s+(?:whatever|the\s+(?:full\s+)?context)', prompt_lower):
            project_m = re.search(r'\b(?:on|for|with|of)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
            project = project_m.group(1) if project_m else None
            return 'session_load_context', ({'project': project} if project else {})

    # --- Social pleasantry rejection ---
    SOCIAL_PATTERNS = [
        r'^thanks', r'^thank you', r'^cheers', r'^goodbye', r'^bye',
        r"that's all", r"we're done", r"all done", r"all set",
        r'^ok\s+great', r'^perfect$', r'^nice$', r'^cool$',
        r'^hi\b', r'^hey\b', r'^hello\b', r'^good\s+morning', r'^good\s+afternoon',
    ]
    is_social = any(re.search(p, prompt_lower.strip()) for p in SOCIAL_PATTERNS)
    if is_social and not any(w in prompt_lower for w in [
        'save', 'export', 'search', 'load', 'record', 'log', 'run', 'check', 'find',
        'compact', 'handoff', 'route', 'synthesize', 'backfill', 'forget', 'upvote', 'downvote',
    ]):
        return "NO_TOOL", {}

    # --- False-positive rejection (CS patterns) ---
    is_general = any(re.search(p, prompt_lower) for p in GENERAL_PROGRAMMING_PATTERNS)

    if not is_general:
        return tool_name, tool_args

    has_prism_intent = any(re.search(p, prompt_lower) for p in PRISM_INTENT_PATTERNS)

    if has_prism_intent:
        return tool_name, tool_args

    return "NO_TOOL", {}


# ---------------------------------------------------------------------------
# Ollama Call
# ---------------------------------------------------------------------------

TOOL_CALL_NOPIPE_RE = re.compile(
    r'<tool_call>\s*(\{.*?\})\s*(?:</tool_call>|$)',
    re.DOTALL
)
TOOL_CALL_PIPE_RE = re.compile(
    r'<\|tool_call\|>\s*(\{.*?\})',
    re.DOTALL
)
BARE_JSON_RE = re.compile(
    r'(\{[^{}]*"name"\s*:\s*"[^"]+?"[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
)


def call_ollama(prompt: str, timeout: int = 120) -> tuple:
    """Call Ollama REST API with a pre-formatted ChatML prompt.
    Returns (raw_response, tool_name, tool_args, latency_secs).
    """
    start = time.time()
    try:
        payload = json.dumps({
            "model": MODEL,
            "prompt": prompt,
            "stream": False,
            "raw": True,
            "options": {"temperature": 0.0, "num_predict": 512},
        }).encode("utf-8")
        req = urllib.request.Request(
            OLLAMA_API,
            data=payload,
            headers={"Content-Type": "application/json"},
        )
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            data = json.loads(resp.read().decode("utf-8"))
            raw = data.get("response", "").strip()
    except Exception as exc:
        return (str(exc), "ERROR", {}, time.time() - start)

    latency = time.time() - start

    # Strip CoT blocks
    clean = re.sub(
        r'<\|synalux_think\|>.*?(?:</\|synalux_think\|>|$)',
        '', raw, flags=re.DOTALL
    )

    # Strategy 0: no-pipe <tool_call>…</tool_call>  (v43 native format)
    m = TOOL_CALL_NOPIPE_RE.search(clean)
    if m:
        try:
            tj = json.loads(m.group(1))
            return (raw, tj.get("name", tj.get("tool", "UNKNOWN")),
                    tj.get("arguments", tj.get("args", {})), latency)
        except json.JSONDecodeError:
            pass

    # Strategy 1: piped <|tool_call|>
    m = TOOL_CALL_PIPE_RE.search(clean)
    if m:
        try:
            tj = json.loads(m.group(1))
            return (raw, tj.get("name", tj.get("tool", "UNKNOWN")),
                    tj.get("arguments", tj.get("args", {})), latency)
        except json.JSONDecodeError:
            pass

    # Strategy 2: bare JSON with "name" key
    m = BARE_JSON_RE.search(clean)
    if m:
        try:
            tj = json.loads(m.group(0))
            return (raw, tj.get("name", "UNKNOWN"),
                    tj.get("arguments", tj.get("args", {})), latency)
        except json.JSONDecodeError:
            pass

    return (raw, "NO_TOOL", {}, latency)


# ---------------------------------------------------------------------------
# Scoring
# ---------------------------------------------------------------------------

def evaluate_result(expected_tool, required_params, got_tool, got_args):
    """
    Returns one of:
      strict_pass  — correct tool + all required_params present
      partial_pass — correct tool + at least 1 required_param present but not all
      wrong_tool   — tool name is wrong (includes false positives / negatives)
      false_positive — tool called when NO_TOOL expected
      false_negative — NO_TOOL returned when tool expected
    """
    if expected_tool == "NO_TOOL":
        return "false_positive" if got_tool != "NO_TOOL" else "strict_pass"

    if got_tool == "NO_TOOL":
        return "false_negative"

    # Accept either search tool for ambiguous prompts
    tools_match = (got_tool == expected_tool) or (
        expected_tool in ("session_search_memory", "knowledge_search") and
        got_tool in ("session_search_memory", "knowledge_search")
    )
    if not tools_match:
        return "wrong_tool"

    if not required_params:
        return "strict_pass"

    if not isinstance(got_args, dict):
        got_args = {}

    present = [p for p in required_params if p in got_args and got_args[p] not in (None, "", [])]
    if len(present) == len(required_params):
        return "strict_pass"
    if len(present) > 0:
        return "partial_pass"
    # Right tool, zero params matched
    return "partial_pass"


def score(verdict):
    if verdict == "strict_pass":
        return 1.0
    if verdict == "partial_pass":
        return 0.5
    return 0.0


# ---------------------------------------------------------------------------
# Main Eval
# ---------------------------------------------------------------------------

def run_once(tests, shuffle=False, run_label=""):
    """Run one full pass over test suite. Returns (results_list, category_stats)."""
    indexed = list(enumerate(tests))
    if shuffle:
        random.shuffle(indexed)

    results = [None] * len(tests)
    category_stats = {}

    for display_i, (orig_idx, (prompt, expected, req_params, category)) in enumerate(indexed, 1):
        chatml = (
            f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
        raw, got_tool, got_args, latency = call_ollama(chatml)
        got_tool, got_args = validate_tool_call(prompt, got_tool, got_args)
        verdict = evaluate_result(expected, req_params, got_tool, got_args)

        icon = "OK" if verdict == "strict_pass" else ("~~" if verdict == "partial_pass" else "XX")
        tag = f"#{orig_idx + 1:03d}"
        short = prompt[:52]
        run_info = f"[{run_label}] " if run_label else ""
        print(
            f"  {run_info}[{display_i:3d}/{len(tests)}] {icon} {tag} "
            f"expect={expected:30s} got={got_tool:30s} {latency:5.1f}s | {short}"
        )
        if verdict != "strict_pass":
            if verdict == "partial_pass":
                missing = [p for p in req_params if p not in got_args or got_args.get(p) in (None, "", [])]
                print(f"           -> partial: missing params {missing}")
            elif verdict == "false_positive":
                print(f"           -> FALSE POSITIVE: called {got_tool} (expected NO_TOOL)")
            elif verdict == "false_negative":
                print(f"           -> FALSE NEGATIVE: no tool called (expected {expected})")
            elif verdict == "wrong_tool":
                print(f"           -> WRONG TOOL: expected {expected}, got {got_tool}")

        results[orig_idx] = {
            "id": orig_idx + 1,
            "prompt": prompt,
            "expected": expected,
            "got": got_tool,
            "got_args": got_args,
            "verdict": verdict,
            "latency": latency,
            "category": category,
            "points": score(verdict),
        }

        if category not in category_stats:
            category_stats[category] = {"total": 0, "strict": 0, "partial": 0, "fail": 0, "points": 0.0}
        cat = category_stats[category]
        cat["total"] += 1
        cat["points"] += score(verdict)
        if verdict == "strict_pass":
            cat["strict"] += 1
        elif verdict == "partial_pass":
            cat["partial"] += 1
        else:
            cat["fail"] += 1

    return results, category_stats


def print_run_summary(results, category_stats, run_label=""):
    strict = sum(1 for r in results if r["verdict"] == "strict_pass")
    partial = sum(1 for r in results if r["verdict"] == "partial_pass")
    fp = sum(1 for r in results if r["verdict"] == "false_positive")
    fn = sum(1 for r in results if r["verdict"] == "false_negative")
    wt = sum(1 for r in results if r["verdict"] == "wrong_tool")
    total = len(results)
    total_points = sum(r["points"] for r in results)

    tool_tests = [r for r in results if r["expected"] != "NO_TOOL"]
    no_tool_tests = [r for r in results if r["expected"] == "NO_TOOL"]
    no_tool_correct = sum(1 for r in no_tool_tests if r["verdict"] == "strict_pass")
    hallucinations = sum(1 for r in results if r["verdict"] == "false_positive")
    avg_lat = sum(r["latency"] for r in results) / total if total else 0

    lbl = f" (Run {run_label})" if run_label else ""
    print()
    print("=" * 80)
    print(f"  EVAL-300 RESULTS{lbl}")
    print("=" * 80)
    print(f"  Strict Pass:     {strict}/{total} = {strict / total * 100:.1f}%")
    print(f"  Partial Pass:    {partial}/{total} = {partial / total * 100:.1f}%")
    print(f"  Wrong Tool:      {wt}/{total}")
    print(f"  False Positives: {fp}/{total}  (hallucinations)")
    print(f"  False Negatives: {fn}/{total}")
    print(f"  ---")
    print(f"  strict_pct  (strict/total):          {strict / total * 100:.1f}%")
    print(f"  weighted_pct (total_points/total):    {total_points / total * 100:.1f}%")
    print(f"  Abstention accuracy:                  {no_tool_correct}/{len(no_tool_tests)} = {no_tool_correct / len(no_tool_tests) * 100:.1f}%")
    print(f"  Hallucinations:                       {hallucinations}  (target = 0)")
    print(f"  Avg latency:                          {avg_lat:.1f}s")
    print()
    print(f"  {'Category':<22} {'Strict':>7} {'Partial':>8} {'Fail':>5} {'Pts/Tot':>10}  {'Pct':>6}")
    print(f"  {'-'*22} {'-'*7} {'-'*8} {'-'*5} {'-'*10}  {'-'*6}")
    for cat, s in sorted(category_stats.items()):
        pts_pct = s["points"] / s["total"] * 100 if s["total"] else 0
        print(f"  {cat:<22} {s['strict']:>7} {s['partial']:>8} {s['fail']:>5} "
              f"{s['points']:>5.1f}/{s['total']:<4}  {pts_pct:>5.1f}%")
    print("=" * 80)
    return {
        "strict": strict,
        "partial": partial,
        "wrong_tool": wt,
        "false_positive": fp,
        "false_negative": fn,
        "total": total,
        "total_points": total_points,
        "strict_pct": strict / total,
        "weighted_pct": total_points / total,
        "abstention_rate": no_tool_correct / len(no_tool_tests) if no_tool_tests else 0,
        "hallucinations": hallucinations,
        "avg_latency": avg_lat,
        "category_stats": category_stats,
    }


def main():
    parser = argparse.ArgumentParser(description="Eval-300: 300-case standard evaluation for prism-coder")
    parser.add_argument("--model", type=str, default=None,
                        help="Ollama model tag to evaluate (default: prism-coder:4b-v43)")
    parser.add_argument("--runs", type=int, default=1,
                        help="Number of eval runs (default: 1; use 3 for stability check)")
    parser.add_argument("--shuffle", action="store_true",
                        help="Randomize test order each run")
    parser.add_argument("--no-validate-layer3", action="store_true",
                        help="Disable Layer 3 false-positive rejection "
                             "(use during RFT/DPO so model sees true failures)")
    args = parser.parse_args()

    global MODEL, validate_tool_call
    if args.model:
        MODEL = args.model

    if args.no_validate_layer3:
        def validate_tool_call(prompt, tool_name, tool_args):  # noqa: F811
            return tool_name, tool_args

    _verify_test_counts()

    print("=" * 80)
    print(f"  EVAL-300 — prism-coder standard evaluation")
    print(f"  Model:  {MODEL}")
    print(f"  Tests:  {len(TESTS)}")
    print(f"  Runs:   {args.runs}" + (" (RANDOMIZED ORDER each run)" if args.shuffle else ""))
    print(f"  Layer3: {'DISABLED' if args.no_validate_layer3 else 'enabled'}")
    print("=" * 80)

    all_run_summaries = []
    all_run_results = []

    for run_idx in range(args.runs):
        run_label = str(run_idx + 1) if args.runs > 1 else ""
        if args.runs > 1:
            print(f"\n{'#' * 80}")
            print(f"  RUN {run_idx + 1} / {args.runs}" +
                  (f"  (seed={random.randint(1000, 9999)})" if args.shuffle else ""))
            print(f"{'#' * 80}")

        results, cat_stats = run_once(TESTS, shuffle=args.shuffle, run_label=run_label)
        summary = print_run_summary(results, cat_stats, run_label=run_label)
        all_run_summaries.append(summary)
        all_run_results.append(results)

    # ---------------------------------------------------------------------------
    # Multi-run aggregate
    # ---------------------------------------------------------------------------
    if args.runs > 1:
        strict_scores = [s["strict"] for s in all_run_summaries]
        weighted_pcts = [s["weighted_pct"] * 100 for s in all_run_summaries]
        total = all_run_summaries[0]["total"]
        halluc_counts = [s["hallucinations"] for s in all_run_summaries]

        # Per-test stability
        per_test_pass = [0] * len(TESTS)
        per_test_fail_tools = [[] for _ in range(len(TESTS))]
        for run_results in all_run_results:
            for r in run_results:
                idx = r["id"] - 1
                if r["verdict"] == "strict_pass":
                    per_test_pass[idx] += 1
                else:
                    per_test_fail_tools[idx].append(r.get("got", "???"))

        med_strict = statistics.median(strict_scores)
        avg_strict = statistics.mean(strict_scores)
        med_weighted = statistics.median(weighted_pcts)

        print(f"\n{'=' * 80}")
        print(f"  MULTI-RUN SUMMARY ({args.runs} runs x {total} tests)")
        print(f"{'=' * 80}")
        print(f"  Strict scores:  {' | '.join(f'{s}/{total}' for s in strict_scores)}")
        print(f"  Median strict:  {med_strict}/{total} = {med_strict / total * 100:.1f}%")
        print(f"  Average strict: {avg_strict:.1f}/{total} = {avg_strict / total * 100:.1f}%")
        print(f"  Weighted pct:   {' | '.join(f'{p:.1f}%' for p in weighted_pcts)}  "
              f"(median {med_weighted:.1f}%)")
        print(f"  Hallucinations: {' | '.join(str(h) for h in halluc_counts)}  "
              f"(target = 0 each run)")
        print()
        print(f"  Flaky tests (< 100% pass rate across {args.runs} runs):")
        flaky = []
        for i, (prompt, expected, _, cat) in enumerate(TESTS):
            rate = per_test_pass[i] / args.runs
            if rate < 1.0:
                fail_tools = per_test_fail_tools[i]
                flaky.append((i + 1, rate, expected, set(fail_tools), cat, prompt[:60]))
        if flaky:
            for fid, rate, exp, fails, fcat, fshort in sorted(flaky, key=lambda x: x[1]):
                print(f"    [{fid:03d}] {rate * 100:3.0f}% | cat={fcat:<18s} | expect={exp:<28s} | fails->{','.join(fails):<20s} | {fshort}")
        else:
            print("    All tests passed consistently across all runs!")
        print(f"  Total flaky: {len(flaky)}/{total}")
        print(f"{'=' * 80}")

    # ---------------------------------------------------------------------------
    # Save JSON report
    # ---------------------------------------------------------------------------
    os.makedirs("results", exist_ok=True)
    report_path = "results/eval300_report.json"
    final_summary = all_run_summaries[-1] if args.runs == 1 else {
        "runs": args.runs,
        "strict_scores": strict_scores,
        "median_strict": statistics.median(strict_scores) / total,
        "avg_strict": statistics.mean(strict_scores) / total,
        "median_weighted_pct": statistics.median(weighted_pcts) / 100,
        "hallucinations_per_run": halluc_counts,
        "per_run_summaries": all_run_summaries,
    } if args.runs > 1 else all_run_summaries[0]

    report = {
        "model": MODEL,
        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "total_tests": len(TESTS),
        "runs": args.runs,
        "shuffle": args.shuffle,
        "layer3_enabled": not args.no_validate_layer3,
        "summary": final_summary,
        "last_run_results": all_run_results[-1],
    }
    with open(report_path, "w") as f:
        json.dump(report, f, indent=2, default=str)
    print(f"\nReport saved: {report_path}")

    # Exit code: fail if last run strict < 90%
    last_strict_pct = all_run_summaries[-1]["strict_pct"] * 100
    if last_strict_pct < 90.0:
        print(f"FAIL: strict_pct {last_strict_pct:.1f}% is below 90% gate")
        sys.exit(1)
    else:
        print(f"PASS: strict_pct {last_strict_pct:.1f}%")
        sys.exit(0)


if __name__ == "__main__":
    main()