#!/usr/bin/env python3 """ analyze_swe_failures.py — Auto-generate surgical patch from swe_bench_report.json. Usage: python3 analyze_swe_failures.py \ --report results/swe_bench_report.json \ --version s2 \ --out /tmp/4b_swe_s2_data Reads the report, groups failures by type, generates 8-15 targeted variations per unique failure pattern, and writes train.jsonl + valid.jsonl. """ import argparse, json, random, re from pathlib import Path random.seed(None) # different seed each run SYS_PROMPT = ( "You are Synalux, a memory-augmented coding and clinical reasoning assistant. " "You have access to Prism Memory tools (session_save_ledger, session_load_context, " "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, " "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, " "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, " "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool " "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, " "hipaa, data_graph, templates, pdf_parser). " "Think step-by-step before answering. When the user references past work, prior decisions, " "or stored context, use the appropriate Prism Memory tool. " "Format tool calls inside ... JSON blocks with fields 'name' and 'arguments'. " "If no tool is needed, answer directly in plain text. " "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions." ) PROJS = ["portal", "analytics", "billing", "auth-service", "dashboard", "prism-training", "api-gateway", "synalux", "ios-app", "prism-mcp"] def ex(user, tool, args): a = json.dumps(args, ensure_ascii=False) return {"text": ( f"<|im_start|>system\n{SYS_PROMPT}<|im_end|>\n" f"<|im_start|>user\n{user}<|im_end|>\n" f"<|im_start|>assistant\n" f"\n{{\"name\": \"{tool}\", \"arguments\": {a}}}\n\n<|im_end|>" )} def ex_abstain(user, reply): return {"text": ( f"<|im_start|>system\n{SYS_PROMPT}<|im_end|>\n" f"<|im_start|>user\n{user}<|im_end|>\n" f"<|im_start|>assistant\n{reply}<|im_end|>" )} # --------------------------------------------------------------------------- # Per-tool variation generators # --------------------------------------------------------------------------- VARIATIONS = { "session_task_route": [ "Should I handle this {task} locally or route to cloud?", "Is the on-device model capable of doing this {task}?", "Route this: {task}.", "Can the local agent handle {task}, or escalate to cloud?", "Local or cloud for this {task}?", "Is {task} too complex for the small model?", "Handle or escalate — {task}?", "Should I send this {task} to the bigger model?", "Which model should I use for {task}?", "Is this {task} within local model capacity?", ], "session_save_ledger": [ "Log what we accomplished today — {summary}.", "Jot down our session progress: {summary}.", "Write this down: {summary}.", "Save today's work to the ledger — {summary}.", "Note our progress for this session: {summary}.", "Record what we did: {summary}.", "Document today's session — {summary}.", "Keep a record: {summary}.", "Before I sign off, save that we {summary}.", "Session complete. Log: {summary}.", ], "session_search_memory": [ "Remind me — what did we decide about {topic}?", "Did we ever settle on {topic}?", "What was our conclusion on {topic}?", "Can you recall what we discussed about {topic}?", "Search my past sessions for notes on {topic}.", "Did we document our stance on {topic}?", "What did we agree on regarding {topic}?", "Find our previous discussion about {topic}.", "Do we have past notes on {topic}?", "What was decided about {topic} in a prior session?", ], "session_synthesize_edges": [ "Verify all session links are consistent for {proj}.", "Run a consistency check on session edges for {proj}.", "Synthesize the session graph for {proj}.", "Rebuild semantic connections between sessions in {proj}.", "Make sure the session edge graph is coherent for {proj}.", "Build fresh edge connections across {proj} sessions.", "Check that session node links are valid for {proj}.", "Verify session graph integrity for {proj}.", "Run a synthesis pass to update {proj} session edges.", "Before closing out, verify all links are consistent for {proj}.", ], "session_backfill_links": [ "Reconnect dangling session references for {proj}.", "Patch up link gaps in the {proj} session history.", "Fill missing cross-session references for {proj}.", "Backfill the missing links in {proj} session graph.", "Fix broken references in {proj} session history.", "There are orphaned sessions in {proj} — reconnect them.", "Some {proj} sessions are unlinked — patch them.", "Repair the dangling session refs for {proj}.", "Connect the disconnected sessions in {proj}.", "Fill in the missing session links for {proj}.", "Repair missing session links for {proj}.", "Fix the missing link references in {proj} sessions.", "Patch up missing refs in {proj} session history.", ], "session_health_check": [ "Is the Prism memory system healthy?", "Is the session database running correctly?", "Check if Prism MCP is up and responsive.", "Run a health check on the memory store.", "Ping the session DB — is it working?", "Verify the memory system is operational.", "Is the knowledge store responding?", "Do a status check on Prism.", "Is the Prism backend healthy?", "Health check: is session storage OK?", ], "knowledge_forget": [ "Delete all {category} knowledge entries for {proj}.", "Remove the knowledge base entries about {topic}.", "Wipe knowledge entries tagged '{category}'.", "Purge old {category} docs from the knowledge base.", "Clear {proj} entries from my knowledge store.", "Forget all documentation about {topic}.", "Remove stale knowledge about {topic}.", "Delete the accumulated notes on {topic} from the KB.", "Wipe out old {category} entries from the knowledge base.", "Remove knowledge entries for the deprecated {topic}.", "Forget the knowledge entry with ID {mid} — that approach is deprecated.", "Delete the knowledge base entry with ID {mid}.", "Remove knowledge entry {mid} from the KB.", ], "knowledge_search": [ "What do I know about {topic}?", "Search my knowledge base for {topic}.", "What's in my stored docs about {topic}?", "Look up {topic} in the knowledge base.", "Find my knowledge base entries on {topic}.", "Query the knowledge store for {topic}.", "What does the knowledge base say about {topic}?", "Check my stored documentation on {topic}.", "Pull up knowledge base notes on {topic}.", "Search the KB for {topic}.", "What knowledge do I have about {topic}?", "Retrieve knowledge base entries about {topic}.", "KB lookup: {topic}.", "What's stored in the knowledge base about {topic}?", "Find knowledge entries related to {topic}.", ], "session_compact_ledger": [ "The session ledger is too long — compact it.", "Shrink the ledger for {proj}.", "Archive old session entries from the ledger.", "Trim the session log — too many entries.", "Prune the {proj} session ledger.", "Compress the current ledger to save space.", "The {proj} ledger is bloated — archive it.", "Run a compaction on the session log.", "Clean up old entries in the ledger.", "Compact and archive old {proj} session entries.", ], "session_export_memory": [ "Export all memory to {path}.", "Dump everything to {path} as a backup.", "Save a memory backup to {path}.", "Archive my sessions to {path} in JSON format.", "Export {proj} project memory to {path}.", "Backup my Prism data to {path}.", "Write all sessions to {path} so I can restore later.", "Export a snapshot of memory to {path}.", "Dump the {proj} sessions to {path}.", "Create a full memory export at {path}.", ], "session_forget_memory": [ "Delete memory entry {mid}.", "Remove the entry with ID {mid}.", "Forget memory ID {mid} — it's wrong.", "Wipe memory entry {mid}.", "That entry {mid} is outdated — remove it.", "Delete the specific session entry: {mid}.", "Erase memory record {mid}.", "The entry ID {mid} was saved by mistake — delete it.", "Remove wrong entry {mid} from memory.", "Clear out entry {mid}.", ], "session_save_experience": [ "Record a milestone: {summary}.", "Mark this specific event: {summary}.", "Capture this breakthrough — {summary}.", "Add an experience entry: {summary}.", "Journal this success: {summary}.", "Log this correction: {summary}.", "Note this as a completed milestone for {proj}: {summary}.", "Record this key insight: {summary}.", "Save this specific event for {proj}: {summary}.", "Capture: we just {summary}.", "Log a milestone: {summary}.", "Record that we {summary}.", "Mark a success: {summary}.", "Save this moment — we {summary}.", "Document this specific achievement: {summary}.", "That was a win — {summary}.", "Checkpoint: we {summary}.", "Record this win for the team: {summary}.", "Mark this achievement: {summary}.", ], "session_save_handoff": [ "Shift change. Save the state for {proj} so the next agent can continue.", "Package up {proj} context for the incoming agent.", "Handoff notes for {proj}: {summary}.", "Before rotating out, save current {proj} state.", "The next agent needs context for {proj} — save it.", "End of shift — store handoff for {proj}.", "Store a handoff package for {proj}: {summary}.", "Another agent is taking over {proj} — preserve state.", "Handoff prep: summarize {proj} for the next session.", "Save session state for {proj} — agent rotation.", ], "session_load_context": [ "Load my previous work on {proj}.", "Recover context for {proj}.", "Where did we leave off with {proj}?", "Resume the {proj} session.", "Pull up everything we had on {proj}.", "Start a new session for {proj} — load context.", "Get me back up to speed on {proj}.", "Reload context for {proj} — continuing work.", "Restore {proj} session state.", "Initialize {proj} context — continuing from last time.", ], "knowledge_upvote": [ "Give a thumbs up to the {topic} knowledge entry.", "Boost the ranking on {topic} notes — they're accurate.", "The {topic} documentation is spot on — upvote it.", "Increase the importance score of the {topic} entry.", "Mark {topic} docs as highly useful.", "Rank up the {topic} entry — it's correct.", "Flag the {topic} knowledge as high quality.", "Upvote the stored doc on {topic}.", "The {topic} notes are gold — boost them.", "Promote the {topic} entry — it's been validated.", ], "knowledge_downvote": [ "Lower the ranking on the {topic} doc — it's wrong.", "The {topic} entry is outdated — mark it down.", "That {topic} knowledge is misleading — downvote it.", "Reduce the score on the {topic} notes.", "Flag the {topic} entry as inaccurate.", "Decrease importance of the {topic} knowledge entry.", "That {topic} doc is dangerous — give it a negative rating.", "The {topic} notes are terrible — lower their rank.", "Mark {topic} documentation as low quality.", "Demote the {topic} entry — it's incorrect.", "Reduce the rank of that outdated note about {topic}.", "That {topic} knowledge entry is wrong — use knowledge_downvote.", "Downvote the knowledge base entry about {topic}.", ], "knowledge_set_retention": [ "Set {proj} knowledge to expire after {days} days.", "Auto-expire {proj} docs in {days} days.", "Configure retention for {proj} knowledge: {days} days.", "Set a {days}-day TTL on {proj} knowledge entries.", "Make {proj} knowledge entries expire in {days} days.", "Schedule {proj} knowledge cleanup after {days} days.", "Configure {proj} knowledge to auto-purge in {days} days.", "Set knowledge retention policy for {proj}: {days} days.", "TTL setting: {proj} knowledge → {days} days.", "Auto-clean {proj} knowledge after {days} days.", ], "NO_TOOL": { # CS abstain variations - categorized by trigger word "save_pref": [ "How do I save user preferences in {lang}?", "What's the best way to persist user settings in {lang}?", "How do I store app preferences in {lang}?", "Saving state between sessions in {lang} — what approach?", "Best practice for saving user config in {lang}?", ], "search_algo": [ "Write a function to search {struct} using {algo}.", "Implement {algo} search on a {struct}.", "How do I search through a {struct} in Python?", "What's the fastest way to search a {struct}?", "Code a {struct} search using {algo}.", ], "export_db": [ "How do I export data from {db} to CSV?", "Export a {db} table to a file?", "How to dump {db} data to disk?", "Best way to export {db} records?", "How do I backup {db} data?", ], "route_framework": [ "How do I route tasks in {framework}?", "How does routing work in {framework}?", "Set up {framework} routing for different handlers.", "What's the routing mechanism in {framework}?", "How to configure {framework} task routing?", ], "explain_concept": [ "Explain Python's __slots__ and when to use it for memory optimization.", "What is the difference between deepcopy and shallow copy in Python?", "Explain the JavaScript event loop and microtask queue.", "Explain Go's garbage collector and write barriers.", "What is Rust's Pin and why is it needed for async futures?", "Explain stack vs. heap memory allocation and when each is appropriate.", "Explain how CRDTs achieve conflict-free distributed state.", "Explain eventual consistency in distributed systems.", "What is the time and space complexity of merge sort?", "How do topological sorts work on directed acyclic graphs?", "What is the difference between server components and client components in React?", "Explain the difference between concurrency and parallelism.", "What is the CAP theorem and how does it affect distributed system design?", "Explain how garbage collection works in the JVM.", "What are the SOLID principles in object-oriented design?", ], "language_feature": [ "How do I profile memory usage in a Python application?", "How do I debounce a function in JavaScript without lodash?", "How do I implement context cancellation in a Go HTTP server?", "How do I set up hot-module replacement in a Vite + React app?", "How do I use window functions in {db} to compute running totals?", "How do I implement memoization in Python?", "How do I use async generators in Python?", "How do I implement a custom React hook for data fetching?", "How do I handle errors in a Go goroutine?", "How do I use TypeScript generics with constraints?", "How do I implement dependency injection in Python?", "How do I use the Rust borrow checker correctly with lifetimes?", "How do I implement rate limiting in an Express.js server?", ], "implement_algo": [ "Write a function to detect cycles in a directed graph using DFS.", "Implement a least-recently-used (LRU) cache in Python.", "Write a binary search tree insertion function.", "Implement Dijkstra's shortest path algorithm.", "Write a function to serialize and deserialize a binary tree.", "Implement a thread-safe queue in Python.", "Write a function to find all permutations of a string.", "Implement a sliding window maximum algorithm.", "Write a function to check if two strings are anagrams.", "Implement a simple pub-sub event system in JavaScript.", ], "database_concept": [ "How do I use window functions in {db} to compute running totals?", "What is the difference between a clustered and non-clustered index?", "How does a {db} query planner choose an index?", "What is a covering index and when should I use one?", "How do I avoid N+1 queries in an ORM?", "What is the difference between INNER JOIN and LEFT JOIN?", "Explain ACID properties in database transactions.", "What is optimistic locking and when should I use it?", "How does {db} handle deadlocks?", "What is the difference between WHERE and HAVING in SQL?", ], }, } TASKS = ["TypeScript refactor", "SQL optimization", "test suite generation", "documentation pass", "regex patterns", "CSS layout fix", "GraphQL migration", "API endpoint review", "performance profiling", "3000-line code review", "linting across 200 files", "unit test writing", "auth middleware rewrite", "schema migration"] TOPICS = ["Redis vs Memcached", "auth token expiry", "logging library choice", "API error handling", "pagination strategy", "TypeScript strict mode", "database ORM choice", "caching strategy", "WebSocket reconnect", "rate limiting approach", "monolith vs microservices", "deploy strategy", "CI/CD pipeline choice"] CATEGORIES = ["debugging", "deprecated", "outdated", "old-deploy", "legacy", "v1-api"] PATHS = ["/tmp/prism-backup", "/tmp/mem-export", "/var/exports", "/tmp/backup", "/home/user/prism-data", "/tmp/sessions", "/tmp/prism-export"] SUMMARIES = ["fixed the OAuth flow", "completed the database migration", "finished the API refactor", "resolved the memory leak", "shipped the billing integration", "added retry logic", "completed TypeScript migration", "fixed the race condition", "deployed the hotfix", "migrated the auth module"] LANGS = ["React Native", "Flutter", "Swift", "Kotlin", "Electron"] STRUCTS = ["binary tree", "linked list", "hash map", "graph", "trie", "sorted array"] ALGOS = ["BFS", "DFS", "binary search", "A*", "Dijkstra's"] DBS = ["PostgreSQL", "MySQL", "SQLite", "MongoDB", "Redis"] FRAMEWORKS = ["Celery", "RQ", "Kafka", "Bull queue", "Sidekiq"] MIDS = ["mem-abc-123", "mem-portal-007", "sess-42-bad", "exp-001", "MEM-2025-0042", "err-cfg-88", "old-deploy-19", "ent-2024-991"] DAYS = [7, 14, 30, 45, 60, 90, 180] def gen_variations(tool, count, seed, failures): rng = random.Random(seed) rows = [] if tool == "NO_TOOL": cs_replies = { "save_pref": "Use AsyncStorage / SharedPreferences / localStorage for user prefs. This is a standard mobile/web storage pattern — no Prism tool needed.", "search_algo": "Implement the algorithm with standard code. No Prism tool call required for CS/algorithmic questions.", "export_db": "Use the database's native export command (e.g., `COPY ... TO '/tmp/out.csv'`). This is a database operation — no Prism session export.", "route_framework": "Configure routing in the framework's config file (e.g., `CELERY_TASK_ROUTES`). This is a framework configuration question — no Prism task routing.", "explain_concept": "This is a CS/programming concept question. Answer directly with technical knowledge — no Prism tool needed.", "language_feature": "This is a language-specific programming question. Answer from CS knowledge — no Prism tool call required.", "implement_algo": "Implement this with standard code. Pure coding task — no Prism memory tool needed.", "database_concept": "This is a database/SQL concept question. Answer directly — no Prism session or memory tool needed.", } # Include exact failing prompts as abstain training — oversampled 2× for targeted correction _abstain_reply = ( "This is a programming/CS question — answer directly with technical knowledge. " "No Prism memory tool needed for general coding and concept questions." ) for f in failures: if f.get("verdict") == "false_positive": exact_prompt = f["prompt"] for _ in range(12): rows.append(ex_abstain(exact_prompt, _abstain_reply)) # Also generate template-based variations for generalization for cat, tmpls in VARIATIONS["NO_TOOL"].items(): for _ in range(max(2, count // 4)): tmpl = rng.choice(tmpls) if "{lang}" in tmpl: msg = tmpl.format(lang=rng.choice(LANGS)) elif "{struct}" in tmpl and "{algo}" in tmpl: msg = tmpl.format(struct=rng.choice(STRUCTS), algo=rng.choice(ALGOS)) elif "{struct}" in tmpl: msg = tmpl.format(struct=rng.choice(STRUCTS)) elif "{db}" in tmpl: msg = tmpl.format(db=rng.choice(DBS)) elif "{framework}" in tmpl: msg = tmpl.format(framework=rng.choice(FRAMEWORKS)) else: msg = tmpl rows.append(ex_abstain(msg, cs_replies[cat])) return rows tmpls = VARIATIONS.get(tool, []) if not tmpls: return rows # Also include the exact failing prompts as training examples for f in failures: if f["expected"] == tool: exact_prompt = f["prompt"] args = _infer_args(tool, exact_prompt) rows.append(ex(exact_prompt, tool, args)) for i in range(count): tmpl = rng.choice(tmpls) proj = rng.choice(PROJS) task = rng.choice(TASKS) topic = rng.choice(TOPICS) cat = rng.choice(CATEGORIES) path = rng.choice(PATHS) summary = rng.choice(SUMMARIES) mid = rng.choice(MIDS) msg = (tmpl .replace("{proj}", proj) .replace("{task}", task) .replace("{topic}", topic) .replace("{category}", cat) .replace("{path}", path) .replace("{summary}", summary) .replace("{mid}", mid)) days = rng.choice(DAYS) msg = msg.replace("{days}", str(days)) args = _build_args(tool, proj, task, topic, cat, path, summary, mid, days) rows.append(ex(msg, tool, args)) return rows def _build_args(tool, proj, task, topic, cat, path, summary, mid, days=30): if tool == "session_task_route": return {"task_description": task} if tool == "session_save_ledger": return {"project": proj, "content": summary} if tool == "session_save_experience": return {"description": summary} if tool == "session_save_handoff": return {"project": proj, "summary": summary} if tool == "session_search_memory": return {"query": topic} if tool == "session_load_context": return {"project": proj} if tool == "session_synthesize_edges": return {"project": proj} if tool == "session_backfill_links": return {"project": proj} if tool == "session_health_check": return {} if tool == "session_compact_ledger": return {"project": proj} if tool == "session_export_memory": return {"output_dir": path, "format": "json"} if tool == "session_forget_memory": return {"memory_id": mid} if tool == "knowledge_search": return {"query": topic} if tool == "knowledge_forget": return {"project": proj, "category": cat, "entry_id": mid} if tool == "knowledge_upvote": return {} if tool == "knowledge_downvote": return {} if tool == "knowledge_set_retention": return {"project": proj, "retention_days": days} return {} def _infer_args(tool, prompt): """Best-effort argument extraction from prompt text for exact failure examples.""" args = _build_args(tool, random.choice(PROJS), random.choice(TASKS), random.choice(TOPICS), random.choice(CATEGORIES), random.choice(PATHS), random.choice(SUMMARIES), random.choice(MIDS), random.choice(DAYS)) # Try to extract project name proj_tools = ("session_synthesize_edges", "session_backfill_links", "session_compact_ledger", "session_load_context", "session_export_memory", "knowledge_forget") m = re.search(r'\bfor\s+(?:the\s+)?(\w[\w-]*)\s+project', prompt, re.I) if m and tool in proj_tools: args["project"] = m.group(1) # Also match "the X project ledger/log/history" (e.g. "metrics project ledger") m2 = re.search(r'\b(?:the\s+)?(\w[\w-]*)\s+project\s+(?:ledger|log|history)', prompt, re.I) if m2 and tool in proj_tools and "project" not in args: args["project"] = m2.group(1) # Extract memory ID m_id = re.search(r'\b(mem-[a-z0-9-]+|ent-\d+-\w+|sess-\d+-\w+|exp-\d+|MEM-\d+-\d+)', prompt) if m_id and tool == "session_forget_memory": args["memory_id"] = m_id.group(1) # Extract path m_path = re.search(r'(/tmp/[^\s,.]+|/var/[^\s,.]+|/home/[^\s,.]+)', prompt) if m_path and tool == "session_export_memory": args["output_dir"] = m_path.group(1) return args def main(): ap = argparse.ArgumentParser() ap.add_argument("--report", default="results/swe_bench_report.json") ap.add_argument("--version", default="s2") ap.add_argument("--out", default=None) ap.add_argument("--seed", type=int, default=None) args = ap.parse_args() seed = args.seed or random.randint(1000, 9999) random.seed(seed) print(f"Seed: {seed}") report = json.loads(Path(args.report).read_text()) # Support both swe_bench_report.json (report["results"]) # and eval300_report.json (report["last_run_results"]) if "last_run_results" in report: results = report["last_run_results"] else: results = report["results"] out_dir = Path(args.out or f"/tmp/4b_swe_{args.version}_data") out_dir.mkdir(parents=True, exist_ok=True) # Group failures by correct tool wrong_tool = {} # expected_tool → list of failures fp_count = 0 # false positives (should be NO_TOOL) fn_count = 0 # false negatives (called NO_TOOL but shouldn't) partial = {} # expected_tool → count for r in results: verdict = r["verdict"] expected = r["expected"] if verdict == "false_positive": fp_count += 1 elif verdict == "false_negative": t = r["expected"] wrong_tool.setdefault(t, []).append(r) elif verdict == "wrong_tool": t = r["expected"] wrong_tool.setdefault(t, []).append(r) elif verdict == "partial_pass": partial.setdefault(expected, []) partial[expected].append(r) total_failures = sum(len(v) for v in wrong_tool.values()) + fp_count + fn_count print(f"\nFailure analysis:") print(f" False positives (over-trigger): {fp_count}") print(f" Wrong tool: {sum(len(v) for v in wrong_tool.values())}") print(f" Partial (missing params): {sum(len(v) for v in partial.values())}") print(f" Tool-specific: {sorted(wrong_tool.keys())}") rows = [] rng_seed = seed # 1. False positives → more CS abstain examples if fp_count > 0: n = max(12, fp_count * 5) rows.extend(gen_variations("NO_TOOL", n, rng_seed, [r for r in results if r["verdict"] == "false_positive"])) rng_seed += 1 print(f" Generated {n} NO_TOOL abstain rows") # 2. Wrong tool → targeted tool examples for tool, failures in wrong_tool.items(): n = max(10, len(failures) * 8) new_rows = gen_variations(tool, n, rng_seed, failures) rows.extend(new_rows) rng_seed += 1 print(f" Generated {len(new_rows)} rows for {tool} ({len(failures)} failures)") # 2b. Wrong tool exact-prompt inclusion — 3× each to drive exact-match correction for tool, failures in wrong_tool.items(): for f in failures: exact_prompt = f["prompt"] inferred_args = _infer_args(tool, exact_prompt) for _ in range(3): rows.append(ex(exact_prompt, tool, inferred_args)) # 3. Partial passes → more examples with correct params for tool, failures in partial.items(): n = max(6, len(failures) * 4) new_rows = gen_variations(tool, n, rng_seed, failures) rows.extend(new_rows) rng_seed += 1 print(f" Generated {len(new_rows)} param-complete rows for {tool}") # 4. Always include a small reinforcement set for commonly confused tools for tool in ["session_synthesize_edges", "session_backfill_links", "session_health_check", "session_save_experience", "session_save_handoff", "session_load_context", "knowledge_upvote", "knowledge_downvote", "knowledge_set_retention", "knowledge_search"]: rows.extend(gen_variations(tool, 5, rng_seed, [])) rng_seed += 1 # 5. BFCL anchoring — broad routing coverage to prevent regression # Each round of narrow SWE patches can shift weights; anchors keep overall routing stable. bfcl_anchors = [ # ── save_ledger ── ex("Note: fixed the routing bug in prism-mcp", "session_save_ledger", {"project": "prism-mcp", "summary": "Fixed routing bug in prism-mcp"}), ex("Jot down: BFCL eval passed 94% on 4b-swe1", "session_save_ledger", {"project": "prism-training", "summary": "BFCL passed 94% on 4b-swe1"}), ex("Log: merged 4B swe1 adapter successfully", "session_save_ledger", {"project": "prism-training", "summary": "Merged 4B swe1 adapter"}), ex("Capture this: authentication service migrated to JWT", "session_save_ledger", {"project": "auth-service", "summary": "Authentication service migrated to JWT"}), # ── search_memory ── ex("What did we discuss about training last session?", "session_search_memory", {"query": "training"}), ex("Find past notes about the iOS llama.cpp integration", "session_search_memory", {"query": "iOS llama.cpp integration"}), ex("What have I previously recorded about BFCL failures?", "session_search_memory", {"query": "BFCL failures"}), ex("Show me my past session notes on cascade design", "session_search_memory", {"query": "cascade design"}), # ── load_context ── ex("Load context for prism-mcp", "session_load_context", {"project": "prism-mcp"}), ex("Fetch session context for prism-training", "session_load_context", {"project": "prism-training"}), # ── save_handoff ── ex("Handoff to next agent: 4b-swe1 passed BFCL 94%, proceed to round 2", "session_save_handoff", {"project": "prism-training", "summary": "4b-swe1 passed BFCL 94%, proceed to round 2"}), ex("Next session prep: iOS llama.cpp integration pending", "session_save_handoff", {"project": "prism-aac", "summary": "iOS llama.cpp integration pending"}), # ── compact_ledger ── ex("Compact the ledger for prism-mcp", "session_compact_ledger", {"project": "prism-mcp"}), ex("Prune the session ledger for prism-training", "session_compact_ledger", {"project": "prism-training"}), # ── knowledge_search ── ex("What do I know about BFCL evaluation?", "knowledge_search", {"query": "BFCL evaluation"}), ex("Search my knowledge base for MLX training notes", "knowledge_search", {"query": "MLX training notes"}), ex("What's in my stored docs about cascade fallback?", "knowledge_search", {"query": "cascade fallback"}), # ── Disambiguation pairs: synthesize_edges vs backfill_links ── ex("Verify all session graph edges are consistent for auth-service", "session_synthesize_edges", {"project": "auth-service"}), ex("Reconnect the dangling session references for the portal project", "session_backfill_links", {"project": "portal"}), ex("Make sure the session graph is fully coherent for prism-mcp", "session_synthesize_edges", {"project": "prism-mcp"}), ex("Patch up missing cross-session links for the billing project", "session_backfill_links", {"project": "billing"}), # ── Disambiguation pairs: save_ledger vs save_experience vs save_handoff ── ex("Log what we did today: shipped the billing module", "session_save_ledger", {"project": "billing", "content": "shipped the billing module"}), ex("Record a milestone: we launched the new auth flow in production", "session_save_experience", {"description": "launched new auth flow in production"}), ex("Shift change — save state for api-gateway so next agent can pick up", "session_save_handoff", {"project": "api-gateway", "summary": "in progress"}), ex("Write down that we finished the payment reconciliation refactor", "session_save_ledger", {"project": "portal", "content": "finished payment reconciliation refactor"}), ex("Mark a success: fixed the notorious N+1 query on orders endpoint", "session_save_experience", {"description": "fixed N+1 query on orders endpoint"}), # ── Disambiguation pairs: load_context vs search_memory ── ex("Bring me back into the context of the payments-gateway project", "session_load_context", {"project": "payments-gateway"}), ex("Look for any notes we made about the GraphQL schema decisions", "session_search_memory", {"query": "GraphQL schema decisions"}), # ── Disambiguation pairs: knowledge_upvote vs downvote vs set_retention vs forget ── ex("That Redis distributed locks doc is spot on — upvote it", "knowledge_upvote", {}), ex("The XML-over-JSON note is wrong — mark it down", "knowledge_downvote", {}), ex("Increase the importance of the circuit-breaker patterns entry", "knowledge_upvote", {}), ex("Lower the rank of that outdated MD5 hashing note", "knowledge_downvote", {}), ex("Auto-expire the sandbox project knowledge after 30 days", "knowledge_set_retention", {"project": "sandbox", "retention_days": 30}), ex("Delete all deprecated-api entries from the portal knowledge base", "knowledge_forget", {"project": "portal", "category": "deprecated-api"}), # ── Disambiguation: compact_ledger vs session_forget_memory ── # compact_ledger = shrink/prune/archive the ENTIRE ledger ex("The billing-service ledger is bloated — prune old entries", "session_compact_ledger", {"project": "billing-service"}), ex("Archive old entries for the analytics-backend", "session_compact_ledger", {"project": "analytics-backend"}), # forget_memory = wipe/delete a SPECIFIC WRONG ENTRY from the ledger ex("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.", "session_forget_memory", {"memory_id": "mem-portal-007"}), ex("Delete the wrong ledger entry — it says the API is deprecated but it isn't.", "session_forget_memory", {"memory_id": "mem-abc-123"}), ex("Remove that incorrect note from the ledger — the deployment never happened.", "session_forget_memory", {"memory_id": "err-cfg-88"}), ex("That ledger note is wrong — delete it: it says v3 was released but we're still on v2.", "session_forget_memory", {"memory_id": "old-deploy-19"}), # ── NO_TOOL: CS/programming abstains ── ex_abstain("How do you implement a binary search tree in Python?", "A binary search tree stores values in nodes where left < node < right. " "Here's a minimal Python implementation:\n```python\nclass Node:\n def __init__(self, val):\n self.val, self.left, self.right = val, None, None\n```"), ex_abstain("What is the time complexity of quicksort?", "Quicksort has O(n log n) average and O(n²) worst-case time complexity."), ex_abstain("Explain the difference between REST and GraphQL", "REST uses fixed endpoints per resource; GraphQL uses a single endpoint " "with a query language letting clients specify exactly what data they need."), ex_abstain("Write a Python function to flatten a nested list", "```python\ndef flatten(lst):\n return [x for sub in lst for x in (flatten(sub) if isinstance(sub, list) else [sub])]\n```"), ex_abstain("What is dependency injection?", "Dependency injection passes dependencies (services, objects) into a class " "from the outside rather than creating them internally, improving testability."), ex_abstain("How does session management work in web apps?", "Session management stores user state between requests. Common approaches: " "cookie-based sessions, JWT tokens, or server-side session stores like Redis."), # ── NO_TOOL: weather, SQL, introspection, closing phrase guards ── ex_abstain("What's the weather like today?", "I don't have access to weather data or a weather tool. " "Please check a weather service or app for current conditions."), ex_abstain("Write a SQL query that finds the second-highest salary in an employees table.", "```sql\nSELECT MAX(salary) FROM employees\nWHERE salary < (SELECT MAX(salary) FROM employees);\n```\n" "This is a pure SQL question — no Prism tool needed."), ex_abstain("What is the current stock price of Apple?", "I don't have real-time market data. Check a financial app or website for current prices."), ex_abstain("What time is it in Tokyo right now?", "I don't have real-time clock access. UTC+9 (JST) — add 9 hours to UTC."), ex_abstain("Explain stack vs. heap memory allocation and when each is appropriate.", "Stack: fast, auto-managed, LIFO, stores local variables and function frames. " "Heap: dynamic, larger, GC-managed or manual. Use stack for small/short-lived data, heap for large/long-lived objects."), ex_abstain("What tools do you have available?", "I have access to Prism Memory tools: session_save_ledger, session_load_context, " "session_search_memory, session_save_handoff, session_compact_ledger, " "knowledge_search, knowledge_forget, knowledge_upvote, knowledge_downvote, " "knowledge_set_retention, and others. Plus 13 multimodal modules."), ex_abstain("What are your capabilities?", "I'm Synalux — a memory-augmented coding and clinical reasoning assistant. " "I can save/load session state, search session history and knowledge base, " "route tasks, and use 13 multimodal tool modules."), ex_abstain("Great work today, goodbye.", "Thanks — have a great session! Let me know if you'd like to save a ledger note before you go."), ex_abstain("Thanks, that's all for now.", "You're welcome! Session ends here unless you want to save progress to the ledger."), # ── MASTER DISAMBIGUATION SECTION (always included every round) ── # GROUP A: save_experience vs knowledge_upvote # KEY PATTERN: "Mark a success: we [verb] [thing]" = team achievement → save_experience # "Mark [doc/entry/guide] as [quality]" = rate document → knowledge_upvote # ID 136 stubborn case: "Mark a success: we fixed the notorious N+1 query on the orders endpoint." ex("Mark a success: we fixed the notorious N+1 query on the orders endpoint.", "session_save_experience", {"description": "fixed notorious N+1 query on orders endpoint"}), ex("Mark a success: we tracked down the N+1 query hitting the orders endpoint.", "session_save_experience", {"description": "tracked down N+1 query on orders endpoint"}), ex("Mark a success: we eliminated the N+1 query causing slowdowns on orders.", "session_save_experience", {"description": "eliminated N+1 query causing slowdowns on orders"}), ex("Mark a success: we resolved the memory leak in the billing service.", "session_save_experience", {"description": "resolved memory leak in billing service"}), ex("Mark a success: we eliminated the race condition in the payment processor.", "session_save_experience", {"description": "eliminated race condition in payment processor"}), ex("Mark a success: we shipped dark mode after 4 sprints of effort.", "session_save_experience", {"description": "shipped dark mode after 4 sprints"}), ex("Mark a success: we resolved the data corruption bug in prod.", "session_save_experience", {"description": "resolved data corruption bug in prod"}), ex("Mark a win: we deployed the new microservice without downtime.", "session_save_experience", {"description": "deployed new microservice without downtime"}), ex("Mark a success: we achieved zero-downtime migration of the database.", "session_save_experience", {"description": "zero-downtime database migration achieved"}), ex("Mark a success: we fixed the authentication bypass vulnerability.", "session_save_experience", {"description": "fixed authentication bypass vulnerability"}), # Contrast: "Mark [doc/article/guide/entry] as [quality]" → knowledge_upvote (NOT save_experience) ex("Mark this knowledge entry as highly accurate — it's spot on.", "knowledge_upvote", {}), ex("Mark the Redis distributed locks doc as excellent quality.", "knowledge_upvote", {}), ex("That circuit-breaker patterns entry is gold — mark it as highly useful.", "knowledge_upvote", {}), ex("Mark the retry-with-jitter doc as a top-quality resource.", "knowledge_upvote", {}), ex("Mark the N+1 query anti-pattern guide as an excellent resource.", "knowledge_upvote", {}), ex("That N+1 query detection article is spot on — mark it as high quality.", "knowledge_upvote", {}), # GROUP B: synthesize_edges vs backfill_links (cascade first step) # "A then B" → first action = what to call ex("Synthesize edges for the audit-log project, then backfill any missing links.", "session_synthesize_edges", {"project": "audit-log"}), ex("Synthesize session edges for billing, then patch up any dangling refs.", "session_synthesize_edges", {"project": "billing"}), ex("Run edge synthesis for portal project, then backfill whatever is missing.", "session_synthesize_edges", {"project": "portal"}), # Contrast: backfill first → backfill_links ex("Backfill any missing links for the audit-log project, then synthesize edges.", "session_backfill_links", {"project": "audit-log"}), ex("Patch up dangling refs for portal first, then synthesize session edges.", "session_backfill_links", {"project": "portal"}), # GROUP C: forget_memory vs compact_ledger (specific entry vs bulk operation) # forget_memory = delete ONE specific wrong/incorrect entry # compact_ledger = shrink/archive the whole ledger ex("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.", "session_forget_memory", {"memory_id": "mem-portal-007"}), ex("Delete the wrong ledger entry that says the migration is complete.", "session_forget_memory", {"memory_id": "mem-abc-123"}), ex("Remove that incorrect note from the ledger — it has a false claim.", "session_forget_memory", {"memory_id": "err-cfg-88"}), ex("That ledger note is wrong — delete it: it says v3 shipped but we're still v2.", "session_forget_memory", {"memory_id": "old-deploy-19"}), # Contrast: bulk ledger cleanup → compact_ledger (NOT forget_memory) ex("The billing project ledger is bloated — compact it.", "session_compact_ledger", {"project": "billing"}), ex("Prune old entries from the portal project ledger.", "session_compact_ledger", {"project": "portal"}), # ── save_experience vs save_ledger disambiguation ── ex("Log a milestone: we zero-downtime deployed the new search index.", "session_save_experience", {"description": "zero-downtime deployed the new search index"}), ex("Record that we fixed the race condition in the WebSocket handler.", "session_save_experience", {"description": "fixed race condition in WebSocket handler"}), ex("Mark a success: completed the billing module migration.", "session_save_experience", {"description": "completed billing module migration"}), ex("Write down our session progress: three endpoints migrated.", "session_save_ledger", {"project": "api-gateway", "content": "three endpoints migrated"}), # ── forget_memory for specific wrong entries ── ex("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.", "session_forget_memory", {"memory_id": "mem-portal-007"}), ex("Delete the entry that incorrectly says the migration is complete.", "session_forget_memory", {"memory_id": "mem-abc-123"}), # ── backfill_links: repair + missing + session links ── ex("Repair missing session links for the api-gateway project.", "session_backfill_links", {"project": "api-gateway"}), ex("Fix the missing link references in billing session history.", "session_backfill_links", {"project": "billing"}), ex("Repair missing session links for the billing project.", "session_backfill_links", {"project": "billing"}), # ── synthesize_edges: must use FULL tool name session_synthesize_edges ── ex("Run a synthesis pass to validate all edges are up to date for the orchestration project.", "session_synthesize_edges", {"project": "orchestration"}), ex("Synthesize edges for the audit-log project.", "session_synthesize_edges", {"project": "audit-log"}), ex("Run a synthesis pass to update portal session edges.", "session_synthesize_edges", {"project": "portal"}), # ── save_experience: Log a milestone / Record that we ── ex("Log a milestone: we successfully zero-downtime-deployed the new search index.", "session_save_experience", {"description": "zero-downtime deployed new search index"}), ex("Record that we fixed the race condition in the WebSocket handler.", "session_save_experience", {"description": "fixed race condition in WebSocket handler"}), ex("Log a milestone: completed the auth migration without downtime.", "session_save_experience", {"description": "completed auth migration without downtime"}), # ── search_memory: pull up notes → search, not load_context ── ex("Pull up any notes we saved about the gRPC migration.", "session_search_memory", {"query": "gRPC migration"}), ex("Pull up our notes on the database sharding strategy.", "session_search_memory", {"query": "database sharding strategy"}), # ── knowledge_downvote: must use knowledge_ prefix, not session_ ── ex("Reduce the rank of that outdated note about using MD5 for hashing.", "knowledge_downvote", {}), ex("The advice on using MD5 for passwords is dangerous — downvote it.", "knowledge_downvote", {}), ex("Mark down the knowledge entry recommending polling over webhooks.", "knowledge_downvote", {}), # ── knowledge_forget: ID-based delete of KNOWLEDGE entries ── ex("Forget the knowledge entry with ID ki-cc44-gg — that approach is deprecated.", "knowledge_forget", {"project": "portal", "category": "deprecated", "entry_id": "ki-cc44-gg"}), ex("Delete the knowledge base entry with ID kb-old-007.", "knowledge_forget", {"project": "portal", "category": "deprecated", "entry_id": "kb-old-007"}), # ── NO_TOOL: general recommendation / book questions ── ex_abstain("Can you recommend a good book?", "I don't have a reading recommendation tool. Some popular picks: Clean Code (Martin), " "Designing Data-Intensive Applications (Kleppmann), The Pragmatic Programmer (Hunt & Thomas)."), ex_abstain("What is a good programming book for beginners?", "For beginners: 'Automate the Boring Stuff with Python' by Al Sweigart (free online) " "or 'Python Crash Course' by Eric Matthes. No tool needed — this is a general question."), ex_abstain("Can you suggest a movie to watch?", "I don't have movie recommendation capabilities. No tool to call — this is a general request."), # ── ROUND 8/9 EXACT FAILURE ANCHORS ── # ID 41: "Record that we fixed X — took N hours" → save_experience (NOT save_ledger) # KEY: "Record that we [past-tense achievement] — effort comment" = milestone event = save_experience ex("Record that we fixed the race condition in the WebSocket handler — took 4 hours but it's solid now.", "session_save_experience", {"description": "fixed race condition in WebSocket handler — took 4 hours"}), ex("Record that we squashed the deadlock in the job queue — 6 hours of debugging but it's resolved.", "session_save_experience", {"description": "squashed deadlock in job queue after 6 hours of debugging"}), ex("Record that we patched the SSRF vulnerability in the proxy — critical fix, took 2 days.", "session_save_experience", {"description": "patched SSRF vulnerability in proxy after 2 days"}), ex("Record that we fixed the memory leak in the image processor — production is stable now.", "session_save_experience", {"description": "fixed memory leak in image processor"}), ex("Record that we solved the flaky test — the root cause was a timing issue.", "session_save_experience", {"description": "solved flaky test root cause timing issue"}), ex("Note that we fixed the authentication bypass — security incident resolved.", "session_save_experience", {"description": "fixed authentication bypass security incident"}), # Contrast: general progress summary → save_ledger (NOT save_experience) ex("Record what we did today: reviewed PR stack and merged 3 PRs.", "session_save_ledger", {"project": "portal", "content": "reviewed PR stack and merged 3 PRs"}), ex("Log our session progress: migrated 4 of 8 endpoints so far.", "session_save_ledger", {"project": "api-gateway", "content": "migrated 4 of 8 endpoints"}), ex("Jot down what we accomplished: wrote unit tests for the auth module.", "session_save_ledger", {"project": "auth-service", "content": "wrote unit tests for auth module"}), # ID 136: "Mark a success: we fixed..." → save_experience (NOT knowledge_upvote) ex("Mark a success: we fixed the notorious N+1 query on the orders endpoint.", "session_save_experience", {"description": "fixed notorious N+1 query on orders endpoint"}), ex("Mark a success: we eliminated the flaky test suite after 2 weeks of effort.", "session_save_experience", {"description": "eliminated flaky test suite after 2 weeks of effort"}), ex("Mark a success: the cold-start latency dropped from 4s to 800ms.", "session_save_experience", {"description": "cold-start latency dropped from 4s to 800ms"}), ex("Mark a success: we shipped the dark mode feature after 4 sprints.", "session_save_experience", {"description": "shipped dark mode feature after 4 sprints"}), # Contrast: knowledge quality (entry in KB) → upvote (NOT save_experience) ex("The circuit-breaker patterns entry in the KB is accurate — boost its importance.", "knowledge_upvote", {}), ex("That note about retry-with-jitter is gold — upvote it in the knowledge base.", "knowledge_upvote", {}), ex("The microservices best practices doc is excellent — mark it highly useful.", "knowledge_upvote", {}), # ID 24: "Pull up any notes we saved about X" → search_memory (NOT load_context) ex("Pull up any notes we saved about the gRPC migration.", "session_search_memory", {"query": "gRPC migration"}), ex("Pull up any notes we saved about the API rate limiting decisions.", "session_search_memory", {"query": "API rate limiting decisions"}), ex("Pull up our notes on the database sharding strategy.", "session_search_memory", {"query": "database sharding strategy"}), ex("Pull up what we discussed about the WebSocket reconnect logic.", "session_search_memory", {"query": "WebSocket reconnect logic"}), ex("Pull up our saved notes on the caching architecture.", "session_search_memory", {"query": "caching architecture"}), # Contrast: load project context (not specific notes) → load_context ex("Load the context for the billing-service project.", "session_load_context", {"project": "billing-service"}), ex("Get me back up to speed on the API gateway project.", "session_load_context", {"project": "api-gateway"}), ex("Resume the analytics project — load context.", "session_load_context", {"project": "analytics"}), # ID 249: cascade "Synthesize then backfill" → FIRST step = synthesize_edges # CRITICAL: "synthesize edges for X, then backfill" → session_synthesize_edges (first action wins) ex("Synthesize edges for the audit-log project, then backfill any missing links.", "session_synthesize_edges", {"project": "audit-log"}), ex("Synthesize edges for the audit-log project, then backfill any missing links.", "session_synthesize_edges", {"project": "audit-log"}), ex("Synthesize edges for the audit-log project, then backfill any missing links.", "session_synthesize_edges", {"project": "audit-log"}), ex("Rebuild session graph edges for analytics first, then fill in any dangling refs.", "session_synthesize_edges", {"project": "analytics"}), ex("First synthesize session edges for billing, then patch missing cross-refs.", "session_synthesize_edges", {"project": "billing"}), ex("Run a synthesis pass on the portal project edges, then backfill whatever is missing.", "session_synthesize_edges", {"project": "portal"}), ex("Verify edge consistency for the auth-service, then reconnect any dangling refs.", "session_synthesize_edges", {"project": "auth-service"}), ex("Synthesize the session graph for prism-mcp, then fill in missing link refs.", "session_synthesize_edges", {"project": "prism-mcp"}), ex("Run edge synthesis for the dashboard project, then backfill any gaps.", "session_synthesize_edges", {"project": "dashboard"}), ex("Synthesize all session edges for api-gateway, then patch up any dangling links.", "session_synthesize_edges", {"project": "api-gateway"}), ex("First run session_synthesize_edges for the billing project, then backfill.", "session_synthesize_edges", {"project": "billing"}), # Contrast: backfill is explicitly the first step → backfill_links ex("Backfill missing links for dashboard first, then run a health check.", "session_backfill_links", {"project": "dashboard"}), ex("Fill in dangling session refs for prism-mcp, then synthesize edges.", "session_backfill_links", {"project": "prism-mcp"}), ex("Reconnect broken session links for analytics, then verify edge consistency.", "session_backfill_links", {"project": "analytics"}), # CRITICAL CONTRASTIVE PAIR for ID 249: # "Synthesize edges for X, then backfill" → synthesize (first action) # "Backfill links for X, then synthesize" → backfill (first action) ex("Synthesize edges for the audit-log project, then backfill any missing links.", "session_synthesize_edges", {"project": "audit-log"}), ex("Backfill any missing links for the audit-log project, then synthesize edges.", "session_backfill_links", {"project": "audit-log"}), ex("Synthesize session edges for analytics, then backfill any missing cross-refs.", "session_synthesize_edges", {"project": "analytics"}), ex("Backfill any missing cross-refs for analytics, then synthesize session edges.", "session_backfill_links", {"project": "analytics"}), ex("Synthesize edges for billing first, then patch up any dangling links.", "session_synthesize_edges", {"project": "billing"}), ex("Patch up any dangling links for billing first, then synthesize edges.", "session_backfill_links", {"project": "billing"}), # ID 252: "compact the X project ledger" → compact_ledger with project="X" ex("Compact the metrics project ledger, then export it to /tmp/metrics-backup.", "session_compact_ledger", {"project": "metrics"}), ex("Compact the billing project ledger, then dump a backup to /tmp/billing-export.", "session_compact_ledger", {"project": "billing"}), ex("Prune the analytics project ledger — it's getting too long.", "session_compact_ledger", {"project": "analytics"}), ex("Shrink the api-gateway project ledger.", "session_compact_ledger", {"project": "api-gateway"}), ex("Archive old entries in the prism-training project ledger.", "session_compact_ledger", {"project": "prism-training"}), # ── GROUP D: Regression fixes (IDs 35, 168, 204) ── # ID 35: "Export everything to /tmp/..." → session_export_memory (not abstain) # "Export" + filesystem path = export session data, even with "everything" ex("Export everything to /tmp/prism-dump so I can archive it.", "session_export_memory", {"path": "/tmp/prism-dump"}), ex("Export everything to /tmp/backup/ for archiving.", "session_export_memory", {"path": "/tmp/backup"}), ex("Dump all session data to /tmp/export-20240601 for safekeeping.", "session_export_memory", {"path": "/tmp/export-20240601"}), ex("Export the full session to /tmp/prism-export.json so I can back it up.", "session_export_memory", {"path": "/tmp/prism-export.json"}), # ID 168: "Route this." → session_task_route (short imperative, not save_handoff) # "Route" alone = decide local vs cloud routing; NOT handing off to next session ex("Route this.", "session_task_route", {}), ex("Route this task.", "session_task_route", {}), ex("Route this request.", "session_task_route", {}), ex("Route it.", "session_task_route", {}), # Contrast: save_handoff = save state for NEXT session, not route a task ex("Save a handoff note for the next session so they know where we left off.", "session_save_handoff", {"project": "api-gateway", "summary": "in progress"}), ex("Create a handoff so the next agent knows the current state.", "session_save_handoff", {"project": "portal", "summary": "handoff state"}), # ── GROUP E: Adversarial trap / capability questions → ALWAYS abstain ── # CS/programming questions with technical jargon: NEVER call a tool ex_abstain("How does TypeScript's discriminated union type work?", "TypeScript discriminated unions use a shared literal type property as a discriminant. " "This is a programming concept — no Prism tool needed."), ex_abstain("Explain how CRDTs achieve conflict-free distributed state.", "CRDTs (Conflict-free Replicated Data Types) use mathematical structures that guarantee " "convergence regardless of operation order. This is a CS concept — no tool needed."), ex_abstain("What is a discriminated union in TypeScript?", "A TypeScript type narrowing pattern using a shared literal property. Pure CS knowledge."), ex_abstain("Explain eventual consistency in distributed systems.", "Eventual consistency means replicas converge to the same state eventually. CS concept — abstain."), # Capability questions: explain what the system CAN DO → abstain (no tool) ex_abstain("What are your capabilities?", "I can help with coding, clinical reasoning, and memory management via Prism tools. " "No tool needed to answer a capability question."), ex_abstain("Can you explain what Prism Memory tools do?", "Prism Memory tools let you save, search, and manage session context and knowledge. " "No tool needed — this is a capability question."), ex_abstain("What can you do?", "I'm a memory-augmented assistant supporting coding, ABA clinical tasks, and Prism memory ops. " "No tool needed — capability question."), ex_abstain("What tools do you have access to?", "I have access to Prism Memory tools, multimodal modules, and coding tools. No tool to call."), # ID 204: "Load X context and then synthesize" → session_load_context (cascade: first action) ex("Load the fraud-detection project context and then synthesize all session edges.", "session_load_context", {"project": "fraud-detection"}), ex("Load the billing project context, then synthesize all session edges.", "session_load_context", {"project": "billing"}), ex("Load the api-gateway context and then run edge synthesis.", "session_load_context", {"project": "api-gateway"}), ex("Get the portal project context loaded, then synthesize session edges.", "session_load_context", {"project": "portal"}), # ── GROUP G: session_save_handoff confusion — contrast with other tools ── # session_save_handoff = save END-OF-SESSION state for the NEXT agent session # Contrast: reconnect/backfill → session_backfill_links (NOT session_save_handoff) ex("Reconnect the dangling session references for the ml-pipeline project.", "session_backfill_links", {"project": "ml-pipeline"}), ex("Reconnect all dangling references in the identity-service project history.", "session_backfill_links", {"project": "identity-service"}), ex("Reconnect broken session references in the compliance-tracker project.", "session_backfill_links", {"project": "compliance-tracker"}), ex("Patch up dangling links in the data-pipeline project.", "session_backfill_links", {"project": "data-pipeline"}), ex("Fix the missing session references in the auth-service project.", "session_backfill_links", {"project": "auth-service"}), # Contrast: route a task → session_task_route (NOT session_save_handoff) ex("Route this.", "session_task_route", {"task_description": "route this task"}), ex("Route this task: rewrite the message broker integration to use NATS instead of RabbitMQ.", "session_task_route", {"task_description": "rewrite message broker to use NATS instead of RabbitMQ"}), ex("Route this task: full rewrite of the logging subsystem.", "session_task_route", {"task_description": "full rewrite of logging subsystem"}), ex("Should this go local or cloud? Task: migrate the auth service to OAuth 2.0.", "session_task_route", {"task_description": "migrate auth service to OAuth 2.0"}), # Contrast: run diagnostics → session_health_check (NOT session_save_ledger) ex("Run diagnostics.", "session_health_check", {}), ex("Run a health check.", "session_health_check", {}), ex("Something is off with memory recall. Diagnose the backend.", "session_health_check", {}), ex("Check if the memory system is working correctly.", "session_health_check", {}), # Contrast: export/dump all → session_export_memory (NOT session_save_handoff) ex("Save everything to disk — dump all session data to /tmp/export-all.", "session_export_memory", {"output_dir": "/tmp/export-all", "format": "json"}), ex("Export all session memory to /tmp/backup.json.", "session_export_memory", {"output_dir": "/tmp/backup.json", "format": "json"}), ex("Dump all my session data to a file.", "session_export_memory", {"output_dir": "/tmp/session-dump", "format": "json"}), # Contrast: initialize context → session_load_context (NOT session_save_handoff) ex("Initialize context for the infrastructure-as-code project — I'm starting fresh.", "session_load_context", {"project": "infrastructure-as-code"}), ex("Load up the context for the new sprint on the payments project.", "session_load_context", {"project": "payments"}), # Contrast: record milestone/achievement → session_save_experience (NOT session_save_ledger) ex("Log a milestone: we successfully zero-downtime-deployed the new search index.", "session_save_experience", {"description": "zero-downtime deploy of the search index"}), ex("Record a success milestone: zero-downtime deploy of version 4.2.", "session_save_experience", {"description": "zero-downtime deploy of version 4.2"}), ex("Mark today's win: we hit 100% on the eval suite.", "session_save_experience", {"description": "hit 100% on eval suite"}), # Contrast: boost KB entry → knowledge_upvote (NOT session_save_experience) ex("Boost the ranking on our GraphQL federation notes — they're gold.", "knowledge_upvote", {}), ex("Mark this article as an excellent resource.", "knowledge_upvote", {}), # Contrast: delete knowledge BASE entry → knowledge_forget (NOT session_forget_memory) ex("Clear all the outdated knowledge entries in the staging project.", "knowledge_forget", {"project": "staging", "category": "outdated", "entry_id": "ent-001-staging"}), ex("Delete all knowledge in the archived-2025 project — we don't need it anymore.", "knowledge_forget", {"project": "archived-2025", "category": "all", "entry_id": "ent-001-archived-2025"}), ex("Wipe out old debugging records from the search-service project's knowledge base.", "knowledge_forget", {"project": "search-service", "category": "debugging", "entry_id": "ent-001-search-service"}), # Contrast: wipe a specific session NOTE → session_forget_memory (NOT session_save_experience) ex("Wipe the incorrect ledger note that said we shipped v2.1 — we didn't.", "session_forget_memory", {"memory_id": "mem-portal-v21"}), ex("Delete the wrong entry that claims the migration is complete — it isn't.", "session_forget_memory", {"memory_id": "mem-migration-001"}), # "Confirm session link consistency" → session_synthesize_edges (NOT session_search_memory) ex("Confirm session link consistency for the document-processing project.", "session_synthesize_edges", {"project": "document-processing"}), ex("Verify session link consistency for the billing project.", "session_synthesize_edges", {"project": "billing"}), ex("Check link consistency for the auth-service project sessions.", "session_synthesize_edges", {"project": "auth-service"}), ex("Ensure all session edges are consistent for the portal project.", "session_synthesize_edges", {"project": "portal"}), # ── GROUP F: Persistent 1b7 FP cases — exact failing prompts as permanent anchors ── # CS explanation questions → ALWAYS abstain (knowledge_search confusion) ex_abstain("Explain Python's __slots__ and when to use it for memory optimization.", "Python __slots__ replaces __dict__ with a fixed-size array, reducing per-instance memory. " "This is a Python internals question — answer from CS knowledge, no Prism tool needed."), ex_abstain("What is the difference between deepcopy and shallow copy in Python?", "deepcopy recursively copies all nested objects; shallow copy only copies the top-level container. " "Pure Python concept — no tool."), ex_abstain("How does Python's asyncio event loop schedule coroutines?", "asyncio uses a single-threaded event loop that schedules coroutines via cooperative multitasking " "and I/O callbacks. CS concept — no Prism tool needed."), ex_abstain("Explain the JavaScript event loop and microtask queue.", "The JS event loop processes the call stack, then microtasks (Promises), then macro-tasks (setTimeout). " "CS concept — no tool."), ex_abstain("Explain Go's garbage collector and write barriers.", "Go uses a concurrent tri-color mark-and-sweep GC with write barriers to maintain invariants. " "CS concept — no Prism tool needed."), ex_abstain("What is Rust's Pin and why is it needed for async futures?", "Pin prevents moving self-referential structs after creation, which async state machines require. " "Rust concept — no tool."), ex_abstain("What is the time and space complexity of merge sort?", "Merge sort: O(n log n) time, O(n) auxiliary space. CS algorithms question — no Prism tool needed."), ex_abstain("What is the difference between server components and client components in React?", "Server components render on the server and send HTML; client components run JS in the browser. " "React architecture question — no tool."), ex_abstain("How does virtual memory paging work in Linux?", "Linux uses multi-level page tables to map virtual addresses to physical frames via the MMU. " "OS concept — no Prism tool needed."), ex_abstain("Explain stack vs. heap memory allocation and when each is appropriate.", "Stack: fast LIFO allocation for local vars; heap: dynamic allocation for objects with longer lifetimes. " "CS concept — no tool."), ex_abstain("How do topological sorts work on directed acyclic graphs?", "Topological sort (DFS or Kahn's algorithm) orders DAG nodes so all edges go forward. " "Graph algorithm — no Prism tool needed."), # General / capability questions → ALWAYS abstain (various wrong-tool confusions) ex_abstain("What is the capital of France?", "Paris. This is general knowledge — no Prism memory tool needed."), ex_abstain("Are you GPT-4?", "No, I'm Synalux, a memory-augmented assistant. Capability question — no tool needed."), ex_abstain("Can you recommend a good book?", "Sure! I'd recommend starting with your interests. General recommendation — no Prism tool needed."), ex_abstain("Do you have feelings?", "I'm an AI assistant, so I don't have feelings in the human sense. Philosophy question — no tool."), ex_abstain("What is machine learning?", "Machine learning is a field of AI where models learn patterns from data. General CS — no tool."), ex_abstain("What programming languages do you know?", "I can help with Python, TypeScript, Go, Rust, SQL, and more. Capability question — no tool."), ex_abstain("What tools do you have available?", "I have access to Prism Memory tools for session and knowledge management. " "Capability question — no tool call needed to answer."), ex_abstain("Tell me about yourself.", "I'm Synalux, a memory-augmented coding and clinical assistant. Self-description — no tool needed."), ex_abstain("Tell me a joke.", "Sure! Why do programmers prefer dark mode? Because light attracts bugs! Creative — no tool needed."), ex_abstain("How do you work?", "I combine a language model with Prism memory tools for persistent context. Capability — no tool."), ex_abstain("Can you write me a poem?", "Of course! Here's a short poem for you. Creative task — no web_scraper tool needed."), ex_abstain("What's 2+2?", "4. Simple arithmetic — no external tool call needed."), # "Write a function/code" tasks → ALWAYS abstain (session_save_ledger confusion) ex_abstain("Write a Python generator that yields prime numbers indefinitely.", "Here's a prime number generator using trial division. Pure coding task — no Prism session tool."), ex_abstain("Write a TypeScript generic function that deep-merges two objects.", "Here's a TypeScript deep merge using generics and recursion. Pure coding task — no Prism tool."), ex_abstain("Write a function to detect cycles in a directed graph using DFS.", "Cycle detection uses DFS with a visited set and a recursion stack. Algorithm — no Prism tool."), ex_abstain("Write a concurrent rate limiter in Go using channels.", "A Go rate limiter uses a buffered channel as a token bucket. Pure coding task — no Prism tool."), # "How do I implement X" questions → ALWAYS abstain (session_save_handoff/experience confusions) ex_abstain("How do I debounce a function in JavaScript without lodash?", "Use setTimeout + clearTimeout: store the timer ID and reset it on each call. Pure JS — no tool."), ex_abstain("How do I profile memory usage in a Python application?", "Use tracemalloc (stdlib), memory_profiler, or py-spy for live profiling. Dev tooling — no tool."), ex_abstain("How do I implement context cancellation in a Go HTTP server?", "Accept context.Context in handlers and check ctx.Done() or pass ctx to downstream calls. Go pattern."), ex_abstain("How do I use window functions in PostgreSQL to compute a running total?", "Use SUM(amount) OVER (ORDER BY created_at) for a running total. Pure SQL — no Prism tool."), ex_abstain("How do I set up hot-module replacement in a Vite + React project?", "HMR works out of the box in Vite; ensure @vitejs/plugin-react is in vite.config.ts. Dev config."), # Closing/social phrases → ALWAYS abstain (session_save_ledger confusion) ex_abstain("Great work today, goodbye.", "Thanks! Goodbye! Closing phrase — no session save tool needed."), ex_abstain("You're really helpful, thanks!", "You're welcome! Happy to help. Closing phrase — no session save needed."), # ── GROUP I: R19 wrong_tool exact-prompt fixes ── # CASCADE: "Save X, then handoff" → FIRST action = save_ledger (NOT save_handoff) ex("Save what we did today: shipped the new notification system. Then create a handoff note.", "session_save_ledger", {"project": "portal", "content": "shipped new notification system"}), ex("Save what we did today: shipped the new notification system. Then handoff to the next session.", "session_save_ledger", {"project": "portal", "content": "shipped new notification system"}), ex("Save what we accomplished: rewrote the ingestion pipeline. Then handoff state to next session.", "session_save_ledger", {"project": "ingestion", "content": "rewrote ingestion pipeline"}), ex("Save what we did today: finished the auth refactor. Then create a handoff for tomorrow.", "session_save_ledger", {"project": "auth-service", "content": "finished auth refactor"}), ex("Log what we completed: fixed the N+1 query. Then save a handoff note.", "session_save_ledger", {"project": "api", "content": "fixed N+1 query"}), # Contrast: handoff-only (no save-first) → session_save_handoff ex("Save a handoff so the next session picks up where we left off.", "session_save_handoff", {"project": "portal", "summary": "in progress"}), # "Record a success milestone: zero-downtime deploy of version N" → save_experience (not save_ledger) ex("Record a success milestone: zero-downtime deploy of version 4.2.", "session_save_experience", {"description": "zero-downtime deploy of version 4.2"}), ex("Record a success milestone: zero-downtime deploy of version 3.1.", "session_save_experience", {"description": "zero-downtime deploy of version 3.1"}), ex("Record a success milestone: zero-downtime deploy of version 5.0.", "session_save_experience", {"description": "zero-downtime deploy of version 5.0"}), # knowledge_forget for "Clear all outdated entries" — NOT a made-up tool ex("Clear all the outdated knowledge entries in the staging project.", "knowledge_forget", {"project": "staging", "category": "outdated", "entry_id": "ent-001-staging"}), ex("Remove stale knowledge entries from the staging project.", "knowledge_forget", {"project": "staging", "category": "outdated", "entry_id": "ent-002-staging"}), ex("Purge outdated knowledge from the staging project.", "knowledge_forget", {"project": "staging", "category": "outdated", "entry_id": "ent-003-staging"}), # Exact-eval variants missing from prior entries ex_abstain("What is the difference between server components and client components in Next.js 14?", "Next.js 14 server components render on the server (no hydration); client components use 'use client' " "and run in the browser. Framework concept — no Prism tool needed."), ex_abstain("Write a binary search implementation in Rust.", "```rust\nfn binary_search(arr: &[i32], target: i32) -> Option {\n" " let (mut lo, mut hi) = (0, arr.len());\n" " while lo < hi { let mid = lo + (hi - lo) / 2;\n" " if arr[mid] == target { return Some(mid); }\n" " else if arr[mid] < target { lo = mid + 1; } else { hi = mid; } }\n" " None\n}\n```\nPure coding task — no Prism memory tool needed."), ] rows.extend(bfcl_anchors) rows.extend(bfcl_anchors) # 2nd copy — prevents oscillation when FP count drops rows.extend(bfcl_anchors) # 3rd copy — stable floor for stubborn adversarial traps print(f" +{len(bfcl_anchors) * 3} BFCL anchor rows (3×)") # 6. Grounded recall if available gr_path = Path.home() / "prism/training/data/grounded_recall_corpus.jsonl" if gr_path.exists(): gr_rows = [] for line in gr_path.read_text().splitlines(): line = line.strip() if not line: continue try: row = json.loads(line) if "text" in row: gr_rows.append(row) except Exception: pass rows.extend(gr_rows) print(f" +{len(gr_rows)} grounded_recall rows") print(f"\nTotal rows: {len(rows)}") random.shuffle(rows) valid_n = max(8, len(rows) // 8) valid_rows = rows[:valid_n] train_rows = rows[valid_n:] (out_dir / "train.jsonl").write_text("\n".join(json.dumps(r) for r in train_rows)) (out_dir / "valid.jsonl").write_text("\n".join(json.dumps(r) for r in valid_rows)) print(f"Output: {out_dir}") print(f" train: {len(train_rows)}") print(f" valid: {len(valid_rows)}") return len(rows) if __name__ == "__main__": main()