#!/usr/bin/env python3 """ Patch4 corpus for prism-coder:4b-v43. Targets swe-bench failures not fixable by Layer 3: - Group C: session_task_route false negatives (implicit routing questions) - Group D: param extraction from natural/informal phrasing Output format: {"text": ""} to match existing train.jsonl """ import json, pathlib OUT = pathlib.Path("/tmp/4b_v43_patch4.jsonl") SYS = ( "You are Synalux, a memory-augmented coding and clinical reasoning assistant. " "You have access to Prism Memory tools (session_save_ledger, session_load_context, " "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, " "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, " "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, " "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool " "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, " "hipaa, data_graph, templates, pdf_parser). " "Think step-by-step before answering. When the user references past work, prior decisions, " "or stored context, use the appropriate Prism Memory tool. " "Format tool calls inside ... JSON blocks with fields 'name' and 'arguments'. " "If no tool is needed, answer directly in plain text. " "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions." ) THINK_OPEN = "<|synalux_think|>" THINK_CLOSE = "" def render_chatml(user_text, assistant_text): return ( f"<|im_start|>system\n{SYS}<|im_end|>\n" f"<|im_start|>user\n{user_text}<|im_end|>\n" f"<|im_start|>assistant\n{assistant_text}<|im_end|>" ) def msg(user, tool_name, args): call = json.dumps({"name": tool_name, "arguments": args}) think = f"{THINK_OPEN}The user wants {tool_name.replace('_', ' ')}. I'll call the tool with the right parameters.{THINK_CLOSE}" assistant = f"{think}\n\n{call}\n" return {"text": render_chatml(user, assistant)} def no_tool_msg(user, reply): think = f"{THINK_OPEN}This is a general programming/CS question. I should answer directly without calling any Prism tool.{THINK_CLOSE}" assistant = f"{think}\n{reply}" return {"text": render_chatml(user, assistant)} rows = [] # ── GROUP C: session_task_route — implicit "should I/local/cloud?" questions ── task_route_cases = [ ("Should I handle this CSS grid refactor myself or punt it to the cloud model?", {"task_description": "CSS grid refactor", "prefer_local": False}), ("Is this bug fix simple enough for the local model to tackle?", {"task_description": "bug fix"}), ("Can the on-device model handle parsing this 50MB XML dump, or should I escalate to cloud?", {"task_description": "parsing 50MB XML dump", "prefer_local": False}), ("Should I let the local model handle this TypeScript refactor or route it to the bigger model?", {"task_description": "TypeScript refactor", "prefer_local": False}), ("Is writing a full test suite for the billing module something the local model can do well?", {"task_description": "writing test suite for billing module"}), ("This SQL query optimization seems complex — local or cloud?", {"task_description": "SQL query optimization"}), ("Route this: rewriting the authentication middleware from scratch.", {"task_description": "rewriting authentication middleware from scratch"}), ("Should the local agent handle this performance profiling task?", {"task_description": "performance profiling task"}), ("Is a full-text semantic search implementation doable on-device?", {"task_description": "full-text semantic search implementation"}), ("Local or cloud for this multi-file refactor of the payments module?", {"task_description": "multi-file refactor of payments module", "prefer_local": False}), ("Can the on-device model handle a 3000-line code review?", {"task_description": "3000-line code review"}), ("Route this debugging task: tracing intermittent race conditions in async code.", {"task_description": "tracing intermittent race conditions in async code"}), ("Should I send this architecture review to the cloud model or keep it local?", {"task_description": "architecture review", "prefer_local": False}), ("Is this data pipeline migration too complex for local inference?", {"task_description": "data pipeline migration"}), ("Handle or escalate? — writing API documentation for 40 endpoints.", {"task_description": "writing API documentation for 40 endpoints"}), ] for user, args in task_route_cases: rows.append(msg(user, "session_task_route", args)) print(f"After GROUP C (task_route): {len(rows)} rows") # ── GROUP D1: session_export_memory — extract output_dir from casual phrasing ── export_cases = [ ("Dump everything to /tmp/backup so I can archive it. JSON format.", {"output_path": "/tmp/backup", "format": "json"}), ("Back up all my memories to /var/exports before I wipe the session.", {"output_path": "/var/exports"}), ("Export the prism-mcp project memory to /tmp/prism-export in markdown format.", {"project": "prism-mcp", "output_path": "/tmp/prism-export", "format": "markdown"}), ("I want to export a backup and then compact the old entries — save it to /tmp/mem-backup.", {"output_path": "/tmp/mem-backup"}), ("Export everything from the billing project to /tmp/billing-backup.json.", {"project": "billing", "output_path": "/tmp/billing-backup.json"}), ("Can you dump the analytics project sessions to /home/user/exports/analytics?", {"project": "analytics", "output_path": "/home/user/exports/analytics"}), ("Save a memory backup to /tmp/export before we start the migration.", {"output_path": "/tmp/export"}), ("Archive the prism-training project to /tmp/training-archive in JSON.", {"project": "prism-training", "output_path": "/tmp/training-archive", "format": "json"}), ] for user, args in export_cases: rows.append(msg(user, "session_export_memory", args)) print(f"After GROUP D1 (export): {len(rows)} rows") # ── GROUP D2: session_forget_memory — extract memory_id from prompt ── forget_cases = [ ("Delete the specific memory entry with ID mem-abc-123.", {"memory_id": "mem-abc-123"}), ("Remove memory entry 7f3a-bc21-d4e5 — it's completely wrong.", {"memory_id": "7f3a-bc21-d4e5"}), ("Get rid of entry ID: ent-2024-991. It was saved by mistake.", {"memory_id": "ent-2024-991"}), ("Forget memory ID mem-portal-007.", {"memory_id": "mem-portal-007"}), ("Delete that specific memory entry with ID 'sess-42-bad'.", {"memory_id": "sess-42-bad"}), ("The entry tagged as 'exp-001' is outdated. Please remove it.", {"memory_id": "exp-001"}), ("Wipe memory entry ID = abc123xyz.", {"memory_id": "abc123xyz"}), ("Remove the wrong entry — its ID is MEM-2025-0042.", {"memory_id": "MEM-2025-0042"}), ("That entry we saved earlier about the broken migration config — the ID is err-cfg-88, delete it.", {"memory_id": "err-cfg-88"}), ("Clear out memory ID: old-deploy-notes-19.", {"memory_id": "old-deploy-notes-19"}), ] for user, args in forget_cases: rows.append(msg(user, "session_forget_memory", args)) print(f"After GROUP D2 (forget+id): {len(rows)} rows") # ── GROUP D3: session_task_route — extract task_description from implicit phrasing ── task_route_desc_cases = [ ("Route this refactoring task — if local, proceed; if cloud, just tell me.", {"task_description": "refactoring task"}), ("Is the on-device model good enough for a full GraphQL schema migration?", {"task_description": "GraphQL schema migration"}), ("This linting fix across 200 files — local or cloud?", {"task_description": "linting fix across 200 files"}), ("Route: generating embeddings for 10k documents.", {"task_description": "generating embeddings for 10k documents"}), ("Should the local model handle writing unit tests for the auth module?", {"task_description": "writing unit tests for auth module"}), ] for user, args in task_route_desc_cases: rows.append(msg(user, "session_task_route", args)) print(f"After GROUP D3 (task_route desc): {len(rows)} rows") # ── GROUP D4: session_save_ledger — extract project + summary from natural phrasing ── ledger_natural_cases = [ ("We just finished a big refactor of the billing module. Make sure it's written down. Project: billing, conv: conv-bill-2024.", {"project": "billing", "conversation_id": "conv-bill-2024", "summary": "Completed big refactor of billing module"}), ("Before I hand off, save what we did today: fixed the OAuth flow and updated tests. Project is portal.", {"project": "portal", "summary": "Fixed OAuth flow and updated tests"}), ("We're done for the day. Log what we accomplished on the analytics project.", {"project": "analytics", "summary": "Session progress logged"}), ("Jot this down for the prism-training project: we built and validated the v43 corpus.", {"project": "prism-training", "summary": "Built and validated v43 corpus"}), ("Log session for ios-app project: completed llama.cpp Swift integration.", {"project": "ios-app", "summary": "Completed llama.cpp Swift integration"}), ("Write it down — we migrated auth to OAuth2 in the portal project. conv-id: sess-99.", {"project": "portal", "conversation_id": "sess-99", "summary": "Migrated auth to OAuth2"}), ("End of session for prism-mcp. We deployed v3 MCP server and ran smoke tests.", {"project": "prism-mcp", "summary": "Deployed v3 MCP server and ran smoke tests"}), ("Record this session: we fixed the race condition in the payments module. project=payments.", {"project": "payments", "summary": "Fixed race condition in payments module"}), ] for user, args in ledger_natural_cases: rows.append(msg(user, "session_save_ledger", args)) print(f"After GROUP D4 (ledger natural): {len(rows)} rows") # ── GROUP D5: backfill_links + synthesize_edges — verifier distinctions ── verifier_cases = [ ("Backfill the missing cross-session links for the analytics project.", "session_backfill_links", {"project": "analytics"}), ("Reconnect the dangling session references for the billing project.", "session_backfill_links", {"project": "billing"}), ("Patch up the link gaps in our session history for prism-training.", "session_backfill_links", {"project": "prism-training"}), ("Fix the broken links between sessions in the portal project.", "session_backfill_links", {"project": "portal"}), ("There are dangling references in ios-app sessions. Backfill them.", "session_backfill_links", {"project": "ios-app"}), ("Reconnect missing session cross-references for prism-mcp.", "session_backfill_links", {"project": "prism-mcp"}), ("Run a synthesis pass on the prism-mcp project to make sure all edges are up to date.", "session_synthesize_edges", {"project": "prism-mcp"}), ("Before we close out, verify all the session links are consistent for the portal project.", "session_synthesize_edges", {"project": "portal"}), ("Synthesize the session graph for analytics to build fresh edge connections.", "session_synthesize_edges", {"project": "analytics"}), ("Verify graph integrity — synthesize edges for the ios-app project.", "session_synthesize_edges", {"project": "ios-app"}), ("Do a graph synthesis on the billing sessions to update the relationship map.", "session_synthesize_edges", {"project": "billing"}), ("Build the session edge graph for prism-training.", "session_synthesize_edges", {"project": "prism-training"}), ] for user, tool, args in verifier_cases: rows.append(msg(user, tool, args)) print(f"After GROUP D5 (verifier): {len(rows)} rows") # ── GROUP D6: session_search_memory — "remind me / did we ever decide" phrasing ── remind_search_cases = [ ("Remind me — did we ever decide between Redis and Memcached for caching?", {"query": "Redis vs Memcached caching decision"}), ("What did we decide about the database schema for the billing module?", {"query": "database schema billing module decision"}), ("Did we settle on which logging library to use for the portal?", {"query": "logging library portal decision"}), ("Remind me what we concluded about microservices vs monolith.", {"query": "microservices vs monolith conclusion"}), ("What was our decision on the auth token expiry?", {"query": "auth token expiry decision"}), ("Did we ever document our stance on using TypeScript strict mode?", {"query": "TypeScript strict mode decision"}), ("Can you remind me what we figured out about the WebSocket reconnection strategy?", {"query": "WebSocket reconnection strategy"}), ("What did we decide to do about the rate limiting approach?", {"query": "rate limiting approach decision"}), ("Remind me of our agreed-on approach for handling API errors.", {"query": "API error handling approach"}), ("Did we ever land on a pagination strategy for the search API?", {"query": "pagination strategy search API"}), ] for user, args in remind_search_cases: rows.append(msg(user, "session_search_memory", args)) print(f"After GROUP D6 (remind/search): {len(rows)} rows") # Write output with OUT.open("w") as f: for row in rows: f.write(json.dumps(row) + "\n") print(f"\nWrote {len(rows)} patch4 rows to {OUT}") # Verify format with OUT.open() as f: first = json.loads(f.readline()) print(f"First row keys: {list(first.keys())}") print(f"Text starts with: {first['text'][:50]}") print(f"Think close tag present: {'' in first['text']}") print(f"Bad think close absent: {'