File size: 14,181 Bytes
d8c2900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python3
"""
Patch4 corpus for prism-coder:4b-v43.
Targets swe-bench failures not fixable by Layer 3:
  - Group C: session_task_route false negatives (implicit routing questions)
  - Group D: param extraction from natural/informal phrasing

Output format: {"text": "<ChatML string>"} to match existing train.jsonl
"""
import json, pathlib

OUT = pathlib.Path("/tmp/4b_v43_patch4.jsonl")

SYS = (
    "You are Synalux, a memory-augmented coding and clinical reasoning assistant. "
    "You have access to Prism Memory tools (session_save_ledger, session_load_context, "
    "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, "
    "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, "
    "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, "
    "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool "
    "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, "
    "hipaa, data_graph, templates, pdf_parser). "
    "Think step-by-step before answering. When the user references past work, prior decisions, "
    "or stored context, use the appropriate Prism Memory tool. "
    "Format tool calls inside <tool_call>...</tool_call> JSON blocks with fields 'name' and 'arguments'. "
    "If no tool is needed, answer directly in plain text. "
    "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions."
)

THINK_OPEN = "<|synalux_think|>"
THINK_CLOSE = "</|synalux_think|>"


def render_chatml(user_text, assistant_text):
    return (
        f"<|im_start|>system\n{SYS}<|im_end|>\n"
        f"<|im_start|>user\n{user_text}<|im_end|>\n"
        f"<|im_start|>assistant\n{assistant_text}<|im_end|>"
    )


def msg(user, tool_name, args):
    call = json.dumps({"name": tool_name, "arguments": args})
    think = f"{THINK_OPEN}The user wants {tool_name.replace('_', ' ')}. I'll call the tool with the right parameters.{THINK_CLOSE}"
    assistant = f"{think}\n<tool_call>\n{call}\n</tool_call>"
    return {"text": render_chatml(user, assistant)}


def no_tool_msg(user, reply):
    think = f"{THINK_OPEN}This is a general programming/CS question. I should answer directly without calling any Prism tool.{THINK_CLOSE}"
    assistant = f"{think}\n{reply}"
    return {"text": render_chatml(user, assistant)}


rows = []

# ── GROUP C: session_task_route β€” implicit "should I/local/cloud?" questions ──
task_route_cases = [
    ("Should I handle this CSS grid refactor myself or punt it to the cloud model?",
     {"task_description": "CSS grid refactor", "prefer_local": False}),
    ("Is this bug fix simple enough for the local model to tackle?",
     {"task_description": "bug fix"}),
    ("Can the on-device model handle parsing this 50MB XML dump, or should I escalate to cloud?",
     {"task_description": "parsing 50MB XML dump", "prefer_local": False}),
    ("Should I let the local model handle this TypeScript refactor or route it to the bigger model?",
     {"task_description": "TypeScript refactor", "prefer_local": False}),
    ("Is writing a full test suite for the billing module something the local model can do well?",
     {"task_description": "writing test suite for billing module"}),
    ("This SQL query optimization seems complex β€” local or cloud?",
     {"task_description": "SQL query optimization"}),
    ("Route this: rewriting the authentication middleware from scratch.",
     {"task_description": "rewriting authentication middleware from scratch"}),
    ("Should the local agent handle this performance profiling task?",
     {"task_description": "performance profiling task"}),
    ("Is a full-text semantic search implementation doable on-device?",
     {"task_description": "full-text semantic search implementation"}),
    ("Local or cloud for this multi-file refactor of the payments module?",
     {"task_description": "multi-file refactor of payments module", "prefer_local": False}),
    ("Can the on-device model handle a 3000-line code review?",
     {"task_description": "3000-line code review"}),
    ("Route this debugging task: tracing intermittent race conditions in async code.",
     {"task_description": "tracing intermittent race conditions in async code"}),
    ("Should I send this architecture review to the cloud model or keep it local?",
     {"task_description": "architecture review", "prefer_local": False}),
    ("Is this data pipeline migration too complex for local inference?",
     {"task_description": "data pipeline migration"}),
    ("Handle or escalate? β€” writing API documentation for 40 endpoints.",
     {"task_description": "writing API documentation for 40 endpoints"}),
]
for user, args in task_route_cases:
    rows.append(msg(user, "session_task_route", args))

print(f"After GROUP C (task_route): {len(rows)} rows")

# ── GROUP D1: session_export_memory β€” extract output_dir from casual phrasing ──
export_cases = [
    ("Dump everything to /tmp/backup so I can archive it. JSON format.",
     {"output_path": "/tmp/backup", "format": "json"}),
    ("Back up all my memories to /var/exports before I wipe the session.",
     {"output_path": "/var/exports"}),
    ("Export the prism-mcp project memory to /tmp/prism-export in markdown format.",
     {"project": "prism-mcp", "output_path": "/tmp/prism-export", "format": "markdown"}),
    ("I want to export a backup and then compact the old entries β€” save it to /tmp/mem-backup.",
     {"output_path": "/tmp/mem-backup"}),
    ("Export everything from the billing project to /tmp/billing-backup.json.",
     {"project": "billing", "output_path": "/tmp/billing-backup.json"}),
    ("Can you dump the analytics project sessions to /home/user/exports/analytics?",
     {"project": "analytics", "output_path": "/home/user/exports/analytics"}),
    ("Save a memory backup to /tmp/export before we start the migration.",
     {"output_path": "/tmp/export"}),
    ("Archive the prism-training project to /tmp/training-archive in JSON.",
     {"project": "prism-training", "output_path": "/tmp/training-archive", "format": "json"}),
]
for user, args in export_cases:
    rows.append(msg(user, "session_export_memory", args))

print(f"After GROUP D1 (export): {len(rows)} rows")

# ── GROUP D2: session_forget_memory β€” extract memory_id from prompt ──
forget_cases = [
    ("Delete the specific memory entry with ID mem-abc-123.",
     {"memory_id": "mem-abc-123"}),
    ("Remove memory entry 7f3a-bc21-d4e5 β€” it's completely wrong.",
     {"memory_id": "7f3a-bc21-d4e5"}),
    ("Get rid of entry ID: ent-2024-991. It was saved by mistake.",
     {"memory_id": "ent-2024-991"}),
    ("Forget memory ID mem-portal-007.",
     {"memory_id": "mem-portal-007"}),
    ("Delete that specific memory entry with ID 'sess-42-bad'.",
     {"memory_id": "sess-42-bad"}),
    ("The entry tagged as 'exp-001' is outdated. Please remove it.",
     {"memory_id": "exp-001"}),
    ("Wipe memory entry ID = abc123xyz.",
     {"memory_id": "abc123xyz"}),
    ("Remove the wrong entry β€” its ID is MEM-2025-0042.",
     {"memory_id": "MEM-2025-0042"}),
    ("That entry we saved earlier about the broken migration config β€” the ID is err-cfg-88, delete it.",
     {"memory_id": "err-cfg-88"}),
    ("Clear out memory ID: old-deploy-notes-19.",
     {"memory_id": "old-deploy-notes-19"}),
]
for user, args in forget_cases:
    rows.append(msg(user, "session_forget_memory", args))

print(f"After GROUP D2 (forget+id): {len(rows)} rows")

# ── GROUP D3: session_task_route β€” extract task_description from implicit phrasing ──
task_route_desc_cases = [
    ("Route this refactoring task β€” if local, proceed; if cloud, just tell me.",
     {"task_description": "refactoring task"}),
    ("Is the on-device model good enough for a full GraphQL schema migration?",
     {"task_description": "GraphQL schema migration"}),
    ("This linting fix across 200 files β€” local or cloud?",
     {"task_description": "linting fix across 200 files"}),
    ("Route: generating embeddings for 10k documents.",
     {"task_description": "generating embeddings for 10k documents"}),
    ("Should the local model handle writing unit tests for the auth module?",
     {"task_description": "writing unit tests for auth module"}),
]
for user, args in task_route_desc_cases:
    rows.append(msg(user, "session_task_route", args))

print(f"After GROUP D3 (task_route desc): {len(rows)} rows")

# ── GROUP D4: session_save_ledger β€” extract project + summary from natural phrasing ──
ledger_natural_cases = [
    ("We just finished a big refactor of the billing module. Make sure it's written down. Project: billing, conv: conv-bill-2024.",
     {"project": "billing", "conversation_id": "conv-bill-2024", "summary": "Completed big refactor of billing module"}),
    ("Before I hand off, save what we did today: fixed the OAuth flow and updated tests. Project is portal.",
     {"project": "portal", "summary": "Fixed OAuth flow and updated tests"}),
    ("We're done for the day. Log what we accomplished on the analytics project.",
     {"project": "analytics", "summary": "Session progress logged"}),
    ("Jot this down for the prism-training project: we built and validated the v43 corpus.",
     {"project": "prism-training", "summary": "Built and validated v43 corpus"}),
    ("Log session for ios-app project: completed llama.cpp Swift integration.",
     {"project": "ios-app", "summary": "Completed llama.cpp Swift integration"}),
    ("Write it down β€” we migrated auth to OAuth2 in the portal project. conv-id: sess-99.",
     {"project": "portal", "conversation_id": "sess-99", "summary": "Migrated auth to OAuth2"}),
    ("End of session for prism-mcp. We deployed v3 MCP server and ran smoke tests.",
     {"project": "prism-mcp", "summary": "Deployed v3 MCP server and ran smoke tests"}),
    ("Record this session: we fixed the race condition in the payments module. project=payments.",
     {"project": "payments", "summary": "Fixed race condition in payments module"}),
]
for user, args in ledger_natural_cases:
    rows.append(msg(user, "session_save_ledger", args))

print(f"After GROUP D4 (ledger natural): {len(rows)} rows")

# ── GROUP D5: backfill_links + synthesize_edges β€” verifier distinctions ──
verifier_cases = [
    ("Backfill the missing cross-session links for the analytics project.",
     "session_backfill_links", {"project": "analytics"}),
    ("Reconnect the dangling session references for the billing project.",
     "session_backfill_links", {"project": "billing"}),
    ("Patch up the link gaps in our session history for prism-training.",
     "session_backfill_links", {"project": "prism-training"}),
    ("Fix the broken links between sessions in the portal project.",
     "session_backfill_links", {"project": "portal"}),
    ("There are dangling references in ios-app sessions. Backfill them.",
     "session_backfill_links", {"project": "ios-app"}),
    ("Reconnect missing session cross-references for prism-mcp.",
     "session_backfill_links", {"project": "prism-mcp"}),
    ("Run a synthesis pass on the prism-mcp project to make sure all edges are up to date.",
     "session_synthesize_edges", {"project": "prism-mcp"}),
    ("Before we close out, verify all the session links are consistent for the portal project.",
     "session_synthesize_edges", {"project": "portal"}),
    ("Synthesize the session graph for analytics to build fresh edge connections.",
     "session_synthesize_edges", {"project": "analytics"}),
    ("Verify graph integrity β€” synthesize edges for the ios-app project.",
     "session_synthesize_edges", {"project": "ios-app"}),
    ("Do a graph synthesis on the billing sessions to update the relationship map.",
     "session_synthesize_edges", {"project": "billing"}),
    ("Build the session edge graph for prism-training.",
     "session_synthesize_edges", {"project": "prism-training"}),
]
for user, tool, args in verifier_cases:
    rows.append(msg(user, tool, args))

print(f"After GROUP D5 (verifier): {len(rows)} rows")

# ── GROUP D6: session_search_memory β€” "remind me / did we ever decide" phrasing ──
remind_search_cases = [
    ("Remind me β€” did we ever decide between Redis and Memcached for caching?",
     {"query": "Redis vs Memcached caching decision"}),
    ("What did we decide about the database schema for the billing module?",
     {"query": "database schema billing module decision"}),
    ("Did we settle on which logging library to use for the portal?",
     {"query": "logging library portal decision"}),
    ("Remind me what we concluded about microservices vs monolith.",
     {"query": "microservices vs monolith conclusion"}),
    ("What was our decision on the auth token expiry?",
     {"query": "auth token expiry decision"}),
    ("Did we ever document our stance on using TypeScript strict mode?",
     {"query": "TypeScript strict mode decision"}),
    ("Can you remind me what we figured out about the WebSocket reconnection strategy?",
     {"query": "WebSocket reconnection strategy"}),
    ("What did we decide to do about the rate limiting approach?",
     {"query": "rate limiting approach decision"}),
    ("Remind me of our agreed-on approach for handling API errors.",
     {"query": "API error handling approach"}),
    ("Did we ever land on a pagination strategy for the search API?",
     {"query": "pagination strategy search API"}),
]
for user, args in remind_search_cases:
    rows.append(msg(user, "session_search_memory", args))

print(f"After GROUP D6 (remind/search): {len(rows)} rows")

# Write output
with OUT.open("w") as f:
    for row in rows:
        f.write(json.dumps(row) + "\n")

print(f"\nWrote {len(rows)} patch4 rows to {OUT}")

# Verify format
with OUT.open() as f:
    first = json.loads(f.readline())
print(f"First row keys: {list(first.keys())}")
print(f"Text starts with: {first['text'][:50]}")
print(f"Think close tag present: {'</|synalux_think|>' in first['text']}")
print(f"Bad think close absent: {'</{' not in first['text']}")