dcostenco commited on
Commit
d8c2900
Β·
verified Β·
1 Parent(s): 30020c5

Add training/build_4b_v43_patch4.py

Browse files
Files changed (1) hide show
  1. training/build_4b_v43_patch4.py +262 -0
training/build_4b_v43_patch4.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Patch4 corpus for prism-coder:4b-v43.
4
+ Targets swe-bench failures not fixable by Layer 3:
5
+ - Group C: session_task_route false negatives (implicit routing questions)
6
+ - Group D: param extraction from natural/informal phrasing
7
+
8
+ Output format: {"text": "<ChatML string>"} to match existing train.jsonl
9
+ """
10
+ import json, pathlib
11
+
12
+ OUT = pathlib.Path("/tmp/4b_v43_patch4.jsonl")
13
+
14
+ SYS = (
15
+ "You are Synalux, a memory-augmented coding and clinical reasoning assistant. "
16
+ "You have access to Prism Memory tools (session_save_ledger, session_load_context, "
17
+ "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, "
18
+ "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, "
19
+ "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, "
20
+ "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool "
21
+ "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, "
22
+ "hipaa, data_graph, templates, pdf_parser). "
23
+ "Think step-by-step before answering. When the user references past work, prior decisions, "
24
+ "or stored context, use the appropriate Prism Memory tool. "
25
+ "Format tool calls inside <tool_call>...</tool_call> JSON blocks with fields 'name' and 'arguments'. "
26
+ "If no tool is needed, answer directly in plain text. "
27
+ "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions."
28
+ )
29
+
30
+ THINK_OPEN = "<|synalux_think|>"
31
+ THINK_CLOSE = "</|synalux_think|>"
32
+
33
+
34
+ def render_chatml(user_text, assistant_text):
35
+ return (
36
+ f"<|im_start|>system\n{SYS}<|im_end|>\n"
37
+ f"<|im_start|>user\n{user_text}<|im_end|>\n"
38
+ f"<|im_start|>assistant\n{assistant_text}<|im_end|>"
39
+ )
40
+
41
+
42
+ def msg(user, tool_name, args):
43
+ call = json.dumps({"name": tool_name, "arguments": args})
44
+ think = f"{THINK_OPEN}The user wants {tool_name.replace('_', ' ')}. I'll call the tool with the right parameters.{THINK_CLOSE}"
45
+ assistant = f"{think}\n<tool_call>\n{call}\n</tool_call>"
46
+ return {"text": render_chatml(user, assistant)}
47
+
48
+
49
+ def no_tool_msg(user, reply):
50
+ think = f"{THINK_OPEN}This is a general programming/CS question. I should answer directly without calling any Prism tool.{THINK_CLOSE}"
51
+ assistant = f"{think}\n{reply}"
52
+ return {"text": render_chatml(user, assistant)}
53
+
54
+
55
+ rows = []
56
+
57
+ # ── GROUP C: session_task_route β€” implicit "should I/local/cloud?" questions ──
58
+ task_route_cases = [
59
+ ("Should I handle this CSS grid refactor myself or punt it to the cloud model?",
60
+ {"task_description": "CSS grid refactor", "prefer_local": False}),
61
+ ("Is this bug fix simple enough for the local model to tackle?",
62
+ {"task_description": "bug fix"}),
63
+ ("Can the on-device model handle parsing this 50MB XML dump, or should I escalate to cloud?",
64
+ {"task_description": "parsing 50MB XML dump", "prefer_local": False}),
65
+ ("Should I let the local model handle this TypeScript refactor or route it to the bigger model?",
66
+ {"task_description": "TypeScript refactor", "prefer_local": False}),
67
+ ("Is writing a full test suite for the billing module something the local model can do well?",
68
+ {"task_description": "writing test suite for billing module"}),
69
+ ("This SQL query optimization seems complex β€” local or cloud?",
70
+ {"task_description": "SQL query optimization"}),
71
+ ("Route this: rewriting the authentication middleware from scratch.",
72
+ {"task_description": "rewriting authentication middleware from scratch"}),
73
+ ("Should the local agent handle this performance profiling task?",
74
+ {"task_description": "performance profiling task"}),
75
+ ("Is a full-text semantic search implementation doable on-device?",
76
+ {"task_description": "full-text semantic search implementation"}),
77
+ ("Local or cloud for this multi-file refactor of the payments module?",
78
+ {"task_description": "multi-file refactor of payments module", "prefer_local": False}),
79
+ ("Can the on-device model handle a 3000-line code review?",
80
+ {"task_description": "3000-line code review"}),
81
+ ("Route this debugging task: tracing intermittent race conditions in async code.",
82
+ {"task_description": "tracing intermittent race conditions in async code"}),
83
+ ("Should I send this architecture review to the cloud model or keep it local?",
84
+ {"task_description": "architecture review", "prefer_local": False}),
85
+ ("Is this data pipeline migration too complex for local inference?",
86
+ {"task_description": "data pipeline migration"}),
87
+ ("Handle or escalate? β€” writing API documentation for 40 endpoints.",
88
+ {"task_description": "writing API documentation for 40 endpoints"}),
89
+ ]
90
+ for user, args in task_route_cases:
91
+ rows.append(msg(user, "session_task_route", args))
92
+
93
+ print(f"After GROUP C (task_route): {len(rows)} rows")
94
+
95
+ # ── GROUP D1: session_export_memory β€” extract output_dir from casual phrasing ──
96
+ export_cases = [
97
+ ("Dump everything to /tmp/backup so I can archive it. JSON format.",
98
+ {"output_path": "/tmp/backup", "format": "json"}),
99
+ ("Back up all my memories to /var/exports before I wipe the session.",
100
+ {"output_path": "/var/exports"}),
101
+ ("Export the prism-mcp project memory to /tmp/prism-export in markdown format.",
102
+ {"project": "prism-mcp", "output_path": "/tmp/prism-export", "format": "markdown"}),
103
+ ("I want to export a backup and then compact the old entries β€” save it to /tmp/mem-backup.",
104
+ {"output_path": "/tmp/mem-backup"}),
105
+ ("Export everything from the billing project to /tmp/billing-backup.json.",
106
+ {"project": "billing", "output_path": "/tmp/billing-backup.json"}),
107
+ ("Can you dump the analytics project sessions to /home/user/exports/analytics?",
108
+ {"project": "analytics", "output_path": "/home/user/exports/analytics"}),
109
+ ("Save a memory backup to /tmp/export before we start the migration.",
110
+ {"output_path": "/tmp/export"}),
111
+ ("Archive the prism-training project to /tmp/training-archive in JSON.",
112
+ {"project": "prism-training", "output_path": "/tmp/training-archive", "format": "json"}),
113
+ ]
114
+ for user, args in export_cases:
115
+ rows.append(msg(user, "session_export_memory", args))
116
+
117
+ print(f"After GROUP D1 (export): {len(rows)} rows")
118
+
119
+ # ── GROUP D2: session_forget_memory β€” extract memory_id from prompt ──
120
+ forget_cases = [
121
+ ("Delete the specific memory entry with ID mem-abc-123.",
122
+ {"memory_id": "mem-abc-123"}),
123
+ ("Remove memory entry 7f3a-bc21-d4e5 β€” it's completely wrong.",
124
+ {"memory_id": "7f3a-bc21-d4e5"}),
125
+ ("Get rid of entry ID: ent-2024-991. It was saved by mistake.",
126
+ {"memory_id": "ent-2024-991"}),
127
+ ("Forget memory ID mem-portal-007.",
128
+ {"memory_id": "mem-portal-007"}),
129
+ ("Delete that specific memory entry with ID 'sess-42-bad'.",
130
+ {"memory_id": "sess-42-bad"}),
131
+ ("The entry tagged as 'exp-001' is outdated. Please remove it.",
132
+ {"memory_id": "exp-001"}),
133
+ ("Wipe memory entry ID = abc123xyz.",
134
+ {"memory_id": "abc123xyz"}),
135
+ ("Remove the wrong entry β€” its ID is MEM-2025-0042.",
136
+ {"memory_id": "MEM-2025-0042"}),
137
+ ("That entry we saved earlier about the broken migration config β€” the ID is err-cfg-88, delete it.",
138
+ {"memory_id": "err-cfg-88"}),
139
+ ("Clear out memory ID: old-deploy-notes-19.",
140
+ {"memory_id": "old-deploy-notes-19"}),
141
+ ]
142
+ for user, args in forget_cases:
143
+ rows.append(msg(user, "session_forget_memory", args))
144
+
145
+ print(f"After GROUP D2 (forget+id): {len(rows)} rows")
146
+
147
+ # ── GROUP D3: session_task_route β€” extract task_description from implicit phrasing ──
148
+ task_route_desc_cases = [
149
+ ("Route this refactoring task β€” if local, proceed; if cloud, just tell me.",
150
+ {"task_description": "refactoring task"}),
151
+ ("Is the on-device model good enough for a full GraphQL schema migration?",
152
+ {"task_description": "GraphQL schema migration"}),
153
+ ("This linting fix across 200 files β€” local or cloud?",
154
+ {"task_description": "linting fix across 200 files"}),
155
+ ("Route: generating embeddings for 10k documents.",
156
+ {"task_description": "generating embeddings for 10k documents"}),
157
+ ("Should the local model handle writing unit tests for the auth module?",
158
+ {"task_description": "writing unit tests for auth module"}),
159
+ ]
160
+ for user, args in task_route_desc_cases:
161
+ rows.append(msg(user, "session_task_route", args))
162
+
163
+ print(f"After GROUP D3 (task_route desc): {len(rows)} rows")
164
+
165
+ # ── GROUP D4: session_save_ledger β€” extract project + summary from natural phrasing ──
166
+ ledger_natural_cases = [
167
+ ("We just finished a big refactor of the billing module. Make sure it's written down. Project: billing, conv: conv-bill-2024.",
168
+ {"project": "billing", "conversation_id": "conv-bill-2024", "summary": "Completed big refactor of billing module"}),
169
+ ("Before I hand off, save what we did today: fixed the OAuth flow and updated tests. Project is portal.",
170
+ {"project": "portal", "summary": "Fixed OAuth flow and updated tests"}),
171
+ ("We're done for the day. Log what we accomplished on the analytics project.",
172
+ {"project": "analytics", "summary": "Session progress logged"}),
173
+ ("Jot this down for the prism-training project: we built and validated the v43 corpus.",
174
+ {"project": "prism-training", "summary": "Built and validated v43 corpus"}),
175
+ ("Log session for ios-app project: completed llama.cpp Swift integration.",
176
+ {"project": "ios-app", "summary": "Completed llama.cpp Swift integration"}),
177
+ ("Write it down β€” we migrated auth to OAuth2 in the portal project. conv-id: sess-99.",
178
+ {"project": "portal", "conversation_id": "sess-99", "summary": "Migrated auth to OAuth2"}),
179
+ ("End of session for prism-mcp. We deployed v3 MCP server and ran smoke tests.",
180
+ {"project": "prism-mcp", "summary": "Deployed v3 MCP server and ran smoke tests"}),
181
+ ("Record this session: we fixed the race condition in the payments module. project=payments.",
182
+ {"project": "payments", "summary": "Fixed race condition in payments module"}),
183
+ ]
184
+ for user, args in ledger_natural_cases:
185
+ rows.append(msg(user, "session_save_ledger", args))
186
+
187
+ print(f"After GROUP D4 (ledger natural): {len(rows)} rows")
188
+
189
+ # ── GROUP D5: backfill_links + synthesize_edges β€” verifier distinctions ──
190
+ verifier_cases = [
191
+ ("Backfill the missing cross-session links for the analytics project.",
192
+ "session_backfill_links", {"project": "analytics"}),
193
+ ("Reconnect the dangling session references for the billing project.",
194
+ "session_backfill_links", {"project": "billing"}),
195
+ ("Patch up the link gaps in our session history for prism-training.",
196
+ "session_backfill_links", {"project": "prism-training"}),
197
+ ("Fix the broken links between sessions in the portal project.",
198
+ "session_backfill_links", {"project": "portal"}),
199
+ ("There are dangling references in ios-app sessions. Backfill them.",
200
+ "session_backfill_links", {"project": "ios-app"}),
201
+ ("Reconnect missing session cross-references for prism-mcp.",
202
+ "session_backfill_links", {"project": "prism-mcp"}),
203
+ ("Run a synthesis pass on the prism-mcp project to make sure all edges are up to date.",
204
+ "session_synthesize_edges", {"project": "prism-mcp"}),
205
+ ("Before we close out, verify all the session links are consistent for the portal project.",
206
+ "session_synthesize_edges", {"project": "portal"}),
207
+ ("Synthesize the session graph for analytics to build fresh edge connections.",
208
+ "session_synthesize_edges", {"project": "analytics"}),
209
+ ("Verify graph integrity β€” synthesize edges for the ios-app project.",
210
+ "session_synthesize_edges", {"project": "ios-app"}),
211
+ ("Do a graph synthesis on the billing sessions to update the relationship map.",
212
+ "session_synthesize_edges", {"project": "billing"}),
213
+ ("Build the session edge graph for prism-training.",
214
+ "session_synthesize_edges", {"project": "prism-training"}),
215
+ ]
216
+ for user, tool, args in verifier_cases:
217
+ rows.append(msg(user, tool, args))
218
+
219
+ print(f"After GROUP D5 (verifier): {len(rows)} rows")
220
+
221
+ # ── GROUP D6: session_search_memory β€” "remind me / did we ever decide" phrasing ──
222
+ remind_search_cases = [
223
+ ("Remind me β€” did we ever decide between Redis and Memcached for caching?",
224
+ {"query": "Redis vs Memcached caching decision"}),
225
+ ("What did we decide about the database schema for the billing module?",
226
+ {"query": "database schema billing module decision"}),
227
+ ("Did we settle on which logging library to use for the portal?",
228
+ {"query": "logging library portal decision"}),
229
+ ("Remind me what we concluded about microservices vs monolith.",
230
+ {"query": "microservices vs monolith conclusion"}),
231
+ ("What was our decision on the auth token expiry?",
232
+ {"query": "auth token expiry decision"}),
233
+ ("Did we ever document our stance on using TypeScript strict mode?",
234
+ {"query": "TypeScript strict mode decision"}),
235
+ ("Can you remind me what we figured out about the WebSocket reconnection strategy?",
236
+ {"query": "WebSocket reconnection strategy"}),
237
+ ("What did we decide to do about the rate limiting approach?",
238
+ {"query": "rate limiting approach decision"}),
239
+ ("Remind me of our agreed-on approach for handling API errors.",
240
+ {"query": "API error handling approach"}),
241
+ ("Did we ever land on a pagination strategy for the search API?",
242
+ {"query": "pagination strategy search API"}),
243
+ ]
244
+ for user, args in remind_search_cases:
245
+ rows.append(msg(user, "session_search_memory", args))
246
+
247
+ print(f"After GROUP D6 (remind/search): {len(rows)} rows")
248
+
249
+ # Write output
250
+ with OUT.open("w") as f:
251
+ for row in rows:
252
+ f.write(json.dumps(row) + "\n")
253
+
254
+ print(f"\nWrote {len(rows)} patch4 rows to {OUT}")
255
+
256
+ # Verify format
257
+ with OUT.open() as f:
258
+ first = json.loads(f.readline())
259
+ print(f"First row keys: {list(first.keys())}")
260
+ print(f"Text starts with: {first['text'][:50]}")
261
+ print(f"Think close tag present: {'</|synalux_think|>' in first['text']}")
262
+ print(f"Bad think close absent: {'</{' not in first['text']}")