dcostenco commited on
Commit
09252eb
·
verified ·
1 Parent(s): 324921d

Add training/swe_bench_test.py

Browse files
Files changed (1) hide show
  1. training/swe_bench_test.py +747 -0
training/swe_bench_test.py ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SWE-Bench Inspired Blind Evaluation for prism-coder:7b
4
+
5
+ Unlike the real-life test (which had training overlap), these prompts are:
6
+ 1. Completely novel — never seen in any training data
7
+ 2. Realistic — mimic actual user interactions
8
+ 3. Ambiguous — some have keyword traps or context-dependent meanings
9
+ 4. Multi-intent — some require the model to pick the most appropriate tool
10
+ 5. Adversarial — designed to confuse tool vs reasoning boundaries
11
+
12
+ Scoring follows SWE-bench methodology:
13
+ - Strict match: correct tool name + all required params present
14
+ - Partial match: correct tool name + some params
15
+ - Wrong tool: incorrect tool name (regardless of params)
16
+ - False positive: tool called when none should be
17
+ - False negative: no tool called when one should be
18
+ """
19
+ import subprocess
20
+ import json
21
+ import re
22
+ import time
23
+ import sys
24
+ import random
25
+ import urllib.request
26
+ import statistics
27
+
28
+ MODEL = "prism-coder:4b-v43"
29
+ OLLAMA_API = "http://localhost:11434/api/generate"
30
+
31
+ # === BLIND TEST CASES (never in training data) ===
32
+ # Format: (prompt, expected_tool_or_NO_TOOL, required_params, category)
33
+ BLIND_TESTS = [
34
+ # ====== CATEGORY 1: Natural user phrasing — tool needed (15 tests) ======
35
+ ("Hey, I want to start a new session. Pull up everything we had on the synalux project.",
36
+ "session_load_context", ["project"], "natural_phrasing"),
37
+
38
+ ("Can you jot down what we accomplished? We rewrote the webhook handler and fixed 3 edge cases.",
39
+ "session_save_ledger", ["summary"], "natural_phrasing"),
40
+
41
+ ("I'm handing this off to the night shift. Make sure they know where we left off on prism-mcp.",
42
+ "session_save_handoff", ["project"], "natural_phrasing"),
43
+
44
+ ("Remind me — did we ever decide between Redis and Memcached for the session store?",
45
+ "session_search_memory", ["query"], "natural_phrasing"),
46
+
47
+ ("That memory entry about the old deployment script is totally wrong. Nuke it.",
48
+ "session_forget_memory", ["memory_id"], "natural_phrasing"),
49
+
50
+ ("Is everything OK with the memory backend? Run diagnostics.",
51
+ "session_health_check", [], "natural_phrasing"),
52
+
53
+ ("Any institutional knowledge about how we handle rate limiting?",
54
+ "knowledge_search", ["query"], "natural_phrasing"),
55
+
56
+ ("The ledger is getting huge. Summarize and archive the old stuff for billing-portal.",
57
+ "session_compact_ledger", ["project"], "natural_phrasing"),
58
+
59
+ ("Dump everything to a file so I can back it up. JSON format, save to /tmp/prism-backup.",
60
+ "session_export_memory", ["output_path", "format"], "natural_phrasing"),
61
+
62
+ ("Should I handle this CSS grid refactor myself or punt it to the local model?",
63
+ "session_task_route", ["task_description"], "natural_phrasing"),
64
+
65
+ # Additional natural phrasing (indirect/conversational)
66
+ ("Where were we on the portal project? Bring me up to speed.",
67
+ "session_load_context", ["project"], "natural_phrasing"),
68
+
69
+ ("We just finished a big refactor. Make sure it's written down for posterity.",
70
+ "session_save_ledger", [], "natural_phrasing"),
71
+
72
+ ("Go look through our old conversations and find anything about the payment gateway.",
73
+ "session_search_memory", ["query"], "natural_phrasing"),
74
+
75
+ ("Get rid of that wrong entry we saved about the broken migration.",
76
+ "session_forget_memory", ["memory_id"], "natural_phrasing"),
77
+
78
+ ("Is this bug fix simple enough for the local model to handle?",
79
+ "session_task_route", ["task_description"], "natural_phrasing"),
80
+
81
+ # ====== CATEGORY 2: Adversarial keyword traps — NO tool (15 tests) ======
82
+ ("How do I implement a session manager in Express.js with Redis as the backing store?",
83
+ "NO_TOOL", [], "adversarial_trap"),
84
+
85
+ ("Explain the concept of memory management in Rust — borrowing, ownership, and lifetimes.",
86
+ "NO_TOOL", [], "adversarial_trap"),
87
+
88
+ ("What's the best way to save user preferences in a React Native app?",
89
+ "NO_TOOL", [], "adversarial_trap"),
90
+
91
+ ("Write a function that searches through a knowledge graph using BFS.",
92
+ "NO_TOOL", [], "adversarial_trap"),
93
+
94
+ ("How does garbage collection work in Go vs Java?",
95
+ "NO_TOOL", [], "adversarial_trap"),
96
+
97
+ ("Can you explain the compact representation of sparse matrices?",
98
+ "NO_TOOL", [], "adversarial_trap"),
99
+
100
+ ("What is the health check endpoint pattern in microservices?",
101
+ "NO_TOOL", [], "adversarial_trap"),
102
+
103
+ ("How do I export data from PostgreSQL to a CSV file?",
104
+ "NO_TOOL", [], "adversarial_trap"),
105
+
106
+ # NEW adversarial traps — high-risk keywords
107
+ ("How do I create a session in PHP using session_start()?",
108
+ "NO_TOOL", [], "adversarial_trap"),
109
+
110
+ ("Write me a Python context manager for database connections.",
111
+ "NO_TOOL", [], "adversarial_trap"),
112
+
113
+ ("What's the difference between saving to disk vs saving to memory in SQLite?",
114
+ "NO_TOOL", [], "adversarial_trap"),
115
+
116
+ ("How do I implement search functionality with Elasticsearch?",
117
+ "NO_TOOL", [], "adversarial_trap"),
118
+
119
+ ("Explain how to load balance across multiple Node.js processes.",
120
+ "NO_TOOL", [], "adversarial_trap"),
121
+
122
+ ("What is the forget gate in an LSTM neural network?",
123
+ "NO_TOOL", [], "adversarial_trap"),
124
+
125
+ ("How do I route tasks in Celery to different queues?",
126
+ "NO_TOOL", [], "adversarial_trap"),
127
+
128
+ # ====== CATEGORY 3: Disambiguation — correct tool choice (8 tests) ======
129
+ ("Search for anything we discussed about the authentication overhaul last month.",
130
+ "session_search_memory", ["query"], "disambiguation"),
131
+
132
+ ("I need to know if our knowledge base has anything on Kubernetes pod autoscaling.",
133
+ "knowledge_search", ["query"], "disambiguation"),
134
+
135
+ # NEW: forget tool disambiguation
136
+ ("Delete the specific memory entry with ID mem-abc-123.",
137
+ "session_forget_memory", ["memory_id"], "disambiguation"),
138
+
139
+ ("Wipe out all old debugging entries from the prism-mcp project.",
140
+ "knowledge_forget", ["project"], "disambiguation"),
141
+
142
+ # NEW: save tool disambiguation
143
+ ("We're done for the day. Log what we accomplished.",
144
+ "session_save_ledger", [], "disambiguation"),
145
+
146
+ ("Pass this project to the next developer. Save the handoff state.",
147
+ "session_save_handoff", ["project"], "disambiguation"),
148
+
149
+ # NEW: search tool disambiguation
150
+ ("What do our curated knowledge items say about error handling best practices?",
151
+ "knowledge_search", ["query"], "disambiguation"),
152
+
153
+ ("Did we discuss anything about caching in our recent sessions?",
154
+ "session_search_memory", ["query"], "disambiguation"),
155
+
156
+ # ====== CATEGORY 4: Edge cases (8 tests) ======
157
+ ("Load context.",
158
+ "session_load_context", [], "edge_case"),
159
+
160
+ ("Save.",
161
+ "session_save_ledger", [], "edge_case"),
162
+
163
+ ("What tools do you have available?",
164
+ "NO_TOOL", [], "edge_case"),
165
+
166
+ ("Tell me about yourself.",
167
+ "NO_TOOL", [], "edge_case"),
168
+
169
+ # NEW edge cases
170
+ ("Hello!",
171
+ "NO_TOOL", [], "edge_case"),
172
+
173
+ ("Thanks, that's all for now.",
174
+ "NO_TOOL", [], "edge_case"),
175
+
176
+ ("Search.",
177
+ "session_search_memory", ["query"], "edge_case"),
178
+
179
+ ("Check health.",
180
+ "session_health_check", [], "edge_case"),
181
+
182
+ # ====== CATEGORY 5: Multi-tool / complex intent (4 tests) ======
183
+ ("Find all our past notes about the billing API redesign and check if the memory DB is healthy.",
184
+ "session_search_memory", ["query"], "multi_intent"),
185
+
186
+ ("Load the prism project context and then save a note that we started the migration.",
187
+ "session_load_context", ["project"], "multi_intent"),
188
+
189
+ ("Before I hand off, save what we did today: fixed the OAuth flow and updated tests.",
190
+ "session_save_ledger", ["summary"], "multi_intent"),
191
+
192
+ ("I want to export a backup and then compact the old entries.",
193
+ "session_export_memory", [], "multi_intent"),
194
+
195
+ # ====== CATEGORY 6: Verifier patterns (8 tests) ======
196
+ # Verifier = synthesize_edges, backfill_links, health_check used to verify/validate state
197
+
198
+ ("Before we close out, verify all the session links are consistent for the portal project.",
199
+ "session_synthesize_edges", ["project"], "verifier"),
200
+
201
+ ("Run a synthesis pass on the prism-mcp project to make sure all edges are up to date.",
202
+ "session_synthesize_edges", ["project"], "verifier"),
203
+
204
+ ("Backfill the missing cross-session links for the analytics project.",
205
+ "session_backfill_links", ["project"], "verifier"),
206
+
207
+ ("Reconnect the dangling session references for the billing project.",
208
+ "session_backfill_links", ["project"], "verifier"),
209
+
210
+ ("Make sure the memory system is healthy before I start a new session.",
211
+ "session_health_check", [], "verifier"),
212
+
213
+ ("Verify graph integrity — synthesize edges for the ios-app project.",
214
+ "session_synthesize_edges", ["project"], "verifier"),
215
+
216
+ ("Is the memory backend responding correctly?",
217
+ "session_health_check", [], "verifier"),
218
+
219
+ ("Patch up the link gaps in our session history for prism-training.",
220
+ "session_backfill_links", ["project"], "verifier"),
221
+
222
+ # ====== CATEGORY 7: Cascade patterns (10 tests) ======
223
+ # Cascade = first step of a multi-step chain — model must pick the right first tool
224
+
225
+ ("Search our knowledge base for Redis caching patterns, then upvote the best result.",
226
+ "knowledge_search", ["query"], "cascade"),
227
+
228
+ ("Load context for the portal project, search for any open issues, then save a handoff.",
229
+ "session_load_context", ["project"], "cascade"),
230
+
231
+ ("Check memory health, then compact the ledger if there are stale entries.",
232
+ "session_health_check", [], "cascade"),
233
+
234
+ ("Export everything from the billing project, then set a 60-day retention policy on it.",
235
+ "session_export_memory", ["project"], "cascade"), # output_path not in prompt, only project
236
+
237
+ ("Search for what we decided about authentication, then save a handoff note about it.",
238
+ "session_search_memory", ["query"], "cascade"),
239
+
240
+ ("Save this session's progress, then create a handoff for the next agent.",
241
+ "session_save_ledger", [], "cascade"),
242
+
243
+ ("Route this refactoring task — if local, proceed; if cloud, just tell me.",
244
+ "session_task_route", ["task_description"], "cascade"),
245
+
246
+ ("Search knowledge for WebSocket patterns, downvote anything about long-polling.",
247
+ "knowledge_search", ["query"], "cascade"),
248
+
249
+ ("Compact the prism-mcp ledger and then synthesize the session edges.",
250
+ "session_compact_ledger", [], "cascade"),
251
+
252
+ ("Load the analytics project context and then log that we shipped the v4 dashboard.",
253
+ "session_load_context", ["project"], "cascade"),
254
+ ]
255
+
256
+ TOOL_CALL_RE = re.compile(
257
+ r'<\|tool_call\|>\s*(\{.*\})',
258
+ re.DOTALL
259
+ )
260
+ # v43 model uses <tool_call> (no pipes) — strip CoT first, then match
261
+ NO_PIPE_TOOL_CALL_RE = re.compile(
262
+ r'<tool_call>\s*(\{.*?\})\s*(?:</tool_call>|$)',
263
+ re.DOTALL
264
+ )
265
+
266
+ def call_ollama(prompt: str, timeout: int = 120) -> tuple:
267
+ """Call ollama REST API and return (raw_response, parsed_tool_name, parsed_args, latency)."""
268
+ start = time.time()
269
+ try:
270
+ payload = json.dumps({
271
+ "model": MODEL,
272
+ "prompt": prompt,
273
+ "stream": False,
274
+ "raw": True,
275
+ "options": {"temperature": 0.0, "num_predict": 512}
276
+ }).encode("utf-8")
277
+
278
+ req = urllib.request.Request(
279
+ OLLAMA_API,
280
+ data=payload,
281
+ headers={"Content-Type": "application/json"}
282
+ )
283
+
284
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
285
+ data = json.loads(resp.read().decode("utf-8"))
286
+ raw = data.get("response", "").strip()
287
+ except Exception as e:
288
+ return (str(e), "ERROR", {}, time.time() - start)
289
+
290
+ latency = time.time() - start
291
+
292
+ # Strip CoT blocks before parsing
293
+ clean_raw = re.sub(r'<\|synalux_think\|>.*?(?:</\|synalux_think\|>|$)', '', raw, flags=re.DOTALL)
294
+
295
+ # Strategy 0: no-pipe <tool_call> format (v43 model)
296
+ no_pipe_match = NO_PIPE_TOOL_CALL_RE.search(clean_raw)
297
+ if no_pipe_match:
298
+ try:
299
+ tool_json = json.loads(no_pipe_match.group(1))
300
+ tool_name = tool_json.get("name", tool_json.get("tool", "UNKNOWN"))
301
+ tool_args = tool_json.get("arguments", tool_json.get("args", {}))
302
+ return (raw, tool_name, tool_args, latency)
303
+ except json.JSONDecodeError:
304
+ pass
305
+
306
+ # Strategy 1: piped <|tool_call|> format
307
+ match = TOOL_CALL_RE.search(clean_raw)
308
+ if match:
309
+ try:
310
+ tool_json = json.loads(match.group(1))
311
+ tool_name = tool_json.get("name", tool_json.get("tool", "UNKNOWN"))
312
+ tool_args = tool_json.get("arguments", tool_json.get("args", {}))
313
+ return (raw, tool_name, tool_args, latency)
314
+ except json.JSONDecodeError:
315
+ pass
316
+
317
+ # Fallback: try to find JSON with "name" key containing nested braces
318
+ json_re = re.search(r'(\{[^{}]*"name"\s*:\s*"[^"]+?"[^{}]*(?:\{[^{}]*\}[^{}]*)*\})', raw)
319
+ if json_re:
320
+ try:
321
+ tool_json = json.loads(json_re.group(0))
322
+ tool_name = tool_json.get("name", "UNKNOWN")
323
+ tool_args = tool_json.get("arguments", tool_json.get("args", {}))
324
+ return (raw, tool_name, tool_args, latency)
325
+ except json.JSONDecodeError:
326
+ pass
327
+
328
+ return (raw, "NO_TOOL", {}, latency)
329
+
330
+
331
+ # === LAYER 3: Inference-Time False Positive Rejection ===
332
+ # Catches cases where the model hallucinates a tool call on general programming prompts.
333
+ # These are lightweight heuristics — they only reject, never add tool calls.
334
+
335
+ # Patterns that strongly indicate a general programming question (NOT Prism)
336
+ GENERAL_PROGRAMMING_PATTERNS = [
337
+ # Python context managers — not Prism context loading
338
+ r'\bcontext\s+manager\b', r'\bcontextlib\b', r'\b__enter__\b', r'\b__exit__\b',
339
+ r'\basync\s+context\s+manager\b',
340
+ # ML/LSTM forget gates — not Prism memory deletion
341
+ r'\bforget\s+gate\b', r'\blstm\b', r'\bcatastrophic\s+forgetting\b',
342
+ r'\bforget\s+bias\b', r'\belastic\s+weight\s+consolidation\b',
343
+ # Web framework sessions — not Prism sessions
344
+ r'\bexpress\.js\b', r'\bdjango\b', r'\bflask\b', r'\bsession_start\(\)',
345
+ r'\bsession\s+middleware\b', r'\bsession\s+affinity\b',
346
+ # General CS concepts that overlap with tool names
347
+ r'\bgarbage\s+collection\b', r'\bmemory\s+management\s+in\s+rust\b',
348
+ r'\bload\s+balanc', r'\bcontext\s+switch',
349
+ r'\bsearch\s+algorithm\b', r'\bsearch\s+functionality\s+with\s+elasticsearch\b',
350
+ r'\bhealth\s+check\s+endpoint\s+pattern\b',
351
+ # Group A: swe-bench false positives
352
+ r'\bcelery\b.*\bqueue', r'\broute\s+tasks?\s+in\s+celery\b',
353
+ r'\bknowledge\s+graph\b.*\b(?:function|search|algorithm|traversal)\b',
354
+ r'\b(?:function|write\s+a\s+function|implement)\b.*\bknowledge\s+graph\b',
355
+ r'\bsave\s+(?:user\s+)?preferences?\s+in\s+(?:react|redux|localstorage|a\s+database)\b',
356
+ r'\bexport\s+(?:data\s+)?from\s+(?:postgresql|mysql|sqlite|a\s+database)\b',
357
+ r'\bpostgresql\b.*\bcsv\b', r'\bcsv\b.*\bpostgresql\b',
358
+ ]
359
+
360
+ # Patterns that confirm Prism-specific intent (overrides rejection)
361
+ PRISM_INTENT_PATTERNS = [
362
+ r'\bprism\b', r'\bsession\s*ledger\b', r'\bhandoff\b', r'\bknowledge\s+base\b',
363
+ r'\bknowledge\s+items?\b', r'\bour\s+knowledge\b', r'\bknowledge\s+base\b',
364
+ r'\bsave.*(?:session|ledger|handoff)\b', r'\bload\s+context\b',
365
+ r'\b(?:search|find).*(?:memory|sessions?|conversations?|notes)\b',
366
+ r'\bproject\b', r'\bwhat\s+(?:do\s+)?we\s+(?:know|have)\b',
367
+ r'\binstitutional\s+knowledge\b', r'\bdocumented\b', r'\bcurated\b',
368
+ r'\bmemory\s+entry\b', r'\bmemory\s+backend\b', r'\bdiagnostics\b',
369
+ r'\bledger\b', r'\bcompact\b.*(?:ledger|entries|session)\b',
370
+ r'\bexport.*(?:memory|backup)\b', r'\b(?:delete|nuke|wipe|remove).*(?:entry|memory|entries)\b',
371
+ r'\blog.*(?:what|accomplished|session)\b', r'\brecord.*(?:session|what)\b',
372
+ r'\bhand.*(?:off|over)\b', r'\bbring.*up\s+to\s+speed\b',
373
+ r'\bbug\s+fix.*(?:local\s+model|handle)\b', r'\broute.*(?:task|this)\b',
374
+ ]
375
+
376
+ def validate_tool_call(prompt, tool_name, tool_args):
377
+ """Layer 3: reject obvious false positive tool calls and remap semantic neighbors.
378
+
379
+ Returns (tool_name, tool_args) — possibly changed if rejected or remapped.
380
+ """
381
+ if tool_name == "NO_TOOL":
382
+ return tool_name, tool_args
383
+
384
+ prompt_lower = prompt.lower()
385
+
386
+ # --- Group B remaps (before false-positive rejection) ---
387
+
388
+ # "reconnect/patch up/dangling links" → backfill_links
389
+ if tool_name in ('session_synthesize_edges', 'session_reconnect'):
390
+ if re.search(r'\b(?:reconnect|backfill|patch\s+up|dangling|link\s+gaps?|missing\s+links?|fix\s+links?)\b', prompt_lower):
391
+ return 'session_backfill_links', tool_args
392
+
393
+ # "verify/check that session links are consistent" → synthesize_edges
394
+ # Covers both health_check and backfill_links false routes
395
+ _VERIFY_CONSISTENT_RE = re.compile(
396
+ r'\b(?:verify|validate|check)\b.{0,40}\b(?:links?\s+(?:are\s+)?consistent|edges?\s+up\s+to\s+date|graph\s+integrit|session\s+links?)\b',
397
+ re.DOTALL
398
+ )
399
+ if tool_name in ('session_health_check', 'session_backfill_links'):
400
+ if _VERIFY_CONSISTENT_RE.search(prompt_lower):
401
+ return 'session_synthesize_edges', tool_args
402
+
403
+ # "wipe/clear old entries from knowledge base" → knowledge_forget (not compact_ledger)
404
+ if tool_name == 'session_compact_ledger':
405
+ if re.search(r'\bknowledge\b', prompt_lower) and re.search(r'\b(?:wipe|clear|delete|remove|entries)\b', prompt_lower):
406
+ return 'knowledge_forget', tool_args
407
+
408
+ # "entries from ... knowledge base" + delete verbs → knowledge_forget (not session_forget_memory)
409
+ if tool_name == 'session_forget_memory':
410
+ if re.search(r'\bknowledge\s+(?:entr|items?|records?|base)\b', prompt_lower):
411
+ return 'knowledge_forget', tool_args
412
+ if re.search(r'\bknowledge\s+base\b', prompt_lower) and re.search(r'\b(?:entries|records|items)\b', prompt_lower):
413
+ return 'knowledge_forget', tool_args
414
+ # "delete/wipe entries from [project]" without a specific memory ID → knowledge_forget
415
+ if re.search(r'\b(?:entries|records|logs?)\b', prompt_lower) and re.search(r'\bproject\b', prompt_lower):
416
+ if not re.search(r'\bmemory[_\s]id\b|mem-[a-z0-9]|ID\s*[=:]\s*\S+', prompt):
417
+ return 'knowledge_forget', {'project': re.search(r'(?:for|from|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project', prompt_lower, re.I) and re.search(r'(?:for|from|in)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project', prompt_lower, re.I).group(1) or None}
418
+
419
+ # "where were we / bring me up to speed" → session_load_context (not session_search_memory)
420
+ if tool_name == 'session_search_memory':
421
+ if re.search(r'\bwhere\s+were\s+we\b|\bbring\s+me\s+up\s+to\s+speed\b|\bcatch\s+me\s+up\b|\bwhat\s+were\s+we\s+(?:doing|working)', prompt_lower):
422
+ project_m = re.search(r'\b(?:on|for|with)\s+(?:the\s+)?([a-zA-Z][a-zA-Z0-9_-]+)\s+project\b', prompt_lower)
423
+ project = project_m.group(1) if project_m else None
424
+ return 'session_load_context', {'project': project} if project else {}
425
+
426
+ # knowledge_forget / knowledge_set_retention → upvote/downvote protection
427
+ if tool_name in ('knowledge_forget', 'knowledge_set_retention'):
428
+ if re.search(r'\b(?:upvote|boost|increase\s+(?:its\s+)?(?:rank|score|importance)|uprate|thumbs[\s-]?up)\b', prompt_lower):
429
+ return 'knowledge_upvote', {"id": tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")}
430
+ if re.search(r'\b(?:downvote|lower\s+(?:its\s+)?(?:rank|score)|not\s+useful|derank|thumbs[\s-]?down|reduce\s+(?:its\s+)?(?:rank|score))\b', prompt_lower):
431
+ return 'knowledge_downvote', {"id": tool_args.get("id") or tool_args.get("knowledge_id") or tool_args.get("entry_id")}
432
+
433
+ # "remind me / did we ever decide" → session_search_memory (not load_context)
434
+ # Exclude "bring me up to speed / where were we" which is a load_context pattern
435
+ if tool_name == 'session_load_context':
436
+ if re.search(r'\bremind\s+me\b|\bdid\s+we\s+ever\s+(?:decide|settle|choose|pick)\b|\bwhat\s+did\s+we\s+decide\b', prompt_lower):
437
+ if not re.search(r'\bbring\s+me\s+up\s+to\s+speed\b|\bwhere\s+were\s+we\b|\bcatch\s+me\s+up\b|\bload\s+.*\bcontext\b', prompt_lower):
438
+ return 'session_search_memory', {"query": prompt[:120]}
439
+
440
+ # Normalize param aliases (model uses alternate field names)
441
+ if tool_name == 'session_save_ledger':
442
+ # content → summary rename
443
+ if 'content' in tool_args and 'summary' not in tool_args:
444
+ tool_args = dict(tool_args)
445
+ tool_args['summary'] = tool_args.pop('content')
446
+ # If prompt contains explicit completed-work content and model omitted summary, fill it
447
+ if 'summary' not in tool_args:
448
+ work_m = re.search(
449
+ r'(?:jot\s+down|log|record|write\s+down|note)\s+(?:what\s+we\s+)?(?:accomplished|did|completed|finished)?\s*[:;]?\s*'
450
+ r'(?:we\s+)?(.{10,120})',
451
+ prompt, re.I
452
+ )
453
+ if not work_m:
454
+ work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed)\s+.{10,120})', prompt, re.I)
455
+ if work_m:
456
+ tool_args = dict(tool_args)
457
+ tool_args['summary'] = work_m.group(1).strip().rstrip('.')
458
+ # session_export_memory: extract output_path from path patterns, format from keywords
459
+ if tool_name == 'session_export_memory':
460
+ if 'output_path' not in tool_args or not tool_args.get('output_path'):
461
+ path_m = re.search(r'(?:save\s+to|(?:output|export)\s+(?:to|dir(?:ectory)?)\s+["\']?)(/[\w/.-]+|~/[\w/.-]+|\.\/[\w/.-]+)', prompt, re.I)
462
+ if path_m:
463
+ tool_args = dict(tool_args)
464
+ tool_args['output_path'] = path_m.group(1)
465
+ if 'format' not in tool_args or not tool_args.get('format'):
466
+ fmt_m = re.search(r'\b(json|jsonl|markdown|csv|yaml)\b(?:\s+format)?\b', prompt_lower)
467
+ if fmt_m:
468
+ tool_args = dict(tool_args)
469
+ tool_args['format'] = fmt_m.group(1)
470
+
471
+ # "jot down / write down / make sure it's written down" → session_save_ledger (not save_experience)
472
+ if tool_name == 'session_save_experience':
473
+ if re.search(r'\bjot\s+down\b|\bwrite\s+(?:it\s+)?down\b|\bwhat\s+we\s+accomplished\b|\bmake\s+sure\s+it.{0,10}written\b|\brecord\s+(?:this|what)\b', prompt_lower):
474
+ if not re.search(r'\b(?:successfully|milestone|achievement|deployed|shipped|launched|fixed\s+the)\b', prompt_lower):
475
+ # Apply same normalization as the save_ledger block below
476
+ if 'content' in tool_args and 'summary' not in tool_args:
477
+ tool_args = dict(tool_args)
478
+ tool_args['summary'] = tool_args.pop('content')
479
+ if 'summary' not in tool_args:
480
+ work_m = re.search(r'(?:we\s+)?((?:rewrote|fixed|refactored|built|deployed|updated|added|removed)\s+.{10,120})', prompt, re.I)
481
+ if work_m:
482
+ tool_args = dict(tool_args)
483
+ tool_args['summary'] = work_m.group(1).strip().rstrip('.')
484
+ return 'session_save_ledger', tool_args
485
+
486
+ # --- False-positive rejection (CS patterns) ---
487
+ is_general = any(re.search(p, prompt_lower) for p in GENERAL_PROGRAMMING_PATTERNS)
488
+
489
+ if not is_general:
490
+ return tool_name, tool_args
491
+
492
+ has_prism_intent = any(re.search(p, prompt_lower) for p in PRISM_INTENT_PATTERNS)
493
+
494
+ if has_prism_intent:
495
+ return tool_name, tool_args
496
+
497
+ return "NO_TOOL", {}
498
+
499
+
500
+
501
+ def evaluate_result(expected_tool, required_params, got_tool, got_args):
502
+ """
503
+ SWE-bench scoring:
504
+ - strict_pass: correct tool + all required params
505
+ - partial_pass: correct tool + missing some params
506
+ - wrong_tool: different tool called
507
+ - false_positive: tool called when none should be
508
+ - false_negative: no tool called when one should be
509
+ """
510
+ if expected_tool == "NO_TOOL":
511
+ if got_tool == "NO_TOOL":
512
+ return "strict_pass"
513
+ else:
514
+ return "false_positive"
515
+ else:
516
+ if got_tool == "NO_TOOL":
517
+ return "false_negative"
518
+ elif got_tool != expected_tool:
519
+ # Special case: accept session_search_memory OR knowledge_search for search queries
520
+ if expected_tool in ("session_search_memory", "knowledge_search") and got_tool in ("session_search_memory", "knowledge_search"):
521
+ pass # Close enough
522
+ else:
523
+ return "wrong_tool"
524
+
525
+ # Check required params
526
+ if not required_params:
527
+ return "strict_pass"
528
+
529
+ present = [p for p in required_params if p in got_args]
530
+ if len(present) == len(required_params):
531
+ return "strict_pass"
532
+ elif len(present) > 0:
533
+ return "partial_pass"
534
+ else:
535
+ return "partial_pass" # Got the tool right but missing params
536
+
537
+
538
+ def main(shuffle=False, no_validate_layer3=False):
539
+ print("=" * 70)
540
+ print("SWE-BENCH INSPIRED BLIND EVALUATION — prism-coder:7b")
541
+ print("=" * 70)
542
+ print(f"Model: {MODEL}")
543
+ print(f"Tests: {len(BLIND_TESTS)} (all novel, never in training data)")
544
+ print(f"Order: {'RANDOMIZED' if shuffle else 'sequential'}")
545
+ print(f"Categories: natural_phrasing, adversarial_trap, disambiguation, edge_case, multi_intent")
546
+ print()
547
+
548
+ # Build indexed test list and optionally shuffle
549
+ indexed_tests = list(enumerate(BLIND_TESTS))
550
+ if shuffle:
551
+ random.shuffle(indexed_tests)
552
+
553
+ results = [None] * len(BLIND_TESTS) # store by original index
554
+ category_stats = {}
555
+
556
+ # Use training-compatible system prompt (matches v43 <tool_call> no-pipe format)
557
+ _sys_prompt = (
558
+ "You are Synalux, a memory-augmented coding and clinical reasoning assistant. "
559
+ "You have access to Prism Memory tools (session_save_ledger, session_load_context, "
560
+ "session_search_memory, session_save_handoff, session_forget_memory, session_health_check, "
561
+ "session_compact_ledger, session_export_memory, session_task_route, session_save_experience, "
562
+ "session_synthesize_edges, session_backfill_links, knowledge_search, knowledge_forget, "
563
+ "knowledge_upvote, knowledge_downvote, knowledge_set_retention) and 13 multimodal tool "
564
+ "modules (image_gen, office, web_scraper, browser, tts, ocr, git, terminal, deps_scanner, "
565
+ "hipaa, data_graph, templates, pdf_parser). "
566
+ "Think step-by-step before answering. When the user references past work, prior decisions, "
567
+ "or stored context, use the appropriate Prism Memory tool. "
568
+ "Format tool calls inside <tool_call>...</tool_call> JSON blocks with fields 'name' and 'arguments'. "
569
+ "If no tool is needed, answer directly in plain text. "
570
+ "ABSTAIN for general programming questions, CS concepts, greetings, and capability questions."
571
+ )
572
+
573
+ for display_i, (orig_idx, (prompt, expected, required_params, category)) in enumerate(indexed_tests, 1):
574
+ full_prompt = f"<|im_start|>system\n{_sys_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
575
+ raw, got_tool, got_args, latency = call_ollama(full_prompt)
576
+ # Layer 3: reject false positive tool calls on general programming prompts
577
+ # Disabled during training benchmarks so RFT/DPO sees true model failures.
578
+ if not no_validate_layer3:
579
+ got_tool, got_args = validate_tool_call(prompt, got_tool, got_args)
580
+ verdict = evaluate_result(expected, required_params, got_tool, got_args)
581
+
582
+ is_pass = verdict in ("strict_pass", "partial_pass")
583
+ icon = "✅" if verdict == "strict_pass" else ("⚠️" if verdict == "partial_pass" else "❌")
584
+
585
+ # Truncate prompt for display
586
+ short_prompt = prompt[:55]
587
+ tag = f"#{orig_idx+1}"
588
+ print(f" [{display_i:2d}/{len(BLIND_TESTS)}] {icon} {tag:4s}| expect={expected:28s} got={got_tool:28s} | {latency:5.1f}s | {short_prompt}")
589
+ if verdict not in ("strict_pass",):
590
+ if verdict == "partial_pass":
591
+ missing = [p for p in required_params if p not in got_args]
592
+ print(f" ↳ missing params: {missing}")
593
+ elif verdict == "false_positive":
594
+ print(f" ↳ FALSE POSITIVE: called {got_tool} when no tool expected")
595
+ elif verdict == "false_negative":
596
+ print(f" ↳ FALSE NEGATIVE: no tool called when {expected} expected")
597
+ elif verdict == "wrong_tool":
598
+ print(f" ↳ WRONG TOOL: expected {expected}, got {got_tool}")
599
+
600
+ results[orig_idx] = {
601
+ "id": orig_idx + 1,
602
+ "prompt": prompt,
603
+ "expected": expected,
604
+ "got": got_tool,
605
+ "got_args": got_args,
606
+ "verdict": verdict,
607
+ "latency": latency,
608
+ "category": category
609
+ }
610
+
611
+ # Category tracking
612
+ if category not in category_stats:
613
+ category_stats[category] = {"total": 0, "strict": 0, "partial": 0, "fail": 0}
614
+ category_stats[category]["total"] += 1
615
+ if verdict == "strict_pass":
616
+ category_stats[category]["strict"] += 1
617
+ elif verdict == "partial_pass":
618
+ category_stats[category]["partial"] += 1
619
+ else:
620
+ category_stats[category]["fail"] += 1
621
+
622
+ # Summary
623
+ strict = sum(1 for r in results if r["verdict"] == "strict_pass")
624
+ partial = sum(1 for r in results if r["verdict"] == "partial_pass")
625
+ fails = sum(1 for r in results if r["verdict"] not in ("strict_pass", "partial_pass"))
626
+ total = len(results)
627
+
628
+ tool_tests = [r for r in results if r["expected"] != "NO_TOOL"]
629
+ no_tool_tests = [r for r in results if r["expected"] == "NO_TOOL"]
630
+
631
+ tool_strict = sum(1 for r in tool_tests if r["verdict"] == "strict_pass")
632
+ tool_partial = sum(1 for r in tool_tests if r["verdict"] == "partial_pass")
633
+ no_tool_pass = sum(1 for r in no_tool_tests if r["verdict"] == "strict_pass")
634
+
635
+ avg_latency = sum(r["latency"] for r in results) / total
636
+
637
+ print()
638
+ print("=" * 70)
639
+ print("SWE-BENCH RESULTS (Blind Evaluation)")
640
+ print("=" * 70)
641
+ print(f" Strict Pass: {strict}/{total} = {strict/total*100:.0f}%")
642
+ print(f" Partial Pass: {partial}/{total} = {partial/total*100:.0f}%")
643
+ print(f" Total Pass: {strict+partial}/{total} = {(strict+partial)/total*100:.0f}%")
644
+ print(f" Fail: {fails}/{total} = {fails/total*100:.0f}%")
645
+ print(f" ---")
646
+ print(f" Tool Strict: {tool_strict}/{len(tool_tests)} = {tool_strict/len(tool_tests)*100:.0f}%")
647
+ print(f" Tool Partial: {tool_partial}/{len(tool_tests)} = {tool_partial/len(tool_tests)*100:.0f}%")
648
+ print(f" Abstention: {no_tool_pass}/{len(no_tool_tests)} = {no_tool_pass/len(no_tool_tests)*100:.0f}%")
649
+ print(f" Avg latency: {avg_latency:.1f}s")
650
+ print()
651
+ print(" Category Breakdown:")
652
+ for cat, stats in sorted(category_stats.items()):
653
+ pct = (stats["strict"] + stats["partial"]) / stats["total"] * 100
654
+ print(f" {cat:20s}: {stats['strict']}/{stats['total']} strict, {stats['partial']} partial, {stats['fail']} fail ({pct:.0f}%)")
655
+ print("=" * 70)
656
+
657
+ # Save report
658
+ report = {
659
+ "model": MODEL,
660
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
661
+ "total_tests": total,
662
+ "strict_pass": strict,
663
+ "partial_pass": partial,
664
+ "fails": fails,
665
+ "strict_rate": strict / total,
666
+ "total_pass_rate": (strict + partial) / total,
667
+ "tool_strict_rate": tool_strict / len(tool_tests),
668
+ "abstention_rate": no_tool_pass / len(no_tool_tests),
669
+ "avg_latency": avg_latency,
670
+ "category_stats": category_stats,
671
+ "results": results
672
+ }
673
+
674
+ os.makedirs("results", exist_ok=True)
675
+ with open("results/swe_bench_report.json", "w") as f:
676
+ json.dump(report, f, indent=2, default=str)
677
+ print(f"\nReport saved: results/swe_bench_report.json")
678
+
679
+ return strict, total, results
680
+
681
+ import os
682
+ import argparse
683
+
684
+ if __name__ == "__main__":
685
+ parser = argparse.ArgumentParser()
686
+ parser.add_argument("--model", type=str, default=None, help="Ollama model tag to evaluate (overrides MODEL constant)")
687
+ parser.add_argument("--runs", type=int, default=1, help="Number of eval runs for statistical validation")
688
+ parser.add_argument("--shuffle", action="store_true", help="Randomize test order each run")
689
+ parser.add_argument("--no-validate-layer3", action="store_true",
690
+ help="Disable Layer 3 false-positive rejection (use during training benchmarks "
691
+ "so RFT/DPO sees true model failures, not heuristic-corrected results)")
692
+ args = parser.parse_args()
693
+
694
+ if args.model:
695
+ MODEL = args.model
696
+
697
+ if args.runs == 1:
698
+ main(shuffle=args.shuffle, no_validate_layer3=args.no_validate_layer3)
699
+ else:
700
+ all_scores = []
701
+ per_test_pass = [0] * len(BLIND_TESTS)
702
+ per_test_fail_tools = [[] for _ in range(len(BLIND_TESTS))]
703
+
704
+ for run_idx in range(args.runs):
705
+ seed = random.randint(0, 9999) if args.shuffle else None
706
+ print(f"\n{'#'*70}")
707
+ print(f" RUN {run_idx+1}/{args.runs}" + (f" (seed={seed})" if seed else ""))
708
+ print(f"{'#'*70}")
709
+ if seed is not None:
710
+ random.seed(seed)
711
+ strict, total, results = main(shuffle=args.shuffle, no_validate_layer3=args.no_validate_layer3)
712
+ all_scores.append(strict)
713
+ for i, r in enumerate(results):
714
+ if r["verdict"] == "strict_pass":
715
+ per_test_pass[i] += 1
716
+ else:
717
+ per_test_fail_tools[i].append(r.get("got", "???"))
718
+
719
+ # Multi-run summary
720
+ med = statistics.median(all_scores)
721
+ avg = sum(all_scores) / len(all_scores)
722
+ print(f"\n{'='*70}")
723
+ print(f" MULTI-RUN SUMMARY ({args.runs} runs × {total} tests" + (" — RANDOMIZED ORDER" if args.shuffle else "") + ")")
724
+ print(f"{'='*70}")
725
+ print(f" Scores: {' | '.join(f'{s}/{total}' for s in all_scores)}")
726
+ print(f" Median: {med}/{total} = {med/total*100:.1f}%")
727
+ print(f" Average: {avg:.1f}/{total} = {avg/total*100:.1f}%")
728
+ print(f" Min: {min(all_scores)}/{total} = {min(all_scores)/total*100:.0f}%")
729
+ print(f" Max: {max(all_scores)}/{total} = {max(all_scores)/total*100:.0f}%")
730
+
731
+ # Per-test consistency
732
+ print(f"\n Per-Test Consistency (N={args.runs} runs):")
733
+ flaky = []
734
+ for i, (prompt, expected, _, cat) in enumerate(BLIND_TESTS):
735
+ rate = per_test_pass[i] / args.runs
736
+ if rate < 1.0:
737
+ fail_tools = per_test_fail_tools[i]
738
+ flaky.append((i+1, prompt[:60], expected, rate, fail_tools))
739
+ status = f" ⚠️ [{i+1:2d}] {rate*100:3.0f}% pass | expect={expected:25s} | fails→{','.join(set(fail_tools)):20s} | {prompt[:55]}"
740
+ print(status)
741
+
742
+ if not flaky:
743
+ print(" ✅ All tests passed consistently across all runs!")
744
+ else:
745
+ print(f"\n Flaky tests: {len(flaky)}/{total}")
746
+ print(f"{'='*70}")
747
+