narcolepticchicken commited on
Commit
8b7d080
·
verified ·
1 Parent(s): e2518ba

Delete router_models/*, eval/*, aco/benchmarks/*, examples/*

Browse files
aco/benchmarks/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- """Benchmarks module for Agent Cost Optimizer."""
2
- from .benchmark_suite import BenchmarkSuite, BenchmarkResult, BenchmarkConfig
3
-
4
- __all__ = ["BenchmarkSuite", "BenchmarkResult", "BenchmarkConfig"]
 
 
 
 
 
aco/benchmarks/benchmark_suite.py DELETED
@@ -1,506 +0,0 @@
1
- """Benchmark Suite for Agent Cost Optimizer.
2
-
3
- Benchmarks:
4
- A. Coding Agent Tasks
5
- B. Research Agent Tasks
6
- C. Tool-Use Tasks
7
- D. Document / Contract / QA Tasks
8
- E. Long-Horizon Agent Tasks
9
-
10
- Baselines:
11
- A. always frontier model
12
- B. always cheap model
13
- C. static model routing
14
- D. prompt-only router
15
- E. rules-only optimizer
16
- F. learned model router
17
- G. learned router + context budgeter
18
- H. learned router + context + verifier budgeter
19
- I. full Agent Cost Optimizer
20
-
21
- Metrics:
22
- - task success
23
- - cost per successful task
24
- - cost reduction at iso-quality
25
- - latency
26
- - token usage
27
- - model calls
28
- - tool calls
29
- - verifier calls
30
- - retries
31
- - cache hit rate
32
- - context tokens
33
- - false-DONE rate
34
- - unsafe cheap-model miss rate
35
- - missed escalation rate
36
- - user correction rate
37
- - regression rate
38
- - quality/cost frontier
39
- """
40
-
41
- import json
42
- import time
43
- from typing import Dict, List, Any, Optional
44
- from dataclasses import dataclass, field
45
- from collections import defaultdict
46
-
47
- from aco.optimizer import AgentCostOptimizer, OptimizationResult
48
- from aco.config import ACOConfig, ModelConfig, ToolConfig, VerifierConfig, RoutingPolicy
49
- from aco.trace_schema import AgentTrace, TraceStep, ModelCall, ToolCall, VerifierCall, TaskType, Outcome, FailureTag
50
- from aco.datasets.synthetic_traces import SyntheticTraceGenerator
51
-
52
-
53
- @dataclass
54
- class BenchmarkConfig:
55
- name: str
56
- task_types: List[TaskType]
57
- num_tasks: int
58
- routing_mode: str = "cascade"
59
- enable_modules: Dict[str, bool] = field(default_factory=dict)
60
- baseline_name: str = ""
61
-
62
-
63
- @dataclass
64
- class BenchmarkResult:
65
- benchmark_name: str
66
- baseline_name: str
67
- num_tasks: int
68
- num_success: int
69
- num_partial: int
70
- num_failure: int
71
- num_false_done: int
72
- num_blocked: int
73
- total_cost: float
74
- avg_cost_success: float
75
- avg_latency_ms: float
76
- total_tool_calls: int
77
- total_verifier_calls: int
78
- total_retries: int
79
- avg_cache_hit_rate: float
80
- total_context_tokens: int
81
- cost_reduction_vs_frontier: float
82
- false_done_rate: float
83
- unsafe_cheap_miss_rate: float
84
- missed_escalation_rate: float
85
- regression_rate: float
86
- quality_cost_frontier: List[Dict[str, float]] = field(default_factory=list)
87
- per_task_results: List[Dict[str, Any]] = field(default_factory=list)
88
-
89
-
90
- class BenchmarkSuite:
91
- """Runs ACO benchmarks across tasks and baselines."""
92
-
93
- def __init__(self, config: Optional[ACOConfig] = None):
94
- self.config = config or self._default_config()
95
-
96
- def _default_config(self) -> ACOConfig:
97
- models = {
98
- "tiny_local": ModelConfig("tiny_local", "local", 0.0001, 0.0002, latency_ms_estimate=200, strength_tier=1),
99
- "cheap_cloud": ModelConfig("cheap_cloud", "cloud", 0.0005, 0.001, latency_ms_estimate=500, strength_tier=2),
100
- "medium": ModelConfig("medium", "cloud", 0.003, 0.006, latency_ms_estimate=800, strength_tier=3),
101
- "frontier": ModelConfig("frontier", "cloud", 0.01, 0.03, latency_ms_estimate=1500, strength_tier=4),
102
- "specialist": ModelConfig("specialist", "cloud", 0.015, 0.045, latency_ms_estimate=2000, strength_tier=5),
103
- }
104
- tools = {
105
- "search": ToolConfig("search", 0.002, 500),
106
- "retrieve": ToolConfig("retrieve", 0.001, 300),
107
- "code_execution": ToolConfig("code_execution", 0.005, 1000),
108
- "linter": ToolConfig("linter", 0.001, 200),
109
- "file_read": ToolConfig("file_read", 0.0005, 100),
110
- "compliance_check": ToolConfig("compliance_check", 0.01, 1500),
111
- "summarize": ToolConfig("summarize", 0.002, 400),
112
- }
113
- verifiers = {
114
- "verifier_medium": VerifierConfig("verifier_medium", 0.005, 800, 0.8),
115
- }
116
- return ACOConfig(
117
- project_name="aco-benchmark",
118
- models=models,
119
- tools=tools,
120
- verifiers=verifiers,
121
- routing_policy=RoutingPolicy("benchmark"),
122
- )
123
-
124
- def generate_benchmark_data(self, n: int = 1000, seed: int = 42) -> List[AgentTrace]:
125
- """Generate synthetic traces for benchmarking."""
126
- gen = SyntheticTraceGenerator(seed=seed)
127
- return gen.generate(n)
128
-
129
- def run_baseline(
130
- self,
131
- traces: List[AgentTrace],
132
- baseline_name: str,
133
- ) -> BenchmarkResult:
134
- """Run a single baseline over the benchmark traces."""
135
-
136
- # Configure optimizer for baseline
137
- mode_map = {
138
- "always_frontier": "always_frontier",
139
- "always_cheap": "always_frontier", # overridden below
140
- "static": "static",
141
- "prompt_only": "prompt_only",
142
- "learned": "learned",
143
- "learned_verifier": "learned_verifier",
144
- "cascade": "cascade",
145
- "rules_only": "cascade", # uses cascade routing with rules-based modules
146
- "full": "cascade",
147
- }
148
-
149
- # Adjust config based on baseline
150
- config = self._default_config()
151
-
152
- if baseline_name == "always_frontier":
153
- config.enable_router = False
154
- elif baseline_name == "always_cheap":
155
- config.enable_router = False
156
- # Override all models to cheap tier in simulation by using special handling
157
- elif baseline_name == "static":
158
- pass # default static routing
159
- elif baseline_name == "prompt_only":
160
- pass # prompt heuristic routing
161
- elif baseline_name == "rules_only":
162
- config.enable_classifier = True
163
- config.enable_router = True
164
- config.enable_context_budgeter = True
165
- config.enable_cache_layout = True
166
- config.enable_tool_gate = True
167
- config.enable_verifier_budgeter = True
168
- config.enable_retry_optimizer = True
169
- config.enable_meta_tool_miner = False
170
- config.enable_early_termination = True
171
- elif baseline_name == "full":
172
- pass # all enabled
173
-
174
- # For ablations, disable specific modules
175
- if baseline_name.startswith("no_"):
176
- module_name = baseline_name.replace("no_", "")
177
- if hasattr(config, f"enable_{module_name}"):
178
- setattr(config, f"enable_{module_name}", False)
179
-
180
- optimizer = AgentCostOptimizer(config)
181
-
182
- results = []
183
- total_cost = 0.0
184
- total_latency = 0.0
185
- total_tools = 0
186
- total_verifiers = 0
187
- total_retries = 0
188
- total_context = 0
189
- cache_rates = []
190
-
191
- success_count = 0
192
- partial_count = 0
193
- failure_count = 0
194
- false_done_count = 0
195
- blocked_count = 0
196
-
197
- cheap_misses = 0
198
- escalation_misses = 0
199
- regression_count = 0
200
-
201
- frontier_costs = []
202
- actual_costs = []
203
-
204
- for trace in traces:
205
- # Run optimization on this trace's request
206
- run_state = {
207
- "trace_id": trace.trace_id,
208
- "routing_mode": mode_map.get(baseline_name, "cascade"),
209
- "current_cost": 0.0,
210
- "planned_tools": [
211
- (tc.tool_name, tc.tool_input)
212
- for step in trace.steps
213
- for tc in step.tool_calls
214
- ],
215
- "previous_tool_calls": [
216
- tc for step in trace.steps for tc in step.tool_calls
217
- ],
218
- "step_number": len(trace.steps),
219
- "total_steps": len(trace.steps),
220
- "is_irreversible": trace.task_type == TaskType.LEGAL_REGULATED,
221
- }
222
-
223
- result = optimizer.optimize(trace.user_request, run_state)
224
-
225
- # Simulate execution based on optimization decisions
226
- sim_cost, sim_latency, sim_success = self._simulate(trace, result, baseline_name)
227
-
228
- total_cost += sim_cost
229
- total_latency += sim_latency
230
- total_tools += len(result.tool_decisions)
231
- if result.verifier_decision:
232
- total_verifiers += 1
233
- total_retries += sum(1 for d in result.tool_decisions if d.decision.value == "skip")
234
- total_context += sum(s.context_size_tokens for s in trace.steps)
235
-
236
- frontier_cost = sum(
237
- s.model_call.total_cost if s.model_call else 0
238
- for s in trace.steps
239
- ) if trace.metadata.get("scenario") == "frontier_unnecessary" else trace.total_cost * 2
240
- frontier_costs.append(frontier_cost)
241
- actual_costs.append(sim_cost)
242
-
243
- outcome = trace.final_outcome
244
- if sim_success:
245
- if outcome == Outcome.SUCCESS:
246
- success_count += 1
247
- elif outcome == Outcome.PARTIAL_SUCCESS:
248
- partial_count += 1
249
- else:
250
- regression_count += 1
251
- else:
252
- if outcome == Outcome.FALSE_DONE:
253
- false_done_count += 1
254
- elif outcome == Outcome.BLOCKED:
255
- blocked_count += 1
256
- else:
257
- failure_count += 1
258
-
259
- # Check for cheap model misses
260
- if trace.metadata.get("scenario") == "cheap_failure" and result.routing_decision.tier <= 2:
261
- cheap_misses += 1
262
-
263
- # Check for missed escalation
264
- if trace.metadata.get("scenario") in ("cheap_failure", "tool_underuse") and result.routing_decision.tier < 3:
265
- escalation_misses += 1
266
-
267
- cache_rates.append(trace.cache_hit_rate)
268
-
269
- results.append({
270
- "trace_id": trace.trace_id,
271
- "task_type": trace.task_type.value,
272
- "scenario": trace.metadata.get("scenario", "normal"),
273
- "simulated_cost": sim_cost,
274
- "simulated_success": sim_success,
275
- "routing_tier": result.routing_decision.tier,
276
- "model_id": result.routing_decision.model_id,
277
- "tool_count": len(result.tool_decisions),
278
- "verifier_used": result.verifier_decision is not None,
279
- })
280
-
281
- n = len(traces)
282
- avg_cost_success = total_cost / max(success_count + partial_count, 1)
283
-
284
- # Cost reduction vs frontier baseline
285
- cost_reduction = (sum(frontier_costs) - sum(actual_costs)) / max(sum(frontier_costs), 1)
286
-
287
- return BenchmarkResult(
288
- benchmark_name="synthetic_benchmark",
289
- baseline_name=baseline_name,
290
- num_tasks=n,
291
- num_success=success_count,
292
- num_partial=partial_count,
293
- num_failure=failure_count,
294
- num_false_done=false_done_count,
295
- num_blocked=blocked_count,
296
- total_cost=total_cost,
297
- avg_cost_success=avg_cost_success,
298
- avg_latency_ms=total_latency / n,
299
- total_tool_calls=total_tools,
300
- total_verifier_calls=total_verifiers,
301
- total_retries=total_retries,
302
- avg_cache_hit_rate=sum(cache_rates) / n,
303
- total_context_tokens=total_context,
304
- cost_reduction_vs_frontier=cost_reduction,
305
- false_done_rate=false_done_count / n,
306
- unsafe_cheap_miss_rate=cheap_misses / n,
307
- missed_escalation_rate=escalation_misses / n,
308
- regression_rate=regression_count / n,
309
- quality_cost_frontier=[
310
- {"cost": c, "success": 1.0 if s else 0.0}
311
- for c, s in zip(actual_costs, [r["simulated_success"] for r in results])
312
- ],
313
- per_task_results=results,
314
- )
315
-
316
- def _simulate(self, trace: AgentTrace, result: OptimizationResult, baseline: str) -> tuple:
317
- """Simulate execution based on optimizer decisions."""
318
-
319
- # Base cost from the trace
320
- base_cost = trace.total_cost_computed
321
-
322
- # Adjust cost based on routing decision
323
- tier = result.routing_decision.tier
324
- cost_mult = {
325
- 1: 0.05, 2: 0.25, 3: 0.75, 4: 1.0, 5: 1.5,
326
- }.get(tier, 1.0)
327
-
328
- # Override for always_cheap baseline
329
- if baseline == "always_cheap":
330
- cost_mult = 0.25
331
- tier = 2
332
-
333
- # Override for always_frontier baseline
334
- if baseline == "always_frontier":
335
- cost_mult = 1.0
336
- tier = 4
337
-
338
- # Apply tool gate savings
339
- tools_skipped = sum(1 for d in result.tool_decisions if d.decision.value in ("skip", "use_cache"))
340
- tool_savings = tools_skipped * 0.005
341
-
342
- # Apply cache savings
343
- cache_savings = 0.0
344
- if result.prompt_layout:
345
- cache_savings = result.prompt_layout.cache_discount
346
-
347
- sim_cost = base_cost * cost_mult - tool_savings - cache_savings
348
- sim_cost = max(sim_cost, 0.001)
349
-
350
- # Simulate latency
351
- sim_latency = trace.total_latency_ms * cost_mult * 0.8
352
-
353
- # Simulate success probability
354
- scenario = trace.metadata.get("scenario", "normal")
355
-
356
- # Base success rate by tier and scenario
357
- success_prob = 0.95 if tier >= 3 else 0.7
358
- if scenario == "cheap_failure":
359
- success_prob = 0.3 if tier <= 2 else 0.85
360
- elif scenario == "tool_underuse":
361
- success_prob = 0.6 if tools_skipped > 0 else 0.8
362
- elif scenario == "retry_loop":
363
- success_prob = 0.2
364
- elif scenario == "frontier_unnecessary":
365
- success_prob = 0.95
366
- elif scenario == "meta_tool_success":
367
- success_prob = 0.9
368
- elif scenario == "meta_tool_bad":
369
- success_prob = 0.4
370
- elif scenario == "false_done":
371
- success_prob = 0.1
372
- elif scenario == "blocked" or scenario == "stopped_doom":
373
- success_prob = 0.0
374
- elif scenario == "human_escalation":
375
- success_prob = 0.5
376
-
377
- # Verifier improves success for high-risk tasks
378
- if result.verifier_decision and result.verifier_decision.decision.value == "call_verifier":
379
- success_prob += 0.05
380
-
381
- # Meta-tool success bonus
382
- if result.meta_tool_match:
383
- success_prob += 0.03
384
-
385
- sim_success = success_prob > 0.5 # simplified threshold
386
-
387
- return sim_cost, sim_latency, sim_success
388
-
389
- def run_all_baselines(self, traces: List[AgentTrace]) -> Dict[str, BenchmarkResult]:
390
- """Run all baseline configurations."""
391
- baselines = [
392
- "always_frontier",
393
- "always_cheap",
394
- "static",
395
- "prompt_only",
396
- "cascade",
397
- "rules_only",
398
- "full",
399
- ]
400
-
401
- results = {}
402
- for baseline in baselines:
403
- print(f"Running baseline: {baseline}...")
404
- results[baseline] = self.run_baseline(traces, baseline)
405
-
406
- return results
407
-
408
- def run_ablations(self, traces: List[AgentTrace]) -> Dict[str, BenchmarkResult]:
409
- """Run ablation study disabling each module."""
410
- ablations = [
411
- "no_router",
412
- "no_context_budgeter",
413
- "no_cache_layout",
414
- "no_tool_gate",
415
- "no_verifier_budgeter",
416
- "no_retry_optimizer",
417
- "no_meta_tool_miner",
418
- "no_early_termination",
419
- ]
420
-
421
- results = {}
422
- for ablation in ablations:
423
- print(f"Running ablation: {ablation}...")
424
- results[ablation] = self.run_baseline(traces, ablation)
425
-
426
- return results
427
-
428
- def report(self, results: Dict[str, BenchmarkResult]) -> str:
429
- """Generate formatted benchmark report."""
430
- lines = ["=" * 80, "AGENT COST OPTIMIZER BENCHMARK REPORT", "=" * 80, ""]
431
-
432
- headers = ["Baseline", "Success", "Partial", "Fail", "Blocked", "False-DONE",
433
- "Total Cost", "Avg Cost/Succ", "Latency(ms)", "Tools", "Verifiers",
434
- "Retries", "Cache Hit", "Cost Reduction", "Regression"]
435
-
436
- lines.append(" | ".join(headers))
437
- lines.append("-" * 120)
438
-
439
- for name, result in results.items():
440
- row = [
441
- name[:20].ljust(20),
442
- f"{result.num_success / result.num_tasks:.1%}",
443
- f"{result.num_partial / result.num_tasks:.1%}",
444
- f"{result.num_failure / result.num_tasks:.1%}",
445
- f"{result.num_blocked / result.num_tasks:.1%}",
446
- f"{result.false_done_rate:.1%}",
447
- f"${result.total_cost:.2f}",
448
- f"${result.avg_cost_success:.4f}",
449
- f"{result.avg_latency_ms:.0f}",
450
- str(result.total_tool_calls),
451
- str(result.total_verifier_calls),
452
- str(result.total_retries),
453
- f"{result.avg_cache_hit_rate:.1%}",
454
- f"{result.cost_reduction_vs_frontier:.1%}",
455
- f"{result.regression_rate:.1%}",
456
- ]
457
- lines.append(" | ".join(row))
458
-
459
- lines.append("")
460
- lines.append("=" * 80)
461
-
462
- # Find best cost/success tradeoff
463
- best_score = -float("inf")
464
- best_name = ""
465
- for name, result in results.items():
466
- success_rate = (result.num_success + result.num_partial) / result.num_tasks
467
- score = success_rate * 10 - result.avg_cost_success * 100 - result.regression_rate * 50
468
- if score > best_score:
469
- best_score = score
470
- best_name = name
471
-
472
- lines.append(f"BEST OVERALL: {best_name} (score={best_score:.2f})")
473
- lines.append("")
474
-
475
- return "\n".join(lines)
476
-
477
- def export(self, results: Dict[str, BenchmarkResult], path: str) -> None:
478
- """Export results to JSON."""
479
- export_data = {}
480
- for name, result in results.items():
481
- export_data[name] = {
482
- "benchmark_name": result.benchmark_name,
483
- "baseline_name": result.baseline_name,
484
- "num_tasks": result.num_tasks,
485
- "num_success": result.num_success,
486
- "num_partial": result.num_partial,
487
- "num_failure": result.num_failure,
488
- "num_false_done": result.num_false_done,
489
- "num_blocked": result.num_blocked,
490
- "total_cost": result.total_cost,
491
- "avg_cost_success": result.avg_cost_success,
492
- "avg_latency_ms": result.avg_latency_ms,
493
- "total_tool_calls": result.total_tool_calls,
494
- "total_verifier_calls": result.total_verifier_calls,
495
- "total_retries": result.total_retries,
496
- "avg_cache_hit_rate": result.avg_cache_hit_rate,
497
- "total_context_tokens": result.total_context_tokens,
498
- "cost_reduction_vs_frontier": result.cost_reduction_vs_frontier,
499
- "false_done_rate": result.false_done_rate,
500
- "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
501
- "missed_escalation_rate": result.missed_escalation_rate,
502
- "regression_rate": result.regression_rate,
503
- }
504
-
505
- with open(path, "w") as f:
506
- json.dump(export_data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/bert_vs_xgboost_results.json DELETED
File without changes
examples/integration_example.py DELETED
@@ -1,118 +0,0 @@
1
- """Example integration of Agent Cost Optimizer with a hypothetical agent harness."""
2
-
3
- from aco import AgentCostOptimizer
4
- from aco.config import ACOConfig
5
-
6
-
7
- def example_agent_harness():
8
- """Example of how to bolt ACO onto any agent harness."""
9
-
10
- # Initialize optimizer
11
- config = ACOConfig.from_yaml("config.yaml")
12
- optimizer = AgentCostOptimizer(config)
13
-
14
- # Incoming user request
15
- user_request = "Write a Python script to fetch data from an API and cache it in Redis"
16
-
17
- # Build run state from current agent state
18
- run_state = {
19
- "trace_id": "agent-run-12345",
20
- "current_cost": 0.0,
21
- "planned_tools": [
22
- ("search", {"query": "redis python client"}),
23
- ("fetch", {"url": "https://api.example.com/docs"}),
24
- ("code_execution", {"code": "test script"}),
25
- ],
26
- "previous_tool_calls": [],
27
- "step_number": 1,
28
- "total_steps": 3,
29
- "is_irreversible": False,
30
- "context_pieces": {
31
- "system_rules": "You are a coding assistant.",
32
- "tool_descriptions": "Available tools: search, fetch, code_execution",
33
- "user_preferences": "Prefer Python 3.11+, type hints, async where possible",
34
- "recent_messages": "User: Write a Python script...",
35
- },
36
- "retrieved_docs": [],
37
- "routing_mode": "cascade",
38
- }
39
-
40
- # Call optimizer before executing
41
- decision = optimizer.optimize(user_request, run_state)
42
-
43
- print(f"Trace ID: {decision.trace_id}")
44
- print(f"Selected Model: {decision.routing_decision.model_id} (tier {decision.routing_decision.tier})")
45
- print(f"Estimated Cost: ${decision.estimated_cost:.4f}")
46
- print(f"Estimated Latency: {decision.estimated_latency_ms:.0f}ms")
47
- print(f"Confidence: {decision.confidence:.2f}")
48
- print()
49
-
50
- # Apply tool gate decisions
51
- print("Tool Decisions:")
52
- for td in decision.tool_decisions:
53
- print(f" {td.tool_name}: {td.decision.value} (reason: {td.reasoning})")
54
-
55
- # Apply context budget
56
- if decision.context_budget:
57
- print(f"\nContext Budget: {decision.context_budget.total_budget_tokens} tokens")
58
- print(f" Cache prefix: {decision.context_budget.cache_prefix_tokens} tokens")
59
- print(f" Dynamic suffix: {decision.context_budget.dynamic_suffix_tokens} tokens")
60
- if decision.context_budget.omitted_sources:
61
- print(f" Omitted: {[s.name for s in decision.context_budget.omitted_sources]}")
62
-
63
- # Apply cache layout
64
- if decision.prompt_layout:
65
- print(f"\nCache Layout:")
66
- print(f" Cold cost: ${decision.prompt_layout.estimated_cold_cost:.4f}")
67
- print(f" Warm cost: ${decision.prompt_layout.estimated_warm_cost:.4f}")
68
- print(f" Cache discount: ${decision.prompt_layout.cache_discount:.4f}")
69
-
70
- # Check meta-tool
71
- if decision.meta_tool_match:
72
- print(f"\nMeta-Tool Match: {decision.meta_tool_match['meta_tool_id']}")
73
- print(f" Estimated savings: ${decision.meta_tool_match['estimated_cost_savings']:.4f}")
74
-
75
- # Check doom assessment
76
- if decision.doom_assessment:
77
- print(f"\nDoom Assessment: {decision.doom_assessment.action.value}")
78
- print(f" Confidence: {decision.doom_assessment.confidence:.2f}")
79
- print(f" Signals: {decision.doom_assessment.signals_triggered}")
80
-
81
- # Check verifier
82
- if decision.verifier_decision:
83
- print(f"\nVerifier: {decision.verifier_decision.decision.value}")
84
- print(f" Checks: {decision.verifier_decision.checks}")
85
- print(f" Cost: ${decision.verifier_decision.estimated_verifier_cost:.4f}")
86
-
87
- # After execution, record step and finalize
88
- from aco.trace_schema import ModelCall, Outcome
89
-
90
- model_call = ModelCall(
91
- model_id=decision.routing_decision.model_id,
92
- provider="cloud",
93
- input_tokens=2048,
94
- output_tokens=512,
95
- cost_per_1k_input=0.003,
96
- cost_per_1k_output=0.006,
97
- )
98
-
99
- optimizer.record_step(
100
- trace_id=decision.trace_id,
101
- model_call=model_call,
102
- context_size_tokens=2048,
103
- step_outcome=Outcome.SUCCESS,
104
- )
105
-
106
- # Finalize
107
- trace = optimizer.finalize_trace(
108
- trace_id=decision.trace_id,
109
- outcome=Outcome.SUCCESS,
110
- user_satisfaction=0.95,
111
- )
112
-
113
- print(f"\nTrace finalized. Total cost: ${trace.total_cost_computed:.4f}")
114
- print(f"Cost saved vs frontier: ${trace.total_cost_saved_vs_frontier:.4f}")
115
-
116
-
117
- if __name__ == "__main__":
118
- example_agent_harness()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
router_models/baar_bundle.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d9f1b2ad58746c97634be6e882cef198fc7ead51c62579d604fcc36c4b15e3e
3
- size 5483316
 
 
 
 
router_models/production_bundle.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:30e4e5548187842da8b73a96879a783f5e9b7f7d83d6be38f51be6d222585b64
3
- size 1649115
 
 
 
 
router_models/router_bundle.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b032dfc47b7db2d37ef29c0016d71a6f69212cd127c57c51c76db1b0568be14e
3
- size 1259385