narcolepticchicken commited on
Commit
944b77c
·
verified ·
1 Parent(s): b2c7131

Upload notebook_walkthrough.ipynb

Browse files
Files changed (1) hide show
  1. notebook_walkthrough.ipynb +220 -0
notebook_walkthrough.ipynb ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# OCC Stack Walkthrough\n",
8
+ "\n",
9
+ "This notebook demonstrates the Oracle-Credit-Compute (OCC) stack for agentic compute allocation."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import sys\n",
19
+ "from pathlib import Path\n",
20
+ "sys.path.insert(0, str(Path.cwd()))\n",
21
+ "\n",
22
+ "from oracle.oracle import ImpactOracle\n",
23
+ "from ledger.ledger import CreditLedger\n",
24
+ "from broker.broker import ResourceBroker, Decision\n",
25
+ "from rl.reward import RewardHook, OfflineComparator"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "metadata": {},
31
+ "source": [
32
+ "## 1. Impact Oracle\n",
33
+ "\n",
34
+ "The oracle scores whether an action produced measurable marginal value."
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "oracle = ImpactOracle(compute_budget=1e5)\n",
44
+ "\n",
45
+ "# Score a code attempt\n",
46
+ "result = oracle.score(\n",
47
+ " mode=\"code\",\n",
48
+ " action={\"tokens_used\": 50},\n",
49
+ " context={\"previous_passed\": False},\n",
50
+ " result={\"passed\": True, \"hidden_passed\": True, \"compute_cost\": 50},\n",
51
+ " agent_id=\"agent_1\"\n",
52
+ ")\n",
53
+ "print(f\"Raw score: {result.raw_score}\")\n",
54
+ "print(f\"Cost-adjusted: {result.cost_adjusted_score}\")\n",
55
+ "print(f\"Reward: {result.reward_value}\")\n",
56
+ "print(f\"Reason: {result.reason}\")"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "markdown",
61
+ "metadata": {},
62
+ "source": [
63
+ "## 2. Credit Ledger\n",
64
+ "\n",
65
+ "Credits are non-transferable, decaying, and capability-scoped."
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "ledger = CreditLedger(decay_lambda=0.05)\n",
75
+ "\n",
76
+ "# Agent earns credits\n",
77
+ "ledger.earn(\n",
78
+ " agent_id=\"agent_1\",\n",
79
+ " task_id=\"task_1\",\n",
80
+ " action_id=\"attempt_1\",\n",
81
+ " amount=10.0,\n",
82
+ " oracle_score=1.0,\n",
83
+ " compute_cost=50.0,\n",
84
+ " reason=\"pass_hidden_test\"\n",
85
+ ")\n",
86
+ "\n",
87
+ "print(f\"Balance: {ledger.balance('agent_1')}\")\n",
88
+ "\n",
89
+ "# Try to transfer (blocked)\n",
90
+ "success = ledger.transfer(\"agent_1\", \"agent_2\", 5.0)\n",
91
+ "print(f\"Transfer succeeded: {success}\")\n",
92
+ "\n",
93
+ "# Spend credits\n",
94
+ "ok, entry = ledger.spend(\"agent_1\", \"task_1\", \"action_2\", 3.0, reason=\"retrieval_call\")\n",
95
+ "print(f\"Spend succeeded: {ok}, remaining: {ledger.balance('agent_1')}\")"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "markdown",
100
+ "metadata": {},
101
+ "source": [
102
+ "## 3. Resource Broker\n",
103
+ "\n",
104
+ "The broker grants capability-based rights based on credit balance and risk."
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "broker = ResourceBroker()\n",
114
+ "\n",
115
+ "# Low credit -> deny\n",
116
+ "dec = broker.request(\"model_call_large\", \"agent_1\", 1.0)\n",
117
+ "print(f\"Low credit: {dec.decision.value} - {dec.reason}\")\n",
118
+ "\n",
119
+ "# High credit -> allow (with approval for high-risk)\n",
120
+ "dec = broker.request(\"model_call_large\", \"agent_1\", 50.0)\n",
121
+ "print(f\"High credit: {dec.decision.value} - {dec.reason}\")\n",
122
+ "\n",
123
+ "# Gaming detected -> escalate\n",
124
+ "dec = broker.request(\"file_write\", \"agent_1\", 100.0, agent_flags={\"gaming_score\": 0.6})\n",
125
+ "print(f\"Gaming: {dec.decision.value} - {dec.reason}\")"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "markdown",
130
+ "metadata": {},
131
+ "source": [
132
+ "## 4. GRPO Reward Hook\n",
133
+ "\n",
134
+ "Connects the oracle to RL reward computation."
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "hook = RewardHook(oracle, ledger, broker, mode=\"code\", agent_id=\"rl_agent\")\n",
144
+ "\n",
145
+ "prompts = [\"def add(a, b):\\n return\"] * 3\n",
146
+ "completions = [\"a + b\", \"a * b\", \"a + b + 0\"]\n",
147
+ "oracle_inputs = [\n",
148
+ " {\"action\": {}, \"context\": {}, \"result\": {\"passed\": True, \"hidden_passed\": True, \"compute_cost\": 5}, \"task_id\": \"t1\", \"action_id\": \"a1\"},\n",
149
+ " {\"action\": {}, \"context\": {}, \"result\": {\"passed\": False, \"hidden_passed\": False, \"compute_cost\": 5}, \"task_id\": \"t1\", \"action_id\": \"a2\"},\n",
150
+ " {\"action\": {}, \"context\": {}, \"result\": {\"passed\": True, \"hidden_passed\": True, \"compute_cost\": 5}, \"task_id\": \"t1\", \"action_id\": \"a3\"},\n",
151
+ "]\n",
152
+ "\n",
153
+ "rewards = hook.compute_rewards(prompts, completions, oracle_inputs)\n",
154
+ "print(\"Rewards:\", rewards)"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "markdown",
159
+ "metadata": {},
160
+ "source": [
161
+ "## 5. Code Benchmark\n",
162
+ "\n",
163
+ "Run the compute allocation benchmark."
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "from benchmarks.benchmark_code import CodeBenchmark\n",
173
+ "\n",
174
+ "bench = CodeBenchmark(max_problems=50, seed=42)\n",
175
+ "bench.load_data()\n",
176
+ "results = bench.run_all()\n",
177
+ "\n",
178
+ "for label, res in results.items():\n",
179
+ " print(f\"{label:20s}: pass@1={res['pass@1']:.3f}, compute/problem={res['compute_per_problem']:.0f}\")"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "markdown",
184
+ "metadata": {},
185
+ "source": [
186
+ "## 6. Ablation Study\n",
187
+ "\n",
188
+ "Compare OCC with ablated configurations."
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": null,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "from eval_runner import AblationRunner\n",
198
+ "\n",
199
+ "runner = AblationRunner(seed=42)\n",
200
+ "code_ablations = runner.ablation_code()\n",
201
+ "\n",
202
+ "for k, v in code_ablations.items():\n",
203
+ " print(f\"{k:20s}: pass@1={v.get('pass@1', 'N/A'):.3f}, compute={v.get('total_compute', 'N/A'):.0f}\")"
204
+ ]
205
+ }
206
+ ],
207
+ "metadata": {
208
+ "kernelspec": {
209
+ "display_name": "Python 3",
210
+ "language": "python",
211
+ "name": "python3"
212
+ },
213
+ "language_info": {
214
+ "name": "python",
215
+ "version": "3.10.0"
216
+ }
217
+ },
218
+ "nbformat": 4,
219
+ "nbformat_minor": 4
220
+ }