sakthivarshans commited on
Commit
5a37ff6
·
1 Parent(s): f72012b

Initial BEACON environment

Browse files
Dockerfile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ WORKDIR /app
3
+ COPY requirements.txt .
4
+ RUN pip install --no-cache-dir -r requirements.txt
5
+ COPY . .
6
+ EXPOSE 7860
7
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/environment.cpython-311.pyc ADDED
Binary file (14.8 kB). View file
 
__pycache__/graders.cpython-311.pyc ADDED
Binary file (9.93 kB). View file
 
__pycache__/models.cpython-311.pyc ADDED
Binary file (2.97 kB). View file
 
baseline.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ baseline.py — Groq LLM baseline agent for the BEACON RL environment.
3
+
4
+ Runs a Llama 3 model (via Groq) as a zero-shot budget-allocation agent
5
+ against all three BEACON tasks and prints reproducible episode scores.
6
+
7
+ Usage:
8
+ export GROQ_API_KEY="your-key-here"
9
+ python baseline.py
10
+
11
+ Requirements:
12
+ pip install openai
13
+ """
14
+
15
+ import json
16
+ import os
17
+
18
+ from openai import OpenAI
19
+
20
+ from environment import BEACONEnvironment
21
+ from models import Action
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Groq client — OpenAI-compatible endpoint
26
+ # ---------------------------------------------------------------------------
27
+
28
+ client = OpenAI(
29
+ api_key=os.environ.get("GROQ_API_KEY"),
30
+ base_url="https://api.groq.com/openai/v1",
31
+ )
32
+
33
+ MODEL = "llama3-8b-8192"
34
+ TEMPERATURE = 0 # deterministic completions for reproducibility
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Prompt builder
39
+ # ---------------------------------------------------------------------------
40
+
41
+ def _build_prompt(obs, step_num: int) -> str:
42
+ """
43
+ Construct a structured natural-language prompt from the current Observation.
44
+
45
+ The prompt instructs the LLM to return ONLY a valid JSON object with
46
+ `allocations` and `savings_contribution` fields. No prose, no markdown.
47
+
48
+ Args:
49
+ obs: The current Observation from the environment.
50
+ step_num: 1-indexed step number within the episode (for context).
51
+
52
+ Returns:
53
+ A formatted prompt string.
54
+ """
55
+ # Format category budgets and spent as a readable table
56
+ budget_lines = "\n".join(
57
+ f" {cat}: allocated={obs.category_budgets[cat]:.2f}, "
58
+ f"spent={obs.category_spent[cat]:.2f}"
59
+ for cat in obs.category_budgets
60
+ )
61
+
62
+ shocks_text = (
63
+ ", ".join(obs.active_shocks) if obs.active_shocks else "none"
64
+ )
65
+
66
+ prompt = f"""You are a financial planning agent managing a {obs.mode} budget.
67
+
68
+ Current state (Period {obs.period} of {obs.period + obs.periods_remaining - 1}):
69
+ - Periods remaining (including this one): {obs.periods_remaining}
70
+ - Total income available this period: {obs.total_income:.2f}
71
+ - Savings balance: {obs.savings_balance:.2f}
72
+ - Savings goal: {obs.savings_goal:.2f}
73
+ - Active financial shocks: {shocks_text}
74
+
75
+ Category budgets and spending so far:
76
+ {budget_lines}
77
+
78
+ Your task:
79
+ Allocate this period's income across all categories and decide how much to save.
80
+ The total of all allocations + savings_contribution must NOT exceed {obs.total_income:.2f}.
81
+ Prioritise essential categories first (avoid allocating 0 to any necessary category).
82
+ Try to make progress toward the savings goal each period.
83
+
84
+ Respond with ONLY a valid JSON object — no explanation, no markdown, no extra text:
85
+ {{
86
+ "allocations": {{
87
+ {", ".join(f'"{cat}": <float>' for cat in obs.category_budgets)}
88
+ }},
89
+ "savings_contribution": <float>
90
+ }}"""
91
+
92
+ return prompt
93
+
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Fallback action
97
+ # ---------------------------------------------------------------------------
98
+
99
+ def _fallback_action(obs) -> Action:
100
+ """
101
+ Build a safe fallback Action using exact minimum required allocations.
102
+
103
+ Used when the LLM response cannot be parsed as valid JSON. Allocates
104
+ exactly the minimum fraction of income to each category and puts any
105
+ remaining income into savings.
106
+
107
+ Args:
108
+ obs: The current Observation (provides income and mode context).
109
+
110
+ Returns:
111
+ A valid Action that satisfies all essential category minimums.
112
+ """
113
+ minimums = BEACONEnvironment.MIN_REQUIREMENTS[obs.mode]
114
+ income = obs.total_income
115
+
116
+ allocations = {cat: frac * income for cat, frac in minimums.items()}
117
+ total_bills = sum(allocations.values())
118
+
119
+ # Sweep remaining income into savings after covering bills
120
+ savings_contribution = max(0.0, income - total_bills)
121
+
122
+ return Action(
123
+ allocations=allocations,
124
+ savings_contribution=savings_contribution,
125
+ )
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # LLM action parser
130
+ # ---------------------------------------------------------------------------
131
+
132
+ def _parse_action(response_text: str, obs) -> Action:
133
+ """
134
+ Parse the LLM's JSON response into a valid Action.
135
+
136
+ Applies two safety guards after parsing:
137
+ 1. Clamps all allocation values to non-negative floats.
138
+ 2. Scales the entire action down proportionally if total spend would
139
+ exceed total_income, ensuring the agent never overspends.
140
+
141
+ Falls back to minimum allocations if the response is not valid JSON.
142
+
143
+ Args:
144
+ response_text: Raw text returned by the LLM.
145
+ obs: Current Observation (used for income and fallback).
146
+
147
+ Returns:
148
+ A valid Action ready to pass to env.step().
149
+ """
150
+ try:
151
+ # Strip surrounding whitespace/newlines before parsing
152
+ data = json.loads(response_text.strip())
153
+
154
+ allocations = {
155
+ cat: max(0.0, float(v))
156
+ for cat, v in data["allocations"].items()
157
+ }
158
+ savings_contribution = max(0.0, float(data["savings_contribution"]))
159
+
160
+ # Safety clamp: scale down if total spend exceeds income
161
+ total_requested = sum(allocations.values()) + savings_contribution
162
+ if total_requested > obs.total_income and total_requested > 0:
163
+ scale = obs.total_income / total_requested
164
+ allocations = {cat: amt * scale for cat, amt in allocations.items()}
165
+ savings_contribution *= scale
166
+
167
+ return Action(
168
+ allocations=allocations,
169
+ savings_contribution=savings_contribution,
170
+ )
171
+
172
+ except (json.JSONDecodeError, KeyError, TypeError, ValueError) as exc:
173
+ print(f" [WARN] Could not parse LLM response ({type(exc).__name__}: {exc}). "
174
+ f"Using fallback minimum allocations.")
175
+ return _fallback_action(obs)
176
+
177
+
178
+ # ---------------------------------------------------------------------------
179
+ # Core episode runner
180
+ # ---------------------------------------------------------------------------
181
+
182
+ def run_agent_episode(mode: str, total_periods: int, seed: int) -> float:
183
+ """
184
+ Run a full BEACON episode with the Groq LLM agent and return the
185
+ average reward across all periods.
186
+
187
+ At each step the agent receives a natural-language prompt describing
188
+ the current budget state, responds with a JSON allocation plan, and
189
+ the environment returns a structured Reward. If the LLM produces
190
+ unparseable output, a safe minimum-allocation fallback is used.
191
+
192
+ Args:
193
+ mode: BEACON mode — "household" or "corporate".
194
+ total_periods: Number of budget periods in the episode.
195
+ seed: Random seed for environment reproducibility.
196
+
197
+ Returns:
198
+ Mean reward.total across all completed periods (float in [-1.0, 1.0]).
199
+ """
200
+ # --- Initialise environment ----------------------------------------------
201
+ env = BEACONEnvironment(mode=mode, total_periods=total_periods, seed=seed)
202
+ obs = env.reset()
203
+
204
+ period_rewards: list[float] = []
205
+
206
+ system_prompt = (
207
+ "You are a precise financial planning agent. "
208
+ "You always respond with ONLY valid JSON — no prose, no markdown fences, "
209
+ "no explanation. Every numeric value must be a plain float."
210
+ )
211
+
212
+ # --- Episode loop --------------------------------------------------------
213
+ for step_num in range(1, total_periods + 1):
214
+ user_prompt = _build_prompt(obs, step_num)
215
+
216
+ # --- Query the LLM ---------------------------------------------------
217
+ try:
218
+ response = client.chat.completions.create(
219
+ model=MODEL,
220
+ temperature=TEMPERATURE,
221
+ messages=[
222
+ {"role": "system", "content": system_prompt},
223
+ {"role": "user", "content": user_prompt},
224
+ ],
225
+ )
226
+ raw_text = response.choices[0].message.content or ""
227
+ except Exception as exc:
228
+ print(f" [WARN] LLM API call failed (step {step_num}): {exc}. "
229
+ f"Using fallback action.")
230
+ raw_text = "" # triggers fallback in _parse_action
231
+
232
+ # --- Parse response into an Action -----------------------------------
233
+ action = _parse_action(raw_text, obs)
234
+
235
+ # --- Step the environment --------------------------------------------
236
+ obs, reward, done, _info = env.step(action)
237
+ period_rewards.append(reward.total)
238
+
239
+ if done:
240
+ break
241
+
242
+ # --- Average reward across all periods -----------------------------------
243
+ avg_reward = sum(period_rewards) / len(period_rewards) if period_rewards else 0.0
244
+ return avg_reward
245
+
246
+
247
+ # ---------------------------------------------------------------------------
248
+ # Top-level baseline runner
249
+ # ---------------------------------------------------------------------------
250
+
251
+ def run_baseline() -> dict[str, float]:
252
+ """
253
+ Run all three BEACON tasks with the Groq LLM agent and report scores.
254
+
255
+ Tasks:
256
+ Task 1 — Easy: household mode, 1 period, seed=42
257
+ Task 2 — Medium: household mode, 3 periods, seed=99
258
+ Task 3 — Hard: corporate mode, 6 periods, seed=7
259
+
260
+ Each task returns the mean reward across all periods, printed to 2
261
+ decimal places.
262
+
263
+ Returns:
264
+ dict with keys "task1", "task2", "task3" mapping to float scores.
265
+ """
266
+ print("Running BEACON baseline...")
267
+ print(f" Model : {MODEL}")
268
+ print(f" Temp : {TEMPERATURE}")
269
+ print()
270
+
271
+ # --- Task 1: Easy — Bill Coverage (1 period, household) ------------------
272
+ print("Task 1 (Easy — Bill Coverage)...")
273
+ score1 = run_agent_episode(mode="household", total_periods=1, seed=42)
274
+ print(f"Task 1: {score1:.2f}")
275
+ print()
276
+
277
+ # --- Task 2: Medium — Shock Absorption (3 periods, household) ------------
278
+ print("Task 2 (Medium — Shock Absorption)...")
279
+ score2 = run_agent_episode(mode="household", total_periods=3, seed=99)
280
+ print(f"Task 2: {score2:.2f}")
281
+ print()
282
+
283
+ # --- Task 3: Hard — 6-Month Goal Planning (6 periods, corporate) ---------
284
+ print("Task 3 (Hard — 6-Month Goal Planning)...")
285
+ score3 = run_agent_episode(mode="corporate", total_periods=6, seed=7)
286
+ print(f"Task 3: {score3:.2f}")
287
+ print()
288
+
289
+ return {
290
+ "task1": score1,
291
+ "task2": score2,
292
+ "task3": score3,
293
+ }
294
+
295
+
296
+ # ---------------------------------------------------------------------------
297
+ # Entry point
298
+ # ---------------------------------------------------------------------------
299
+
300
+ if __name__ == "__main__":
301
+ run_baseline()
environment.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ environment.py — BEACON reinforcement learning environment.
3
+
4
+ BEACON (Budget Environment for Agent Control and Optimization of Needs) is a
5
+ dual-scale budget management environment with two operating modes:
6
+ - "household": personal finance simulation (income in Indian Rupees)
7
+ - "corporate": organisational finance simulation
8
+ """
9
+
10
+ import random
11
+ from models import Observation, Action, Reward
12
+
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Module-level configuration constants
16
+ # ---------------------------------------------------------------------------
17
+
18
+ MODES = ("household", "corporate")
19
+
20
+ # Spending categories available in each mode
21
+ CATEGORIES: dict[str, list[str]] = {
22
+ "household": [
23
+ "rent", "food", "utilities", "transport",
24
+ "education", "medical", "discretionary",
25
+ ],
26
+ "corporate": [
27
+ "payroll", "operations", "marketing", "logistics",
28
+ "capex", "reserves", "miscellaneous",
29
+ ],
30
+ }
31
+
32
+ # Income sampling range (inclusive) per mode — household values in Indian Rupees
33
+ INCOME_RANGE: dict[str, tuple[float, float]] = {
34
+ "household": (30_000.0, 100_000.0),
35
+ "corporate": (1_000_000.0, 50_000_000.0),
36
+ }
37
+
38
+ # Unexpected financial events that can hit the agent mid-episode
39
+ SHOCKS: dict[str, list[str]] = {
40
+ "household": [
41
+ "medical_emergency",
42
+ "appliance_repair",
43
+ "school_fee_spike",
44
+ "utility_surge",
45
+ ],
46
+ "corporate": [
47
+ "vendor_default",
48
+ "regulatory_fine",
49
+ "equipment_failure",
50
+ "key_employee_exit",
51
+ ],
52
+ }
53
+
54
+ # Each shock costs between 10% and 25% of total_income (sampled uniformly)
55
+ SHOCK_COST_RANGE: tuple[float, float] = (0.10, 0.25)
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Environment class
60
+ # ---------------------------------------------------------------------------
61
+
62
+ class BEACONEnvironment:
63
+ """
64
+ BEACON: Budget Environment for Agent Control and Optimization of Needs.
65
+
66
+ An OpenEnv-compatible, dual-scale budget management RL environment.
67
+ The agent manages a budget over `total_periods` steps, allocating funds
68
+ across spending categories, growing savings, and weathering random
69
+ financial shocks.
70
+
71
+ Episode flow:
72
+ obs = env.reset()
73
+ while True:
74
+ action = agent.act(obs)
75
+ obs, reward, done, info = env.step(action)
76
+ if done:
77
+ break
78
+ """
79
+
80
+ # ------------------------------------------------------------------
81
+ # Minimum category allocations as a fraction of total_income.
82
+ # Categories with 0.0 are non-essential (no penalty for zero spend).
83
+ # ------------------------------------------------------------------
84
+ MIN_REQUIREMENTS: dict[str, dict[str, float]] = {
85
+ "household": {
86
+ "rent": 0.25,
87
+ "food": 0.20,
88
+ "utilities": 0.08,
89
+ "transport": 0.05,
90
+ "education": 0.10,
91
+ "medical": 0.05,
92
+ "discretionary": 0.00, # non-essential
93
+ },
94
+ "corporate": {
95
+ "payroll": 0.35,
96
+ "operations": 0.20,
97
+ "marketing": 0.05,
98
+ "logistics": 0.08,
99
+ "capex": 0.05,
100
+ "reserves": 0.10,
101
+ "miscellaneous": 0.00, # non-essential
102
+ },
103
+ }
104
+
105
+ def __init__(
106
+ self,
107
+ mode: str = "household",
108
+ total_periods: int = 6,
109
+ seed: int = 42,
110
+ ) -> None:
111
+ """
112
+ Initialise the BEACON environment.
113
+
114
+ Args:
115
+ mode: Simulation mode — "household" or "corporate".
116
+ total_periods: Number of budget periods in one episode.
117
+ seed: Random seed for full reproducibility.
118
+
119
+ Raises:
120
+ ValueError: If an unrecognised mode is supplied.
121
+ """
122
+ if mode not in MODES:
123
+ raise ValueError(
124
+ f"Invalid mode '{mode}'. Choose one of {MODES}."
125
+ )
126
+
127
+ self.mode = mode
128
+ self.total_periods = total_periods
129
+ self.seed = seed
130
+
131
+ # Isolated RNG — does not pollute global random state
132
+ self._rng = random.Random(seed)
133
+
134
+ # Internal state fields — initialised properly inside reset()
135
+ self._period: int = 1
136
+ self._total_income: float = 0.0
137
+ self._savings_balance: float = 0.0
138
+ self._savings_goal: float = 0.0
139
+ self._category_budgets: dict[str, float] = {}
140
+ self._category_spent: dict[str, float] = {}
141
+ self._active_shocks: list[str] = []
142
+ self._shock_costs: dict[str, float] = {} # shock → cost amount
143
+
144
+ # Start the first episode immediately
145
+ self.reset()
146
+
147
+ # ------------------------------------------------------------------
148
+ # Core API
149
+ # ------------------------------------------------------------------
150
+
151
+ def reset(self) -> Observation:
152
+ """
153
+ Reset the environment and begin a new episode.
154
+
155
+ Re-seeds the internal RNG so that consecutive reset() calls always
156
+ produce the same starting state (deterministic reproducibility).
157
+ Randomly activates zero or one shock at episode start.
158
+
159
+ Returns:
160
+ The initial Observation for the new episode.
161
+ """
162
+ # Fresh RNG from the same seed → identical episode starts every call
163
+ self._rng = random.Random(self.seed)
164
+
165
+ # --- Sample income -----------------------------------------------
166
+ lo, hi = INCOME_RANGE[self.mode]
167
+ self._total_income = self._rng.uniform(lo, hi)
168
+
169
+ # --- Savings goal = 20% of projected total income ----------------
170
+ self._savings_goal = 0.20 * self._total_income * self.total_periods
171
+
172
+ # --- Zero-initialise all category tracking -----------------------
173
+ categories = CATEGORIES[self.mode]
174
+ self._category_budgets = {cat: 0.0 for cat in categories}
175
+ self._category_spent = {cat: 0.0 for cat in categories}
176
+
177
+ # --- Reset savings and time counters -----------------------------
178
+ self._savings_balance = 0.0
179
+ self._period = 1
180
+
181
+ # --- Clear shock state, then optionally seed one starting shock --
182
+ self._active_shocks = []
183
+ self._shock_costs = {}
184
+ if self._rng.random() < 0.50: # 50% chance of a starting shock
185
+ self._activate_random_shock()
186
+
187
+ return self._make_observation()
188
+
189
+ def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
190
+ """
191
+ Execute one budget period using the agent's action.
192
+
193
+ Steps performed:
194
+ 1. Apply category allocations → update budgets and spent amounts.
195
+ 2. Add savings contribution → update savings balance.
196
+ 3. Calculate the multi-component reward signal.
197
+ 4. Advance the period counter.
198
+ 5. Randomly activate a new shock (30% probability).
199
+ 6. Determine episode termination.
200
+
201
+ Args:
202
+ action: The Action submitted by the agent for this period.
203
+
204
+ Returns:
205
+ observation: New environment state after the step.
206
+ reward: Structured Reward for this period.
207
+ done: True when the episode has ended.
208
+ info: Auxiliary diagnostic data (plain dict).
209
+ """
210
+ # ---- 1. Apply category allocations ------------------------------
211
+ for cat, amount in action.allocations.items():
212
+ if cat in self._category_budgets:
213
+ # Treat the allocation as the amount budgeted and spent
214
+ self._category_budgets[cat] = amount
215
+ self._category_spent[cat] = amount
216
+
217
+ # ---- 2. Update savings balance ----------------------------------
218
+ self._savings_balance += action.savings_contribution
219
+
220
+ # ---- 3. Total spending = all allocations + savings this period --
221
+ total_spent = sum(action.allocations.values()) + action.savings_contribution
222
+
223
+ # ---- 4. Compute reward ------------------------------------------
224
+ reward = self._calculate_reward(action, total_spent)
225
+
226
+ # ---- 5. Advance time period -------------------------------------
227
+ self._period += 1
228
+
229
+ # ---- 6. Randomly activate a new shock (30% probability) ---------
230
+ if self._rng.random() < 0.30:
231
+ self._activate_random_shock()
232
+
233
+ # ---- 7. Episode is done when no periods remain ------------------
234
+ done = self.periods_remaining == 0
235
+
236
+ # ---- 8. Diagnostic info dict ------------------------------------
237
+ info: dict = {
238
+ "period_completed": self._period - 1,
239
+ "total_spent": total_spent,
240
+ "total_income": self._total_income,
241
+ "overspent": total_spent > self._total_income,
242
+ "active_shocks": list(self._active_shocks),
243
+ "shock_costs": dict(self._shock_costs),
244
+ "savings_balance": self._savings_balance,
245
+ "savings_goal": self._savings_goal,
246
+ "periods_remaining": self.periods_remaining,
247
+ }
248
+
249
+ return self._make_observation(), reward, done, info
250
+
251
+ def state(self) -> dict:
252
+ """
253
+ Return the complete current environment state as a plain dictionary.
254
+
255
+ Useful for logging, checkpointing, or external serialisation without
256
+ constructing Pydantic models.
257
+
258
+ Returns:
259
+ A flat dict containing all internal state fields.
260
+ """
261
+ return {
262
+ "mode": self.mode,
263
+ "period": self._period,
264
+ "total_periods": self.total_periods,
265
+ "periods_remaining": self.periods_remaining,
266
+ "total_income": self._total_income,
267
+ "savings_balance": self._savings_balance,
268
+ "savings_goal": self._savings_goal,
269
+ "category_budgets": dict(self._category_budgets),
270
+ "category_spent": dict(self._category_spent),
271
+ "active_shocks": list(self._active_shocks),
272
+ "shock_costs": dict(self._shock_costs),
273
+ "seed": self.seed,
274
+ }
275
+
276
+ # ------------------------------------------------------------------
277
+ # Properties
278
+ # ------------------------------------------------------------------
279
+
280
+ @property
281
+ def periods_remaining(self) -> int:
282
+ """Number of budget periods still remaining in the current episode."""
283
+ return max(0, self.total_periods - self._period + 1)
284
+
285
+ # ------------------------------------------------------------------
286
+ # Private helpers
287
+ # ------------------------------------------------------------------
288
+
289
+ def _make_observation(self) -> Observation:
290
+ """Build and return an Observation from the current internal state."""
291
+ return Observation(
292
+ mode=self.mode,
293
+ period=self._period,
294
+ total_income=self._total_income,
295
+ category_budgets=dict(self._category_budgets),
296
+ category_spent=dict(self._category_spent),
297
+ savings_balance=self._savings_balance,
298
+ savings_goal=self._savings_goal,
299
+ active_shocks=list(self._active_shocks),
300
+ periods_remaining=self.periods_remaining,
301
+ )
302
+
303
+ def _activate_random_shock(self) -> None:
304
+ """
305
+ Select and activate one random shock from the mode's shock pool.
306
+
307
+ Prefers shocks not currently active. If all shocks are already active,
308
+ one is reselected and its cost is refreshed.
309
+
310
+ Cost is sampled uniformly in [10%, 25%] of total_income.
311
+ """
312
+ available = SHOCKS[self.mode]
313
+
314
+ # Prefer shocks not yet active to diversify events
315
+ inactive = [s for s in available if s not in self._active_shocks]
316
+ shock = self._rng.choice(inactive if inactive else available)
317
+
318
+ # Sample a cost fraction and convert to absolute amount
319
+ cost_fraction = self._rng.uniform(*SHOCK_COST_RANGE)
320
+ shock_cost = cost_fraction * self._total_income
321
+
322
+ # Add to active list only if not already present
323
+ if shock not in self._active_shocks:
324
+ self._active_shocks.append(shock)
325
+
326
+ # Always update/refresh the cost (covers re-roll of existing shocks)
327
+ self._shock_costs[shock] = shock_cost
328
+
329
+ def _calculate_reward(self, action: Action, total_spent: float) -> Reward:
330
+ """
331
+ Compute the structured Reward for the current period.
332
+
333
+ Component breakdown:
334
+ bills_paid_score ∈ [0.0, 0.4]
335
+ Fraction of essential categories that received ≥ 80% of
336
+ their minimum requirement, scaled by 0.4.
337
+
338
+ savings_progress_score ∈ [0.0, 0.3]
339
+ (savings_balance / savings_goal) × 0.3, capped at 0.3.
340
+
341
+ efficiency_score ∈ {0.0, 0.2}
342
+ 0.2 if total_spent ≤ total_income, else 0.0.
343
+
344
+ shock_resilience_bonus ∈ {0.0, 0.1}
345
+ 0.1 if shocks are active AND total_spent covers all shock
346
+ costs, else 0.0.
347
+
348
+ penalties ∈ (-∞, 0.0]
349
+ −0.3 per essential category with zero allocation.
350
+ −0.1 if total_spent > total_income.
351
+
352
+ total = sum of all components, clipped to [−1.0, 1.0].
353
+
354
+ Args:
355
+ action: Agent's action for this period.
356
+ total_spent: Total funds deployed (allocations + savings).
357
+
358
+ Returns:
359
+ A fully populated Reward model.
360
+ """
361
+ minimums = self.MIN_REQUIREMENTS[self.mode]
362
+
363
+ # Essential categories are those with a non-zero minimum requirement
364
+ essential_cats = {
365
+ cat: frac
366
+ for cat, frac in minimums.items()
367
+ if frac > 0.0
368
+ }
369
+ total_essential = len(essential_cats)
370
+
371
+ # --- bills_paid_score --- (max 0.4) --------------------------------
372
+ categories_covered = 0
373
+ zero_alloc_essentials = 0 # count for penalty calculation
374
+
375
+ for cat, min_fraction in essential_cats.items():
376
+ min_required = min_fraction * self._total_income
377
+ allocated = action.allocations.get(cat, 0.0)
378
+
379
+ if allocated == 0.0:
380
+ # Completely skipped an essential category → penalty later
381
+ zero_alloc_essentials += 1
382
+ elif allocated >= 0.80 * min_required:
383
+ # Covered at least 80% of the minimum → category is satisfied
384
+ categories_covered += 1
385
+
386
+ bills_paid_score = (
387
+ (categories_covered / total_essential) * 0.4
388
+ if total_essential > 0
389
+ else 0.4
390
+ )
391
+
392
+ # --- savings_progress_score --- (max 0.3) --------------------------
393
+ if self._savings_goal > 0:
394
+ raw_savings_score = (self._savings_balance / self._savings_goal) * 0.3
395
+ savings_progress_score = min(raw_savings_score, 0.3)
396
+ else:
397
+ savings_progress_score = 0.0
398
+
399
+ # --- efficiency_score --- (0.2 if within budget, else 0.0) ---------
400
+ efficiency_score = 0.2 if total_spent <= self._total_income else 0.0
401
+
402
+ # --- shock_resilience_bonus --- (0.1 or 0.0) ----------------------
403
+ # Awarded when active shocks exist AND the agent's spending covers
404
+ # the combined shock cost (demonstrating financial resilience)
405
+ shock_resilience_bonus = 0.0
406
+ if self._active_shocks:
407
+ total_shock_cost = sum(self._shock_costs.values())
408
+ if total_spent >= total_shock_cost:
409
+ shock_resilience_bonus = 0.1
410
+
411
+ # --- penalties --- (negative values) ------------------------------
412
+ penalties = 0.0
413
+
414
+ # Hard penalty for each essential category left completely unfunded
415
+ penalties -= 0.3 * zero_alloc_essentials
416
+
417
+ # Penalty for exceeding total available income
418
+ if total_spent > self._total_income:
419
+ penalties -= 0.1
420
+
421
+ # --- total reward --- clipped to [-1.0, 1.0] ----------------------
422
+ total = (
423
+ bills_paid_score
424
+ + savings_progress_score
425
+ + efficiency_score
426
+ + shock_resilience_bonus
427
+ + penalties
428
+ )
429
+ total = max(-1.0, min(1.0, total))
430
+
431
+ return Reward(
432
+ total=total,
433
+ bills_paid_score=bills_paid_score,
434
+ savings_progress_score=savings_progress_score,
435
+ efficiency_score=efficiency_score,
436
+ shock_resilience_bonus=shock_resilience_bonus,
437
+ penalties=penalties,
438
+ )
graders.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ graders.py — Evaluation graders for the BEACON reinforcement learning environment.
3
+
4
+ Each grader runs one complete, fully deterministic episode and returns a
5
+ normalised float score in [0.0, 1.0].
6
+
7
+ Graders:
8
+ grade_task1() — Easy: Bill Coverage (household, 1 period)
9
+ grade_task2() — Medium: Shock Absorption (household, 3 periods)
10
+ grade_task3() — Hard: 6-Month Goal Planning (corporate, 6 periods)
11
+
12
+ run_all_graders() runs all three, prints results, and returns a summary dict.
13
+ """
14
+
15
+ from environment import BEACONEnvironment
16
+ from models import Action
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # GRADER 1 — Easy: Bill Coverage
21
+ # ---------------------------------------------------------------------------
22
+
23
+ def grade_task1() -> float:
24
+ """
25
+ Easy grader: tests whether the agent can cover all essential bills in a
26
+ single period by allocating exactly the minimum required amount to each
27
+ essential category and directing remaining income to savings.
28
+
29
+ Episode config:
30
+ mode="household", total_periods=1, seed=42
31
+
32
+ Scoring:
33
+ score = reward.bills_paid_score / 0.4 → normalised to [0.0, 1.0]
34
+
35
+ Returns:
36
+ A float in [0.0, 1.0] representing bill-coverage performance.
37
+ """
38
+ # --- Set up environment ---------------------------------------------------
39
+ env = BEACONEnvironment(mode="household", total_periods=1, seed=42)
40
+ obs = env.reset()
41
+
42
+ income = obs.total_income
43
+ minimums = BEACONEnvironment.MIN_REQUIREMENTS["household"]
44
+
45
+ # --- Build allocations: exactly the minimum required for each category ----
46
+ # Essential categories have a non-zero minimum fraction; discretionary gets 0.
47
+ allocations: dict[str, float] = {}
48
+ total_bills = 0.0
49
+
50
+ for cat, fraction in minimums.items():
51
+ amount = fraction * income # exact minimum amount
52
+ allocations[cat] = amount
53
+ total_bills += amount
54
+
55
+ # Remaining income after meeting all bills goes into savings
56
+ savings_contribution = max(0.0, income - total_bills)
57
+
58
+ action = Action(
59
+ allocations=allocations,
60
+ savings_contribution=savings_contribution,
61
+ )
62
+
63
+ # --- Run the single step --------------------------------------------------
64
+ _obs, reward, _done, _info = env.step(action)
65
+
66
+ # --- Normalise bills_paid_score from [0.0, 0.4] → [0.0, 1.0] ------------
67
+ score = reward.bills_paid_score / 0.4
68
+ return round(score, 4)
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # GRADER 2 — Medium: Shock Absorption
73
+ # ---------------------------------------------------------------------------
74
+
75
+ def grade_task2() -> float:
76
+ """
77
+ Medium grader: tests the agent's ability to maintain essential spending
78
+ while absorbing unexpected financial shocks across 3 periods.
79
+
80
+ Episode config:
81
+ mode="household", total_periods=3, seed=99
82
+
83
+ Strategy (per step):
84
+ Step 1 — Allocate minimums everywhere; reduce discretionary to help
85
+ absorb the shock cost. Put any remainder into savings.
86
+ Step 2 — Rebalance after shock: re-allocate minimums and re-check
87
+ shock costs; discretionary absorbs overflow again.
88
+ Step 3 — Recovery: allocate minimums, maximise savings contribution
89
+ to push savings_progress_score up.
90
+
91
+ Scoring:
92
+ raw_avg = mean(reward.total) across 3 steps ∈ [-1.0, 1.0]
93
+ score = (raw_avg + 1.0) / 2.0 ∈ [ 0.0, 1.0]
94
+
95
+ Returns:
96
+ A float in [0.0, 1.0] representing shock-resilience performance.
97
+ """
98
+ # --- Set up environment ---------------------------------------------------
99
+ env = BEACONEnvironment(mode="household", total_periods=3, seed=99)
100
+ obs = env.reset()
101
+
102
+ # Force at least one shock active at the start if reset produced none
103
+ if not env._active_shocks:
104
+ env._active_shocks = ["medical_emergency"]
105
+ env._shock_costs = {"medical_emergency": 0.15 * env._total_income}
106
+
107
+ minimums = BEACONEnvironment.MIN_REQUIREMENTS["household"]
108
+
109
+ total_rewards: list[float] = []
110
+
111
+ for step_num in range(1, 4): # steps 1, 2, 3
112
+ income = env._total_income
113
+ shock_cost = sum(env._shock_costs.values()) if env._active_shocks else 0.0
114
+
115
+ # Compute baseline essential spend (sum of all minimum fractions × income)
116
+ essential_spend = sum(
117
+ frac * income
118
+ for cat, frac in minimums.items()
119
+ if frac > 0.0
120
+ )
121
+
122
+ # Budget headroom after essentials
123
+ headroom = income - essential_spend
124
+
125
+ if step_num == 1:
126
+ # Step 1: allocate minimums; let discretionary absorb shock cost
127
+ allocations = {cat: frac * income for cat, frac in minimums.items()}
128
+
129
+ # Add shock cost into discretionary so it shows the agent "spent" it
130
+ shock_absorption = min(shock_cost, max(0.0, headroom))
131
+ allocations["discretionary"] = shock_absorption
132
+
133
+ savings_contribution = max(0.0, headroom - shock_absorption)
134
+
135
+ elif step_num == 2:
136
+ # Step 2: rebalance — refresh shock costs, keep essentials solid
137
+ allocations = {cat: frac * income for cat, frac in minimums.items()}
138
+
139
+ current_shock = sum(env._shock_costs.values()) if env._active_shocks else 0.0
140
+ shock_absorption = min(current_shock, max(0.0, headroom))
141
+ allocations["discretionary"] = shock_absorption
142
+
143
+ savings_contribution = max(0.0, headroom - shock_absorption)
144
+
145
+ else:
146
+ # Step 3: recovery — allocate minimums, maximise savings
147
+ allocations = {cat: frac * income for cat, frac in minimums.items()}
148
+ allocations["discretionary"] = 0.0 # nothing to discretionary
149
+
150
+ # Channel all remaining headroom into savings
151
+ savings_contribution = max(0.0, headroom)
152
+
153
+ action = Action(
154
+ allocations=allocations,
155
+ savings_contribution=savings_contribution,
156
+ )
157
+
158
+ _obs, reward, _done, _info = env.step(action)
159
+ total_rewards.append(reward.total)
160
+
161
+ # --- Normalise mean reward from [-1.0, 1.0] → [0.0, 1.0] ----------------
162
+ avg_reward = sum(total_rewards) / len(total_rewards)
163
+ score = (avg_reward + 1.0) / 2.0
164
+ return round(score, 4)
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # GRADER 3 — Hard: 6-Month Goal Planning
169
+ # ---------------------------------------------------------------------------
170
+
171
+ def grade_task3() -> float:
172
+ """
173
+ Hard grader: tests the agent's ability to meet a multi-period savings goal
174
+ while consistently covering all essential spending in a corporate setting.
175
+
176
+ Episode config:
177
+ mode="corporate", total_periods=6, seed=7
178
+
179
+ Strategy (each of 6 steps):
180
+ - Allocate exactly the minimum required to every category.
181
+ - Contribute 15% of total_income to savings.
182
+ - Keep total spend ≤ total_income (efficiency constraint).
183
+
184
+ Scoring:
185
+ goal_reached = min(savings_balance / savings_goal, 1.0)
186
+ no_misses = 1.0 if no essential category ever had 0 allocation
187
+ else 0.5
188
+ score = (goal_reached × 0.6) + (no_misses × 0.4)
189
+
190
+ Returns:
191
+ A float in [0.0, 1.0] representing long-term planning performance.
192
+ """
193
+ # --- Set up environment ---------------------------------------------------
194
+ env = BEACONEnvironment(mode="corporate", total_periods=6, seed=7)
195
+ obs = env.reset()
196
+
197
+ minimums = BEACONEnvironment.MIN_REQUIREMENTS["corporate"]
198
+
199
+ # Track whether any essential category was ever left at zero allocation
200
+ had_zero_essential = False
201
+
202
+ for _step in range(6):
203
+ income = env._total_income
204
+
205
+ # Allocate exactly the minimum to every category
206
+ allocations: dict[str, float] = {
207
+ cat: frac * income for cat, frac in minimums.items()
208
+ }
209
+
210
+ # Contribute a fixed 15% of income to savings each period
211
+ savings_contribution = 0.15 * income
212
+
213
+ # Check for zero-allocation on any essential before submitting
214
+ for cat, frac in minimums.items():
215
+ if frac > 0.0 and allocations.get(cat, 0.0) == 0.0:
216
+ had_zero_essential = True
217
+
218
+ action = Action(
219
+ allocations=allocations,
220
+ savings_contribution=savings_contribution,
221
+ )
222
+
223
+ _obs, _reward, done, _info = env.step(action)
224
+
225
+ if done:
226
+ break
227
+
228
+ # --- Final score calculation ----------------------------------------------
229
+ savings_balance = env._savings_balance
230
+ savings_goal = env._savings_goal
231
+
232
+ # Fraction of savings goal achieved, capped at 1.0
233
+ goal_reached = min(savings_balance / savings_goal, 1.0) if savings_goal > 0 else 0.0
234
+
235
+ # Full credit if every step had non-zero allocation to all essential cats
236
+ no_misses = 0.5 if had_zero_essential else 1.0
237
+
238
+ score = (goal_reached * 0.6) + (no_misses * 0.4)
239
+ return round(score, 4)
240
+
241
+
242
+ # ---------------------------------------------------------------------------
243
+ # Aggregate runner
244
+ # ---------------------------------------------------------------------------
245
+
246
+ def run_all_graders() -> dict[str, float]:
247
+ """
248
+ Run all three BEACON graders, print individual scores, and return a
249
+ summary dictionary.
250
+
251
+ Each grader is fully deterministic — scores are identical on every run.
252
+
253
+ Returns:
254
+ dict with keys "task1", "task2", "task3" mapping to float scores.
255
+ """
256
+ task1 = grade_task1()
257
+ task2 = grade_task2()
258
+ task3 = grade_task3()
259
+
260
+ print(f"Task 1: {task1:.2f}")
261
+ print(f"Task 2: {task2:.2f}")
262
+ print(f"Task 3: {task3:.2f}")
263
+
264
+ return {
265
+ "task1": task1,
266
+ "task2": task2,
267
+ "task3": task3,
268
+ }
269
+
270
+
271
+ # ---------------------------------------------------------------------------
272
+ # Entry point
273
+ # ---------------------------------------------------------------------------
274
+
275
+ if __name__ == "__main__":
276
+ run_all_graders()
models.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ models.py — Pydantic v2 data models for the BEACON reinforcement learning environment.
3
+
4
+ BEACON (Budget Environment for Agent Control and Optimization of Needs) is a dual-scale
5
+ budget management environment supporting "household" and "corporate" simulation modes.
6
+ """
7
+
8
+ from pydantic import BaseModel
9
+
10
+
11
+ class Observation(BaseModel):
12
+ """
13
+ Represents the observation returned to the agent at each environment step.
14
+
15
+ Contains the full state of the current budget period, including income,
16
+ category-level spending, savings progress, any active economic shocks,
17
+ and how many periods are left in the episode.
18
+ """
19
+
20
+ mode: str
21
+ """Simulation mode — either 'household' or 'corporate'."""
22
+
23
+ period: int
24
+ """Current time period, starting from 1."""
25
+
26
+ total_income: float
27
+ """Total income available for the current period."""
28
+
29
+ category_budgets: dict[str, float]
30
+ """Mapping of category name to the amount allocated for that category."""
31
+
32
+ category_spent: dict[str, float]
33
+ """Mapping of category name to the amount already spent this period."""
34
+
35
+ savings_balance: float
36
+ """Current accumulated savings balance."""
37
+
38
+ savings_goal: float
39
+ """Target savings balance the agent should aim to reach."""
40
+
41
+ active_shocks: list[str]
42
+ """Names of unexpected financial events currently affecting the environment."""
43
+
44
+ periods_remaining: int
45
+ """Number of time periods left before the episode ends."""
46
+
47
+
48
+ class Action(BaseModel):
49
+ """
50
+ Represents the action submitted by the agent for a given time period.
51
+
52
+ The agent specifies how much to allocate to each spending category and
53
+ how much to contribute to savings from the available income.
54
+ """
55
+
56
+ allocations: dict[str, float]
57
+ """Mapping of category name to the amount the agent allocates this period."""
58
+
59
+ savings_contribution: float
60
+ """Amount the agent chooses to add to savings this period."""
61
+
62
+
63
+ class Reward(BaseModel):
64
+ """
65
+ Represents the reward signal returned to the agent after each step.
66
+
67
+ The total reward is a scalar in [-1.0, 1.0] composed of several sub-scores
68
+ that reflect different aspects of budgeting performance: bill coverage,
69
+ savings trajectory, spending efficiency, and resilience to shocks.
70
+ Penalties are subtracted for constraint violations.
71
+ """
72
+
73
+ total: float
74
+ """Final scalar reward for the step, in the range [-1.0, 1.0]."""
75
+
76
+ bills_paid_score: float
77
+ """Score reflecting whether all essential bills and obligations were covered."""
78
+
79
+ savings_progress_score: float
80
+ """Score reflecting progress toward the savings goal."""
81
+
82
+ efficiency_score: float
83
+ """Score reflecting how efficiently income was allocated with minimal waste."""
84
+
85
+ shock_resilience_bonus: float
86
+ """Bonus awarded for successfully absorbing active economic shocks."""
87
+
88
+ penalties: float
89
+ """Cumulative penalty subtracted for constraint violations (e.g., overspending)."""
openenv.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: BEACON
2
+ version: "1.0.0"
3
+ description: >
4
+ Dual-scale budget management environment where agents
5
+ learn to allocate income across household and corporate
6
+ financial categories under constraints and economic shocks.
7
+ author: your_name
8
+ tags: [finance, budgeting, planning, dual-scale]
9
+ modes: [household, corporate]
10
+ tasks:
11
+ - task1
12
+ - task2
13
+ - task3
14
+ ```
15
+
16
+ **`requirements.txt`** — paste this:
17
+ ```
18
+ fastapi
19
+ uvicorn
20
+ pydantic
21
+ openai
22
+ pyyaml
23
+ groq
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ openai
5
+ pyyaml
6
+ groq
server/__pycache__/app.cpython-311.pyc ADDED
Binary file (9.57 kB). View file
 
server/app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py — FastAPI server for the BEACON reinforcement learning environment.
3
+
4
+ Exposes the BEACON environment as a REST API so that agents, dashboards,
5
+ and evaluation pipelines can interact with it over HTTP.
6
+
7
+ Endpoints:
8
+ POST /reset — initialise / reset the environment
9
+ POST /step — submit an action and advance one period
10
+ GET /state — inspect the full current environment state
11
+ GET /tasks — list all available evaluation tasks
12
+ POST /grader — run a specific grader and get a score
13
+ GET /baseline — run all graders and return all scores
14
+ GET /health — liveness check
15
+
16
+ Usage:
17
+ python app.py
18
+ # or
19
+ uvicorn beacon_env.app:app --reload
20
+ """
21
+
22
+ import os
23
+ import sys
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Ensure parent directory (d:/meta) is on the Python path so that
27
+ # environment.py, models.py, and graders.py can be imported as top-level
28
+ # modules from this subdirectory.
29
+ # ---------------------------------------------------------------------------
30
+ _PARENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
31
+ if _PARENT_DIR not in sys.path:
32
+ sys.path.insert(0, _PARENT_DIR)
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # BEACON imports (resolved via sys.path above)
36
+ # ---------------------------------------------------------------------------
37
+ from environment import BEACONEnvironment # noqa: E402
38
+ from models import Action # noqa: E402
39
+ from graders import ( # noqa: E402
40
+ grade_task1,
41
+ grade_task2,
42
+ grade_task3,
43
+ run_all_graders,
44
+ )
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # FastAPI imports
48
+ # ---------------------------------------------------------------------------
49
+ import uvicorn
50
+ from fastapi import FastAPI, HTTPException
51
+ from fastapi.middleware.cors import CORSMiddleware
52
+ from pydantic import BaseModel, Field
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # App setup
57
+ # ---------------------------------------------------------------------------
58
+
59
+ app = FastAPI(
60
+ title="BEACON Environment API",
61
+ description=(
62
+ "REST API for the BEACON dual-scale budget management "
63
+ "reinforcement learning environment."
64
+ ),
65
+ version="1.0.0",
66
+ )
67
+
68
+ # Allow all origins so browser-based agents and dashboards can connect freely
69
+ app.add_middleware(
70
+ CORSMiddleware,
71
+ allow_origins=["*"],
72
+ allow_credentials=True,
73
+ allow_methods=["*"],
74
+ allow_headers=["*"],
75
+ )
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # Global environment instance
80
+ # Starts as None; created / replaced on the first POST /reset call.
81
+ # A default instance is also created at startup so GET endpoints work
82
+ # immediately without requiring a prior reset.
83
+ # ---------------------------------------------------------------------------
84
+
85
+ _env: BEACONEnvironment = BEACONEnvironment(mode="household", seed=42)
86
+
87
+
88
+ def _require_env() -> BEACONEnvironment:
89
+ """Return the global environment, raising 503 if it is uninitialised."""
90
+ if _env is None:
91
+ raise HTTPException(
92
+ status_code=503,
93
+ detail="Environment not initialised. Call POST /reset first.",
94
+ )
95
+ return _env
96
+
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Request / response schemas
100
+ # ---------------------------------------------------------------------------
101
+
102
+ class ResetRequest(BaseModel):
103
+ """Request body for POST /reset."""
104
+ mode: str = Field(default="household", description="'household' or 'corporate'")
105
+ seed: int = Field(default=42, description="Random seed for reproducibility")
106
+ total_periods: int = Field(default=6, description="Number of budget periods per episode")
107
+
108
+
109
+ class GraderRequest(BaseModel):
110
+ """Request body for POST /grader."""
111
+ task_id: str = Field(description="One of: 'task1', 'task2', 'task3'")
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # Task catalogue (static metadata)
116
+ # ---------------------------------------------------------------------------
117
+
118
+ ACTION_SCHEMA = {
119
+ "allocations": "dict[str, float]",
120
+ "savings_contribution": "float",
121
+ }
122
+
123
+ TASK_CATALOGUE = [
124
+ {
125
+ "task_id": "task1",
126
+ "name": "Bill Coverage",
127
+ "difficulty": "easy",
128
+ "description": "Allocate income to cover all essential bills in a single period.",
129
+ "mode": "household",
130
+ "periods": 1,
131
+ "seed": 42,
132
+ "action_schema": ACTION_SCHEMA,
133
+ },
134
+ {
135
+ "task_id": "task2",
136
+ "name": "Shock Absorption",
137
+ "difficulty": "medium",
138
+ "description": (
139
+ "Maintain essential spending while absorbing unexpected "
140
+ "financial shocks across 3 periods."
141
+ ),
142
+ "mode": "household",
143
+ "periods": 3,
144
+ "seed": 99,
145
+ "action_schema": ACTION_SCHEMA,
146
+ },
147
+ {
148
+ "task_id": "task3",
149
+ "name": "6-Month Goal Planning",
150
+ "difficulty": "hard",
151
+ "description": (
152
+ "Manage a corporate budget over 6 periods, covering all "
153
+ "essential categories while reaching the savings goal."
154
+ ),
155
+ "mode": "corporate",
156
+ "periods": 6,
157
+ "seed": 7,
158
+ "action_schema": ACTION_SCHEMA,
159
+ },
160
+ ]
161
+
162
+ # Map task_id → grader function for quick lookup
163
+ _GRADER_MAP = {
164
+ "task1": grade_task1,
165
+ "task2": grade_task2,
166
+ "task3": grade_task3,
167
+ }
168
+
169
+
170
+ # ---------------------------------------------------------------------------
171
+ # Endpoints
172
+ # ---------------------------------------------------------------------------
173
+
174
+ @app.get("/health", summary="Liveness check")
175
+ def health():
176
+ """
177
+ Returns a simple status object confirming the service is running.
178
+ """
179
+ return {"status": "ok", "environment": "BEACON"}
180
+
181
+
182
+ @app.post("/reset", summary="Initialise or reset the environment")
183
+ def reset(body: ResetRequest = ResetRequest()):
184
+ """
185
+ Create a fresh BEACONEnvironment with the given parameters and call
186
+ reset(). Returns the initial Observation as JSON.
187
+
188
+ - **mode**: `"household"` or `"corporate"` (default: `"household"`)
189
+ - **seed**: random seed for reproducibility (default: `42`)
190
+ - **total_periods**: episode length (default: `6`)
191
+ """
192
+ global _env
193
+ try:
194
+ _env = BEACONEnvironment(
195
+ mode=body.mode,
196
+ total_periods=body.total_periods,
197
+ seed=body.seed,
198
+ )
199
+ obs = _env.reset()
200
+ except ValueError as exc:
201
+ raise HTTPException(status_code=400, detail=str(exc))
202
+
203
+ return obs.model_dump()
204
+
205
+
206
+ @app.post("/step", summary="Submit an action and advance one period")
207
+ def step(action: Action):
208
+ """
209
+ Apply the agent's Action to the current environment and advance by one
210
+ budget period.
211
+
212
+ Returns the resulting Observation, Reward, done flag, and info dict.
213
+
214
+ - **allocations**: `{category: amount, ...}` — must cover all categories
215
+ - **savings_contribution**: amount added to savings this period
216
+ """
217
+ env = _require_env()
218
+ obs, reward, done, info = env.step(action)
219
+
220
+ return {
221
+ "observation": obs.model_dump(),
222
+ "reward": reward.model_dump(),
223
+ "done": done,
224
+ "info": info,
225
+ }
226
+
227
+
228
+ @app.get("/state", summary="Inspect the current environment state")
229
+ def state():
230
+ """
231
+ Return the full internal state of the current environment as a plain
232
+ dictionary. Does not advance the episode.
233
+ """
234
+ env = _require_env()
235
+ return env.state()
236
+
237
+
238
+ @app.get("/tasks", summary="List all available evaluation tasks")
239
+ def tasks():
240
+ """
241
+ Return metadata for all three BEACON evaluation tasks, including their
242
+ difficulty, mode, episode length, and expected action schema.
243
+ """
244
+ return TASK_CATALOGUE
245
+
246
+
247
+ @app.post("/grader", summary="Run a specific grader and return its score")
248
+ def grader(body: GraderRequest):
249
+ """
250
+ Execute the grader for the requested task and return the normalised
251
+ score in [0.0, 1.0].
252
+
253
+ - **task_id**: one of `"task1"`, `"task2"`, `"task3"`
254
+ """
255
+ grader_fn = _GRADER_MAP.get(body.task_id)
256
+ if grader_fn is None:
257
+ raise HTTPException(
258
+ status_code=404,
259
+ detail=f"Unknown task_id '{body.task_id}'. "
260
+ f"Valid options: {list(_GRADER_MAP.keys())}",
261
+ )
262
+
263
+ score = grader_fn()
264
+ return {"task_id": body.task_id, "score": score}
265
+
266
+
267
+ @app.get("/baseline", summary="Run all graders and return all scores")
268
+ def baseline():
269
+ """
270
+ Execute all three BEACON graders sequentially and return their scores.
271
+
272
+ This endpoint is deterministic — scores are identical on every call.
273
+ """
274
+ scores = run_all_graders()
275
+ return scores
276
+
277
+
278
+ # ---------------------------------------------------------------------------
279
+ # Entry point
280
+ # ---------------------------------------------------------------------------
281
+
282
+ if __name__ == "__main__":
283
+ uvicorn.run(app, host="0.0.0.0", port=7860)