Spaces:
Sleeping
Sleeping
Commit ·
5a37ff6
1
Parent(s): f72012b
Initial BEACON environment
Browse files- Dockerfile +7 -0
- __pycache__/environment.cpython-311.pyc +0 -0
- __pycache__/graders.cpython-311.pyc +0 -0
- __pycache__/models.cpython-311.pyc +0 -0
- baseline.py +301 -0
- environment.py +438 -0
- graders.py +276 -0
- models.py +89 -0
- openenv.yaml +23 -0
- requirements.txt +6 -0
- server/__pycache__/app.cpython-311.pyc +0 -0
- server/app.py +283 -0
Dockerfile
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
COPY requirements.txt .
|
| 4 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 5 |
+
COPY . .
|
| 6 |
+
EXPOSE 7860
|
| 7 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
__pycache__/environment.cpython-311.pyc
ADDED
|
Binary file (14.8 kB). View file
|
|
|
__pycache__/graders.cpython-311.pyc
ADDED
|
Binary file (9.93 kB). View file
|
|
|
__pycache__/models.cpython-311.pyc
ADDED
|
Binary file (2.97 kB). View file
|
|
|
baseline.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
baseline.py — Groq LLM baseline agent for the BEACON RL environment.
|
| 3 |
+
|
| 4 |
+
Runs a Llama 3 model (via Groq) as a zero-shot budget-allocation agent
|
| 5 |
+
against all three BEACON tasks and prints reproducible episode scores.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
export GROQ_API_KEY="your-key-here"
|
| 9 |
+
python baseline.py
|
| 10 |
+
|
| 11 |
+
Requirements:
|
| 12 |
+
pip install openai
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
from openai import OpenAI
|
| 19 |
+
|
| 20 |
+
from environment import BEACONEnvironment
|
| 21 |
+
from models import Action
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Groq client — OpenAI-compatible endpoint
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
client = OpenAI(
|
| 29 |
+
api_key=os.environ.get("GROQ_API_KEY"),
|
| 30 |
+
base_url="https://api.groq.com/openai/v1",
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
MODEL = "llama3-8b-8192"
|
| 34 |
+
TEMPERATURE = 0 # deterministic completions for reproducibility
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Prompt builder
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
def _build_prompt(obs, step_num: int) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Construct a structured natural-language prompt from the current Observation.
|
| 44 |
+
|
| 45 |
+
The prompt instructs the LLM to return ONLY a valid JSON object with
|
| 46 |
+
`allocations` and `savings_contribution` fields. No prose, no markdown.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
obs: The current Observation from the environment.
|
| 50 |
+
step_num: 1-indexed step number within the episode (for context).
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
A formatted prompt string.
|
| 54 |
+
"""
|
| 55 |
+
# Format category budgets and spent as a readable table
|
| 56 |
+
budget_lines = "\n".join(
|
| 57 |
+
f" {cat}: allocated={obs.category_budgets[cat]:.2f}, "
|
| 58 |
+
f"spent={obs.category_spent[cat]:.2f}"
|
| 59 |
+
for cat in obs.category_budgets
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
shocks_text = (
|
| 63 |
+
", ".join(obs.active_shocks) if obs.active_shocks else "none"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
prompt = f"""You are a financial planning agent managing a {obs.mode} budget.
|
| 67 |
+
|
| 68 |
+
Current state (Period {obs.period} of {obs.period + obs.periods_remaining - 1}):
|
| 69 |
+
- Periods remaining (including this one): {obs.periods_remaining}
|
| 70 |
+
- Total income available this period: {obs.total_income:.2f}
|
| 71 |
+
- Savings balance: {obs.savings_balance:.2f}
|
| 72 |
+
- Savings goal: {obs.savings_goal:.2f}
|
| 73 |
+
- Active financial shocks: {shocks_text}
|
| 74 |
+
|
| 75 |
+
Category budgets and spending so far:
|
| 76 |
+
{budget_lines}
|
| 77 |
+
|
| 78 |
+
Your task:
|
| 79 |
+
Allocate this period's income across all categories and decide how much to save.
|
| 80 |
+
The total of all allocations + savings_contribution must NOT exceed {obs.total_income:.2f}.
|
| 81 |
+
Prioritise essential categories first (avoid allocating 0 to any necessary category).
|
| 82 |
+
Try to make progress toward the savings goal each period.
|
| 83 |
+
|
| 84 |
+
Respond with ONLY a valid JSON object — no explanation, no markdown, no extra text:
|
| 85 |
+
{{
|
| 86 |
+
"allocations": {{
|
| 87 |
+
{", ".join(f'"{cat}": <float>' for cat in obs.category_budgets)}
|
| 88 |
+
}},
|
| 89 |
+
"savings_contribution": <float>
|
| 90 |
+
}}"""
|
| 91 |
+
|
| 92 |
+
return prompt
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# Fallback action
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
|
| 99 |
+
def _fallback_action(obs) -> Action:
|
| 100 |
+
"""
|
| 101 |
+
Build a safe fallback Action using exact minimum required allocations.
|
| 102 |
+
|
| 103 |
+
Used when the LLM response cannot be parsed as valid JSON. Allocates
|
| 104 |
+
exactly the minimum fraction of income to each category and puts any
|
| 105 |
+
remaining income into savings.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
obs: The current Observation (provides income and mode context).
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
A valid Action that satisfies all essential category minimums.
|
| 112 |
+
"""
|
| 113 |
+
minimums = BEACONEnvironment.MIN_REQUIREMENTS[obs.mode]
|
| 114 |
+
income = obs.total_income
|
| 115 |
+
|
| 116 |
+
allocations = {cat: frac * income for cat, frac in minimums.items()}
|
| 117 |
+
total_bills = sum(allocations.values())
|
| 118 |
+
|
| 119 |
+
# Sweep remaining income into savings after covering bills
|
| 120 |
+
savings_contribution = max(0.0, income - total_bills)
|
| 121 |
+
|
| 122 |
+
return Action(
|
| 123 |
+
allocations=allocations,
|
| 124 |
+
savings_contribution=savings_contribution,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# ---------------------------------------------------------------------------
|
| 129 |
+
# LLM action parser
|
| 130 |
+
# ---------------------------------------------------------------------------
|
| 131 |
+
|
| 132 |
+
def _parse_action(response_text: str, obs) -> Action:
|
| 133 |
+
"""
|
| 134 |
+
Parse the LLM's JSON response into a valid Action.
|
| 135 |
+
|
| 136 |
+
Applies two safety guards after parsing:
|
| 137 |
+
1. Clamps all allocation values to non-negative floats.
|
| 138 |
+
2. Scales the entire action down proportionally if total spend would
|
| 139 |
+
exceed total_income, ensuring the agent never overspends.
|
| 140 |
+
|
| 141 |
+
Falls back to minimum allocations if the response is not valid JSON.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
response_text: Raw text returned by the LLM.
|
| 145 |
+
obs: Current Observation (used for income and fallback).
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
A valid Action ready to pass to env.step().
|
| 149 |
+
"""
|
| 150 |
+
try:
|
| 151 |
+
# Strip surrounding whitespace/newlines before parsing
|
| 152 |
+
data = json.loads(response_text.strip())
|
| 153 |
+
|
| 154 |
+
allocations = {
|
| 155 |
+
cat: max(0.0, float(v))
|
| 156 |
+
for cat, v in data["allocations"].items()
|
| 157 |
+
}
|
| 158 |
+
savings_contribution = max(0.0, float(data["savings_contribution"]))
|
| 159 |
+
|
| 160 |
+
# Safety clamp: scale down if total spend exceeds income
|
| 161 |
+
total_requested = sum(allocations.values()) + savings_contribution
|
| 162 |
+
if total_requested > obs.total_income and total_requested > 0:
|
| 163 |
+
scale = obs.total_income / total_requested
|
| 164 |
+
allocations = {cat: amt * scale for cat, amt in allocations.items()}
|
| 165 |
+
savings_contribution *= scale
|
| 166 |
+
|
| 167 |
+
return Action(
|
| 168 |
+
allocations=allocations,
|
| 169 |
+
savings_contribution=savings_contribution,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
except (json.JSONDecodeError, KeyError, TypeError, ValueError) as exc:
|
| 173 |
+
print(f" [WARN] Could not parse LLM response ({type(exc).__name__}: {exc}). "
|
| 174 |
+
f"Using fallback minimum allocations.")
|
| 175 |
+
return _fallback_action(obs)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ---------------------------------------------------------------------------
|
| 179 |
+
# Core episode runner
|
| 180 |
+
# ---------------------------------------------------------------------------
|
| 181 |
+
|
| 182 |
+
def run_agent_episode(mode: str, total_periods: int, seed: int) -> float:
|
| 183 |
+
"""
|
| 184 |
+
Run a full BEACON episode with the Groq LLM agent and return the
|
| 185 |
+
average reward across all periods.
|
| 186 |
+
|
| 187 |
+
At each step the agent receives a natural-language prompt describing
|
| 188 |
+
the current budget state, responds with a JSON allocation plan, and
|
| 189 |
+
the environment returns a structured Reward. If the LLM produces
|
| 190 |
+
unparseable output, a safe minimum-allocation fallback is used.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
mode: BEACON mode — "household" or "corporate".
|
| 194 |
+
total_periods: Number of budget periods in the episode.
|
| 195 |
+
seed: Random seed for environment reproducibility.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Mean reward.total across all completed periods (float in [-1.0, 1.0]).
|
| 199 |
+
"""
|
| 200 |
+
# --- Initialise environment ----------------------------------------------
|
| 201 |
+
env = BEACONEnvironment(mode=mode, total_periods=total_periods, seed=seed)
|
| 202 |
+
obs = env.reset()
|
| 203 |
+
|
| 204 |
+
period_rewards: list[float] = []
|
| 205 |
+
|
| 206 |
+
system_prompt = (
|
| 207 |
+
"You are a precise financial planning agent. "
|
| 208 |
+
"You always respond with ONLY valid JSON — no prose, no markdown fences, "
|
| 209 |
+
"no explanation. Every numeric value must be a plain float."
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# --- Episode loop --------------------------------------------------------
|
| 213 |
+
for step_num in range(1, total_periods + 1):
|
| 214 |
+
user_prompt = _build_prompt(obs, step_num)
|
| 215 |
+
|
| 216 |
+
# --- Query the LLM ---------------------------------------------------
|
| 217 |
+
try:
|
| 218 |
+
response = client.chat.completions.create(
|
| 219 |
+
model=MODEL,
|
| 220 |
+
temperature=TEMPERATURE,
|
| 221 |
+
messages=[
|
| 222 |
+
{"role": "system", "content": system_prompt},
|
| 223 |
+
{"role": "user", "content": user_prompt},
|
| 224 |
+
],
|
| 225 |
+
)
|
| 226 |
+
raw_text = response.choices[0].message.content or ""
|
| 227 |
+
except Exception as exc:
|
| 228 |
+
print(f" [WARN] LLM API call failed (step {step_num}): {exc}. "
|
| 229 |
+
f"Using fallback action.")
|
| 230 |
+
raw_text = "" # triggers fallback in _parse_action
|
| 231 |
+
|
| 232 |
+
# --- Parse response into an Action -----------------------------------
|
| 233 |
+
action = _parse_action(raw_text, obs)
|
| 234 |
+
|
| 235 |
+
# --- Step the environment --------------------------------------------
|
| 236 |
+
obs, reward, done, _info = env.step(action)
|
| 237 |
+
period_rewards.append(reward.total)
|
| 238 |
+
|
| 239 |
+
if done:
|
| 240 |
+
break
|
| 241 |
+
|
| 242 |
+
# --- Average reward across all periods -----------------------------------
|
| 243 |
+
avg_reward = sum(period_rewards) / len(period_rewards) if period_rewards else 0.0
|
| 244 |
+
return avg_reward
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
# ---------------------------------------------------------------------------
|
| 248 |
+
# Top-level baseline runner
|
| 249 |
+
# ---------------------------------------------------------------------------
|
| 250 |
+
|
| 251 |
+
def run_baseline() -> dict[str, float]:
|
| 252 |
+
"""
|
| 253 |
+
Run all three BEACON tasks with the Groq LLM agent and report scores.
|
| 254 |
+
|
| 255 |
+
Tasks:
|
| 256 |
+
Task 1 — Easy: household mode, 1 period, seed=42
|
| 257 |
+
Task 2 — Medium: household mode, 3 periods, seed=99
|
| 258 |
+
Task 3 — Hard: corporate mode, 6 periods, seed=7
|
| 259 |
+
|
| 260 |
+
Each task returns the mean reward across all periods, printed to 2
|
| 261 |
+
decimal places.
|
| 262 |
+
|
| 263 |
+
Returns:
|
| 264 |
+
dict with keys "task1", "task2", "task3" mapping to float scores.
|
| 265 |
+
"""
|
| 266 |
+
print("Running BEACON baseline...")
|
| 267 |
+
print(f" Model : {MODEL}")
|
| 268 |
+
print(f" Temp : {TEMPERATURE}")
|
| 269 |
+
print()
|
| 270 |
+
|
| 271 |
+
# --- Task 1: Easy — Bill Coverage (1 period, household) ------------------
|
| 272 |
+
print("Task 1 (Easy — Bill Coverage)...")
|
| 273 |
+
score1 = run_agent_episode(mode="household", total_periods=1, seed=42)
|
| 274 |
+
print(f"Task 1: {score1:.2f}")
|
| 275 |
+
print()
|
| 276 |
+
|
| 277 |
+
# --- Task 2: Medium — Shock Absorption (3 periods, household) ------------
|
| 278 |
+
print("Task 2 (Medium — Shock Absorption)...")
|
| 279 |
+
score2 = run_agent_episode(mode="household", total_periods=3, seed=99)
|
| 280 |
+
print(f"Task 2: {score2:.2f}")
|
| 281 |
+
print()
|
| 282 |
+
|
| 283 |
+
# --- Task 3: Hard — 6-Month Goal Planning (6 periods, corporate) ---------
|
| 284 |
+
print("Task 3 (Hard — 6-Month Goal Planning)...")
|
| 285 |
+
score3 = run_agent_episode(mode="corporate", total_periods=6, seed=7)
|
| 286 |
+
print(f"Task 3: {score3:.2f}")
|
| 287 |
+
print()
|
| 288 |
+
|
| 289 |
+
return {
|
| 290 |
+
"task1": score1,
|
| 291 |
+
"task2": score2,
|
| 292 |
+
"task3": score3,
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# ---------------------------------------------------------------------------
|
| 297 |
+
# Entry point
|
| 298 |
+
# ---------------------------------------------------------------------------
|
| 299 |
+
|
| 300 |
+
if __name__ == "__main__":
|
| 301 |
+
run_baseline()
|
environment.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
environment.py — BEACON reinforcement learning environment.
|
| 3 |
+
|
| 4 |
+
BEACON (Budget Environment for Agent Control and Optimization of Needs) is a
|
| 5 |
+
dual-scale budget management environment with two operating modes:
|
| 6 |
+
- "household": personal finance simulation (income in Indian Rupees)
|
| 7 |
+
- "corporate": organisational finance simulation
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import random
|
| 11 |
+
from models import Observation, Action, Reward
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ---------------------------------------------------------------------------
|
| 15 |
+
# Module-level configuration constants
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
|
| 18 |
+
MODES = ("household", "corporate")
|
| 19 |
+
|
| 20 |
+
# Spending categories available in each mode
|
| 21 |
+
CATEGORIES: dict[str, list[str]] = {
|
| 22 |
+
"household": [
|
| 23 |
+
"rent", "food", "utilities", "transport",
|
| 24 |
+
"education", "medical", "discretionary",
|
| 25 |
+
],
|
| 26 |
+
"corporate": [
|
| 27 |
+
"payroll", "operations", "marketing", "logistics",
|
| 28 |
+
"capex", "reserves", "miscellaneous",
|
| 29 |
+
],
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
# Income sampling range (inclusive) per mode — household values in Indian Rupees
|
| 33 |
+
INCOME_RANGE: dict[str, tuple[float, float]] = {
|
| 34 |
+
"household": (30_000.0, 100_000.0),
|
| 35 |
+
"corporate": (1_000_000.0, 50_000_000.0),
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Unexpected financial events that can hit the agent mid-episode
|
| 39 |
+
SHOCKS: dict[str, list[str]] = {
|
| 40 |
+
"household": [
|
| 41 |
+
"medical_emergency",
|
| 42 |
+
"appliance_repair",
|
| 43 |
+
"school_fee_spike",
|
| 44 |
+
"utility_surge",
|
| 45 |
+
],
|
| 46 |
+
"corporate": [
|
| 47 |
+
"vendor_default",
|
| 48 |
+
"regulatory_fine",
|
| 49 |
+
"equipment_failure",
|
| 50 |
+
"key_employee_exit",
|
| 51 |
+
],
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Each shock costs between 10% and 25% of total_income (sampled uniformly)
|
| 55 |
+
SHOCK_COST_RANGE: tuple[float, float] = (0.10, 0.25)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Environment class
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
class BEACONEnvironment:
|
| 63 |
+
"""
|
| 64 |
+
BEACON: Budget Environment for Agent Control and Optimization of Needs.
|
| 65 |
+
|
| 66 |
+
An OpenEnv-compatible, dual-scale budget management RL environment.
|
| 67 |
+
The agent manages a budget over `total_periods` steps, allocating funds
|
| 68 |
+
across spending categories, growing savings, and weathering random
|
| 69 |
+
financial shocks.
|
| 70 |
+
|
| 71 |
+
Episode flow:
|
| 72 |
+
obs = env.reset()
|
| 73 |
+
while True:
|
| 74 |
+
action = agent.act(obs)
|
| 75 |
+
obs, reward, done, info = env.step(action)
|
| 76 |
+
if done:
|
| 77 |
+
break
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
# ------------------------------------------------------------------
|
| 81 |
+
# Minimum category allocations as a fraction of total_income.
|
| 82 |
+
# Categories with 0.0 are non-essential (no penalty for zero spend).
|
| 83 |
+
# ------------------------------------------------------------------
|
| 84 |
+
MIN_REQUIREMENTS: dict[str, dict[str, float]] = {
|
| 85 |
+
"household": {
|
| 86 |
+
"rent": 0.25,
|
| 87 |
+
"food": 0.20,
|
| 88 |
+
"utilities": 0.08,
|
| 89 |
+
"transport": 0.05,
|
| 90 |
+
"education": 0.10,
|
| 91 |
+
"medical": 0.05,
|
| 92 |
+
"discretionary": 0.00, # non-essential
|
| 93 |
+
},
|
| 94 |
+
"corporate": {
|
| 95 |
+
"payroll": 0.35,
|
| 96 |
+
"operations": 0.20,
|
| 97 |
+
"marketing": 0.05,
|
| 98 |
+
"logistics": 0.08,
|
| 99 |
+
"capex": 0.05,
|
| 100 |
+
"reserves": 0.10,
|
| 101 |
+
"miscellaneous": 0.00, # non-essential
|
| 102 |
+
},
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
def __init__(
|
| 106 |
+
self,
|
| 107 |
+
mode: str = "household",
|
| 108 |
+
total_periods: int = 6,
|
| 109 |
+
seed: int = 42,
|
| 110 |
+
) -> None:
|
| 111 |
+
"""
|
| 112 |
+
Initialise the BEACON environment.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
mode: Simulation mode — "household" or "corporate".
|
| 116 |
+
total_periods: Number of budget periods in one episode.
|
| 117 |
+
seed: Random seed for full reproducibility.
|
| 118 |
+
|
| 119 |
+
Raises:
|
| 120 |
+
ValueError: If an unrecognised mode is supplied.
|
| 121 |
+
"""
|
| 122 |
+
if mode not in MODES:
|
| 123 |
+
raise ValueError(
|
| 124 |
+
f"Invalid mode '{mode}'. Choose one of {MODES}."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
self.mode = mode
|
| 128 |
+
self.total_periods = total_periods
|
| 129 |
+
self.seed = seed
|
| 130 |
+
|
| 131 |
+
# Isolated RNG — does not pollute global random state
|
| 132 |
+
self._rng = random.Random(seed)
|
| 133 |
+
|
| 134 |
+
# Internal state fields — initialised properly inside reset()
|
| 135 |
+
self._period: int = 1
|
| 136 |
+
self._total_income: float = 0.0
|
| 137 |
+
self._savings_balance: float = 0.0
|
| 138 |
+
self._savings_goal: float = 0.0
|
| 139 |
+
self._category_budgets: dict[str, float] = {}
|
| 140 |
+
self._category_spent: dict[str, float] = {}
|
| 141 |
+
self._active_shocks: list[str] = []
|
| 142 |
+
self._shock_costs: dict[str, float] = {} # shock → cost amount
|
| 143 |
+
|
| 144 |
+
# Start the first episode immediately
|
| 145 |
+
self.reset()
|
| 146 |
+
|
| 147 |
+
# ------------------------------------------------------------------
|
| 148 |
+
# Core API
|
| 149 |
+
# ------------------------------------------------------------------
|
| 150 |
+
|
| 151 |
+
def reset(self) -> Observation:
|
| 152 |
+
"""
|
| 153 |
+
Reset the environment and begin a new episode.
|
| 154 |
+
|
| 155 |
+
Re-seeds the internal RNG so that consecutive reset() calls always
|
| 156 |
+
produce the same starting state (deterministic reproducibility).
|
| 157 |
+
Randomly activates zero or one shock at episode start.
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
The initial Observation for the new episode.
|
| 161 |
+
"""
|
| 162 |
+
# Fresh RNG from the same seed → identical episode starts every call
|
| 163 |
+
self._rng = random.Random(self.seed)
|
| 164 |
+
|
| 165 |
+
# --- Sample income -----------------------------------------------
|
| 166 |
+
lo, hi = INCOME_RANGE[self.mode]
|
| 167 |
+
self._total_income = self._rng.uniform(lo, hi)
|
| 168 |
+
|
| 169 |
+
# --- Savings goal = 20% of projected total income ----------------
|
| 170 |
+
self._savings_goal = 0.20 * self._total_income * self.total_periods
|
| 171 |
+
|
| 172 |
+
# --- Zero-initialise all category tracking -----------------------
|
| 173 |
+
categories = CATEGORIES[self.mode]
|
| 174 |
+
self._category_budgets = {cat: 0.0 for cat in categories}
|
| 175 |
+
self._category_spent = {cat: 0.0 for cat in categories}
|
| 176 |
+
|
| 177 |
+
# --- Reset savings and time counters -----------------------------
|
| 178 |
+
self._savings_balance = 0.0
|
| 179 |
+
self._period = 1
|
| 180 |
+
|
| 181 |
+
# --- Clear shock state, then optionally seed one starting shock --
|
| 182 |
+
self._active_shocks = []
|
| 183 |
+
self._shock_costs = {}
|
| 184 |
+
if self._rng.random() < 0.50: # 50% chance of a starting shock
|
| 185 |
+
self._activate_random_shock()
|
| 186 |
+
|
| 187 |
+
return self._make_observation()
|
| 188 |
+
|
| 189 |
+
def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
|
| 190 |
+
"""
|
| 191 |
+
Execute one budget period using the agent's action.
|
| 192 |
+
|
| 193 |
+
Steps performed:
|
| 194 |
+
1. Apply category allocations → update budgets and spent amounts.
|
| 195 |
+
2. Add savings contribution → update savings balance.
|
| 196 |
+
3. Calculate the multi-component reward signal.
|
| 197 |
+
4. Advance the period counter.
|
| 198 |
+
5. Randomly activate a new shock (30% probability).
|
| 199 |
+
6. Determine episode termination.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
action: The Action submitted by the agent for this period.
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
observation: New environment state after the step.
|
| 206 |
+
reward: Structured Reward for this period.
|
| 207 |
+
done: True when the episode has ended.
|
| 208 |
+
info: Auxiliary diagnostic data (plain dict).
|
| 209 |
+
"""
|
| 210 |
+
# ---- 1. Apply category allocations ------------------------------
|
| 211 |
+
for cat, amount in action.allocations.items():
|
| 212 |
+
if cat in self._category_budgets:
|
| 213 |
+
# Treat the allocation as the amount budgeted and spent
|
| 214 |
+
self._category_budgets[cat] = amount
|
| 215 |
+
self._category_spent[cat] = amount
|
| 216 |
+
|
| 217 |
+
# ---- 2. Update savings balance ----------------------------------
|
| 218 |
+
self._savings_balance += action.savings_contribution
|
| 219 |
+
|
| 220 |
+
# ---- 3. Total spending = all allocations + savings this period --
|
| 221 |
+
total_spent = sum(action.allocations.values()) + action.savings_contribution
|
| 222 |
+
|
| 223 |
+
# ---- 4. Compute reward ------------------------------------------
|
| 224 |
+
reward = self._calculate_reward(action, total_spent)
|
| 225 |
+
|
| 226 |
+
# ---- 5. Advance time period -------------------------------------
|
| 227 |
+
self._period += 1
|
| 228 |
+
|
| 229 |
+
# ---- 6. Randomly activate a new shock (30% probability) ---------
|
| 230 |
+
if self._rng.random() < 0.30:
|
| 231 |
+
self._activate_random_shock()
|
| 232 |
+
|
| 233 |
+
# ---- 7. Episode is done when no periods remain ------------------
|
| 234 |
+
done = self.periods_remaining == 0
|
| 235 |
+
|
| 236 |
+
# ---- 8. Diagnostic info dict ------------------------------------
|
| 237 |
+
info: dict = {
|
| 238 |
+
"period_completed": self._period - 1,
|
| 239 |
+
"total_spent": total_spent,
|
| 240 |
+
"total_income": self._total_income,
|
| 241 |
+
"overspent": total_spent > self._total_income,
|
| 242 |
+
"active_shocks": list(self._active_shocks),
|
| 243 |
+
"shock_costs": dict(self._shock_costs),
|
| 244 |
+
"savings_balance": self._savings_balance,
|
| 245 |
+
"savings_goal": self._savings_goal,
|
| 246 |
+
"periods_remaining": self.periods_remaining,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
return self._make_observation(), reward, done, info
|
| 250 |
+
|
| 251 |
+
def state(self) -> dict:
|
| 252 |
+
"""
|
| 253 |
+
Return the complete current environment state as a plain dictionary.
|
| 254 |
+
|
| 255 |
+
Useful for logging, checkpointing, or external serialisation without
|
| 256 |
+
constructing Pydantic models.
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
A flat dict containing all internal state fields.
|
| 260 |
+
"""
|
| 261 |
+
return {
|
| 262 |
+
"mode": self.mode,
|
| 263 |
+
"period": self._period,
|
| 264 |
+
"total_periods": self.total_periods,
|
| 265 |
+
"periods_remaining": self.periods_remaining,
|
| 266 |
+
"total_income": self._total_income,
|
| 267 |
+
"savings_balance": self._savings_balance,
|
| 268 |
+
"savings_goal": self._savings_goal,
|
| 269 |
+
"category_budgets": dict(self._category_budgets),
|
| 270 |
+
"category_spent": dict(self._category_spent),
|
| 271 |
+
"active_shocks": list(self._active_shocks),
|
| 272 |
+
"shock_costs": dict(self._shock_costs),
|
| 273 |
+
"seed": self.seed,
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
# ------------------------------------------------------------------
|
| 277 |
+
# Properties
|
| 278 |
+
# ------------------------------------------------------------------
|
| 279 |
+
|
| 280 |
+
@property
|
| 281 |
+
def periods_remaining(self) -> int:
|
| 282 |
+
"""Number of budget periods still remaining in the current episode."""
|
| 283 |
+
return max(0, self.total_periods - self._period + 1)
|
| 284 |
+
|
| 285 |
+
# ------------------------------------------------------------------
|
| 286 |
+
# Private helpers
|
| 287 |
+
# ------------------------------------------------------------------
|
| 288 |
+
|
| 289 |
+
def _make_observation(self) -> Observation:
|
| 290 |
+
"""Build and return an Observation from the current internal state."""
|
| 291 |
+
return Observation(
|
| 292 |
+
mode=self.mode,
|
| 293 |
+
period=self._period,
|
| 294 |
+
total_income=self._total_income,
|
| 295 |
+
category_budgets=dict(self._category_budgets),
|
| 296 |
+
category_spent=dict(self._category_spent),
|
| 297 |
+
savings_balance=self._savings_balance,
|
| 298 |
+
savings_goal=self._savings_goal,
|
| 299 |
+
active_shocks=list(self._active_shocks),
|
| 300 |
+
periods_remaining=self.periods_remaining,
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
def _activate_random_shock(self) -> None:
|
| 304 |
+
"""
|
| 305 |
+
Select and activate one random shock from the mode's shock pool.
|
| 306 |
+
|
| 307 |
+
Prefers shocks not currently active. If all shocks are already active,
|
| 308 |
+
one is reselected and its cost is refreshed.
|
| 309 |
+
|
| 310 |
+
Cost is sampled uniformly in [10%, 25%] of total_income.
|
| 311 |
+
"""
|
| 312 |
+
available = SHOCKS[self.mode]
|
| 313 |
+
|
| 314 |
+
# Prefer shocks not yet active to diversify events
|
| 315 |
+
inactive = [s for s in available if s not in self._active_shocks]
|
| 316 |
+
shock = self._rng.choice(inactive if inactive else available)
|
| 317 |
+
|
| 318 |
+
# Sample a cost fraction and convert to absolute amount
|
| 319 |
+
cost_fraction = self._rng.uniform(*SHOCK_COST_RANGE)
|
| 320 |
+
shock_cost = cost_fraction * self._total_income
|
| 321 |
+
|
| 322 |
+
# Add to active list only if not already present
|
| 323 |
+
if shock not in self._active_shocks:
|
| 324 |
+
self._active_shocks.append(shock)
|
| 325 |
+
|
| 326 |
+
# Always update/refresh the cost (covers re-roll of existing shocks)
|
| 327 |
+
self._shock_costs[shock] = shock_cost
|
| 328 |
+
|
| 329 |
+
def _calculate_reward(self, action: Action, total_spent: float) -> Reward:
|
| 330 |
+
"""
|
| 331 |
+
Compute the structured Reward for the current period.
|
| 332 |
+
|
| 333 |
+
Component breakdown:
|
| 334 |
+
bills_paid_score ∈ [0.0, 0.4]
|
| 335 |
+
Fraction of essential categories that received ≥ 80% of
|
| 336 |
+
their minimum requirement, scaled by 0.4.
|
| 337 |
+
|
| 338 |
+
savings_progress_score ∈ [0.0, 0.3]
|
| 339 |
+
(savings_balance / savings_goal) × 0.3, capped at 0.3.
|
| 340 |
+
|
| 341 |
+
efficiency_score ∈ {0.0, 0.2}
|
| 342 |
+
0.2 if total_spent ≤ total_income, else 0.0.
|
| 343 |
+
|
| 344 |
+
shock_resilience_bonus ∈ {0.0, 0.1}
|
| 345 |
+
0.1 if shocks are active AND total_spent covers all shock
|
| 346 |
+
costs, else 0.0.
|
| 347 |
+
|
| 348 |
+
penalties ∈ (-∞, 0.0]
|
| 349 |
+
−0.3 per essential category with zero allocation.
|
| 350 |
+
−0.1 if total_spent > total_income.
|
| 351 |
+
|
| 352 |
+
total = sum of all components, clipped to [−1.0, 1.0].
|
| 353 |
+
|
| 354 |
+
Args:
|
| 355 |
+
action: Agent's action for this period.
|
| 356 |
+
total_spent: Total funds deployed (allocations + savings).
|
| 357 |
+
|
| 358 |
+
Returns:
|
| 359 |
+
A fully populated Reward model.
|
| 360 |
+
"""
|
| 361 |
+
minimums = self.MIN_REQUIREMENTS[self.mode]
|
| 362 |
+
|
| 363 |
+
# Essential categories are those with a non-zero minimum requirement
|
| 364 |
+
essential_cats = {
|
| 365 |
+
cat: frac
|
| 366 |
+
for cat, frac in minimums.items()
|
| 367 |
+
if frac > 0.0
|
| 368 |
+
}
|
| 369 |
+
total_essential = len(essential_cats)
|
| 370 |
+
|
| 371 |
+
# --- bills_paid_score --- (max 0.4) --------------------------------
|
| 372 |
+
categories_covered = 0
|
| 373 |
+
zero_alloc_essentials = 0 # count for penalty calculation
|
| 374 |
+
|
| 375 |
+
for cat, min_fraction in essential_cats.items():
|
| 376 |
+
min_required = min_fraction * self._total_income
|
| 377 |
+
allocated = action.allocations.get(cat, 0.0)
|
| 378 |
+
|
| 379 |
+
if allocated == 0.0:
|
| 380 |
+
# Completely skipped an essential category → penalty later
|
| 381 |
+
zero_alloc_essentials += 1
|
| 382 |
+
elif allocated >= 0.80 * min_required:
|
| 383 |
+
# Covered at least 80% of the minimum → category is satisfied
|
| 384 |
+
categories_covered += 1
|
| 385 |
+
|
| 386 |
+
bills_paid_score = (
|
| 387 |
+
(categories_covered / total_essential) * 0.4
|
| 388 |
+
if total_essential > 0
|
| 389 |
+
else 0.4
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
# --- savings_progress_score --- (max 0.3) --------------------------
|
| 393 |
+
if self._savings_goal > 0:
|
| 394 |
+
raw_savings_score = (self._savings_balance / self._savings_goal) * 0.3
|
| 395 |
+
savings_progress_score = min(raw_savings_score, 0.3)
|
| 396 |
+
else:
|
| 397 |
+
savings_progress_score = 0.0
|
| 398 |
+
|
| 399 |
+
# --- efficiency_score --- (0.2 if within budget, else 0.0) ---------
|
| 400 |
+
efficiency_score = 0.2 if total_spent <= self._total_income else 0.0
|
| 401 |
+
|
| 402 |
+
# --- shock_resilience_bonus --- (0.1 or 0.0) ----------------------
|
| 403 |
+
# Awarded when active shocks exist AND the agent's spending covers
|
| 404 |
+
# the combined shock cost (demonstrating financial resilience)
|
| 405 |
+
shock_resilience_bonus = 0.0
|
| 406 |
+
if self._active_shocks:
|
| 407 |
+
total_shock_cost = sum(self._shock_costs.values())
|
| 408 |
+
if total_spent >= total_shock_cost:
|
| 409 |
+
shock_resilience_bonus = 0.1
|
| 410 |
+
|
| 411 |
+
# --- penalties --- (negative values) ------------------------------
|
| 412 |
+
penalties = 0.0
|
| 413 |
+
|
| 414 |
+
# Hard penalty for each essential category left completely unfunded
|
| 415 |
+
penalties -= 0.3 * zero_alloc_essentials
|
| 416 |
+
|
| 417 |
+
# Penalty for exceeding total available income
|
| 418 |
+
if total_spent > self._total_income:
|
| 419 |
+
penalties -= 0.1
|
| 420 |
+
|
| 421 |
+
# --- total reward --- clipped to [-1.0, 1.0] ----------------------
|
| 422 |
+
total = (
|
| 423 |
+
bills_paid_score
|
| 424 |
+
+ savings_progress_score
|
| 425 |
+
+ efficiency_score
|
| 426 |
+
+ shock_resilience_bonus
|
| 427 |
+
+ penalties
|
| 428 |
+
)
|
| 429 |
+
total = max(-1.0, min(1.0, total))
|
| 430 |
+
|
| 431 |
+
return Reward(
|
| 432 |
+
total=total,
|
| 433 |
+
bills_paid_score=bills_paid_score,
|
| 434 |
+
savings_progress_score=savings_progress_score,
|
| 435 |
+
efficiency_score=efficiency_score,
|
| 436 |
+
shock_resilience_bonus=shock_resilience_bonus,
|
| 437 |
+
penalties=penalties,
|
| 438 |
+
)
|
graders.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders.py — Evaluation graders for the BEACON reinforcement learning environment.
|
| 3 |
+
|
| 4 |
+
Each grader runs one complete, fully deterministic episode and returns a
|
| 5 |
+
normalised float score in [0.0, 1.0].
|
| 6 |
+
|
| 7 |
+
Graders:
|
| 8 |
+
grade_task1() — Easy: Bill Coverage (household, 1 period)
|
| 9 |
+
grade_task2() — Medium: Shock Absorption (household, 3 periods)
|
| 10 |
+
grade_task3() — Hard: 6-Month Goal Planning (corporate, 6 periods)
|
| 11 |
+
|
| 12 |
+
run_all_graders() runs all three, prints results, and returns a summary dict.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from environment import BEACONEnvironment
|
| 16 |
+
from models import Action
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# GRADER 1 — Easy: Bill Coverage
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
def grade_task1() -> float:
|
| 24 |
+
"""
|
| 25 |
+
Easy grader: tests whether the agent can cover all essential bills in a
|
| 26 |
+
single period by allocating exactly the minimum required amount to each
|
| 27 |
+
essential category and directing remaining income to savings.
|
| 28 |
+
|
| 29 |
+
Episode config:
|
| 30 |
+
mode="household", total_periods=1, seed=42
|
| 31 |
+
|
| 32 |
+
Scoring:
|
| 33 |
+
score = reward.bills_paid_score / 0.4 → normalised to [0.0, 1.0]
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
A float in [0.0, 1.0] representing bill-coverage performance.
|
| 37 |
+
"""
|
| 38 |
+
# --- Set up environment ---------------------------------------------------
|
| 39 |
+
env = BEACONEnvironment(mode="household", total_periods=1, seed=42)
|
| 40 |
+
obs = env.reset()
|
| 41 |
+
|
| 42 |
+
income = obs.total_income
|
| 43 |
+
minimums = BEACONEnvironment.MIN_REQUIREMENTS["household"]
|
| 44 |
+
|
| 45 |
+
# --- Build allocations: exactly the minimum required for each category ----
|
| 46 |
+
# Essential categories have a non-zero minimum fraction; discretionary gets 0.
|
| 47 |
+
allocations: dict[str, float] = {}
|
| 48 |
+
total_bills = 0.0
|
| 49 |
+
|
| 50 |
+
for cat, fraction in minimums.items():
|
| 51 |
+
amount = fraction * income # exact minimum amount
|
| 52 |
+
allocations[cat] = amount
|
| 53 |
+
total_bills += amount
|
| 54 |
+
|
| 55 |
+
# Remaining income after meeting all bills goes into savings
|
| 56 |
+
savings_contribution = max(0.0, income - total_bills)
|
| 57 |
+
|
| 58 |
+
action = Action(
|
| 59 |
+
allocations=allocations,
|
| 60 |
+
savings_contribution=savings_contribution,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# --- Run the single step --------------------------------------------------
|
| 64 |
+
_obs, reward, _done, _info = env.step(action)
|
| 65 |
+
|
| 66 |
+
# --- Normalise bills_paid_score from [0.0, 0.4] → [0.0, 1.0] ------------
|
| 67 |
+
score = reward.bills_paid_score / 0.4
|
| 68 |
+
return round(score, 4)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
# GRADER 2 — Medium: Shock Absorption
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
def grade_task2() -> float:
|
| 76 |
+
"""
|
| 77 |
+
Medium grader: tests the agent's ability to maintain essential spending
|
| 78 |
+
while absorbing unexpected financial shocks across 3 periods.
|
| 79 |
+
|
| 80 |
+
Episode config:
|
| 81 |
+
mode="household", total_periods=3, seed=99
|
| 82 |
+
|
| 83 |
+
Strategy (per step):
|
| 84 |
+
Step 1 — Allocate minimums everywhere; reduce discretionary to help
|
| 85 |
+
absorb the shock cost. Put any remainder into savings.
|
| 86 |
+
Step 2 — Rebalance after shock: re-allocate minimums and re-check
|
| 87 |
+
shock costs; discretionary absorbs overflow again.
|
| 88 |
+
Step 3 — Recovery: allocate minimums, maximise savings contribution
|
| 89 |
+
to push savings_progress_score up.
|
| 90 |
+
|
| 91 |
+
Scoring:
|
| 92 |
+
raw_avg = mean(reward.total) across 3 steps ∈ [-1.0, 1.0]
|
| 93 |
+
score = (raw_avg + 1.0) / 2.0 ∈ [ 0.0, 1.0]
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
A float in [0.0, 1.0] representing shock-resilience performance.
|
| 97 |
+
"""
|
| 98 |
+
# --- Set up environment ---------------------------------------------------
|
| 99 |
+
env = BEACONEnvironment(mode="household", total_periods=3, seed=99)
|
| 100 |
+
obs = env.reset()
|
| 101 |
+
|
| 102 |
+
# Force at least one shock active at the start if reset produced none
|
| 103 |
+
if not env._active_shocks:
|
| 104 |
+
env._active_shocks = ["medical_emergency"]
|
| 105 |
+
env._shock_costs = {"medical_emergency": 0.15 * env._total_income}
|
| 106 |
+
|
| 107 |
+
minimums = BEACONEnvironment.MIN_REQUIREMENTS["household"]
|
| 108 |
+
|
| 109 |
+
total_rewards: list[float] = []
|
| 110 |
+
|
| 111 |
+
for step_num in range(1, 4): # steps 1, 2, 3
|
| 112 |
+
income = env._total_income
|
| 113 |
+
shock_cost = sum(env._shock_costs.values()) if env._active_shocks else 0.0
|
| 114 |
+
|
| 115 |
+
# Compute baseline essential spend (sum of all minimum fractions × income)
|
| 116 |
+
essential_spend = sum(
|
| 117 |
+
frac * income
|
| 118 |
+
for cat, frac in minimums.items()
|
| 119 |
+
if frac > 0.0
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Budget headroom after essentials
|
| 123 |
+
headroom = income - essential_spend
|
| 124 |
+
|
| 125 |
+
if step_num == 1:
|
| 126 |
+
# Step 1: allocate minimums; let discretionary absorb shock cost
|
| 127 |
+
allocations = {cat: frac * income for cat, frac in minimums.items()}
|
| 128 |
+
|
| 129 |
+
# Add shock cost into discretionary so it shows the agent "spent" it
|
| 130 |
+
shock_absorption = min(shock_cost, max(0.0, headroom))
|
| 131 |
+
allocations["discretionary"] = shock_absorption
|
| 132 |
+
|
| 133 |
+
savings_contribution = max(0.0, headroom - shock_absorption)
|
| 134 |
+
|
| 135 |
+
elif step_num == 2:
|
| 136 |
+
# Step 2: rebalance — refresh shock costs, keep essentials solid
|
| 137 |
+
allocations = {cat: frac * income for cat, frac in minimums.items()}
|
| 138 |
+
|
| 139 |
+
current_shock = sum(env._shock_costs.values()) if env._active_shocks else 0.0
|
| 140 |
+
shock_absorption = min(current_shock, max(0.0, headroom))
|
| 141 |
+
allocations["discretionary"] = shock_absorption
|
| 142 |
+
|
| 143 |
+
savings_contribution = max(0.0, headroom - shock_absorption)
|
| 144 |
+
|
| 145 |
+
else:
|
| 146 |
+
# Step 3: recovery — allocate minimums, maximise savings
|
| 147 |
+
allocations = {cat: frac * income for cat, frac in minimums.items()}
|
| 148 |
+
allocations["discretionary"] = 0.0 # nothing to discretionary
|
| 149 |
+
|
| 150 |
+
# Channel all remaining headroom into savings
|
| 151 |
+
savings_contribution = max(0.0, headroom)
|
| 152 |
+
|
| 153 |
+
action = Action(
|
| 154 |
+
allocations=allocations,
|
| 155 |
+
savings_contribution=savings_contribution,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
_obs, reward, _done, _info = env.step(action)
|
| 159 |
+
total_rewards.append(reward.total)
|
| 160 |
+
|
| 161 |
+
# --- Normalise mean reward from [-1.0, 1.0] → [0.0, 1.0] ----------------
|
| 162 |
+
avg_reward = sum(total_rewards) / len(total_rewards)
|
| 163 |
+
score = (avg_reward + 1.0) / 2.0
|
| 164 |
+
return round(score, 4)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# GRADER 3 — Hard: 6-Month Goal Planning
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
def grade_task3() -> float:
|
| 172 |
+
"""
|
| 173 |
+
Hard grader: tests the agent's ability to meet a multi-period savings goal
|
| 174 |
+
while consistently covering all essential spending in a corporate setting.
|
| 175 |
+
|
| 176 |
+
Episode config:
|
| 177 |
+
mode="corporate", total_periods=6, seed=7
|
| 178 |
+
|
| 179 |
+
Strategy (each of 6 steps):
|
| 180 |
+
- Allocate exactly the minimum required to every category.
|
| 181 |
+
- Contribute 15% of total_income to savings.
|
| 182 |
+
- Keep total spend ≤ total_income (efficiency constraint).
|
| 183 |
+
|
| 184 |
+
Scoring:
|
| 185 |
+
goal_reached = min(savings_balance / savings_goal, 1.0)
|
| 186 |
+
no_misses = 1.0 if no essential category ever had 0 allocation
|
| 187 |
+
else 0.5
|
| 188 |
+
score = (goal_reached × 0.6) + (no_misses × 0.4)
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
A float in [0.0, 1.0] representing long-term planning performance.
|
| 192 |
+
"""
|
| 193 |
+
# --- Set up environment ---------------------------------------------------
|
| 194 |
+
env = BEACONEnvironment(mode="corporate", total_periods=6, seed=7)
|
| 195 |
+
obs = env.reset()
|
| 196 |
+
|
| 197 |
+
minimums = BEACONEnvironment.MIN_REQUIREMENTS["corporate"]
|
| 198 |
+
|
| 199 |
+
# Track whether any essential category was ever left at zero allocation
|
| 200 |
+
had_zero_essential = False
|
| 201 |
+
|
| 202 |
+
for _step in range(6):
|
| 203 |
+
income = env._total_income
|
| 204 |
+
|
| 205 |
+
# Allocate exactly the minimum to every category
|
| 206 |
+
allocations: dict[str, float] = {
|
| 207 |
+
cat: frac * income for cat, frac in minimums.items()
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# Contribute a fixed 15% of income to savings each period
|
| 211 |
+
savings_contribution = 0.15 * income
|
| 212 |
+
|
| 213 |
+
# Check for zero-allocation on any essential before submitting
|
| 214 |
+
for cat, frac in minimums.items():
|
| 215 |
+
if frac > 0.0 and allocations.get(cat, 0.0) == 0.0:
|
| 216 |
+
had_zero_essential = True
|
| 217 |
+
|
| 218 |
+
action = Action(
|
| 219 |
+
allocations=allocations,
|
| 220 |
+
savings_contribution=savings_contribution,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
_obs, _reward, done, _info = env.step(action)
|
| 224 |
+
|
| 225 |
+
if done:
|
| 226 |
+
break
|
| 227 |
+
|
| 228 |
+
# --- Final score calculation ----------------------------------------------
|
| 229 |
+
savings_balance = env._savings_balance
|
| 230 |
+
savings_goal = env._savings_goal
|
| 231 |
+
|
| 232 |
+
# Fraction of savings goal achieved, capped at 1.0
|
| 233 |
+
goal_reached = min(savings_balance / savings_goal, 1.0) if savings_goal > 0 else 0.0
|
| 234 |
+
|
| 235 |
+
# Full credit if every step had non-zero allocation to all essential cats
|
| 236 |
+
no_misses = 0.5 if had_zero_essential else 1.0
|
| 237 |
+
|
| 238 |
+
score = (goal_reached * 0.6) + (no_misses * 0.4)
|
| 239 |
+
return round(score, 4)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# ---------------------------------------------------------------------------
|
| 243 |
+
# Aggregate runner
|
| 244 |
+
# ---------------------------------------------------------------------------
|
| 245 |
+
|
| 246 |
+
def run_all_graders() -> dict[str, float]:
|
| 247 |
+
"""
|
| 248 |
+
Run all three BEACON graders, print individual scores, and return a
|
| 249 |
+
summary dictionary.
|
| 250 |
+
|
| 251 |
+
Each grader is fully deterministic — scores are identical on every run.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
dict with keys "task1", "task2", "task3" mapping to float scores.
|
| 255 |
+
"""
|
| 256 |
+
task1 = grade_task1()
|
| 257 |
+
task2 = grade_task2()
|
| 258 |
+
task3 = grade_task3()
|
| 259 |
+
|
| 260 |
+
print(f"Task 1: {task1:.2f}")
|
| 261 |
+
print(f"Task 2: {task2:.2f}")
|
| 262 |
+
print(f"Task 3: {task3:.2f}")
|
| 263 |
+
|
| 264 |
+
return {
|
| 265 |
+
"task1": task1,
|
| 266 |
+
"task2": task2,
|
| 267 |
+
"task3": task3,
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
# ---------------------------------------------------------------------------
|
| 272 |
+
# Entry point
|
| 273 |
+
# ---------------------------------------------------------------------------
|
| 274 |
+
|
| 275 |
+
if __name__ == "__main__":
|
| 276 |
+
run_all_graders()
|
models.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
models.py — Pydantic v2 data models for the BEACON reinforcement learning environment.
|
| 3 |
+
|
| 4 |
+
BEACON (Budget Environment for Agent Control and Optimization of Needs) is a dual-scale
|
| 5 |
+
budget management environment supporting "household" and "corporate" simulation modes.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Observation(BaseModel):
|
| 12 |
+
"""
|
| 13 |
+
Represents the observation returned to the agent at each environment step.
|
| 14 |
+
|
| 15 |
+
Contains the full state of the current budget period, including income,
|
| 16 |
+
category-level spending, savings progress, any active economic shocks,
|
| 17 |
+
and how many periods are left in the episode.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
mode: str
|
| 21 |
+
"""Simulation mode — either 'household' or 'corporate'."""
|
| 22 |
+
|
| 23 |
+
period: int
|
| 24 |
+
"""Current time period, starting from 1."""
|
| 25 |
+
|
| 26 |
+
total_income: float
|
| 27 |
+
"""Total income available for the current period."""
|
| 28 |
+
|
| 29 |
+
category_budgets: dict[str, float]
|
| 30 |
+
"""Mapping of category name to the amount allocated for that category."""
|
| 31 |
+
|
| 32 |
+
category_spent: dict[str, float]
|
| 33 |
+
"""Mapping of category name to the amount already spent this period."""
|
| 34 |
+
|
| 35 |
+
savings_balance: float
|
| 36 |
+
"""Current accumulated savings balance."""
|
| 37 |
+
|
| 38 |
+
savings_goal: float
|
| 39 |
+
"""Target savings balance the agent should aim to reach."""
|
| 40 |
+
|
| 41 |
+
active_shocks: list[str]
|
| 42 |
+
"""Names of unexpected financial events currently affecting the environment."""
|
| 43 |
+
|
| 44 |
+
periods_remaining: int
|
| 45 |
+
"""Number of time periods left before the episode ends."""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class Action(BaseModel):
|
| 49 |
+
"""
|
| 50 |
+
Represents the action submitted by the agent for a given time period.
|
| 51 |
+
|
| 52 |
+
The agent specifies how much to allocate to each spending category and
|
| 53 |
+
how much to contribute to savings from the available income.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
allocations: dict[str, float]
|
| 57 |
+
"""Mapping of category name to the amount the agent allocates this period."""
|
| 58 |
+
|
| 59 |
+
savings_contribution: float
|
| 60 |
+
"""Amount the agent chooses to add to savings this period."""
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class Reward(BaseModel):
|
| 64 |
+
"""
|
| 65 |
+
Represents the reward signal returned to the agent after each step.
|
| 66 |
+
|
| 67 |
+
The total reward is a scalar in [-1.0, 1.0] composed of several sub-scores
|
| 68 |
+
that reflect different aspects of budgeting performance: bill coverage,
|
| 69 |
+
savings trajectory, spending efficiency, and resilience to shocks.
|
| 70 |
+
Penalties are subtracted for constraint violations.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
total: float
|
| 74 |
+
"""Final scalar reward for the step, in the range [-1.0, 1.0]."""
|
| 75 |
+
|
| 76 |
+
bills_paid_score: float
|
| 77 |
+
"""Score reflecting whether all essential bills and obligations were covered."""
|
| 78 |
+
|
| 79 |
+
savings_progress_score: float
|
| 80 |
+
"""Score reflecting progress toward the savings goal."""
|
| 81 |
+
|
| 82 |
+
efficiency_score: float
|
| 83 |
+
"""Score reflecting how efficiently income was allocated with minimal waste."""
|
| 84 |
+
|
| 85 |
+
shock_resilience_bonus: float
|
| 86 |
+
"""Bonus awarded for successfully absorbing active economic shocks."""
|
| 87 |
+
|
| 88 |
+
penalties: float
|
| 89 |
+
"""Cumulative penalty subtracted for constraint violations (e.g., overspending)."""
|
openenv.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: BEACON
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
Dual-scale budget management environment where agents
|
| 5 |
+
learn to allocate income across household and corporate
|
| 6 |
+
financial categories under constraints and economic shocks.
|
| 7 |
+
author: your_name
|
| 8 |
+
tags: [finance, budgeting, planning, dual-scale]
|
| 9 |
+
modes: [household, corporate]
|
| 10 |
+
tasks:
|
| 11 |
+
- task1
|
| 12 |
+
- task2
|
| 13 |
+
- task3
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
**`requirements.txt`** — paste this:
|
| 17 |
+
```
|
| 18 |
+
fastapi
|
| 19 |
+
uvicorn
|
| 20 |
+
pydantic
|
| 21 |
+
openai
|
| 22 |
+
pyyaml
|
| 23 |
+
groq
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pydantic
|
| 4 |
+
openai
|
| 5 |
+
pyyaml
|
| 6 |
+
groq
|
server/__pycache__/app.cpython-311.pyc
ADDED
|
Binary file (9.57 kB). View file
|
|
|
server/app.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py — FastAPI server for the BEACON reinforcement learning environment.
|
| 3 |
+
|
| 4 |
+
Exposes the BEACON environment as a REST API so that agents, dashboards,
|
| 5 |
+
and evaluation pipelines can interact with it over HTTP.
|
| 6 |
+
|
| 7 |
+
Endpoints:
|
| 8 |
+
POST /reset — initialise / reset the environment
|
| 9 |
+
POST /step — submit an action and advance one period
|
| 10 |
+
GET /state — inspect the full current environment state
|
| 11 |
+
GET /tasks — list all available evaluation tasks
|
| 12 |
+
POST /grader — run a specific grader and get a score
|
| 13 |
+
GET /baseline — run all graders and return all scores
|
| 14 |
+
GET /health — liveness check
|
| 15 |
+
|
| 16 |
+
Usage:
|
| 17 |
+
python app.py
|
| 18 |
+
# or
|
| 19 |
+
uvicorn beacon_env.app:app --reload
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import os
|
| 23 |
+
import sys
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# Ensure parent directory (d:/meta) is on the Python path so that
|
| 27 |
+
# environment.py, models.py, and graders.py can be imported as top-level
|
| 28 |
+
# modules from this subdirectory.
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
_PARENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 31 |
+
if _PARENT_DIR not in sys.path:
|
| 32 |
+
sys.path.insert(0, _PARENT_DIR)
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# BEACON imports (resolved via sys.path above)
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
from environment import BEACONEnvironment # noqa: E402
|
| 38 |
+
from models import Action # noqa: E402
|
| 39 |
+
from graders import ( # noqa: E402
|
| 40 |
+
grade_task1,
|
| 41 |
+
grade_task2,
|
| 42 |
+
grade_task3,
|
| 43 |
+
run_all_graders,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# FastAPI imports
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
import uvicorn
|
| 50 |
+
from fastapi import FastAPI, HTTPException
|
| 51 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 52 |
+
from pydantic import BaseModel, Field
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# App setup
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
app = FastAPI(
|
| 60 |
+
title="BEACON Environment API",
|
| 61 |
+
description=(
|
| 62 |
+
"REST API for the BEACON dual-scale budget management "
|
| 63 |
+
"reinforcement learning environment."
|
| 64 |
+
),
|
| 65 |
+
version="1.0.0",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Allow all origins so browser-based agents and dashboards can connect freely
|
| 69 |
+
app.add_middleware(
|
| 70 |
+
CORSMiddleware,
|
| 71 |
+
allow_origins=["*"],
|
| 72 |
+
allow_credentials=True,
|
| 73 |
+
allow_methods=["*"],
|
| 74 |
+
allow_headers=["*"],
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
# Global environment instance
|
| 80 |
+
# Starts as None; created / replaced on the first POST /reset call.
|
| 81 |
+
# A default instance is also created at startup so GET endpoints work
|
| 82 |
+
# immediately without requiring a prior reset.
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
|
| 85 |
+
_env: BEACONEnvironment = BEACONEnvironment(mode="household", seed=42)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _require_env() -> BEACONEnvironment:
|
| 89 |
+
"""Return the global environment, raising 503 if it is uninitialised."""
|
| 90 |
+
if _env is None:
|
| 91 |
+
raise HTTPException(
|
| 92 |
+
status_code=503,
|
| 93 |
+
detail="Environment not initialised. Call POST /reset first.",
|
| 94 |
+
)
|
| 95 |
+
return _env
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
# Request / response schemas
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
|
| 102 |
+
class ResetRequest(BaseModel):
|
| 103 |
+
"""Request body for POST /reset."""
|
| 104 |
+
mode: str = Field(default="household", description="'household' or 'corporate'")
|
| 105 |
+
seed: int = Field(default=42, description="Random seed for reproducibility")
|
| 106 |
+
total_periods: int = Field(default=6, description="Number of budget periods per episode")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class GraderRequest(BaseModel):
|
| 110 |
+
"""Request body for POST /grader."""
|
| 111 |
+
task_id: str = Field(description="One of: 'task1', 'task2', 'task3'")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
# Task catalogue (static metadata)
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
|
| 118 |
+
ACTION_SCHEMA = {
|
| 119 |
+
"allocations": "dict[str, float]",
|
| 120 |
+
"savings_contribution": "float",
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
TASK_CATALOGUE = [
|
| 124 |
+
{
|
| 125 |
+
"task_id": "task1",
|
| 126 |
+
"name": "Bill Coverage",
|
| 127 |
+
"difficulty": "easy",
|
| 128 |
+
"description": "Allocate income to cover all essential bills in a single period.",
|
| 129 |
+
"mode": "household",
|
| 130 |
+
"periods": 1,
|
| 131 |
+
"seed": 42,
|
| 132 |
+
"action_schema": ACTION_SCHEMA,
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"task_id": "task2",
|
| 136 |
+
"name": "Shock Absorption",
|
| 137 |
+
"difficulty": "medium",
|
| 138 |
+
"description": (
|
| 139 |
+
"Maintain essential spending while absorbing unexpected "
|
| 140 |
+
"financial shocks across 3 periods."
|
| 141 |
+
),
|
| 142 |
+
"mode": "household",
|
| 143 |
+
"periods": 3,
|
| 144 |
+
"seed": 99,
|
| 145 |
+
"action_schema": ACTION_SCHEMA,
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"task_id": "task3",
|
| 149 |
+
"name": "6-Month Goal Planning",
|
| 150 |
+
"difficulty": "hard",
|
| 151 |
+
"description": (
|
| 152 |
+
"Manage a corporate budget over 6 periods, covering all "
|
| 153 |
+
"essential categories while reaching the savings goal."
|
| 154 |
+
),
|
| 155 |
+
"mode": "corporate",
|
| 156 |
+
"periods": 6,
|
| 157 |
+
"seed": 7,
|
| 158 |
+
"action_schema": ACTION_SCHEMA,
|
| 159 |
+
},
|
| 160 |
+
]
|
| 161 |
+
|
| 162 |
+
# Map task_id → grader function for quick lookup
|
| 163 |
+
_GRADER_MAP = {
|
| 164 |
+
"task1": grade_task1,
|
| 165 |
+
"task2": grade_task2,
|
| 166 |
+
"task3": grade_task3,
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ---------------------------------------------------------------------------
|
| 171 |
+
# Endpoints
|
| 172 |
+
# ---------------------------------------------------------------------------
|
| 173 |
+
|
| 174 |
+
@app.get("/health", summary="Liveness check")
|
| 175 |
+
def health():
|
| 176 |
+
"""
|
| 177 |
+
Returns a simple status object confirming the service is running.
|
| 178 |
+
"""
|
| 179 |
+
return {"status": "ok", "environment": "BEACON"}
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
@app.post("/reset", summary="Initialise or reset the environment")
|
| 183 |
+
def reset(body: ResetRequest = ResetRequest()):
|
| 184 |
+
"""
|
| 185 |
+
Create a fresh BEACONEnvironment with the given parameters and call
|
| 186 |
+
reset(). Returns the initial Observation as JSON.
|
| 187 |
+
|
| 188 |
+
- **mode**: `"household"` or `"corporate"` (default: `"household"`)
|
| 189 |
+
- **seed**: random seed for reproducibility (default: `42`)
|
| 190 |
+
- **total_periods**: episode length (default: `6`)
|
| 191 |
+
"""
|
| 192 |
+
global _env
|
| 193 |
+
try:
|
| 194 |
+
_env = BEACONEnvironment(
|
| 195 |
+
mode=body.mode,
|
| 196 |
+
total_periods=body.total_periods,
|
| 197 |
+
seed=body.seed,
|
| 198 |
+
)
|
| 199 |
+
obs = _env.reset()
|
| 200 |
+
except ValueError as exc:
|
| 201 |
+
raise HTTPException(status_code=400, detail=str(exc))
|
| 202 |
+
|
| 203 |
+
return obs.model_dump()
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
@app.post("/step", summary="Submit an action and advance one period")
|
| 207 |
+
def step(action: Action):
|
| 208 |
+
"""
|
| 209 |
+
Apply the agent's Action to the current environment and advance by one
|
| 210 |
+
budget period.
|
| 211 |
+
|
| 212 |
+
Returns the resulting Observation, Reward, done flag, and info dict.
|
| 213 |
+
|
| 214 |
+
- **allocations**: `{category: amount, ...}` — must cover all categories
|
| 215 |
+
- **savings_contribution**: amount added to savings this period
|
| 216 |
+
"""
|
| 217 |
+
env = _require_env()
|
| 218 |
+
obs, reward, done, info = env.step(action)
|
| 219 |
+
|
| 220 |
+
return {
|
| 221 |
+
"observation": obs.model_dump(),
|
| 222 |
+
"reward": reward.model_dump(),
|
| 223 |
+
"done": done,
|
| 224 |
+
"info": info,
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
@app.get("/state", summary="Inspect the current environment state")
|
| 229 |
+
def state():
|
| 230 |
+
"""
|
| 231 |
+
Return the full internal state of the current environment as a plain
|
| 232 |
+
dictionary. Does not advance the episode.
|
| 233 |
+
"""
|
| 234 |
+
env = _require_env()
|
| 235 |
+
return env.state()
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
@app.get("/tasks", summary="List all available evaluation tasks")
|
| 239 |
+
def tasks():
|
| 240 |
+
"""
|
| 241 |
+
Return metadata for all three BEACON evaluation tasks, including their
|
| 242 |
+
difficulty, mode, episode length, and expected action schema.
|
| 243 |
+
"""
|
| 244 |
+
return TASK_CATALOGUE
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
@app.post("/grader", summary="Run a specific grader and return its score")
|
| 248 |
+
def grader(body: GraderRequest):
|
| 249 |
+
"""
|
| 250 |
+
Execute the grader for the requested task and return the normalised
|
| 251 |
+
score in [0.0, 1.0].
|
| 252 |
+
|
| 253 |
+
- **task_id**: one of `"task1"`, `"task2"`, `"task3"`
|
| 254 |
+
"""
|
| 255 |
+
grader_fn = _GRADER_MAP.get(body.task_id)
|
| 256 |
+
if grader_fn is None:
|
| 257 |
+
raise HTTPException(
|
| 258 |
+
status_code=404,
|
| 259 |
+
detail=f"Unknown task_id '{body.task_id}'. "
|
| 260 |
+
f"Valid options: {list(_GRADER_MAP.keys())}",
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
score = grader_fn()
|
| 264 |
+
return {"task_id": body.task_id, "score": score}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
@app.get("/baseline", summary="Run all graders and return all scores")
|
| 268 |
+
def baseline():
|
| 269 |
+
"""
|
| 270 |
+
Execute all three BEACON graders sequentially and return their scores.
|
| 271 |
+
|
| 272 |
+
This endpoint is deterministic — scores are identical on every call.
|
| 273 |
+
"""
|
| 274 |
+
scores = run_all_graders()
|
| 275 |
+
return scores
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
# ---------------------------------------------------------------------------
|
| 279 |
+
# Entry point
|
| 280 |
+
# ---------------------------------------------------------------------------
|
| 281 |
+
|
| 282 |
+
if __name__ == "__main__":
|
| 283 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|