Spaces:

sakthivarshans
/

beacon-env

Sleeping

App Files Files Community

sakthivarshans commited on 19 days ago

Commit

5a37ff6

1 Parent(s): f72012b

Initial BEACON environment

Browse files

Files changed (12) hide show

Dockerfile +7 -0
__pycache__/environment.cpython-311.pyc +0 -0
__pycache__/graders.cpython-311.pyc +0 -0
__pycache__/models.cpython-311.pyc +0 -0
baseline.py +301 -0
environment.py +438 -0
graders.py +276 -0
models.py +89 -0
openenv.yaml +23 -0
requirements.txt +6 -0
server/__pycache__/app.cpython-311.pyc +0 -0
server/app.py +283 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,7 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/environment.cpython-311.pyc ADDED Viewed

Binary file (14.8 kB). View file

__pycache__/graders.cpython-311.pyc ADDED Viewed

Binary file (9.93 kB). View file

__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (2.97 kB). View file

baseline.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""
+baseline.py — Groq LLM baseline agent for the BEACON RL environment.
+Runs a Llama 3 model (via Groq) as a zero-shot budget-allocation agent
+against all three BEACON tasks and prints reproducible episode scores.
+Usage:
+    export GROQ_API_KEY="your-key-here"
+    python baseline.py
+Requirements:
+    pip install openai
+"""
+import json
+import os
+from openai import OpenAI
+from environment import BEACONEnvironment
+from models import Action
+# ---------------------------------------------------------------------------
+# Groq client — OpenAI-compatible endpoint
+# ---------------------------------------------------------------------------
+client = OpenAI(
+    api_key=os.environ.get("GROQ_API_KEY"),
+    base_url="https://api.groq.com/openai/v1",
+)
+MODEL       = "llama3-8b-8192"
+TEMPERATURE = 0          # deterministic completions for reproducibility
+# ---------------------------------------------------------------------------
+# Prompt builder
+# ---------------------------------------------------------------------------
+def _build_prompt(obs, step_num: int) -> str:
+    """
+    Construct a structured natural-language prompt from the current Observation.
+    The prompt instructs the LLM to return ONLY a valid JSON object with
+    `allocations` and `savings_contribution` fields. No prose, no markdown.
+    Args:
+        obs:      The current Observation from the environment.
+        step_num: 1-indexed step number within the episode (for context).
+    Returns:
+        A formatted prompt string.
+    """
+    # Format category budgets and spent as a readable table
+    budget_lines = "\n".join(
+        f"  {cat}: allocated={obs.category_budgets[cat]:.2f}, "
+        f"spent={obs.category_spent[cat]:.2f}"
+        for cat in obs.category_budgets
+    )
+    shocks_text = (
+        ", ".join(obs.active_shocks) if obs.active_shocks else "none"
+    )
+    prompt = f"""You are a financial planning agent managing a {obs.mode} budget.
+Current state (Period {obs.period} of {obs.period + obs.periods_remaining - 1}):
+  - Periods remaining (including this one): {obs.periods_remaining}
+  - Total income available this period: {obs.total_income:.2f}
+  - Savings balance: {obs.savings_balance:.2f}
+  - Savings goal: {obs.savings_goal:.2f}
+  - Active financial shocks: {shocks_text}
+Category budgets and spending so far:
+{budget_lines}
+Your task:
+  Allocate this period's income across all categories and decide how much to save.
+  The total of all allocations + savings_contribution must NOT exceed {obs.total_income:.2f}.
+  Prioritise essential categories first (avoid allocating 0 to any necessary category).
+  Try to make progress toward the savings goal each period.
+Respond with ONLY a valid JSON object — no explanation, no markdown, no extra text:
+{{
+  "allocations": {{
+    {", ".join(f'"{cat}": <float>' for cat in obs.category_budgets)}
+  }},
+  "savings_contribution": <float>
+}}"""
+    return prompt
+# ---------------------------------------------------------------------------
+# Fallback action
+# ---------------------------------------------------------------------------
+def _fallback_action(obs) -> Action:
+    """
+    Build a safe fallback Action using exact minimum required allocations.
+    Used when the LLM response cannot be parsed as valid JSON. Allocates
+    exactly the minimum fraction of income to each category and puts any
+    remaining income into savings.
+    Args:
+        obs: The current Observation (provides income and mode context).
+    Returns:
+        A valid Action that satisfies all essential category minimums.
+    """
+    minimums    = BEACONEnvironment.MIN_REQUIREMENTS[obs.mode]
+    income      = obs.total_income
+    allocations = {cat: frac * income for cat, frac in minimums.items()}
+    total_bills = sum(allocations.values())
+    # Sweep remaining income into savings after covering bills
+    savings_contribution = max(0.0, income - total_bills)
+    return Action(
+        allocations=allocations,
+        savings_contribution=savings_contribution,
+    )
+# ---------------------------------------------------------------------------
+# LLM action parser
+# ---------------------------------------------------------------------------
+def _parse_action(response_text: str, obs) -> Action:
+    """
+    Parse the LLM's JSON response into a valid Action.
+    Applies two safety guards after parsing:
+      1. Clamps all allocation values to non-negative floats.
+      2. Scales the entire action down proportionally if total spend would
+         exceed total_income, ensuring the agent never overspends.
+    Falls back to minimum allocations if the response is not valid JSON.
+    Args:
+        response_text: Raw text returned by the LLM.
+        obs:           Current Observation (used for income and fallback).
+    Returns:
+        A valid Action ready to pass to env.step().
+    """
+    try:
+        # Strip surrounding whitespace/newlines before parsing
+        data = json.loads(response_text.strip())
+        allocations          = {
+            cat: max(0.0, float(v))
+            for cat, v in data["allocations"].items()
+        }
+        savings_contribution = max(0.0, float(data["savings_contribution"]))
+        # Safety clamp: scale down if total spend exceeds income
+        total_requested = sum(allocations.values()) + savings_contribution
+        if total_requested > obs.total_income and total_requested > 0:
+            scale = obs.total_income / total_requested
+            allocations = {cat: amt * scale for cat, amt in allocations.items()}
+            savings_contribution *= scale
+        return Action(
+            allocations=allocations,
+            savings_contribution=savings_contribution,
+        )
+    except (json.JSONDecodeError, KeyError, TypeError, ValueError) as exc:
+        print(f"  [WARN] Could not parse LLM response ({type(exc).__name__}: {exc}). "
+              f"Using fallback minimum allocations.")
+        return _fallback_action(obs)
+# ---------------------------------------------------------------------------
+# Core episode runner
+# ---------------------------------------------------------------------------
+def run_agent_episode(mode: str, total_periods: int, seed: int) -> float:
+    """
+    Run a full BEACON episode with the Groq LLM agent and return the
+    average reward across all periods.
+    At each step the agent receives a natural-language prompt describing
+    the current budget state, responds with a JSON allocation plan, and
+    the environment returns a structured Reward. If the LLM produces
+    unparseable output, a safe minimum-allocation fallback is used.
+    Args:
+        mode:          BEACON mode — "household" or "corporate".
+        total_periods: Number of budget periods in the episode.
+        seed:          Random seed for environment reproducibility.
+    Returns:
+        Mean reward.total across all completed periods (float in [-1.0, 1.0]).
+    """
+    # --- Initialise environment ----------------------------------------------
+    env = BEACONEnvironment(mode=mode, total_periods=total_periods, seed=seed)
+    obs = env.reset()
+    period_rewards: list[float] = []
+    system_prompt = (
+        "You are a precise financial planning agent. "
+        "You always respond with ONLY valid JSON — no prose, no markdown fences, "
+        "no explanation. Every numeric value must be a plain float."
+    )
+    # --- Episode loop --------------------------------------------------------
+    for step_num in range(1, total_periods + 1):
+        user_prompt = _build_prompt(obs, step_num)
+        # --- Query the LLM ---------------------------------------------------
+        try:
+            response = client.chat.completions.create(
+                model=MODEL,
+                temperature=TEMPERATURE,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user",   "content": user_prompt},
+                ],
+            )
+            raw_text = response.choices[0].message.content or ""
+        except Exception as exc:
+            print(f"  [WARN] LLM API call failed (step {step_num}): {exc}. "
+                  f"Using fallback action.")
+            raw_text = ""  # triggers fallback in _parse_action
+        # --- Parse response into an Action -----------------------------------
+        action = _parse_action(raw_text, obs)
+        # --- Step the environment --------------------------------------------
+        obs, reward, done, _info = env.step(action)
+        period_rewards.append(reward.total)
+        if done:
+            break
+    # --- Average reward across all periods -----------------------------------
+    avg_reward = sum(period_rewards) / len(period_rewards) if period_rewards else 0.0
+    return avg_reward
+# ---------------------------------------------------------------------------
+# Top-level baseline runner
+# ---------------------------------------------------------------------------
+def run_baseline() -> dict[str, float]:
+    """
+    Run all three BEACON tasks with the Groq LLM agent and report scores.
+    Tasks:
+        Task 1 — Easy:   household mode, 1 period,  seed=42
+        Task 2 — Medium: household mode, 3 periods, seed=99
+        Task 3 — Hard:   corporate mode, 6 periods, seed=7
+    Each task returns the mean reward across all periods, printed to 2
+    decimal places.
+    Returns:
+        dict with keys "task1", "task2", "task3" mapping to float scores.
+    """
+    print("Running BEACON baseline...")
+    print(f"  Model : {MODEL}")
+    print(f"  Temp  : {TEMPERATURE}")
+    print()
+    # --- Task 1: Easy — Bill Coverage (1 period, household) ------------------
+    print("Task 1 (Easy — Bill Coverage)...")
+    score1 = run_agent_episode(mode="household", total_periods=1, seed=42)
+    print(f"Task 1: {score1:.2f}")
+    print()
+    # --- Task 2: Medium — Shock Absorption (3 periods, household) ------------
+    print("Task 2 (Medium — Shock Absorption)...")
+    score2 = run_agent_episode(mode="household", total_periods=3, seed=99)
+    print(f"Task 2: {score2:.2f}")
+    print()
+    # --- Task 3: Hard — 6-Month Goal Planning (6 periods, corporate) ---------
+    print("Task 3 (Hard — 6-Month Goal Planning)...")
+    score3 = run_agent_episode(mode="corporate", total_periods=6, seed=7)
+    print(f"Task 3: {score3:.2f}")
+    print()
+    return {
+        "task1": score1,
+        "task2": score2,
+        "task3": score3,
+    }
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    run_baseline()

environment.py ADDED Viewed

	@@ -0,0 +1,438 @@

+"""
+environment.py — BEACON reinforcement learning environment.
+BEACON (Budget Environment for Agent Control and Optimization of Needs) is a
+dual-scale budget management environment with two operating modes:
+  - "household": personal finance simulation (income in Indian Rupees)
+  - "corporate": organisational finance simulation
+"""
+import random
+from models import Observation, Action, Reward
+# ---------------------------------------------------------------------------
+# Module-level configuration constants
+# ---------------------------------------------------------------------------
+MODES = ("household", "corporate")
+# Spending categories available in each mode
+CATEGORIES: dict[str, list[str]] = {
+    "household": [
+        "rent", "food", "utilities", "transport",
+        "education", "medical", "discretionary",
+    ],
+    "corporate": [
+        "payroll", "operations", "marketing", "logistics",
+        "capex", "reserves", "miscellaneous",
+    ],
+}
+# Income sampling range (inclusive) per mode — household values in Indian Rupees
+INCOME_RANGE: dict[str, tuple[float, float]] = {
+    "household": (30_000.0,     100_000.0),
+    "corporate": (1_000_000.0, 50_000_000.0),
+}
+# Unexpected financial events that can hit the agent mid-episode
+SHOCKS: dict[str, list[str]] = {
+    "household": [
+        "medical_emergency",
+        "appliance_repair",
+        "school_fee_spike",
+        "utility_surge",
+    ],
+    "corporate": [
+        "vendor_default",
+        "regulatory_fine",
+        "equipment_failure",
+        "key_employee_exit",
+    ],
+}
+# Each shock costs between 10% and 25% of total_income (sampled uniformly)
+SHOCK_COST_RANGE: tuple[float, float] = (0.10, 0.25)
+# ---------------------------------------------------------------------------
+# Environment class
+# ---------------------------------------------------------------------------
+class BEACONEnvironment:
+    """
+    BEACON: Budget Environment for Agent Control and Optimization of Needs.
+    An OpenEnv-compatible, dual-scale budget management RL environment.
+    The agent manages a budget over `total_periods` steps, allocating funds
+    across spending categories, growing savings, and weathering random
+    financial shocks.
+    Episode flow:
+        obs = env.reset()
+        while True:
+            action = agent.act(obs)
+            obs, reward, done, info = env.step(action)
+            if done:
+                break
+    """
+    # ------------------------------------------------------------------
+    # Minimum category allocations as a fraction of total_income.
+    # Categories with 0.0 are non-essential (no penalty for zero spend).
+    # ------------------------------------------------------------------
+    MIN_REQUIREMENTS: dict[str, dict[str, float]] = {
+        "household": {
+            "rent":          0.25,
+            "food":          0.20,
+            "utilities":     0.08,
+            "transport":     0.05,
+            "education":     0.10,
+            "medical":       0.05,
+            "discretionary": 0.00,  # non-essential
+        },
+        "corporate": {
+            "payroll":       0.35,
+            "operations":    0.20,
+            "marketing":     0.05,
+            "logistics":     0.08,
+            "capex":         0.05,
+            "reserves":      0.10,
+            "miscellaneous": 0.00,  # non-essential
+        },
+    }
+    def __init__(
+        self,
+        mode: str = "household",
+        total_periods: int = 6,
+        seed: int = 42,
+    ) -> None:
+        """
+        Initialise the BEACON environment.
+        Args:
+            mode:          Simulation mode — "household" or "corporate".
+            total_periods: Number of budget periods in one episode.
+            seed:          Random seed for full reproducibility.
+        Raises:
+            ValueError: If an unrecognised mode is supplied.
+        """
+        if mode not in MODES:
+            raise ValueError(
+                f"Invalid mode '{mode}'. Choose one of {MODES}."
+            )
+        self.mode          = mode
+        self.total_periods = total_periods
+        self.seed          = seed
+        # Isolated RNG — does not pollute global random state
+        self._rng = random.Random(seed)
+        # Internal state fields — initialised properly inside reset()
+        self._period:           int               = 1
+        self._total_income:     float             = 0.0
+        self._savings_balance:  float             = 0.0
+        self._savings_goal:     float             = 0.0
+        self._category_budgets: dict[str, float]  = {}
+        self._category_spent:   dict[str, float]  = {}
+        self._active_shocks:    list[str]         = []
+        self._shock_costs:      dict[str, float]  = {}  # shock → cost amount
+        # Start the first episode immediately
+        self.reset()
+    # ------------------------------------------------------------------
+    # Core API
+    # ------------------------------------------------------------------
+    def reset(self) -> Observation:
+        """
+        Reset the environment and begin a new episode.
+        Re-seeds the internal RNG so that consecutive reset() calls always
+        produce the same starting state (deterministic reproducibility).
+        Randomly activates zero or one shock at episode start.
+        Returns:
+            The initial Observation for the new episode.
+        """
+        # Fresh RNG from the same seed → identical episode starts every call
+        self._rng = random.Random(self.seed)
+        # --- Sample income -----------------------------------------------
+        lo, hi = INCOME_RANGE[self.mode]
+        self._total_income = self._rng.uniform(lo, hi)
+        # --- Savings goal = 20% of projected total income ----------------
+        self._savings_goal = 0.20 * self._total_income * self.total_periods
+        # --- Zero-initialise all category tracking -----------------------
+        categories = CATEGORIES[self.mode]
+        self._category_budgets = {cat: 0.0 for cat in categories}
+        self._category_spent   = {cat: 0.0 for cat in categories}
+        # --- Reset savings and time counters -----------------------------
+        self._savings_balance = 0.0
+        self._period          = 1
+        # --- Clear shock state, then optionally seed one starting shock --
+        self._active_shocks = []
+        self._shock_costs   = {}
+        if self._rng.random() < 0.50:  # 50% chance of a starting shock
+            self._activate_random_shock()
+        return self._make_observation()
+    def step(self, action: Action) -> tuple[Observation, Reward, bool, dict]:
+        """
+        Execute one budget period using the agent's action.
+        Steps performed:
+            1. Apply category allocations → update budgets and spent amounts.
+            2. Add savings contribution → update savings balance.
+            3. Calculate the multi-component reward signal.
+            4. Advance the period counter.
+            5. Randomly activate a new shock (30% probability).
+            6. Determine episode termination.
+        Args:
+            action: The Action submitted by the agent for this period.
+        Returns:
+            observation:  New environment state after the step.
+            reward:       Structured Reward for this period.
+            done:         True when the episode has ended.
+            info:         Auxiliary diagnostic data (plain dict).
+        """
+        # ---- 1. Apply category allocations ------------------------------
+        for cat, amount in action.allocations.items():
+            if cat in self._category_budgets:
+                # Treat the allocation as the amount budgeted and spent
+                self._category_budgets[cat] = amount
+                self._category_spent[cat]   = amount
+        # ---- 2. Update savings balance ----------------------------------
+        self._savings_balance += action.savings_contribution
+        # ---- 3. Total spending = all allocations + savings this period --
+        total_spent = sum(action.allocations.values()) + action.savings_contribution
+        # ---- 4. Compute reward ------------------------------------------
+        reward = self._calculate_reward(action, total_spent)
+        # ---- 5. Advance time period -------------------------------------
+        self._period += 1
+        # ---- 6. Randomly activate a new shock (30% probability) ---------
+        if self._rng.random() < 0.30:
+            self._activate_random_shock()
+        # ---- 7. Episode is done when no periods remain ------------------
+        done = self.periods_remaining == 0
+        # ---- 8. Diagnostic info dict ------------------------------------
+        info: dict = {
+            "period_completed":  self._period - 1,
+            "total_spent":       total_spent,
+            "total_income":      self._total_income,
+            "overspent":         total_spent > self._total_income,
+            "active_shocks":     list(self._active_shocks),
+            "shock_costs":       dict(self._shock_costs),
+            "savings_balance":   self._savings_balance,
+            "savings_goal":      self._savings_goal,
+            "periods_remaining": self.periods_remaining,
+        }
+        return self._make_observation(), reward, done, info
+    def state(self) -> dict:
+        """
+        Return the complete current environment state as a plain dictionary.
+        Useful for logging, checkpointing, or external serialisation without
+        constructing Pydantic models.
+        Returns:
+            A flat dict containing all internal state fields.
+        """
+        return {
+            "mode":              self.mode,
+            "period":            self._period,
+            "total_periods":     self.total_periods,
+            "periods_remaining": self.periods_remaining,
+            "total_income":      self._total_income,
+            "savings_balance":   self._savings_balance,
+            "savings_goal":      self._savings_goal,
+            "category_budgets":  dict(self._category_budgets),
+            "category_spent":    dict(self._category_spent),
+            "active_shocks":     list(self._active_shocks),
+            "shock_costs":       dict(self._shock_costs),
+            "seed":              self.seed,
+        }
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+    @property
+    def periods_remaining(self) -> int:
+        """Number of budget periods still remaining in the current episode."""
+        return max(0, self.total_periods - self._period + 1)
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _make_observation(self) -> Observation:
+        """Build and return an Observation from the current internal state."""
+        return Observation(
+            mode=self.mode,
+            period=self._period,
+            total_income=self._total_income,
+            category_budgets=dict(self._category_budgets),
+            category_spent=dict(self._category_spent),
+            savings_balance=self._savings_balance,
+            savings_goal=self._savings_goal,
+            active_shocks=list(self._active_shocks),
+            periods_remaining=self.periods_remaining,
+        )
+    def _activate_random_shock(self) -> None:
+        """
+        Select and activate one random shock from the mode's shock pool.
+        Prefers shocks not currently active. If all shocks are already active,
+        one is reselected and its cost is refreshed.
+        Cost is sampled uniformly in [10%, 25%] of total_income.
+        """
+        available = SHOCKS[self.mode]
+        # Prefer shocks not yet active to diversify events
+        inactive = [s for s in available if s not in self._active_shocks]
+        shock = self._rng.choice(inactive if inactive else available)
+        # Sample a cost fraction and convert to absolute amount
+        cost_fraction = self._rng.uniform(*SHOCK_COST_RANGE)
+        shock_cost    = cost_fraction * self._total_income
+        # Add to active list only if not already present
+        if shock not in self._active_shocks:
+            self._active_shocks.append(shock)
+        # Always update/refresh the cost (covers re-roll of existing shocks)
+        self._shock_costs[shock] = shock_cost
+    def _calculate_reward(self, action: Action, total_spent: float) -> Reward:
+        """
+        Compute the structured Reward for the current period.
+        Component breakdown:
+            bills_paid_score       ∈ [0.0, 0.4]
+                Fraction of essential categories that received ≥ 80% of
+                their minimum requirement, scaled by 0.4.
+            savings_progress_score ∈ [0.0, 0.3]
+                (savings_balance / savings_goal) × 0.3, capped at 0.3.
+            efficiency_score       ∈ {0.0, 0.2}
+                0.2 if total_spent ≤ total_income, else 0.0.
+            shock_resilience_bonus ∈ {0.0, 0.1}
+                0.1 if shocks are active AND total_spent covers all shock
+                costs, else 0.0.
+            penalties              ∈ (-∞, 0.0]
+                −0.3 per essential category with zero allocation.
+                −0.1 if total_spent > total_income.
+            total = sum of all components, clipped to [−1.0, 1.0].
+        Args:
+            action:      Agent's action for this period.
+            total_spent: Total funds deployed (allocations + savings).
+        Returns:
+            A fully populated Reward model.
+        """
+        minimums = self.MIN_REQUIREMENTS[self.mode]
+        # Essential categories are those with a non-zero minimum requirement
+        essential_cats = {
+            cat: frac
+            for cat, frac in minimums.items()
+            if frac > 0.0
+        }
+        total_essential = len(essential_cats)
+        # --- bills_paid_score --- (max 0.4) --------------------------------
+        categories_covered     = 0
+        zero_alloc_essentials  = 0  # count for penalty calculation
+        for cat, min_fraction in essential_cats.items():
+            min_required = min_fraction * self._total_income
+            allocated    = action.allocations.get(cat, 0.0)
+            if allocated == 0.0:
+                # Completely skipped an essential category → penalty later
+                zero_alloc_essentials += 1
+            elif allocated >= 0.80 * min_required:
+                # Covered at least 80% of the minimum → category is satisfied
+                categories_covered += 1
+        bills_paid_score = (
+            (categories_covered / total_essential) * 0.4
+            if total_essential > 0
+            else 0.4
+        )
+        # --- savings_progress_score --- (max 0.3) --------------------------
+        if self._savings_goal > 0:
+            raw_savings_score    = (self._savings_balance / self._savings_goal) * 0.3
+            savings_progress_score = min(raw_savings_score, 0.3)
+        else:
+            savings_progress_score = 0.0
+        # --- efficiency_score --- (0.2 if within budget, else 0.0) ---------
+        efficiency_score = 0.2 if total_spent <= self._total_income else 0.0
+        # --- shock_resilience_bonus --- (0.1 or 0.0) ----------------------
+        # Awarded when active shocks exist AND the agent's spending covers
+        # the combined shock cost (demonstrating financial resilience)
+        shock_resilience_bonus = 0.0
+        if self._active_shocks:
+            total_shock_cost = sum(self._shock_costs.values())
+            if total_spent >= total_shock_cost:
+                shock_resilience_bonus = 0.1
+        # --- penalties --- (negative values) ------------------------------
+        penalties = 0.0
+        # Hard penalty for each essential category left completely unfunded
+        penalties -= 0.3 * zero_alloc_essentials
+        # Penalty for exceeding total available income
+        if total_spent > self._total_income:
+            penalties -= 0.1
+        # --- total reward --- clipped to [-1.0, 1.0] ----------------------
+        total = (
+            bills_paid_score
+            + savings_progress_score
+            + efficiency_score
+            + shock_resilience_bonus
+            + penalties
+        )
+        total = max(-1.0, min(1.0, total))
+        return Reward(
+            total=total,
+            bills_paid_score=bills_paid_score,
+            savings_progress_score=savings_progress_score,
+            efficiency_score=efficiency_score,
+            shock_resilience_bonus=shock_resilience_bonus,
+            penalties=penalties,
+        )

graders.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+graders.py — Evaluation graders for the BEACON reinforcement learning environment.
+Each grader runs one complete, fully deterministic episode and returns a
+normalised float score in [0.0, 1.0].
+Graders:
+    grade_task1() — Easy:   Bill Coverage          (household, 1 period)
+    grade_task2() — Medium: Shock Absorption       (household, 3 periods)
+    grade_task3() — Hard:   6-Month Goal Planning  (corporate, 6 periods)
+run_all_graders() runs all three, prints results, and returns a summary dict.
+"""
+from environment import BEACONEnvironment
+from models import Action
+# ---------------------------------------------------------------------------
+# GRADER 1 — Easy: Bill Coverage
+# ---------------------------------------------------------------------------
+def grade_task1() -> float:
+    """
+    Easy grader: tests whether the agent can cover all essential bills in a
+    single period by allocating exactly the minimum required amount to each
+    essential category and directing remaining income to savings.
+    Episode config:
+        mode="household", total_periods=1, seed=42
+    Scoring:
+        score = reward.bills_paid_score / 0.4   → normalised to [0.0, 1.0]
+    Returns:
+        A float in [0.0, 1.0] representing bill-coverage performance.
+    """
+    # --- Set up environment ---------------------------------------------------
+    env = BEACONEnvironment(mode="household", total_periods=1, seed=42)
+    obs = env.reset()
+    income      = obs.total_income
+    minimums    = BEACONEnvironment.MIN_REQUIREMENTS["household"]
+    # --- Build allocations: exactly the minimum required for each category ----
+    # Essential categories have a non-zero minimum fraction; discretionary gets 0.
+    allocations: dict[str, float] = {}
+    total_bills = 0.0
+    for cat, fraction in minimums.items():
+        amount = fraction * income          # exact minimum amount
+        allocations[cat] = amount
+        total_bills += amount
+    # Remaining income after meeting all bills goes into savings
+    savings_contribution = max(0.0, income - total_bills)
+    action = Action(
+        allocations=allocations,
+        savings_contribution=savings_contribution,
+    )
+    # --- Run the single step --------------------------------------------------
+    _obs, reward, _done, _info = env.step(action)
+    # --- Normalise bills_paid_score from [0.0, 0.4] → [0.0, 1.0] ------------
+    score = reward.bills_paid_score / 0.4
+    return round(score, 4)
+# ---------------------------------------------------------------------------
+# GRADER 2 — Medium: Shock Absorption
+# ---------------------------------------------------------------------------
+def grade_task2() -> float:
+    """
+    Medium grader: tests the agent's ability to maintain essential spending
+    while absorbing unexpected financial shocks across 3 periods.
+    Episode config:
+        mode="household", total_periods=3, seed=99
+    Strategy (per step):
+        Step 1 — Allocate minimums everywhere; reduce discretionary to help
+                 absorb the shock cost. Put any remainder into savings.
+        Step 2 — Rebalance after shock: re-allocate minimums and re-check
+                 shock costs; discretionary absorbs overflow again.
+        Step 3 — Recovery: allocate minimums, maximise savings contribution
+                 to push savings_progress_score up.
+    Scoring:
+        raw_avg = mean(reward.total) across 3 steps   ∈ [-1.0, 1.0]
+        score   = (raw_avg + 1.0) / 2.0               ∈ [ 0.0, 1.0]
+    Returns:
+        A float in [0.0, 1.0] representing shock-resilience performance.
+    """
+    # --- Set up environment ---------------------------------------------------
+    env = BEACONEnvironment(mode="household", total_periods=3, seed=99)
+    obs = env.reset()
+    # Force at least one shock active at the start if reset produced none
+    if not env._active_shocks:
+        env._active_shocks = ["medical_emergency"]
+        env._shock_costs   = {"medical_emergency": 0.15 * env._total_income}
+    minimums = BEACONEnvironment.MIN_REQUIREMENTS["household"]
+    total_rewards: list[float] = []
+    for step_num in range(1, 4):  # steps 1, 2, 3
+        income     = env._total_income
+        shock_cost = sum(env._shock_costs.values()) if env._active_shocks else 0.0
+        # Compute baseline essential spend (sum of all minimum fractions × income)
+        essential_spend = sum(
+            frac * income
+            for cat, frac in minimums.items()
+            if frac > 0.0
+        )
+        # Budget headroom after essentials
+        headroom = income - essential_spend
+        if step_num == 1:
+            # Step 1: allocate minimums; let discretionary absorb shock cost
+            allocations = {cat: frac * income for cat, frac in minimums.items()}
+            # Add shock cost into discretionary so it shows the agent "spent" it
+            shock_absorption = min(shock_cost, max(0.0, headroom))
+            allocations["discretionary"] = shock_absorption
+            savings_contribution = max(0.0, headroom - shock_absorption)
+        elif step_num == 2:
+            # Step 2: rebalance — refresh shock costs, keep essentials solid
+            allocations = {cat: frac * income for cat, frac in minimums.items()}
+            current_shock = sum(env._shock_costs.values()) if env._active_shocks else 0.0
+            shock_absorption = min(current_shock, max(0.0, headroom))
+            allocations["discretionary"] = shock_absorption
+            savings_contribution = max(0.0, headroom - shock_absorption)
+        else:
+            # Step 3: recovery — allocate minimums, maximise savings
+            allocations = {cat: frac * income for cat, frac in minimums.items()}
+            allocations["discretionary"] = 0.0  # nothing to discretionary
+            # Channel all remaining headroom into savings
+            savings_contribution = max(0.0, headroom)
+        action = Action(
+            allocations=allocations,
+            savings_contribution=savings_contribution,
+        )
+        _obs, reward, _done, _info = env.step(action)
+        total_rewards.append(reward.total)
+    # --- Normalise mean reward from [-1.0, 1.0] → [0.0, 1.0] ----------------
+    avg_reward = sum(total_rewards) / len(total_rewards)
+    score = (avg_reward + 1.0) / 2.0
+    return round(score, 4)
+# ---------------------------------------------------------------------------
+# GRADER 3 — Hard: 6-Month Goal Planning
+# ---------------------------------------------------------------------------
+def grade_task3() -> float:
+    """
+    Hard grader: tests the agent's ability to meet a multi-period savings goal
+    while consistently covering all essential spending in a corporate setting.
+    Episode config:
+        mode="corporate", total_periods=6, seed=7
+    Strategy (each of 6 steps):
+        - Allocate exactly the minimum required to every category.
+        - Contribute 15% of total_income to savings.
+        - Keep total spend ≤ total_income (efficiency constraint).
+    Scoring:
+        goal_reached = min(savings_balance / savings_goal, 1.0)
+        no_misses    = 1.0 if no essential category ever had 0 allocation
+                       else 0.5
+        score = (goal_reached × 0.6) + (no_misses × 0.4)
+    Returns:
+        A float in [0.0, 1.0] representing long-term planning performance.
+    """
+    # --- Set up environment ---------------------------------------------------
+    env = BEACONEnvironment(mode="corporate", total_periods=6, seed=7)
+    obs = env.reset()
+    minimums = BEACONEnvironment.MIN_REQUIREMENTS["corporate"]
+    # Track whether any essential category was ever left at zero allocation
+    had_zero_essential = False
+    for _step in range(6):
+        income = env._total_income
+        # Allocate exactly the minimum to every category
+        allocations: dict[str, float] = {
+            cat: frac * income for cat, frac in minimums.items()
+        }
+        # Contribute a fixed 15% of income to savings each period
+        savings_contribution = 0.15 * income
+        # Check for zero-allocation on any essential before submitting
+        for cat, frac in minimums.items():
+            if frac > 0.0 and allocations.get(cat, 0.0) == 0.0:
+                had_zero_essential = True
+        action = Action(
+            allocations=allocations,
+            savings_contribution=savings_contribution,
+        )
+        _obs, _reward, done, _info = env.step(action)
+        if done:
+            break
+    # --- Final score calculation ----------------------------------------------
+    savings_balance = env._savings_balance
+    savings_goal    = env._savings_goal
+    # Fraction of savings goal achieved, capped at 1.0
+    goal_reached = min(savings_balance / savings_goal, 1.0) if savings_goal > 0 else 0.0
+    # Full credit if every step had non-zero allocation to all essential cats
+    no_misses = 0.5 if had_zero_essential else 1.0
+    score = (goal_reached * 0.6) + (no_misses * 0.4)
+    return round(score, 4)
+# ---------------------------------------------------------------------------
+# Aggregate runner
+# ---------------------------------------------------------------------------
+def run_all_graders() -> dict[str, float]:
+    """
+    Run all three BEACON graders, print individual scores, and return a
+    summary dictionary.
+    Each grader is fully deterministic — scores are identical on every run.
+    Returns:
+        dict with keys "task1", "task2", "task3" mapping to float scores.
+    """
+    task1 = grade_task1()
+    task2 = grade_task2()
+    task3 = grade_task3()
+    print(f"Task 1: {task1:.2f}")
+    print(f"Task 2: {task2:.2f}")
+    print(f"Task 3: {task3:.2f}")
+    return {
+        "task1": task1,
+        "task2": task2,
+        "task3": task3,
+    }
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    run_all_graders()

models.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+models.py — Pydantic v2 data models for the BEACON reinforcement learning environment.
+BEACON (Budget Environment for Agent Control and Optimization of Needs) is a dual-scale
+budget management environment supporting "household" and "corporate" simulation modes.
+"""
+from pydantic import BaseModel
+class Observation(BaseModel):
+    """
+    Represents the observation returned to the agent at each environment step.
+    Contains the full state of the current budget period, including income,
+    category-level spending, savings progress, any active economic shocks,
+    and how many periods are left in the episode.
+    """
+    mode: str
+    """Simulation mode — either 'household' or 'corporate'."""
+    period: int
+    """Current time period, starting from 1."""
+    total_income: float
+    """Total income available for the current period."""
+    category_budgets: dict[str, float]
+    """Mapping of category name to the amount allocated for that category."""
+    category_spent: dict[str, float]
+    """Mapping of category name to the amount already spent this period."""
+    savings_balance: float
+    """Current accumulated savings balance."""
+    savings_goal: float
+    """Target savings balance the agent should aim to reach."""
+    active_shocks: list[str]
+    """Names of unexpected financial events currently affecting the environment."""
+    periods_remaining: int
+    """Number of time periods left before the episode ends."""
+class Action(BaseModel):
+    """
+    Represents the action submitted by the agent for a given time period.
+    The agent specifies how much to allocate to each spending category and
+    how much to contribute to savings from the available income.
+    """
+    allocations: dict[str, float]
+    """Mapping of category name to the amount the agent allocates this period."""
+    savings_contribution: float
+    """Amount the agent chooses to add to savings this period."""
+class Reward(BaseModel):
+    """
+    Represents the reward signal returned to the agent after each step.
+    The total reward is a scalar in [-1.0, 1.0] composed of several sub-scores
+    that reflect different aspects of budgeting performance: bill coverage,
+    savings trajectory, spending efficiency, and resilience to shocks.
+    Penalties are subtracted for constraint violations.
+    """
+    total: float
+    """Final scalar reward for the step, in the range [-1.0, 1.0]."""
+    bills_paid_score: float
+    """Score reflecting whether all essential bills and obligations were covered."""
+    savings_progress_score: float
+    """Score reflecting progress toward the savings goal."""
+    efficiency_score: float
+    """Score reflecting how efficiently income was allocated with minimal waste."""
+    shock_resilience_bonus: float
+    """Bonus awarded for successfully absorbing active economic shocks."""
+    penalties: float
+    """Cumulative penalty subtracted for constraint violations (e.g., overspending)."""

openenv.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: BEACON
+version: "1.0.0"
+description: >
+  Dual-scale budget management environment where agents
+  learn to allocate income across household and corporate
+  financial categories under constraints and economic shocks.
+author: your_name
+tags: [finance, budgeting, planning, dual-scale]
+modes: [household, corporate]
+tasks:
+  - task1
+  - task2
+  - task3
+```
+**`requirements.txt`** — paste this:
+```
+fastapi
+uvicorn
+pydantic
+openai
+pyyaml
+groq

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+pydantic
+openai
+pyyaml
+groq

server/__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (9.57 kB). View file

server/app.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+app.py — FastAPI server for the BEACON reinforcement learning environment.
+Exposes the BEACON environment as a REST API so that agents, dashboards,
+and evaluation pipelines can interact with it over HTTP.
+Endpoints:
+    POST /reset      — initialise / reset the environment
+    POST /step       — submit an action and advance one period
+    GET  /state      — inspect the full current environment state
+    GET  /tasks      — list all available evaluation tasks
+    POST /grader     — run a specific grader and get a score
+    GET  /baseline   — run all graders and return all scores
+    GET  /health     — liveness check
+Usage:
+    python app.py
+    # or
+    uvicorn beacon_env.app:app --reload
+"""
+import os
+import sys
+# ---------------------------------------------------------------------------
+# Ensure parent directory (d:/meta) is on the Python path so that
+# environment.py, models.py, and graders.py can be imported as top-level
+# modules from this subdirectory.
+# ---------------------------------------------------------------------------
+_PARENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _PARENT_DIR not in sys.path:
+    sys.path.insert(0, _PARENT_DIR)
+# ---------------------------------------------------------------------------
+# BEACON imports (resolved via sys.path above)
+# ---------------------------------------------------------------------------
+from environment import BEACONEnvironment          # noqa: E402
+from models import Action                           # noqa: E402
+from graders import (                               # noqa: E402
+    grade_task1,
+    grade_task2,
+    grade_task3,
+    run_all_graders,
+)
+# ---------------------------------------------------------------------------
+# FastAPI imports
+# ---------------------------------------------------------------------------
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# App setup
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="BEACON Environment API",
+    description=(
+        "REST API for the BEACON dual-scale budget management "
+        "reinforcement learning environment."
+    ),
+    version="1.0.0",
+)
+# Allow all origins so browser-based agents and dashboards can connect freely
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ---------------------------------------------------------------------------
+# Global environment instance
+# Starts as None; created / replaced on the first POST /reset call.
+# A default instance is also created at startup so GET endpoints work
+# immediately without requiring a prior reset.
+# ---------------------------------------------------------------------------
+_env: BEACONEnvironment = BEACONEnvironment(mode="household", seed=42)
+def _require_env() -> BEACONEnvironment:
+    """Return the global environment, raising 503 if it is uninitialised."""
+    if _env is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Environment not initialised. Call POST /reset first.",
+        )
+    return _env
+# ---------------------------------------------------------------------------
+# Request / response schemas
+# ---------------------------------------------------------------------------
+class ResetRequest(BaseModel):
+    """Request body for POST /reset."""
+    mode:          str = Field(default="household", description="'household' or 'corporate'")
+    seed:          int = Field(default=42,          description="Random seed for reproducibility")
+    total_periods: int = Field(default=6,           description="Number of budget periods per episode")
+class GraderRequest(BaseModel):
+    """Request body for POST /grader."""
+    task_id: str = Field(description="One of: 'task1', 'task2', 'task3'")
+# ---------------------------------------------------------------------------
+# Task catalogue (static metadata)
+# ---------------------------------------------------------------------------
+ACTION_SCHEMA = {
+    "allocations":          "dict[str, float]",
+    "savings_contribution": "float",
+}
+TASK_CATALOGUE = [
+    {
+        "task_id":     "task1",
+        "name":        "Bill Coverage",
+        "difficulty":  "easy",
+        "description": "Allocate income to cover all essential bills in a single period.",
+        "mode":        "household",
+        "periods":     1,
+        "seed":        42,
+        "action_schema": ACTION_SCHEMA,
+    },
+    {
+        "task_id":     "task2",
+        "name":        "Shock Absorption",
+        "difficulty":  "medium",
+        "description": (
+            "Maintain essential spending while absorbing unexpected "
+            "financial shocks across 3 periods."
+        ),
+        "mode":        "household",
+        "periods":     3,
+        "seed":        99,
+        "action_schema": ACTION_SCHEMA,
+    },
+    {
+        "task_id":     "task3",
+        "name":        "6-Month Goal Planning",
+        "difficulty":  "hard",
+        "description": (
+            "Manage a corporate budget over 6 periods, covering all "
+            "essential categories while reaching the savings goal."
+        ),
+        "mode":        "corporate",
+        "periods":     6,
+        "seed":        7,
+        "action_schema": ACTION_SCHEMA,
+    },
+]
+# Map task_id → grader function for quick lookup
+_GRADER_MAP = {
+    "task1": grade_task1,
+    "task2": grade_task2,
+    "task3": grade_task3,
+}
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+@app.get("/health", summary="Liveness check")
+def health():
+    """
+    Returns a simple status object confirming the service is running.
+    """
+    return {"status": "ok", "environment": "BEACON"}
+@app.post("/reset", summary="Initialise or reset the environment")
+def reset(body: ResetRequest = ResetRequest()):
+    """
+    Create a fresh BEACONEnvironment with the given parameters and call
+    reset(). Returns the initial Observation as JSON.
+    - **mode**: `"household"` or `"corporate"` (default: `"household"`)
+    - **seed**: random seed for reproducibility (default: `42`)
+    - **total_periods**: episode length (default: `6`)
+    """
+    global _env
+    try:
+        _env = BEACONEnvironment(
+            mode=body.mode,
+            total_periods=body.total_periods,
+            seed=body.seed,
+        )
+        obs = _env.reset()
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+    return obs.model_dump()
+@app.post("/step", summary="Submit an action and advance one period")
+def step(action: Action):
+    """
+    Apply the agent's Action to the current environment and advance by one
+    budget period.
+    Returns the resulting Observation, Reward, done flag, and info dict.
+    - **allocations**: `{category: amount, ...}` — must cover all categories
+    - **savings_contribution**: amount added to savings this period
+    """
+    env = _require_env()
+    obs, reward, done, info = env.step(action)
+    return {
+        "observation": obs.model_dump(),
+        "reward":      reward.model_dump(),
+        "done":        done,
+        "info":        info,
+    }
+@app.get("/state", summary="Inspect the current environment state")
+def state():
+    """
+    Return the full internal state of the current environment as a plain
+    dictionary. Does not advance the episode.
+    """
+    env = _require_env()
+    return env.state()
+@app.get("/tasks", summary="List all available evaluation tasks")
+def tasks():
+    """
+    Return metadata for all three BEACON evaluation tasks, including their
+    difficulty, mode, episode length, and expected action schema.
+    """
+    return TASK_CATALOGUE
+@app.post("/grader", summary="Run a specific grader and return its score")
+def grader(body: GraderRequest):
+    """
+    Execute the grader for the requested task and return the normalised
+    score in [0.0, 1.0].
+    - **task_id**: one of `"task1"`, `"task2"`, `"task3"`
+    """
+    grader_fn = _GRADER_MAP.get(body.task_id)
+    if grader_fn is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Unknown task_id '{body.task_id}'. "
+                   f"Valid options: {list(_GRADER_MAP.keys())}",
+        )
+    score = grader_fn()
+    return {"task_id": body.task_id, "score": score}
+@app.get("/baseline", summary="Run all graders and return all scores")
+def baseline():
+    """
+    Execute all three BEACON graders sequentially and return their scores.
+    This endpoint is deterministic — scores are identical on every call.
+    """
+    scores = run_all_graders()
+    return scores
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)