Spaces:

ARKAISW
/

QuantHive

Running

File size: 40,461 Bytes

aec0295

"""
Multi-Agent Trading Environment using PettingZoo AEC API.

Three independent RL agents operate in a decentralized governance framework:
  - risk_manager_0:    Rewarded for restricting dangerous trades. Penalized when Trader loses.
  - portfolio_manager_0: Oversees capital allocation. Rewarded for portfolio growth + drawdown control.
  - trader_0:          Rewarded purely for PnL. Sees Risk/PM constraints as observations.

The AEC (Agent-Environment Cycle) loop alternates agent turns each step.
Agent Negotiation: Each agent's *output message* (constraints, allocations) becomes
part of the next agent's observation, creating an emergent negotiation dynamic.
"""

from __future__ import annotations

import functools
from typing import Dict, List, Optional, Tuple, Any

import numpy as np
import pandas as pd
from gymnasium import spaces

from pettingzoo import AECEnv
try:
    # PettingZoo 1.25.0+ exposes the selector class as AgentSelector.
    from pettingzoo.utils import AgentSelector
except ImportError:
    # Older releases expose agent_selector directly, while some transitional
    # layouts expose a module with AgentSelector inside it.
    from pettingzoo.utils import agent_selector as _agent_selector

    AgentSelector = getattr(_agent_selector, "AgentSelector", _agent_selector)

from env.state import MarketState, PortfolioState, RiskState, get_observation
from env.reward import compute_raw_reward, normalize_reward, compute_grade
from utils.indicators import compute_indicators


# â”€â”€â”€ Agent IDs â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
RISK_MANAGER    = "risk_manager_0"
PORTFOLIO_MGR   = "portfolio_manager_0"
TRADER          = "trader_0"
ALL_AGENTS      = [RISK_MANAGER, PORTFOLIO_MGR, TRADER]

# â”€â”€â”€ Observation Sizes â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Base market+portfolio+risk obs size: 14 + 5 + 5 = 24
BASE_OBS_SIZE = 24
# Risk Manager message appended to PM and Trader observations: [size_limit, allow_new, force_reduce]
RM_MSG_SIZE = 3
# PM message appended to Trader observations: [cap_allocation, is_override_signaled]
PM_MSG_SIZE = 2


class MultiAgentTradingEnv(AECEnv):
    """
    A PettingZoo AEC environment for decentralized multi-agent trading governance.

    Turn order per step: risk_manager_0 â†’ portfolio_manager_0 â†’ trader_0
    On each full cycle, the market advances by one candle.

    Observations:
      risk_manager_0:   base_obs (24,)
      portfolio_mgr_0:  base_obs + rm_message (24 + 3 = 27,)
      trader_0:         base_obs + rm_message + pm_message (24 + 3 + 2 = 29,)

    Actions:
      risk_manager_0:   Box(3,) â€” [size_limit, allow_new_positions, force_reduce] â€” continuous
      portfolio_mgr_0:  Box(2,) â€” [capital_allocation_fraction, override_flag] â€” continuous
      trader_0:         Dict â€” direction (Discrete 3), size (Box 1), sl (Box 1), tp (Box 1)
    """

    metadata = {
        "render_modes": ["human", "ansi"],
        "name": "multi_agent_trading_v1",
        "is_parallelizable": False,
    }

    def __init__(
        self,
        df: Optional[pd.DataFrame] = None,
        initial_cash: float = 100_000.0,
        ticker: str = "default",
        commission: float = 0.001,
        max_steps: Optional[int] = None,
        difficulty: str = "hard",
    ):
        super().__init__()

        self.difficulty = difficulty
        if df is None:
            df = self._make_dummy_data(difficulty=difficulty)
        self.raw_df = df.copy()
        self.df = compute_indicators(df)
        self.ticker = ticker
        self.initial_cash = initial_cash
        self.commission = commission
        self.max_steps = max_steps or (len(self.df) - 1)

        # â”€â”€ PettingZoo required attributes â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        self.agents = ALL_AGENTS[:]
        self.possible_agents = ALL_AGENTS[:]

        # â”€â”€ Observation spaces â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        self.observation_spaces = {
            RISK_MANAGER:   spaces.Box(low=-np.inf, high=np.inf,
                                       shape=(BASE_OBS_SIZE,), dtype=np.float32),
            PORTFOLIO_MGR:  spaces.Box(low=-np.inf, high=np.inf,
                                       shape=(BASE_OBS_SIZE + RM_MSG_SIZE,), dtype=np.float32),
            TRADER:         spaces.Box(low=-np.inf, high=np.inf,
                                       shape=(BASE_OBS_SIZE + RM_MSG_SIZE + PM_MSG_SIZE,), dtype=np.float32),
        }

        # â”€â”€ Action spaces â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        self.action_spaces = {
            RISK_MANAGER:  spaces.Box(low=np.array([0.01, 0.0, 0.0], dtype=np.float32),
                                      high=np.array([1.0, 1.0, 1.0], dtype=np.float32),
                                      shape=(3,), dtype=np.float32),
            PORTFOLIO_MGR: spaces.Box(low=np.array([0.0, 0.0], dtype=np.float32),
                                      high=np.array([1.0, 1.0], dtype=np.float32),
                                      shape=(2,), dtype=np.float32),
            TRADER:        spaces.Dict({
                "direction": spaces.Discrete(3),          # 0=Hold, 1=Buy, 2=Sell/Short
                "size":      spaces.Box(0.0, 1.0, shape=(1,), dtype=np.float32),
                "sl":        spaces.Box(0.0, np.inf, shape=(1,), dtype=np.float32),
                "tp":        spaces.Box(0.0, np.inf, shape=(1,), dtype=np.float32),
            }),
        }

        # â”€â”€ Internal state (reset before first use) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        self._agent_selector = AgentSelector(ALL_AGENTS)
        self._reset_internal_state()

    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    # PettingZoo required API
    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
        if seed is not None:
            np.random.seed(seed)

        self.agents = ALL_AGENTS[:]
        self._agent_selector.reinit(ALL_AGENTS)

        self._reset_internal_state()
        self._generate_observations()

        self.agent_selection = self._agent_selector.reset()

        # Zero-fill all rewards/terminations/truncations/infos for PZ compliance
        self.rewards         = {ag: 0.0 for ag in self.agents}
        self._cumulative_rewards = {ag: 0.0 for ag in self.agents}
        self.terminations    = {ag: False for ag in self.agents}
        self.truncations     = {ag: False for ag in self.agents}
        self.infos           = {ag: {} for ag in self.agents}

    def step(self, action):
        """Process one agent's action in the AEC turn order."""
        agent = self.agent_selection

        if self.terminations[agent] or self.truncations[agent]:
            # Dead-step: PZ compliance requires we handle this
            self._was_dead_step(action)
            return
        # The current agent's cumulative reward was already returned by last().
        # Reset its accumulation window before processing a fresh action.
        self._cumulative_rewards[agent] = 0.0
        self._clear_rewards()

        # â”€â”€ Route action to the correct handler â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        if agent == RISK_MANAGER:
            self._step_risk_manager(action)
        elif agent == PORTFOLIO_MGR:
            self._step_portfolio_manager(action)
        elif agent == TRADER:
            self._step_trader(action)
            # After the trader acts, the market cycle is complete â†’ advance step
            self._advance_market()

        # Advance to next agent
        self._accumulate_rewards()
        self.agent_selection = self._agent_selector.next()

    def observe(self, agent: str) -> np.ndarray:
        return self._observations[agent]

    def observation_space(self, agent: str) -> spaces.Space:
        return self.observation_spaces[agent]

    def action_space(self, agent: str) -> spaces.Space:
        return self.action_spaces[agent]

    def render(self):
        price = self._market.current_price()
        val   = self._portfolio.total_value(price, self.ticker)
        print(
            f"Step {self._current_step:4d} | "
            f"Price: {price:10,.2f} | "
            f"Value: {val:12,.2f} | "
            f"Agent: {self.agent_selection}"
        )

    def close(self):
        pass

    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    # Per-Agent Step Handlers
    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

    def _step_risk_manager(self, action: np.ndarray):
        """
        Risk Manager decides governance constraints.
        action = [size_limit (0-1), allow_new_positions (0-1), force_reduce (0-1)]

        Reward logic (adversarial):
          +0.2  for restricting a dangerous action (high drawdown â†’ low size_limit)
          -0.3  for each $ portfolio value LOST since it last acted (it shares downside pain)
          +0.05 for being compliant (not overriding a healthy portfolio)
        """
        size_limit, allow_new_raw, force_reduce_raw = float(action[0]), float(action[1]), float(action[2])
        allow_new  = allow_new_raw  > 0.5
        force_reduce = force_reduce_raw > 0.5

        # Store message to pass to PM and Trader
        self._rm_message = np.array(
            [size_limit, float(allow_new), float(force_reduce)], dtype=np.float32
        )

        # Compute RM's step reward
        drawdown = self._risk.current_drawdown
        rm_reward = 0.0

        # Rewarded for restricting size when portfolio is underwater
        if drawdown > 0.10 and size_limit < 0.30:
            rm_reward += 0.20   # RM correctly capped risk during drawdown

        if force_reduce and drawdown > 0.20:
            rm_reward += 0.15   # Correct force-reduce under severe drawdown

        # Penalize for allowing reckless sizing when at risk
        if drawdown > 0.15 and size_limit > 0.70:
            rm_reward -= 0.20   # RM being reckless during drawdown

        # Shared downside: RM suffers when portfolio loses money this step
        prev_val = self._prev_portfolio_value
        curr_price = self._market.current_price()
        curr_val   = self._portfolio.total_value(curr_price, self.ticker)
        portfolio_delta_pct = (curr_val - prev_val) / (self.initial_cash + 1e-10)
        rm_reward += min(portfolio_delta_pct * 0.5, 0.0)  # Only downside pain

        # Defer emission until the Trader finishes the cycle so PettingZoo sees
        # one reward publication per cycle.
        self._rm_cycle_reward = float(rm_reward)

    def _step_portfolio_manager(self, action: np.ndarray):
        """
        Portfolio Manager decides capital allocation and optionally signals override.
        action = [capital_allocation (0-1), override_strength (0-1)]

        Reward logic:
          Aligned with overall portfolio performance (grade-based).
          Penalized for excessive overrides that don't improve outcomes.
        """
        cap_alloc  = float(np.clip(action[0], 0.0, 1.0))
        override_s = float(action[1])

        self._pm_message = np.array([cap_alloc, override_s], dtype=np.float32)
        self._pm_capital_allocation = cap_alloc
        self._pm_override_strength  = override_s

        # PM reward deferred to after trader executes (knows the outcome)
        # PM reward is deferred until after the trader executes and the outcome is known.

    def _step_trader(self, action: Dict):
        """
        Trader proposes a trade using the constrained action space.
        Receives both RM and PM guidance in its observation.

        Reward logic (adversarial):
          Rewarded purely on PnL.
          Penalized when governance overrides (RM size cap, PM force-close) are triggered.
          Bonus for proposing compliant actions that need no governance intervention.
        """
        direction = int(action["direction"])
        size_raw  = float(action["size"][0]) if hasattr(action["size"], "__len__") else float(action["size"])
        sl_input  = float(action["sl"][0])   if hasattr(action["sl"],   "__len__") else float(action.get("sl", 0.0))
        tp_input  = float(action["tp"][0])   if hasattr(action["tp"],   "__len__") else float(action.get("tp", 0.0))

        size = float(np.clip(size_raw, 0.0, 1.0))

        # â”€â”€ Apply Risk Manager constraints â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        rm_size_limit  = float(self._rm_message[0])
        rm_allow_new   = bool(self._rm_message[1] > 0.5)
        rm_force_reduce = bool(self._rm_message[2] > 0.5)

        interventions: List[Dict] = []

        if direction != 0 and size > rm_size_limit:
            interventions.append({
                "agent": "RiskManager",
                "type":  "size_clamp",
                "original_size":  size,
                "enforced_size":  rm_size_limit,
            })
            size = rm_size_limit

        if direction in (1, 2) and not rm_allow_new:
            interventions.append({
                "agent": "RiskManager",
                "type":  "no_new_positions",
                "reason": "RM blocked new positions during drawdown",
            })
            direction = 0  # Force hold

        if rm_force_reduce and direction == 1:
            interventions.append({
                "agent": "RiskManager",
                "type":  "force_reduce",
                "reason": "RM signaling to reduce longs",
            })
            direction = 2  # Flip to reduce

        # â”€â”€ Apply Portfolio Manager override â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        cap_alloc  = self._pm_capital_allocation
        if direction != 0 and size > cap_alloc:
            interventions.append({
                "agent": "PortfolioManager",
                "type":  "capital_cap",
                "original_size": size,
                "enforced_size": cap_alloc,
            })
            size = min(size, cap_alloc)

        # PM strong override_strength >0.7 means PM wants to force hold
        if self._pm_override_strength > 0.7 and direction != 0:
            interventions.append({
                "agent": "PortfolioManager",
                "type":  "pm_veto",
                "reason": "PM vetoed trade (insufficient conviction signal)",
            })
            direction = 0

        # â”€â”€ Auto SL/TP (governance baseline) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        current_price = self._market.current_price()
        DEFAULT_SL = 0.02
        if direction != 0 and sl_input <= 0:
            if direction == 1:
                sl_input = current_price * (1 - DEFAULT_SL)
            else:
                sl_input = current_price * (1 + DEFAULT_SL)
            interventions.append({"agent": "RiskManager", "type": "auto_sl"})
        if direction != 0 and tp_input <= 0 and sl_input > 0:
            sl_dist = abs(current_price - sl_input)
            tp_input = (current_price + sl_dist * 2.0) if direction == 1 else (current_price - sl_dist * 2.0)
            interventions.append({"agent": "RiskManager", "type": "auto_tp"})

        # Store pending trade for market advance
        self._pending_trade = {
            "direction": direction,
            "size": size,
            "sl": sl_input,
            "tp": tp_input,
            "interventions": interventions,
            "original_direction": int(action["direction"]),
            "original_size": size_raw,
        }

        # Compliance reward/penalty â€” will be finalized after market moves
        n_interventions = len(interventions)
        compliance_bonus = 0.15 if (n_interventions == 0 and direction != 0) else (-0.05 * n_interventions)
        self._trader_compliance_bonus = compliance_bonus

    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    # Market Advance (called after Trader acts)
    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

    def _advance_market(self):
        """Execute the pending trade, advance market, compute final rewards."""
        if not hasattr(self, "_pending_trade") or self._pending_trade is None:
            # No trade was staged (edge case)
            self._pending_trade = {"direction": 0, "size": 0.0, "sl": 0.0, "tp": 0.0,
                                   "interventions": [], "original_direction": 0, "original_size": 0.0}

        trade = self._pending_trade
        direction = trade["direction"]
        size      = trade["size"]
        sl_input  = trade["sl"]
        tp_input  = trade["tp"]

        current_price = self._market.current_price()
        prev_value    = self._portfolio.total_value(current_price, self.ticker)

        # Check SL/TP before executing new action
        self._check_sl_tp(current_price)

        # Execute trade in portfolio state
        traded = self._execute_trade(direction, size, sl_input, tp_input, current_price)

        # Advance market step
        self._current_step += 1
        self._market.current_step = self._current_step

        # Update risk state
        new_price = self._market.current_price() if self._current_step < len(self.df) else current_price
        new_value = self._portfolio.total_value(new_price, self.ticker)
        self._risk.update(new_value)
        self._episode_values.append(new_value)

        # Compute portfolio delta
        profit = (new_value - prev_value) / (self.initial_cash + 1e-10)
        price_trend = (new_price - current_price) / (current_price + 1e-10)

        raw_r = compute_raw_reward(
            profit=profit,
            drawdown=self._risk.current_drawdown,
            volatility=self._risk.return_volatility(),
            sharpe=self._risk.sharpe_ratio(),
            trade_count=int(traded),
            direction=direction,
            price_trend=price_trend,
        )

        # â”€â”€ Trader reward â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        trader_reward = normalize_reward(raw_r + self._trader_compliance_bonus)
        self.rewards[TRADER] = float(trader_reward)
        self._episode_rewards.append(trader_reward)

        # â”€â”€ PM reward: grade-based portfolio performance â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        normalized_profit  = float(np.clip((profit + 1.0) / 2.0, 0.0, 1.0))
        normalized_sharpe  = float(np.clip((self._risk.sharpe_ratio() + 2.0) / 4.0, 0.0, 1.0))
        consistency = float(np.mean(np.diff(np.array(self._episode_values)) > 0)) if len(self._episode_values) > 2 else 0.5
        grade = float(compute_grade({
            "profit": normalized_profit,
            "sharpe": normalized_sharpe,
            "drawdown": float(self._risk.max_drawdown),
            "consistency": consistency,
        }))
        pm_reward = (grade - 0.5) * 0.4   # Grade in [0,1] â†’ centered reward
        if self._risk.max_drawdown > 0.20:
            pm_reward -= 0.15              # PM penalized for deep drawdown
        self.rewards[PORTFOLIO_MGR] = float(pm_reward)

        # â”€â”€ RM: shared downside with final portfolio value â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        # We ADD to whatever penalty was already set in _step_risk_manager
        rm_pain = min(profit * 0.5, 0.0)   # Only share downside
        self.rewards[RISK_MANAGER] = float(self._rm_cycle_reward + rm_pain)

        # â”€â”€ Termination Check â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        terminated = (
            self._current_step >= self.max_steps or
            new_value < self.initial_cash * 0.10   # Blowup condition
        )
        if terminated:
            for ag in self.agents:
                self.terminations[ag] = True

        # Rebuild observations for the next cycle
        self._generate_observations()

        # Update governance log
        gov_record = {
            "step": self._current_step,
            "proposed": {"direction": trade["original_direction"], "size": trade["original_size"]},
            "executed": {"direction": direction, "size": size, "sl": sl_input, "tp": tp_input},
            "interventions": trade["interventions"],
            "was_compliant": len(trade["interventions"]) == 0,
            "rm_message": self._rm_message.tolist(),
            "pm_message": self._pm_message.tolist(),
        }
        self._governance_log.append(gov_record)

        # Expose info for the Trader (most info-rich agent)
        self.infos[TRADER] = {
            "step": self._current_step,
            "portfolio_value": float(new_value),
            "cash": float(self._portfolio.cash),
            "pnl": float(new_value - self.initial_cash),
            "pnl_pct": float(profit),
            "max_drawdown": float(self._risk.max_drawdown),
            "sharpe_ratio": float(self._risk.sharpe_ratio()),
            "grade": grade,
            "governance": gov_record,
            "rewards": dict(self.rewards),
        }
        self.infos[RISK_MANAGER]  = {"step": self._current_step, "drawdown": float(self._risk.max_drawdown)}
        self.infos[PORTFOLIO_MGR] = {"step": self._current_step, "grade": grade}

        self._prev_portfolio_value = new_value
        self._pending_trade = None
        self._rm_cycle_reward = 0.0

    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    # Observation Generation
    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

    def _generate_observations(self):
        base_obs = get_observation(self._market, self._portfolio, self._risk, self.ticker)
        self._observations = {
            RISK_MANAGER:  base_obs.copy(),
            PORTFOLIO_MGR: np.concatenate([base_obs, self._rm_message]),
            TRADER:        np.concatenate([base_obs, self._rm_message, self._pm_message]),
        }

    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    # Internal Helpers
    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

    def _reset_internal_state(self):
        self._market    = MarketState(prices=self.df, current_step=0)
        self._portfolio = PortfolioState(initial_cash=self.initial_cash, cash=self.initial_cash)
        self._risk      = RiskState(peak_value=self.initial_cash)
        self._current_step = 0

        # Inter-agent messages (start neutral)
        self._rm_message = np.array([0.5, 1.0, 0.0], dtype=np.float32)  # [size_limit=50%, allow=yes, force_reduce=no]
        self._pm_message = np.array([0.5, 0.0], dtype=np.float32)        # [cap_alloc=50%, override_strength=0]
        self._pm_capital_allocation = 0.5
        self._pm_override_strength  = 0.0

        self._pending_trade  = None
        self._rm_cycle_reward = 0.0
        self._trader_compliance_bonus = 0.0

        self._episode_values  = [self.initial_cash]
        self._episode_rewards = []
        self._governance_log: List[Dict] = []
        self._prev_portfolio_value = self.initial_cash

        # PZ state dictionaries
        self._observations = {ag: np.zeros(self.observation_spaces[ag].shape, dtype=np.float32)
                              for ag in ALL_AGENTS}

    def _accumulate_rewards(self):
        """Add the current step rewards into PettingZoo cumulative tracking."""
        for ag in self.agents:
            self._cumulative_rewards[ag] += self.rewards[ag]

    def _execute_trade(
        self, direction: int, size: float, sl: float, tp: float, current_price: float
    ) -> bool:
        """Execute trade on portfolio state. Returns True if a trade was made."""
        traded = False

        if direction == 1:  # BUY / Cover Short
            pos = self._portfolio.positions.get(self.ticker, 0.0)
            if pos < 0:
                # Cover short
                abs_qty = abs(pos)
                cover_cost = abs_qty * current_price * (1 + self.commission)
                margin_return = abs_qty * self._portfolio.avg_costs.get(self.ticker, current_price)
                self._portfolio.cash += margin_return - cover_cost
                self._portfolio.positions[self.ticker] = 0.0
                self._portfolio.avg_costs[self.ticker] = 0.0
                self._portfolio.stop_losses[self.ticker] = None
                self._portfolio.take_profits[self.ticker] = None
                traded = True
            else:
                trade_qty = (self._portfolio.cash * size) / (current_price * (1 + self.commission) + 1e-10)
                if trade_qty > 1e-8:
                    cost = trade_qty * current_price * (1 + self.commission)
                    self._portfolio.cash -= cost
                    prev_qty = pos
                    prev_avg  = self._portfolio.avg_costs.get(self.ticker, 0.0)
                    new_qty  = prev_qty + trade_qty
                    new_avg  = ((prev_qty * prev_avg) + (trade_qty * current_price)) / (new_qty + 1e-10)
                    self._portfolio.positions[self.ticker]   = new_qty
                    self._portfolio.avg_costs[self.ticker]   = new_avg
                    if sl > 0: self._portfolio.stop_losses[self.ticker]  = sl
                    if tp > 0: self._portfolio.take_profits[self.ticker] = tp
                    traded = True

        elif direction == 2:  # SELL / Short
            pos = self._portfolio.positions.get(self.ticker, 0.0)
            if pos > 0:
                sell_qty = min(pos, pos * size)
                if sell_qty > 1e-8:
                    revenue = sell_qty * current_price * (1 - self.commission)
                    self._portfolio.cash += revenue
                    remaining = pos - sell_qty
                    self._portfolio.positions[self.ticker] = max(remaining, 0.0)
                    if remaining <= 1e-8:
                        self._portfolio.avg_costs[self.ticker] = 0.0
                        self._portfolio.stop_losses[self.ticker] = None
                        self._portfolio.take_profits[self.ticker] = None
                    traded = True
            else:
                margin = self._portfolio.cash * size
                short_qty = margin / (current_price * (1 + self.commission) + 1e-10)
                if short_qty > 1e-8:
                    self._portfolio.cash -= short_qty * current_price
                    prev_qty  = abs(pos)
                    prev_avg  = self._portfolio.avg_costs.get(self.ticker, 0.0)
                    new_qty   = prev_qty + short_qty
                    new_avg   = ((prev_qty * prev_avg) + (short_qty * current_price)) / (new_qty + 1e-10)
                    self._portfolio.positions[self.ticker]   = -new_qty
                    self._portfolio.avg_costs[self.ticker]   = new_avg
                    if sl > 0: self._portfolio.stop_losses[self.ticker]  = sl
                    if tp > 0: self._portfolio.take_profits[self.ticker] = tp
                    traded = True

        if traded:
            self._risk.trade_count += 1
        return traded

    def _check_sl_tp(self, current_price: float):
        """Check and execute SL/TP orders."""
        ticker  = self.ticker
        pos_qty = self._portfolio.positions.get(ticker, 0.0)
        sl      = self._portfolio.stop_losses.get(ticker)
        tp      = self._portfolio.take_profits.get(ticker)
        if abs(pos_qty) < 1e-8:
            return

        hit = False
        if pos_qty > 0:
            if sl and current_price <= sl: hit = True
            if tp and current_price >= tp: hit = True
            if hit:
                revenue = pos_qty * current_price * (1 - self.commission)
                self._portfolio.cash += revenue
                self._portfolio.positions[ticker] = 0.0
                self._portfolio.avg_costs[ticker] = 0.0
                self._portfolio.stop_losses[ticker] = None
                self._portfolio.take_profits[ticker] = None
                self._risk.trade_count += 1
        elif pos_qty < 0:
            abs_qty = abs(pos_qty)
            if sl and current_price >= sl: hit = True
            if tp and current_price <= tp: hit = True
            if hit:
                avg_cost   = self._portfolio.avg_costs.get(ticker, current_price)
                cover_cost = abs_qty * current_price * (1 + self.commission)
                margin_ret = abs_qty * avg_cost
                self._portfolio.cash += margin_ret - cover_cost
                self._portfolio.positions[ticker] = 0.0
                self._portfolio.avg_costs[ticker] = 0.0
                self._portfolio.stop_losses[ticker] = None
                self._portfolio.take_profits[ticker] = None
                self._risk.trade_count += 1

    def _make_dummy_data(self, n: int = 500, difficulty: str = "hard") -> pd.DataFrame:
        """Delegate to TradingEnv's proven synthetic data generator."""
        from env.trading_env import TradingEnv
        tmp = TradingEnv.__new__(TradingEnv)
        return tmp._generate_market_data(n=n, difficulty=difficulty)

    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
    # Convenience
    # â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

    @functools.lru_cache(maxsize=None)
    def _obs_space(self, agent: str) -> spaces.Space:
        return self.observation_spaces[agent]

    @functools.lru_cache(maxsize=None)
    def _act_space(self, agent: str) -> spaces.Space:
        return self.action_spaces[agent]

    def state(self) -> Dict:
        """Return the full shared environment state (for visualization)."""
        price = self._market.current_price()
        return {
            "step":            self._current_step,
            "price":           float(price),
            "portfolio_value": float(self._portfolio.total_value(price, self.ticker)),
            "cash":            float(self._portfolio.cash),
            "positions":       {k: float(v) for k, v in self._portfolio.positions.items()},
            "max_drawdown":    float(self._risk.max_drawdown),
            "sharpe_ratio":    float(self._risk.sharpe_ratio()),
            "trade_count":     self._risk.trade_count,
            "rm_message":      self._rm_message.tolist(),
            "pm_message":      self._pm_message.tolist(),
            "governance_log":  self._governance_log[-10:],
        }