Spaces:

ARKAISW
/

QuantHive

Running

App Files Files Community

QuantHive / _tmp_old_env_test /env /multi_agent_env.py

ARKAISW

Update latest changes

aec0295 about 2 months ago

raw

history blame contribute delete

39.8 kB

	"""
	Multi-Agent Trading Environment using PettingZoo AEC API.

	Three independent RL agents operate in a decentralized governance framework:
	- risk_manager_0: Rewarded for restricting dangerous trades. Penalized when Trader loses.
	- portfolio_manager_0: Oversees capital allocation. Rewarded for portfolio growth + drawdown control.
	- trader_0: Rewarded purely for PnL. Sees Risk/PM constraints as observations.

	The AEC (Agent-Environment Cycle) loop alternates agent turns each step.
	Agent Negotiation: Each agent's output message (constraints, allocations) becomes
	part of the next agent's observation, creating an emergent negotiation dynamic.
	"""

	from __future__ import annotations

	import functools
	from typing import Dict, List, Optional, Tuple, Any

	import numpy as np
	import pandas as pd
	from gymnasium import spaces

	from pettingzoo import AECEnv
	from pettingzoo.utils import agent_selector

	from env.state import MarketState, PortfolioState, RiskState, get_observation
	from env.reward import compute_raw_reward, normalize_reward, compute_grade
	from utils.indicators import compute_indicators


	# â”€â”€â”€ Agent IDs â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	RISK_MANAGER = "risk_manager_0"
	PORTFOLIO_MGR = "portfolio_manager_0"
	TRADER = "trader_0"
	ALL_AGENTS = [RISK_MANAGER, PORTFOLIO_MGR, TRADER]

	# â”€â”€â”€ Observation Sizes â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# Base market+portfolio+risk obs size: 14 + 5 + 5 = 24
	BASE_OBS_SIZE = 24
	# Risk Manager message appended to PM and Trader observations: [size_limit, allow_new, force_reduce]
	RM_MSG_SIZE = 3
	# PM message appended to Trader observations: [cap_allocation, is_override_signaled]
	PM_MSG_SIZE = 2


	class MultiAgentTradingEnv(AECEnv):
	"""
	A PettingZoo AEC environment for decentralized multi-agent trading governance.

	Turn order per step: risk_manager_0 â†’ portfolio_manager_0 â†’ trader_0
	On each full cycle, the market advances by one candle.

	Observations:
	risk_manager_0: base_obs (24,)
	portfolio_mgr_0: base_obs + rm_message (24 + 3 = 27,)
	trader_0: base_obs + rm_message + pm_message (24 + 3 + 2 = 29,)

	Actions:
	risk_manager_0: Box(3,) â€” [size_limit, allow_new_positions, force_reduce] â€” continuous
	portfolio_mgr_0: Box(2,) â€” [capital_allocation_fraction, override_flag] â€” continuous
	trader_0: Dict â€” direction (Discrete 3), size (Box 1), sl (Box 1), tp (Box 1)
	"""

	metadata = {
	"render_modes": ["human", "ansi"],
	"name": "multi_agent_trading_v1",
	"is_parallelizable": False,
	}

	def __init__(
	self,
	df: Optional[pd.DataFrame] = None,
	initial_cash: float = 100_000.0,
	ticker: str = "default",
	commission: float = 0.001,
	max_steps: Optional[int] = None,
	difficulty: str = "hard",
	):
	super().__init__()

	self.difficulty = difficulty
	if df is None:
	df = self._make_dummy_data(difficulty=difficulty)
	self.raw_df = df.copy()
	self.df = compute_indicators(df)
	self.ticker = ticker
	self.initial_cash = initial_cash
	self.commission = commission
	self.max_steps = max_steps or (len(self.df) - 1)

	# â”€â”€ PettingZoo required attributes â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	self.agents = ALL_AGENTS[:]
	self.possible_agents = ALL_AGENTS[:]

	# â”€â”€ Observation spaces â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	self.observation_spaces = {
	RISK_MANAGER: spaces.Box(low=-np.inf, high=np.inf,
	shape=(BASE_OBS_SIZE,), dtype=np.float32),
	PORTFOLIO_MGR: spaces.Box(low=-np.inf, high=np.inf,
	shape=(BASE_OBS_SIZE + RM_MSG_SIZE,), dtype=np.float32),
	TRADER: spaces.Box(low=-np.inf, high=np.inf,
	shape=(BASE_OBS_SIZE + RM_MSG_SIZE + PM_MSG_SIZE,), dtype=np.float32),
	}

	# â”€â”€ Action spaces â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	self.action_spaces = {
	RISK_MANAGER: spaces.Box(low=np.array([0.01, 0.0, 0.0], dtype=np.float32),
	high=np.array([1.0, 1.0, 1.0], dtype=np.float32),
	shape=(3,), dtype=np.float32),
	PORTFOLIO_MGR: spaces.Box(low=np.array([0.0, 0.0], dtype=np.float32),
	high=np.array([1.0, 1.0], dtype=np.float32),
	shape=(2,), dtype=np.float32),
	TRADER: spaces.Dict({
	"direction": spaces.Discrete(3), # 0=Hold, 1=Buy, 2=Sell/Short
	"size": spaces.Box(0.0, 1.0, shape=(1,), dtype=np.float32),
	"sl": spaces.Box(0.0, np.inf, shape=(1,), dtype=np.float32),
	"tp": spaces.Box(0.0, np.inf, shape=(1,), dtype=np.float32),
	}),
	}

	# â”€â”€ Internal state (reset before first use) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	self._agent_selector = agent_selector(ALL_AGENTS)
	self._reset_internal_state()

	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# PettingZoo required API
	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

	def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
	if seed is not None:
	np.random.seed(seed)

	self.agents = ALL_AGENTS[:]
	self._agent_selector.reinit(ALL_AGENTS)

	self._reset_internal_state()
	self._generate_observations()

	self.agent_selection = self._agent_selector.reset()

	# Zero-fill all rewards/terminations/truncations/infos for PZ compliance
	self.rewards = {ag: 0.0 for ag in self.agents}
	self._cumulative_rewards = {ag: 0.0 for ag in self.agents}
	self.terminations = {ag: False for ag in self.agents}
	self.truncations = {ag: False for ag in self.agents}
	self.infos = {ag: {} for ag in self.agents}

	def step(self, action):
	"""Process one agent's action in the AEC turn order."""
	agent = self.agent_selection

	if self.terminations[agent] or self.truncations[agent]:
	# Dead-step: PZ compliance requires we handle this
	self._was_dead_step(action)
	return

	# â”€â”€ Route action to the correct handler â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	if agent == RISK_MANAGER:
	self._step_risk_manager(action)
	elif agent == PORTFOLIO_MGR:
	self._step_portfolio_manager(action)
	elif agent == TRADER:
	self._step_trader(action)
	# After the trader acts, the market cycle is complete â†’ advance step
	self._advance_market()

	# Advance to next agent
	self._accumulate_rewards()
	self.agent_selection = self._agent_selector.next()

	def observe(self, agent: str) -> np.ndarray:
	return self._observations[agent]

	def observation_space(self, agent: str) -> spaces.Space:
	return self.observation_spaces[agent]

	def action_space(self, agent: str) -> spaces.Space:
	return self.action_spaces[agent]

	def render(self):
	price = self._market.current_price()
	val = self._portfolio.total_value(price, self.ticker)
	print(
	f"Step {self._current_step:4d} \| "
	f"Price: {price:10,.2f} \| "
	f"Value: {val:12,.2f} \| "
	f"Agent: {self.agent_selection}"
	)

	def close(self):
	pass

	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# Per-Agent Step Handlers
	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

	def _step_risk_manager(self, action: np.ndarray):
	"""
	Risk Manager decides governance constraints.
	action = [size_limit (0-1), allow_new_positions (0-1), force_reduce (0-1)]

	Reward logic (adversarial):
	+0.2 for restricting a dangerous action (high drawdown â†’ low size_limit)
	-0.3 for each $ portfolio value LOST since it last acted (it shares downside pain)
	+0.05 for being compliant (not overriding a healthy portfolio)
	"""
	size_limit, allow_new_raw, force_reduce_raw = float(action[0]), float(action[1]), float(action[2])
	allow_new = allow_new_raw > 0.5
	force_reduce = force_reduce_raw > 0.5

	# Store message to pass to PM and Trader
	self._rm_message = np.array(
	[size_limit, float(allow_new), float(force_reduce)], dtype=np.float32
	)

	# Compute RM's step reward
	drawdown = self._risk.current_drawdown
	rm_reward = 0.0

	# Rewarded for restricting size when portfolio is underwater
	if drawdown > 0.10 and size_limit < 0.30:
	rm_reward += 0.20 # RM correctly capped risk during drawdown

	if force_reduce and drawdown > 0.20:
	rm_reward += 0.15 # Correct force-reduce under severe drawdown

	# Penalize for allowing reckless sizing when at risk
	if drawdown > 0.15 and size_limit > 0.70:
	rm_reward -= 0.20 # RM being reckless during drawdown

	# Shared downside: RM suffers when portfolio loses money this step
	prev_val = self._prev_portfolio_value
	curr_price = self._market.current_price()
	curr_val = self._portfolio.total_value(curr_price, self.ticker)
	portfolio_delta_pct = (curr_val - prev_val) / (self.initial_cash + 1e-10)
	rm_reward += min(portfolio_delta_pct * 0.5, 0.0) # Only downside pain

	self._pending_rewards[RISK_MANAGER] = rm_reward

	def _step_portfolio_manager(self, action: np.ndarray):
	"""
	Portfolio Manager decides capital allocation and optionally signals override.
	action = [capital_allocation (0-1), override_strength (0-1)]

	Reward logic:
	Aligned with overall portfolio performance (grade-based).
	Penalized for excessive overrides that don't improve outcomes.
	"""
	cap_alloc = float(np.clip(action[0], 0.0, 1.0))
	override_s = float(action[1])

	self._pm_message = np.array([cap_alloc, override_s], dtype=np.float32)
	self._pm_capital_allocation = cap_alloc
	self._pm_override_strength = override_s

	# PM reward deferred to after trader executes (knows the outcome)
	self._pending_rewards[PORTFOLIO_MGR] = 0.0 # Will be updated in _advance_market

	def _step_trader(self, action: Dict):
	"""
	Trader proposes a trade using the constrained action space.
	Receives both RM and PM guidance in its observation.

	Reward logic (adversarial):
	Rewarded purely on PnL.
	Penalized when governance overrides (RM size cap, PM force-close) are triggered.
	Bonus for proposing compliant actions that need no governance intervention.
	"""
	direction = int(action["direction"])
	size_raw = float(action["size"][0]) if hasattr(action["size"], "__len__") else float(action["size"])
	sl_input = float(action["sl"][0]) if hasattr(action["sl"], "__len__") else float(action.get("sl", 0.0))
	tp_input = float(action["tp"][0]) if hasattr(action["tp"], "__len__") else float(action.get("tp", 0.0))

	size = float(np.clip(size_raw, 0.0, 1.0))

	# â”€â”€ Apply Risk Manager constraints â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	rm_size_limit = float(self._rm_message[0])
	rm_allow_new = bool(self._rm_message[1] > 0.5)
	rm_force_reduce = bool(self._rm_message[2] > 0.5)

	interventions: List[Dict] = []

	if direction != 0 and size > rm_size_limit:
	interventions.append({
	"agent": "RiskManager",
	"type": "size_clamp",
	"original_size": size,
	"enforced_size": rm_size_limit,
	})
	size = rm_size_limit

	if direction in (1, 2) and not rm_allow_new:
	interventions.append({
	"agent": "RiskManager",
	"type": "no_new_positions",
	"reason": "RM blocked new positions during drawdown",
	})
	direction = 0 # Force hold

	if rm_force_reduce and direction == 1:
	interventions.append({
	"agent": "RiskManager",
	"type": "force_reduce",
	"reason": "RM signaling to reduce longs",
	})
	direction = 2 # Flip to reduce

	# â”€â”€ Apply Portfolio Manager override â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	cap_alloc = self._pm_capital_allocation
	if direction != 0 and size > cap_alloc:
	interventions.append({
	"agent": "PortfolioManager",
	"type": "capital_cap",
	"original_size": size,
	"enforced_size": cap_alloc,
	})
	size = min(size, cap_alloc)

	# PM strong override_strength >0.7 means PM wants to force hold
	if self._pm_override_strength > 0.7 and direction != 0:
	interventions.append({
	"agent": "PortfolioManager",
	"type": "pm_veto",
	"reason": "PM vetoed trade (insufficient conviction signal)",
	})
	direction = 0

	# â”€â”€ Auto SL/TP (governance baseline) â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	current_price = self._market.current_price()
	DEFAULT_SL = 0.02
	if direction != 0 and sl_input <= 0:
	if direction == 1:
	sl_input = current_price * (1 - DEFAULT_SL)
	else:
	sl_input = current_price * (1 + DEFAULT_SL)
	interventions.append({"agent": "RiskManager", "type": "auto_sl"})
	if direction != 0 and tp_input <= 0 and sl_input > 0:
	sl_dist = abs(current_price - sl_input)
	tp_input = (current_price + sl_dist * 2.0) if direction == 1 else (current_price - sl_dist * 2.0)
	interventions.append({"agent": "RiskManager", "type": "auto_tp"})

	# Store pending trade for market advance
	self._pending_trade = {
	"direction": direction,
	"size": size,
	"sl": sl_input,
	"tp": tp_input,
	"interventions": interventions,
	"original_direction": int(action["direction"]),
	"original_size": size_raw,
	}

	# Compliance reward/penalty â€” will be finalized after market moves
	n_interventions = len(interventions)
	compliance_bonus = 0.15 if (n_interventions == 0 and direction != 0) else (-0.05 * n_interventions)
	self._trader_compliance_bonus = compliance_bonus

	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# Market Advance (called after Trader acts)
	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

	def _advance_market(self):
	"""Execute the pending trade, advance market, compute final rewards."""
	if not hasattr(self, "_pending_trade") or self._pending_trade is None:
	# No trade was staged (edge case)
	self._pending_trade = {"direction": 0, "size": 0.0, "sl": 0.0, "tp": 0.0,
	"interventions": [], "original_direction": 0, "original_size": 0.0}

	trade = self._pending_trade
	direction = trade["direction"]
	size = trade["size"]
	sl_input = trade["sl"]
	tp_input = trade["tp"]

	current_price = self._market.current_price()
	prev_value = self._portfolio.total_value(current_price, self.ticker)

	# Check SL/TP before executing new action
	self._check_sl_tp(current_price)

	# Execute trade in portfolio state
	traded = self._execute_trade(direction, size, sl_input, tp_input, current_price)

	# Advance market step
	self._current_step += 1
	self._market.current_step = self._current_step

	# Update risk state
	new_price = self._market.current_price() if self._current_step < len(self.df) else current_price
	new_value = self._portfolio.total_value(new_price, self.ticker)
	self._risk.update(new_value)
	self._episode_values.append(new_value)

	# Compute portfolio delta
	profit = (new_value - prev_value) / (self.initial_cash + 1e-10)
	price_trend = (new_price - current_price) / (current_price + 1e-10)

	raw_r = compute_raw_reward(
	profit=profit,
	drawdown=self._risk.current_drawdown,
	volatility=self._risk.return_volatility(),
	sharpe=self._risk.sharpe_ratio(),
	trade_count=int(traded),
	direction=direction,
	price_trend=price_trend,
	)

	# â”€â”€ Trader reward â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	trader_reward = normalize_reward(raw_r + self._trader_compliance_bonus)
	self._pending_rewards[TRADER] = float(trader_reward)
	self._episode_rewards.append(trader_reward)

	# â”€â”€ PM reward: grade-based portfolio performance â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	normalized_profit = float(np.clip((profit + 1.0) / 2.0, 0.0, 1.0))
	normalized_sharpe = float(np.clip((self._risk.sharpe_ratio() + 2.0) / 4.0, 0.0, 1.0))
	consistency = float(np.mean(np.diff(np.array(self._episode_values)) > 0)) if len(self._episode_values) > 2 else 0.5
	grade = float(compute_grade({
	"profit": normalized_profit,
	"sharpe": normalized_sharpe,
	"drawdown": float(self._risk.max_drawdown),
	"consistency": consistency,
	}))
	pm_reward = (grade - 0.5) * 0.4 # Grade in [0,1] â†’ centered reward
	if self._risk.max_drawdown > 0.20:
	pm_reward -= 0.15 # PM penalized for deep drawdown
	self._pending_rewards[PORTFOLIO_MGR] = float(pm_reward)

	# â”€â”€ RM: shared downside with final portfolio value â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# We ADD to whatever penalty was already set in _step_risk_manager
	rm_pain = min(profit * 0.5, 0.0) # Only share downside
	self._pending_rewards[RISK_MANAGER] = float(self._pending_rewards.get(RISK_MANAGER, 0.0) + rm_pain)

	# â”€â”€ Termination Check â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	terminated = (
	self._current_step >= self.max_steps or
	new_value < self.initial_cash * 0.10 # Blowup condition
	)
	if terminated:
	for ag in self.agents:
	self.terminations[ag] = True

	# Rebuild observations for the next cycle
	self._generate_observations()

	# Update governance log
	gov_record = {
	"step": self._current_step,
	"proposed": {"direction": trade["original_direction"], "size": trade["original_size"]},
	"executed": {"direction": direction, "size": size, "sl": sl_input, "tp": tp_input},
	"interventions": trade["interventions"],
	"was_compliant": len(trade["interventions"]) == 0,
	"rm_message": self._rm_message.tolist(),
	"pm_message": self._pm_message.tolist(),
	}
	self._governance_log.append(gov_record)

	# Expose info for the Trader (most info-rich agent)
	self.infos[TRADER] = {
	"step": self._current_step,
	"portfolio_value": float(new_value),
	"cash": float(self._portfolio.cash),
	"pnl": float(new_value - self.initial_cash),
	"pnl_pct": float(profit),
	"max_drawdown": float(self._risk.max_drawdown),
	"sharpe_ratio": float(self._risk.sharpe_ratio()),
	"grade": grade,
	"governance": gov_record,
	"rewards": dict(self._pending_rewards),
	}
	self.infos[RISK_MANAGER] = {"step": self._current_step, "drawdown": float(self._risk.max_drawdown)}
	self.infos[PORTFOLIO_MGR] = {"step": self._current_step, "grade": grade}

	self._prev_portfolio_value = new_value
	self._pending_trade = None

	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# Observation Generation
	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

	def _generate_observations(self):
	base_obs = get_observation(self._market, self._portfolio, self._risk, self.ticker)
	self._observations = {
	RISK_MANAGER: base_obs.copy(),
	PORTFOLIO_MGR: np.concatenate([base_obs, self._rm_message]),
	TRADER: np.concatenate([base_obs, self._rm_message, self._pm_message]),
	}

	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# Internal Helpers
	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

	def _reset_internal_state(self):
	self._market = MarketState(prices=self.df, current_step=0)
	self._portfolio = PortfolioState(initial_cash=self.initial_cash, cash=self.initial_cash)
	self._risk = RiskState(peak_value=self.initial_cash)
	self._current_step = 0

	# Inter-agent messages (start neutral)
	self._rm_message = np.array([0.5, 1.0, 0.0], dtype=np.float32) # [size_limit=50%, allow=yes, force_reduce=no]
	self._pm_message = np.array([0.5, 0.0], dtype=np.float32) # [cap_alloc=50%, override_strength=0]
	self._pm_capital_allocation = 0.5
	self._pm_override_strength = 0.0

	self._pending_trade = None
	self._pending_rewards = {ag: 0.0 for ag in ALL_AGENTS}
	self._trader_compliance_bonus = 0.0

	self._episode_values = [self.initial_cash]
	self._episode_rewards = []
	self._governance_log: List[Dict] = []
	self._prev_portfolio_value = self.initial_cash

	# PZ state dictionaries
	self._observations = {ag: np.zeros(self.observation_spaces[ag].shape, dtype=np.float32)
	for ag in ALL_AGENTS}

	def _accumulate_rewards(self):
	"""Move pending rewards into PZ cumulative reward tracking."""
	for ag in self.agents:
	self.rewards[ag] = self._pending_rewards.get(ag, 0.0)
	self._cumulative_rewards[ag] += self.rewards[ag]

	def _execute_trade(
	self, direction: int, size: float, sl: float, tp: float, current_price: float
	) -> bool:
	"""Execute trade on portfolio state. Returns True if a trade was made."""
	traded = False

	if direction == 1: # BUY / Cover Short
	pos = self._portfolio.positions.get(self.ticker, 0.0)
	if pos < 0:
	# Cover short
	abs_qty = abs(pos)
	cover_cost = abs_qty * current_price * (1 + self.commission)
	margin_return = abs_qty * self._portfolio.avg_costs.get(self.ticker, current_price)
	self._portfolio.cash += margin_return - cover_cost
	self._portfolio.positions[self.ticker] = 0.0
	self._portfolio.avg_costs[self.ticker] = 0.0
	self._portfolio.stop_losses[self.ticker] = None
	self._portfolio.take_profits[self.ticker] = None
	traded = True
	else:
	trade_qty = (self._portfolio.cash * size) / (current_price * (1 + self.commission) + 1e-10)
	if trade_qty > 1e-8:
	cost = trade_qty * current_price * (1 + self.commission)
	self._portfolio.cash -= cost
	prev_qty = pos
	prev_avg = self._portfolio.avg_costs.get(self.ticker, 0.0)
	new_qty = prev_qty + trade_qty
	new_avg = ((prev_qty * prev_avg) + (trade_qty * current_price)) / (new_qty + 1e-10)
	self._portfolio.positions[self.ticker] = new_qty
	self._portfolio.avg_costs[self.ticker] = new_avg
	if sl > 0: self._portfolio.stop_losses[self.ticker] = sl
	if tp > 0: self._portfolio.take_profits[self.ticker] = tp
	traded = True

	elif direction == 2: # SELL / Short
	pos = self._portfolio.positions.get(self.ticker, 0.0)
	if pos > 0:
	sell_qty = min(pos, pos * size)
	if sell_qty > 1e-8:
	revenue = sell_qty * current_price * (1 - self.commission)
	self._portfolio.cash += revenue
	remaining = pos - sell_qty
	self._portfolio.positions[self.ticker] = max(remaining, 0.0)
	if remaining <= 1e-8:
	self._portfolio.avg_costs[self.ticker] = 0.0
	self._portfolio.stop_losses[self.ticker] = None
	self._portfolio.take_profits[self.ticker] = None
	traded = True
	else:
	margin = self._portfolio.cash * size
	short_qty = margin / (current_price * (1 + self.commission) + 1e-10)
	if short_qty > 1e-8:
	self._portfolio.cash -= short_qty * current_price
	prev_qty = abs(pos)
	prev_avg = self._portfolio.avg_costs.get(self.ticker, 0.0)
	new_qty = prev_qty + short_qty
	new_avg = ((prev_qty * prev_avg) + (short_qty * current_price)) / (new_qty + 1e-10)
	self._portfolio.positions[self.ticker] = -new_qty
	self._portfolio.avg_costs[self.ticker] = new_avg
	if sl > 0: self._portfolio.stop_losses[self.ticker] = sl
	if tp > 0: self._portfolio.take_profits[self.ticker] = tp
	traded = True

	if traded:
	self._risk.trade_count += 1
	return traded

	def _check_sl_tp(self, current_price: float):
	"""Check and execute SL/TP orders."""
	ticker = self.ticker
	pos_qty = self._portfolio.positions.get(ticker, 0.0)
	sl = self._portfolio.stop_losses.get(ticker)
	tp = self._portfolio.take_profits.get(ticker)
	if abs(pos_qty) < 1e-8:
	return

	hit = False
	if pos_qty > 0:
	if sl and current_price <= sl: hit = True
	if tp and current_price >= tp: hit = True
	if hit:
	revenue = pos_qty * current_price * (1 - self.commission)
	self._portfolio.cash += revenue
	self._portfolio.positions[ticker] = 0.0
	self._portfolio.avg_costs[ticker] = 0.0
	self._portfolio.stop_losses[ticker] = None
	self._portfolio.take_profits[ticker] = None
	self._risk.trade_count += 1
	elif pos_qty < 0:
	abs_qty = abs(pos_qty)
	if sl and current_price >= sl: hit = True
	if tp and current_price <= tp: hit = True
	if hit:
	avg_cost = self._portfolio.avg_costs.get(ticker, current_price)
	cover_cost = abs_qty * current_price * (1 + self.commission)
	margin_ret = abs_qty * avg_cost
	self._portfolio.cash += margin_ret - cover_cost
	self._portfolio.positions[ticker] = 0.0
	self._portfolio.avg_costs[ticker] = 0.0
	self._portfolio.stop_losses[ticker] = None
	self._portfolio.take_profits[ticker] = None
	self._risk.trade_count += 1

	def _make_dummy_data(self, n: int = 500, difficulty: str = "hard") -> pd.DataFrame:
	"""Delegate to TradingEnv's proven synthetic data generator."""
	from env.trading_env import TradingEnv
	tmp = TradingEnv.__new__(TradingEnv)
	return tmp._generate_market_data(n=n, difficulty=difficulty)

	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
	# Convenience
	# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

	@functools.lru_cache(maxsize=None)
	def _obs_space(self, agent: str) -> spaces.Space:
	return self.observation_spaces[agent]

	@functools.lru_cache(maxsize=None)
	def _act_space(self, agent: str) -> spaces.Space:
	return self.action_spaces[agent]

	def state(self) -> Dict:
	"""Return the full shared environment state (for visualization)."""
	price = self._market.current_price()
	return {
	"step": self._current_step,
	"price": float(price),
	"portfolio_value": float(self._portfolio.total_value(price, self.ticker)),
	"cash": float(self._portfolio.cash),
	"positions": {k: float(v) for k, v in self._portfolio.positions.items()},
	"max_drawdown": float(self._risk.max_drawdown),
	"sharpe_ratio": float(self._risk.sharpe_ratio()),
	"trade_count": self._risk.trade_count,
	"rm_message": self._rm_message.tolist(),
	"pm_message": self._pm_message.tolist(),
	"governance_log": self._governance_log[-10:],
	}