Spaces:

garvitsachdeva
/

finalRLEnv

Paused

App Files Files Community

finalRLEnv / env /action_space.py

garvitsachdeva

SpindleFlow RL — periodic push + log persistence

02ff91f 21 days ago

raw

history blame contribute delete

6.96 kB

	"""
	Hierarchical Factored Action Space.

	4 heads decoded sequentially at each step:
	Head 1: Meta-action — what high-level thing to do?
	Head 2: Specialist selection — which specialist(s) to call?
	Head 3: Delegation mode — how to call them?
	Head 4: Mode parameters — how many rounds, threshold, etc.?

	Design: Sequential decomposition keeps each head's distribution
	tractable for PPO. The policy sees a flattened joint action, but
	training uses the factored structure.
	"""

	from __future__ import annotations
	from dataclasses import dataclass
	from enum import IntEnum
	from typing import Optional
	import numpy as np


	class MetaAction(IntEnum):
	"""Top-level orchestrator decisions."""
	CALL_SPECIALIST = 0 # Call one or more specialists
	STOP = 1 # Stop delegation, synthesize output
	CALL_MEDIATOR = 2 # Call conflict mediator
	CLARIFY_TASK = 3 # Request task clarification (if ambiguous)
	DELEGATE_SUBTASK = 4 # Delegate a sub-problem (2nd level)
	RETRY_FAILED = 5 # Retry a failed specialist with fallback
	PARALLEL_SPAWN = 6 # Spawn parallel specialists
	SPAWN_SPECIALIST = 7 # Policy requests a new specialist be created


	class DelegationMode(IntEnum):
	"""How to execute the selected specialists."""
	SEQUENTIAL = 0 # A → B → C (each sees previous output)
	PARALLEL = 1 # A, B, C all run simultaneously
	FAN_OUT_REDUCE = 2 # A, B, C run → mediator reduces output
	ITERATIVE = 3 # Run specialist, check output, loop until threshold
	CONDITIONAL = 4 # Run A; if condition met, run B, else C
	PRIORITY_QUEUE = 5 # Run in priority order, stop when threshold met
	BROADCAST = 6 # Send to all specialists, take first to complete


	@dataclass
	class FactoredAction:
	"""
	The complete action decoded from all 4 heads.
	This is what gets passed to the environment's step() function.
	"""
	meta_action: MetaAction
	specialist_ids: list[str] # Which specialists to call
	delegation_mode: DelegationMode # How to call them
	mode_params: dict # Mode-specific parameters
	raw_action: Optional[np.ndarray] = None # Raw policy output (for logging)

	def is_terminal(self) -> bool:
	"""Returns True if this action ends the episode."""
	return self.meta_action == MetaAction.STOP

	def to_log_dict(self) -> dict:
	return {
	"meta_action": self.meta_action.name,
	"specialists": self.specialist_ids,
	"mode": self.delegation_mode.name,
	"params": self.mode_params,
	}


	class ActionDecoder:
	"""
	Decodes a flat action vector from the policy into a FactoredAction.

	Action vector layout:
	[0] : meta_action index (int, 0–6)
	[1 : 1+max_specialists] : specialist selection (multi-hot float)
	[1+max_specialists] : delegation_mode index (int, 0–6)
	[2+max_specialists : *] : mode_params (continuous, 4 floats)

	Total action dim = 1 + max_specialists + 1 + 4 = max_specialists + 6
	"""

	NUM_META_ACTIONS = len(MetaAction)
	NUM_DELEGATION_MODES = len(DelegationMode)
	NUM_MODE_PARAMS = 4

	def __init__(self, specialist_ids: list[str], max_specialists: int = 8):
	self.specialist_ids = specialist_ids
	self.max_specialists = min(len(specialist_ids), max_specialists)
	self.action_dim = self.max_specialists + 6

	def decode(
	self,
	action_vector: np.ndarray,
	valid_specialist_mask: Optional[np.ndarray] = None,
	) -> FactoredAction:
	"""
	Decode a flat action vector into a FactoredAction.

	Args:
	action_vector: Flat numpy array from the policy
	valid_specialist_mask: Binary mask, 1 = valid, 0 = masked out
	(enforces DAG constraints)
	"""
	action_vector = np.asarray(action_vector, dtype=np.float32)

	# Head 1: Meta-action
	meta_idx = int(np.clip(round(action_vector[0]), 0, self.NUM_META_ACTIONS - 1))
	meta_action = MetaAction(meta_idx)

	# Head 2: Specialist selection (multi-hot)
	spec_logits = action_vector[1: 1 + self.max_specialists]
	if valid_specialist_mask is not None:
	spec_logits = spec_logits * valid_specialist_mask[:self.max_specialists]

	selected_indices = np.where(spec_logits > 0.0)[0]
	if len(selected_indices) == 0 and meta_action == MetaAction.CALL_SPECIALIST:
	# Fallback: select the highest-scoring specialist
	selected_indices = [int(np.argmax(spec_logits))]

	selected_ids = [
	self.specialist_ids[i]
	for i in selected_indices
	if i < len(self.specialist_ids)
	]

	# Head 3: Delegation mode
	mode_idx = int(np.clip(
	round(action_vector[1 + self.max_specialists]),
	0, self.NUM_DELEGATION_MODES - 1
	))
	delegation_mode = DelegationMode(mode_idx)

	# Head 4: Mode parameters
	param_start = 2 + self.max_specialists
	raw_params = action_vector[param_start: param_start + self.NUM_MODE_PARAMS]
	mode_params = self._decode_mode_params(delegation_mode, raw_params)

	return FactoredAction(
	meta_action=meta_action,
	specialist_ids=selected_ids,
	delegation_mode=delegation_mode,
	mode_params=mode_params,
	raw_action=action_vector,
	)

	def _decode_mode_params(
	self, mode: DelegationMode, raw_params: np.ndarray
	) -> dict:
	"""Decode mode-specific parameters from the raw continuous params."""
	p = np.clip(raw_params, 0.0, 1.0)
	if mode == DelegationMode.ITERATIVE:
	return {
	"max_rounds": int(1 + round(p[0] * 4)), # 1–5 rounds
	"quality_threshold": float(0.5 + p[1] * 0.5), # 0.5–1.0
	}
	elif mode == DelegationMode.PRIORITY_QUEUE:
	return {
	"stop_threshold": float(0.6 + p[0] * 0.4), # 0.6–1.0
	}
	elif mode == DelegationMode.CONDITIONAL:
	return {
	"condition_threshold": float(0.4 + p[0] * 0.6), # 0.4–1.0
	}
	else:
	return {"parallel_budget_ms": int(2000 + p[0] * 6000)}

	def get_action_dim(self) -> int:
	return self.action_dim

	def build_specialist_mask(
	self, valid_specialist_ids: list[str]
	) -> np.ndarray:
	"""Build a binary mask for valid specialist selections."""
	mask = np.zeros(self.max_specialists, dtype=np.float32)
	valid_set = set(valid_specialist_ids)
	for i, sid in enumerate(self.specialist_ids[: self.max_specialists]):
	if sid in valid_set:
	mask[i] = 1.0
	return mask