Spaces:

WitchedWick
/

JaamCTRL-OpenEnv

Sleeping

Akshara

Fix: Add JSON serialization for numpy arrays in reset/step/state methods

203a9f2 2 months ago

23.8 kB

	"""
	───────────────────────────────────────────────────────────────────────────────
	JaamCTRLTrafficEnv — core Gymnasium environment loop.

	This class owns:
	- SUMO subprocess lifecycle (_launch_sumo, _close_sumo)
	- Action application and yellow-phase safety enforcement
	- Delegating to observation.py, reward.py, incident_manager.py
	- Episode metrics accumulation and success checking
	- The three Gymnasium contract methods: reset(), step(), state()

	It does NOT define task configs, reward coefficients, or observation math —
	those all live in their respective modules.
	───────────────────────────────────────────────────────────────────────────────
	"""

	from __future__ import annotations

	import json
	import logging
	import os
	import subprocess
	import sys
	import time
	from collections import deque
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple

	import numpy as np
	import gymnasium as gym
	from gymnasium import spaces

	# ── TraCI import ─────────────────────────────────────────────────────────────
	if "SUMO_HOME" in os.environ:
	sys.path.append(os.path.join(os.environ["SUMO_HOME"], "tools"))
	try:
	import traci
	TRACI_AVAILABLE = True
	except ImportError:
	TRACI_AVAILABLE = False

	# ── Intra-package imports ─────────────────────────────────────────────────────
	from env import (
	TASK_CONFIGS,
	YELLOW_PHASES,
	YELLOW_DURATION,
	MIN_GREEN_S,
	)
	from env.reward import compute_reward, reward_breakdown
	from env.observation import (
	collect_telemetry,
	mock_telemetry,
	build_obs,
	OBS_DIM,
	)
	from env.incident_manager import IncidentManager

	logger = logging.getLogger("JaamCTRL.BaseEnv")


	class JaamCTRLTrafficEnv(gym.Env):
	"""
	Gymnasium-compatible adaptive traffic signal control environment.

	Supports 3 progressive difficulty tasks via `task_id`.
	Designed for use with the OpenEnv judging harness and stable-baselines3.

	Parameters
	----------
	task_id : 1 = Easy, 2 = Medium, 3 = Hard
	sumo_cfg_path : path to .sumocfg (default "sumo/corridor.sumocfg")
	use_gui : launch sumo-gui (True) or headless sumo (False)
	port : TraCI port; 0 = auto-select a free port
	seed : RNG seed for full reproducibility
	mock_sumo : skip SUMO entirely, use synthetic observations
	(useful for CI / unit tests without SUMO installed)
	"""

	metadata = {"render_modes": ["human", "none"], "render_fps": 10}

	# ── Construction ─────────────────────────────────────────────────────────

	def __init__(
	self,
	task_id: int = 1,
	sumo_cfg_path: str = "sumo/corridor.sumocfg",
	use_gui: bool = False,
	port: int = 0,
	seed: Optional[int] = None,
	mock_sumo: bool = False,
	) -> None:
	super().__init__()

	assert task_id in (1, 2, 3), f"task_id must be 1, 2 or 3; got {task_id}"

	self.task_id = task_id
	self.cfg = TASK_CONFIGS[task_id]
	self.use_gui = use_gui
	self.port = port
	self.mock_sumo = mock_sumo or not TRACI_AVAILABLE
	self.sumo_cfg = Path(sumo_cfg_path)

	self._rng = np.random.default_rng(seed)
	self.n_tl = self.cfg["active_intersections"]

	# ── Spaces ────────────────────────────────────────────────────────
	# Action: phase index per active TL (0–3 each)
	self.action_space = spaces.MultiDiscrete([4] * self.n_tl)

	# Observation: Dict with a pre-flattened "flat" key for PPO
	self.observation_space = spaces.Dict({
	"queue_lengths": spaces.Box(0.0, 50.0, shape=(3, 4), dtype=np.float32),
	"current_phase": spaces.MultiDiscrete([4, 4, 4]),
	"phase_elapsed": spaces.Box(0.0, 120.0, shape=(3,), dtype=np.float32),
	"probe_density": spaces.Box(0.0, 1.0, shape=(3, 8), dtype=np.float32),
	"incident_flag": spaces.MultiBinary(3),
	"time_of_day_norm": spaces.Box(0.0, 1.0, shape=(1,), dtype=np.float32),
	"flat": spaces.Box(-1.0, 50.0, shape=(OBS_DIM,), dtype=np.float32),
	})

	# ── Internal state ────────────────────────────────────────────────
	self._step_count = 0
	self._sim_time_s = 0.0
	self._sumo_process = None
	self._phase_elapsed = np.zeros(3, dtype=np.float32)
	self._current_phases = np.zeros(3, dtype=np.int32)
	self._episode_throughput = np.zeros(3, dtype=np.float32)
	self._episode_delay_sum = 0.0
	self._episode_stops = 0
	self._overflow_events = 0
	self._last_telemetry: Dict[str, Any] = {}

	# Phase history for green-wave computation (observation.py)
	# Entries: (sim_time_s: float, tl_index: int, phase: int)
	self._phase_history: deque = deque(maxlen=200)

	# Baseline metrics for episode_summary comparison
	self._baseline_avg_delay: Optional[float] = None
	self._baseline_throughput: Optional[float] = None

	# Incident manager — owns all chaos events
	self._incident_mgr = IncidentManager(
	cfg=self.cfg,
	rng=self._rng,
	mock_sumo=self.mock_sumo,
	)

	logger.info(
	"JaamCTRLTrafficEnv \| task=%d \| %s \| n_tl=%d \| mock=%s",
	task_id, self.cfg["name"], self.n_tl, self.mock_sumo,
	)

	# ── Gymnasium API ─────────────────────────────────────────────────────────

	def _to_serializable(self, obj: Any) -> Any:
	"""Convert numpy arrays and other non-JSON types to native Python types."""
	if isinstance(obj, np.ndarray):
	return obj.tolist()
	elif isinstance(obj, (np.integer, np.floating)):
	return obj.item()
	elif isinstance(obj, dict):
	return {k: self._to_serializable(v) for k, v in obj.items()}
	elif isinstance(obj, (list, tuple)):
	return [self._to_serializable(v) for v in obj]
	return obj

	def reset(
	self,
	*,
	seed: Optional[int] = None,
	options: Optional[Dict[str, Any]] = None,
	) -> Tuple[Dict[str, Any], Dict[str, Any]]:
	"""
	Reset to the start of a new episode.

	`options` dict accepts:
	"task_id" / "difficulty" : int — switch task on reset
	"use_gui" : bool — override GUI flag

	Returns
	-------
	obs : dict (see observation_space)
	info : dict (task metadata + reset flag)
	"""
	if seed is not None:
	self._rng = np.random.default_rng(seed)

	# Optional task switch
	if options:
	new_task = options.get("task_id") or options.get("difficulty")
	if new_task and int(new_task) != self.task_id:
	self.task_id = int(new_task)
	self.cfg = TASK_CONFIGS[self.task_id]
	self.n_tl = self.cfg["active_intersections"]
	self.action_space = spaces.MultiDiscrete([4] * self.n_tl)
	self._incident_mgr = IncidentManager(
	cfg=self.cfg, rng=self._rng, mock_sumo=self.mock_sumo
	)
	logger.info("Task switched to %d on reset.", self.task_id)
	if "use_gui" in options:
	self.use_gui = bool(options["use_gui"])

	self._close_sumo()

	# Reset episode counters
	self._step_count = 0
	self._sim_time_s = 0.0
	self._phase_elapsed[:] = 0.0
	self._current_phases[:] = 0
	self._episode_throughput[:] = 0.0
	self._episode_delay_sum = 0.0
	self._episode_stops = 0
	self._overflow_events = 0
	self._last_telemetry = {}
	self._phase_history.clear()
	self._incident_mgr.reset()

	if not self.mock_sumo:
	self._launch_sumo()

	# Generate first observation via mock telemetry (no sim steps yet)
	tel = (
	mock_telemetry(self._rng, self.n_tl, self.cfg["probe_noise_sigma"])
	if self.mock_sumo
	else self._fetch_telemetry()
	)
	self._last_telemetry = tel

	obs = build_obs(
	telemetry=tel,
	current_phases=self._current_phases,
	phase_elapsed=self._phase_elapsed,
	active_incidents=self._incident_mgr.active_incidents,
	step_count=self._step_count,
	max_steps=self.cfg["max_steps"],
	n_tl=self.n_tl,
	)
	info = self._build_info(reward=0.0, terminated=False, truncated=False)
	info["reset"] = True
	return self._to_serializable(obs), self._to_serializable(info)

	def step(
	self,
	action: np.ndarray,
	) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
	"""
	Apply phase actions and advance the simulation by one decision step.

	Parameters
	----------
	action : np.ndarray shape (n_tl,) dtype int
	Phase index (0–3) for each active TL.

	Returns
	-------
	obs, reward, terminated, truncated, info
	"""
	action_arr = np.asarray(action, dtype=np.int64).flatten()
	padded_action = self._pad_action(action_arr)
	thrash_count = 0

	# ── Apply phase actions ───────────────────────────────────────────
	for i in range(self.n_tl):
	desired = int(padded_action[i])
	current = int(self._current_phases[i])

	# Thrash guard: ignore switch if minimum green not yet elapsed
	if (
	desired != current
	and desired not in YELLOW_PHASES
	and self._phase_elapsed[i] < MIN_GREEN_S
	):
	thrash_count += 1
	desired = current # keep current phase

	# Auto-insert yellow transition between green phases
	if (
	current in (0, 2)
	and desired in (0, 2)
	and desired != current
	):
	yellow = current + 1 # 0→1 (NS) or 2→3 (EW)
	self._set_phase(i, yellow)
	self._advance_sim(YELLOW_DURATION[yellow])

	self._set_phase(i, desired)
	self._phase_history.append((self._sim_time_s, i, desired))

	# ── Advance simulation ────────────────────────────────────────────
	self._advance_sim(self.cfg["decision_interval_s"])
	self._step_count += 1

	# ── Incident tick ─────────────────────────────────────────────────
	traci_ref = traci if not self.mock_sumo else None
	self._incident_mgr.tick(
	step=self._step_count,
	n_tl=self.n_tl,
	traci=traci_ref,
	)

	# ── Telemetry ─────────────────────────────────────────────────────
	tel = (
	mock_telemetry(self._rng, self.n_tl, self.cfg["probe_noise_sigma"])
	if self.mock_sumo
	else self._fetch_telemetry()
	)
	tel["thrash_count"] = thrash_count

	# Incident clearance check (updates incident_mgr internal flag)
	self._incident_mgr.check_clearance(tel["queue_lengths"])
	tel["incident_cleared"] = self._incident_mgr.incident_cleared

	self._last_telemetry = tel

	# ── Update phase elapsed ──────────────────────────────────────────
	for i in range(self.n_tl):
	if int(self._current_phases[i]) == int(padded_action[i]):
	self._phase_elapsed[i] += self.cfg["decision_interval_s"]
	else:
	self._phase_elapsed[i] = 0.0

	# ── Build obs ─────────────────────────────────────────────────────
	obs = build_obs(
	telemetry=tel,
	current_phases=self._current_phases,
	phase_elapsed=self._phase_elapsed,
	active_incidents=self._incident_mgr.active_incidents,
	step_count=self._step_count,
	max_steps=self.cfg["max_steps"],
	n_tl=self.n_tl,
	)

	# ── Reward ────────────────────────────────────────────────────────
	reward = compute_reward(
	telemetry=tel,
	cfg=self.cfg,
	step_count=self._step_count,
	n_tl=self.n_tl,
	)

	# ── Accumulate episode metrics ────────────────────────────────────
	self._episode_throughput += tel["throughput"]
	self._episode_delay_sum += tel["total_waiting_time_s"]
	self._episode_stops += int(tel["new_stops"].sum())
	if tel["overflow_lanes"] > 0:
	self._overflow_events += 1

	# ── Termination ───────────────────────────────────────────────────
	truncated = self._step_count >= self.cfg["max_steps"]
	terminated = False # no early-success termination

	info = self._build_info(reward, terminated, truncated, tel)
	if truncated or terminated:
	info["episode_summary"] = self._build_episode_summary()

	return self._to_serializable(obs), reward, terminated, truncated, self._to_serializable(info)

	def state(self) -> Dict[str, Any]:
	"""
	Return a fully JSON-serialisable snapshot of current env state.
	Called by the OpenEnv grader after each episode.
	"""
	state_dict = {
	"task_id": self.task_id,
	"task_name": self.cfg["name"],
	"step": int(self._step_count),
	"sim_time_s": float(self._sim_time_s),
	"current_phases": self._current_phases.tolist(),
	"phase_elapsed_s": self._phase_elapsed.tolist(),
	"active_incidents": self._incident_mgr.active_incidents,
	"episode_throughput": self._episode_throughput.tolist(),
	"episode_delay_sum_s": float(self._episode_delay_sum),
	"episode_stops": int(self._episode_stops),
	"overflow_events": int(self._overflow_events),
	"incident_cleared": bool(self._incident_mgr.incident_cleared),
	"success_thresholds": self.cfg["success_thresholds"],
	}
	return self._to_serializable(state_dict)

	def render(self, mode: str = "none") -> None:
	"""SUMO-GUI handles rendering; this is a no-op for headless mode."""

	def close(self) -> None:
	self._close_sumo()

	# ── SUMO lifecycle ────────────────────────────────────────────────────────

	def _launch_sumo(self) -> None:
	"""Start a SUMO subprocess and connect via TraCI."""
	binary = "sumo-gui" if self.use_gui else "sumo"

	if self.port == 0:
	import socket
	with socket.socket() as s:
	s.bind(("", 0))
	self.port = s.getsockname()[1]

	cmd = [
	binary,
	"--configuration-file", str(self.sumo_cfg),
	"--route-files", str(self.cfg["route_file"]),
	"--remote-port", str(self.port),
	"--no-step-log", "true",
	"--no-warnings", "true",
	"--collision.action", "remove",
	"--seed", str(int(self._rng.integers(0, 99_999))),
	]
	logger.debug("SUMO cmd: %s", " ".join(cmd))

	self._sumo_process = subprocess.Popen(
	cmd,
	stdout=subprocess.DEVNULL,
	stderr=subprocess.DEVNULL,
	)
	time.sleep(1.0)
	traci.init(self.port)
	logger.debug("TraCI connected on port %d.", self.port)

	def _close_sumo(self) -> None:
	try:
	if TRACI_AVAILABLE:
	traci.close()
	except Exception:
	pass
	if self._sumo_process is not None:
	self._sumo_process.terminate()
	try:
	self._sumo_process.wait(timeout=5)
	except subprocess.TimeoutExpired:
	self._sumo_process.kill()
	self._sumo_process = None

	def _advance_sim(self, seconds: float) -> None:
	if self.mock_sumo:
	self._sim_time_s += seconds
	return
	for _ in range(max(1, int(seconds))):
	traci.simulationStep()
	self._sim_time_s += seconds

	def _set_phase(self, tl_index: int, phase: int) -> None:
	self._current_phases[tl_index] = phase
	if not self.mock_sumo:
	traci.trafficlight.setPhase(self.cfg["tl_ids"][tl_index], phase)

	# ── Telemetry via TraCI ───────────────────────────────────────────────────

	def _fetch_telemetry(self) -> Dict[str, Any]:
	"""Collect metrics from live TraCI connection."""
	return collect_telemetry(
	traci=traci,
	cfg=self.cfg,
	n_tl=self.n_tl,
	rng=self._rng,
	phase_history=self._phase_history,
	active_incidents=self._incident_mgr.active_incidents,
	incident_cleared_flag=self._incident_mgr.incident_cleared,
	)

	# ── Helpers ───────────────────────────────────────────────────────────────

	def _pad_action(self, action: np.ndarray) -> np.ndarray:
	"""Zero-pad a task-sized action to length 3 for internal loops."""
	padded = np.zeros(3, dtype=np.int64)
	padded[:len(action)] = action[:3]
	return padded

	def _build_info(
	self,
	reward: float,
	terminated: bool,
	truncated: bool,
	telemetry: Optional[Dict] = None,
	) -> Dict[str, Any]:
	info: Dict[str, Any] = {
	"task_id": self.task_id,
	"step": int(self._step_count),
	"sim_time_s": float(self._sim_time_s),
	"reward": float(reward),
	"terminated": terminated,
	"truncated": truncated,
	}
	if telemetry:
	info.update({
	"queue_total": float(telemetry["queue_lengths"][:self.n_tl].sum()),
	"throughput_total": float(telemetry["throughput"][:self.n_tl].sum()),
	"overflow_lanes": int(telemetry["overflow_lanes"]),
	"long_wait_count": int(telemetry["long_wait_count"]),
	"green_wave_hits": int(telemetry["green_wave_hits"]),
	"incident_cleared": bool(telemetry["incident_cleared"]),
	"active_incidents": self._incident_mgr.active_incidents,
	"reward_breakdown": reward_breakdown(
	telemetry, self.cfg, self._step_count, self.n_tl
	),
	})
	return info

	def _build_episode_summary(self) -> Dict[str, Any]:
	"""Compute end-of-episode metrics and check success thresholds."""
	total_steps = max(1, self._step_count)
	avg_delay = self._episode_delay_sum / total_steps
	total_through = float(self._episode_throughput[:self.n_tl].sum())

	# Fall back to heuristic baseline if set_baseline() was never called
	baseline_delay = self._baseline_avg_delay or max(1e-6, avg_delay * 1.30)
	baseline_throughput = self._baseline_throughput or max(1e-6, total_through * 0.80)

	delay_reduction_pct = (
	100.0 * (baseline_delay - avg_delay) / baseline_delay
	)
	throughput_improvement_pct = (
	100.0 * (total_through - baseline_throughput) / baseline_throughput
	)

	t = self.cfg["success_thresholds"]
	passed = all([
	delay_reduction_pct >= t.get("delay_reduction_pct", 0.0),
	throughput_improvement_pct >= t.get("throughput_improvement_pct", 0.0),
	self._overflow_events <= t.get("overflow_events", 9999),
	])

	summary = {
	"task_id": self.task_id,
	"total_steps": total_steps,
	"avg_delay_s": round(float(avg_delay), 4),
	"total_throughput": round(float(total_through), 4),
	"delay_reduction_pct": round(float(delay_reduction_pct), 4),
	"throughput_improvement_pct": round(float(throughput_improvement_pct), 4),
	"overflow_events": int(self._overflow_events),
	"success": bool(passed),
	"thresholds": t,
	}
	logger.info("Episode done \| %s", json.dumps(
	{k: v for k, v in summary.items() if k != "thresholds"}
	))
	return summary

	def set_baseline(self, avg_delay: float, throughput: float) -> None:
	"""
	Inject fixed-time baseline metrics for episode_summary comparison.
	Called by inference.py after running a fixed-time reference episode.
	"""
	self._baseline_avg_delay = float(avg_delay)
	self._baseline_throughput = float(throughput)

	def reward_breakdown_last(self) -> Dict[str, float]:
	"""Return per-term reward breakdown for the most recent step."""
	if not self._last_telemetry:
	return {}
	return reward_breakdown(
	self._last_telemetry, self.cfg, self._step_count, self.n_tl
	)

	def __repr__(self) -> str:
	return (
	f"JaamCTRLTrafficEnv("
	f"task={self.task_id}, "
	f"n_tl={self.n_tl}, "
	f"step={self._step_count}/{self.cfg['max_steps']})"
	)