Spaces:

openenv-community
/

VarahaWildFireDroneReliefTrainingSim

Runtime error

App Files Files Community

VarahaWildFireDroneReliefTrainingSim / varaha_env.py

atin5551

Deploy Varaha OpenEnv Docker Space

cb70a7d 2 days ago

raw

history blame contribute delete

52.3 kB

	"""Varaha — wildfire logistics simulation environment.

	A drone must deliver supplies to responder zones near wildfire hazards in
	California-like terrain. The environment uses lightweight 3D kinematics with
	local metre-based coordinates and an optional lat/lon conversion helper for
	later Cesium visualisation.
	"""

	import math
	import random
	from dataclasses import dataclass
	from typing import Any, Optional

	from sim_types import (
	Vec3,
	DroneState,
	BaseStation,
	DeliveryTarget,
	HazardRegion,
	ObstacleVolume,
	CylindricalObstacle,
	ResponderUnit,
	ScheduledEvent,
	RESPONDER_STATUSES,
	INTEL_TYPES,
	StepInfo,
	TracePoint,
	MissionInstruction,
	)


	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	@dataclass
	class VarahaConfig:
	"""All tunable environment parameters live here."""

	# World bounds (metres) — 5 km × 5 km operational area
	world_x: float = 5000.0
	world_y: float = 5000.0
	world_z: float = 200.0

	# Drone physics
	battery_capacity: float = 300.0
	max_speed: float = 25.0 # m/s
	max_acceleration: float = 8.0 # m/s²
	dt: float = 0.5 # seconds per step

	# Episode
	max_episode_steps: int = 2000

	# Battery drain coefficients (tuned for 5 km scale)
	drain_per_meter: float = 0.008
	drain_elevation_factor: float = 0.02
	drain_idle_per_step: float = 0.005
	recharge_rate: float = 5.0 # battery units restored per recharge step

	# Reward knobs
	delivery_reward: float = 200.0
	return_bonus: float = 100.0
	step_penalty: float = 0.05
	battery_cost_factor: float = 0.3
	collision_penalty: float = 500.0
	hazard_penalty: float = 5.0
	failure_penalty: float = 200.0
	distance_shaping_factor: float = 0.05
	obstacle_proximity_penalty: float = 1.5
	obstacle_proximity_radius: float = 80.0

	# Long-horizon instruction mode (LLM-oriented)
	instruction_mode: bool = False
	instruction_count: int = 60
	sparse_reward_mode: bool = False
	instruction_completion_reward: float = 0.5
	instruction_terminal_success_bonus: float = 2200.0
	instruction_terminal_progress_bonus: float = 800.0
	instruction_violation_penalty: float = 120.0
	instruction_unfinished_penalty: float = 10.0
	available_tools: tuple[str, ...] = (
	"request_intel",
	"battery_forecast",
	"mission_report",
	)

	# California origin anchor (near Sacramento — wildfire-relevant)
	origin_lat: float = 38.55
	origin_lon: float = -121.47


	# ---------------------------------------------------------------------------
	# Random world generator for domain randomization
	# ---------------------------------------------------------------------------

	def build_random_world(env: "VarahaEnv") -> None:
	"""Legacy easy world gen — kept for backward compatibility."""
	build_hardcore_world(env)


	def _hdist(a: Vec3, b: Vec3) -> float:
	return ((a.x - b.x) 2 + (a.y - b.y) 2) ** 0.5


	def build_hardcore_world(env: "VarahaEnv", ultra_hard: bool = False) -> None:
	"""Generate an extremely challenging randomized world for serious RL training.

	Features template-based obstacle placement (urban grid, dense forest,
	corridor maze, river valley, fortress, mixed), cylindrical obstacles,
	responder units with dynamic events, and adversarial target placement.

	When ultra_hard=True: denser obstacles, more hazards, more targets, longer episodes.
	"""
	cfg = env.cfg
	rng = random

	wx, wy, wz = cfg.world_x, cfg.world_y, cfg.world_z
	margin = 200.0

	def _rpos(z_lo=10.0, z_hi=60.0):
	return Vec3(rng.uniform(margin, wx - margin),
	rng.uniform(margin, wy - margin),
	rng.uniform(z_lo, z_hi))

	def _rpos_ground():
	return Vec3(rng.uniform(margin, wx - margin),
	rng.uniform(margin, wy - margin), 0.0)

	# --- Base station ---
	base_pos = Vec3(rng.uniform(100, wx - 100), rng.uniform(100, wy - 100), 0.0)
	env.base = BaseStation(position=base_pos, recharge_radius=rng.uniform(60, 100))

	# --- Targets (2-5 normal, 3-6 ultra) ---
	if ultra_hard:
	n_targets = rng.choices([3, 4, 5, 6], weights=[0.15, 0.35, 0.35, 0.15])[0]
	else:
	n_targets = rng.choices([2, 3, 4, 5], weights=[0.15, 0.40, 0.30, 0.15])[0]
	targets = []
	for i in range(n_targets):
	for _ in range(120):
	pos = _rpos(z_lo=5.0, z_hi=60.0)
	if _hdist(pos, base_pos) < 500:
	continue
	if all(_hdist(pos, t.position) > 400 for t in targets):
	break
	targets.append(DeliveryTarget(
	id=f"T{i+1}", position=pos,
	urgency=rng.uniform(0.3, 1.0),
	delivery_radius=rng.uniform(70.0, 130.0),
	))
	env.targets = targets

	# --- Hazards (3-8 normal, 5-10 ultra) with wild variety ---
	if ultra_hard:
	n_hazards = rng.choices([5, 6, 7, 8, 9, 10], weights=[0.10, 0.20, 0.25, 0.25, 0.15, 0.05])[0]
	else:
	n_hazards = rng.choices([3, 4, 5, 6, 7, 8], weights=[0.10, 0.20, 0.25, 0.25, 0.15, 0.05])[0]
	hazards = []
	for i in range(n_hazards):
	center = _rpos_ground()
	fire_type = rng.choice(["tiny_intense", "massive_low", "tall_mid", "standard"])
	if fire_type == "tiny_intense":
	r, sev, ht, gr = rng.uniform(80, 200), rng.uniform(0.9, 1.0), rng.uniform(140, 195), rng.uniform(0.012, 0.025)
	elif fire_type == "massive_low":
	r, sev, ht, gr = rng.uniform(500, 1000), rng.uniform(0.3, 0.5), rng.uniform(25, 50), rng.uniform(0.001, 0.004)
	elif fire_type == "tall_mid":
	r, sev, ht, gr = rng.uniform(250, 500), rng.uniform(0.7, 0.95), rng.uniform(100, 180), rng.uniform(0.008, 0.015)
	else:
	r, sev, ht, gr = rng.uniform(200, 600), rng.uniform(0.4, 0.9), rng.uniform(40, 120), rng.uniform(0.003, 0.012)
	hazards.append(HazardRegion(id=f"H{i+1}", center=center,
	radius=r, severity=sev, height=ht, growth_rate=gr))
	env.hazards = hazards

	# --- Obstacle templates ---
	obstacles: list[ObstacleVolume] = []
	cylinders: list[CylindricalObstacle] = []
	oid = [0]

	def _next_oid(prefix="O"):
	oid[0] += 1
	return f"{prefix}{oid[0]}"

	def _add_box(cx, cy, w, h, zt, kind="building"):
	obstacles.append(ObstacleVolume(
	id=_next_oid(), kind=kind,
	min_corner=Vec3(cx - w / 2, cy - h / 2, 0.0),
	max_corner=Vec3(cx + w / 2, cy + h / 2, zt),
	))

	def _add_cyl(cx, cy, radius, height, kind="tree"):
	cylinders.append(CylindricalObstacle(
	id=_next_oid("C"), kind=kind,
	center=Vec3(cx, cy, 0.0), radius=radius, height=height,
	))

	if ultra_hard:
	template = rng.choices(["urban_grid", "dense_forest", "corridor_maze",
	"river_valley", "fortress", "mixed"],
	weights=[0.08, 0.12, 0.12, 0.10, 0.10, 0.48])[0]
	else:
	template = rng.choice(["urban_grid", "dense_forest", "corridor_maze",
	"river_valley", "fortress", "mixed"])

	# ---- URBAN GRID: rows and columns of buildings ----
	if template == "urban_grid" or template == "mixed":
	ox = rng.uniform(500, 1500)
	oy = rng.uniform(500, 1500)
	rows = rng.randint(2, 5) if ultra_hard else rng.randint(2, 4)
	cols = rng.randint(3, 6) if ultra_hard else rng.randint(3, 5)
	spacing = rng.uniform(300, 550) if ultra_hard else rng.uniform(350, 600)
	for r in range(rows):
	for c in range(cols):
	bx = ox + c * spacing + rng.uniform(-80, 80)
	by = oy + r * spacing + rng.uniform(-80, 80)
	if bx < margin or bx > wx - margin or by < margin or by > wy - margin:
	continue
	bw = rng.uniform(80, 300)
	bh = rng.uniform(80, 300)
	bzt = rng.choice([rng.uniform(30, 60), rng.uniform(100, 195)])
	_add_box(bx, by, bw, bh, bzt)
	if rng.random() < (0.45 if ultra_hard else 0.3):
	arm_dir = rng.choice(["east", "north"])
	if arm_dir == "east":
	_add_box(bx + bw / 2 + 40, by, 80, bh * 0.6, bzt * 0.9)
	else:
	_add_box(bx, by + bh / 2 + 40, bw * 0.6, 80, bzt * 0.9)

	# ---- DENSE FOREST: many cylindrical trees ----
	if template == "dense_forest" or template == "mixed":
	forest_cx = rng.uniform(800, wx - 800)
	forest_cy = rng.uniform(800, wy - 800)
	n_trees = rng.randint(25, 60) if ultra_hard else rng.randint(15, 40)
	for _ in range(n_trees):
	tx = forest_cx + rng.gauss(0, 600)
	ty = forest_cy + rng.gauss(0, 600)
	tx = max(margin, min(wx - margin, tx))
	ty = max(margin, min(wy - margin, ty))
	tree_type = rng.choice(["pine", "oak", "palm", "dead"])
	if tree_type == "pine":
	_add_cyl(tx, ty, rng.uniform(8, 20), rng.uniform(40, 100), "tree_pine")
	elif tree_type == "oak":
	_add_cyl(tx, ty, rng.uniform(15, 40), rng.uniform(25, 60), "tree_oak")
	elif tree_type == "palm":
	_add_cyl(tx, ty, rng.uniform(5, 12), rng.uniform(30, 80), "tree_palm")
	else:
	_add_cyl(tx, ty, rng.uniform(10, 25), rng.uniform(20, 50), "tree_dead")

	# ---- CORRIDOR MAZE: parallel walls with gaps ----
	if template == "corridor_maze" or template == "mixed":
	maze_ox = rng.uniform(400, wx / 2)
	maze_oy = rng.uniform(400, wy / 2)
	n_walls = rng.randint(6, 12) if ultra_hard else rng.randint(4, 8)
	wall_dir = rng.choice(["horizontal", "vertical"])
	spacing = rng.uniform(200, 500)
	for w in range(n_walls):
	wl = rng.uniform(400, 1500)
	wt = rng.uniform(40, 80)
	wzt = rng.uniform(100, 195)
	if wall_dir == "horizontal":
	wy_pos = maze_oy + w * spacing
	if wy_pos > wy - margin:
	continue
	_add_box(maze_ox + wl / 2, wy_pos, wl, wt, wzt, "wall")
	gap_x = maze_ox + rng.uniform(0.2, 0.8) * wl
	_add_box(gap_x, wy_pos, rng.uniform(80, 200), wt, 0, "gap")
	else:
	wx_pos = maze_ox + w * spacing
	if wx_pos > wx - margin:
	continue
	_add_box(wx_pos, maze_oy + wl / 2, wt, wl, wzt, "wall")

	# ---- RIVER VALLEY: chain of low flat boxes + scattered trees ----
	if template == "river_valley" or (template == "mixed" and rng.random() < (0.7 if ultra_hard else 0.5)):
	river_start_x = rng.uniform(margin, wx / 3)
	river_y = rng.uniform(wy * 0.3, wy * 0.7)
	n_segs = rng.randint(10, 18) if ultra_hard else rng.randint(6, 12)
	for seg in range(n_segs):
	seg_x = river_start_x + seg * rng.uniform(200, 400)
	seg_y = river_y + rng.gauss(0, 150)
	if seg_x > wx - margin:
	break
	seg_y = max(margin, min(wy - margin, seg_y))
	_add_box(seg_x, seg_y, rng.uniform(200, 400), rng.uniform(60, 150),
	rng.uniform(3, 10), "river")
	for _ in range(rng.randint(2, 6) if ultra_hard else rng.randint(1, 4)):
	bank_offset = rng.choice([-1, 1]) * rng.uniform(100, 300)
	_add_cyl(seg_x + rng.uniform(-100, 100),
	seg_y + bank_offset,
	rng.uniform(8, 20), rng.uniform(30, 80), "tree_bank")

	# ---- FORTRESS: walls surrounding a target area ----
	if template == "fortress" or (template == "mixed" and rng.random() < (0.6 if ultra_hard else 0.4)):
	if targets:
	fort_target = rng.choice(targets)
	ftx, fty = fort_target.position.x, fort_target.position.y
	wall_half = rng.uniform(250, 500)
	wall_zt = rng.uniform(120, 190)
	wall_thick = rng.uniform(50, 80)
	_add_box(ftx, fty - wall_half, wall_half * 2, wall_thick, wall_zt, "fortress_wall")
	_add_box(ftx, fty + wall_half, wall_half * 2, wall_thick, wall_zt, "fortress_wall")
	_add_box(ftx - wall_half, fty, wall_thick, wall_half * 2, wall_zt, "fortress_wall")
	_add_box(ftx + wall_half, fty, wall_thick, wall_half * 2, wall_zt, "fortress_wall")

	# ---- Always scatter some light poles and random pillars ----
	n_poles = rng.randint(6, 18) if ultra_hard else rng.randint(3, 10)
	for _ in range(n_poles):
	px = rng.uniform(margin, wx - margin)
	py = rng.uniform(margin, wy - margin)
	_add_cyl(px, py, rng.uniform(2, 6), rng.uniform(30, 80), "light_pole")

	n_pillars = rng.randint(4, 12) if ultra_hard else rng.randint(2, 6)
	for _ in range(n_pillars):
	px = rng.uniform(margin, wx - margin)
	py = rng.uniform(margin, wy - margin)
	_add_cyl(px, py, rng.uniform(15, 50), rng.uniform(80, 195), "pillar")

	obstacles = [o for o in obstacles if o.max_corner.z > 1.0]
	env.obstacles = obstacles
	env.cylinders = cylinders

	# --- Responder units (1 per target, up to 5 in ultra) ---
	responders = []
	max_resp = 5 if ultra_hard else 4
	for i, tgt in enumerate(targets[:max_resp]):
	r = ResponderUnit(
	id=f"R{i+1}",
	position=Vec3(tgt.position.x + rng.uniform(-50, 50),
	tgt.position.y + rng.uniform(-50, 50), 0.0),
	linked_target_id=tgt.id,
	status="stable",
	current_need=rng.choice(["supplies", "medical", "evacuation", "water"]),
	can_update_dropzone=rng.random() < 0.5,
	active=True,
	)
	events = []

	if rng.random() < 0.7:
	events.append(ScheduledEvent(
	step=rng.randint(100, 600),
	event_type="urgency_update",
	payload={"new_urgency": rng.uniform(0.5, 1.0)},
	))

	if r.can_update_dropzone and rng.random() < 0.5:
	events.append(ScheduledEvent(
	step=rng.randint(200, 800),
	event_type="dropzone_relocation",
	payload={"dx": rng.uniform(-200, 200), "dy": rng.uniform(-200, 200)},
	))

	if rng.random() < 0.6:
	intel = rng.choice([
	"blocked_north", "blocked_south", "blocked_east", "blocked_west",
	"safe_north", "safe_south", "safe_east", "safe_west",
	"fire_expanded", "fire_receded",
	])
	events.append(ScheduledEvent(
	step=rng.randint(50, 500),
	event_type="hazard_intel",
	payload={"intel": intel, "severity": rng.uniform(0.3, 1.0)},
	))

	r.scheduled_events = events
	responders.append(r)
	env.responders = responders


	def build_hardcore_world_v2(env: "VarahaEnv") -> None:
	"""Ultra-hard variant: denser obstacles, more hazards, more targets."""
	build_hardcore_world(env, ultra_hard=True)


	# ---------------------------------------------------------------------------
	# Environment
	# ---------------------------------------------------------------------------

	class VarahaEnv:
	"""Core wildfire logistics simulation.

	Action format (dict)::

	{
	"ax": float, # desired acceleration x (m/s²)
	"ay": float, # desired acceleration y
	"az": float, # desired acceleration z
	"deliver": bool, # attempt delivery if near a target
	"recharge": bool, # attempt recharge if near base
	"tool_call": str, # optional: request_intel \| battery_forecast \| mission_report
	}

	Returns ``(obs_dict, reward, done, info_dict)`` per OpenAI-gym convention.
	"""

	def __init__(self, config: Optional[VarahaConfig] = None,
	world_fn: Optional[Any] = None) -> None:
	self.cfg = config or VarahaConfig()
	self._world_fn = world_fn

	self.base: BaseStation
	self.drone: DroneState
	self.targets: list[DeliveryTarget] = []
	self.hazards: list[HazardRegion] = []
	self.obstacles: list[ObstacleVolume] = []
	self.cylinders: list[CylindricalObstacle] = []
	self.responders: list[ResponderUnit] = []

	self.step_count: int = 0
	self.cumulative_reward: float = 0.0
	self.done: bool = False
	self.trace: list[TracePoint] = []

	self._prev_nearest_dist: float = 0.0
	self._hazard_base_heights: list[float] = []
	self._hazard_base_severities: list[float] = []
	self.instructions: list[MissionInstruction] = []
	self._instruction_cursor: int = 0
	self._instruction_violations: int = 0
	self._tool_history: list[str] = []
	self._last_tool_result: dict[str, Any] = {}
	self._instruction_progress_reward: float = 0.0

	self._rebuild_world()

	def _rebuild_world(self):
	if self._world_fn is not None:
	self._world_fn(self)
	else:
	self._build_demo_world()
	self._hazard_base_heights = [h.height for h in self.hazards]
	self._hazard_base_severities = [h.severity for h in self.hazards]

	# ------------------------------------------------------------------
	# World setup
	# ------------------------------------------------------------------

	def _build_demo_world(self) -> None:
	"""Hardcoded 5 km demo scenario.

	Layout (top-down, +x → east, +y → north, 5 km × 5 km)::

	T3 (1000,4200)
	·
	H2 (900,3200) O2 [500-1500, 2600-3000]
	·
	· T2 (4100,2900) ← inside H1 fringe
	· H1 (3800,2600)
	·
	· O1 [2200-2800, 1000-2200]
	·
	· T1 (1800,600)
	·
	Base (250,250)

	- T2 sits inside the fringe of hazard H1 → brief hazard exposure required
	- T3 is behind obstacle O2 and near hazard H2
	- O1 blocks direct mid-map routing from T1 to T2
	- Drone can fly over obstacles if altitude > obstacle height
	- Total route ≈ 12 km, battery budget ≈ 300 units
	"""
	self.base = BaseStation(position=Vec3(250.0, 250.0, 0.0), recharge_radius=80.0)

	self.targets = [
	DeliveryTarget(
	id="T1", position=Vec3(1800.0, 600.0, 30.0),
	urgency=0.6, delivery_radius=80.0,
	),
	DeliveryTarget(
	id="T2", position=Vec3(4100.0, 2900.0, 50.0),
	urgency=1.0, delivery_radius=120.0,
	),
	DeliveryTarget(
	id="T3", position=Vec3(1000.0, 4200.0, 20.0),
	urgency=0.8, delivery_radius=100.0,
	),
	]

	self.hazards = [
	HazardRegion(
	id="H1", center=Vec3(3800.0, 2600.0, 0.0),
	radius=500.0, severity=0.9,
	height=70.0, growth_rate=0.005,
	),
	HazardRegion(
	id="H2", center=Vec3(900.0, 3200.0, 0.0),
	radius=400.0, severity=0.7,
	height=55.0, growth_rate=0.008,
	),
	]

	self.obstacles = [
	ObstacleVolume(
	id="O1",
	min_corner=Vec3(2200.0, 1000.0, 0.0),
	max_corner=Vec3(2800.0, 2200.0, 120.0),
	),
	ObstacleVolume(
	id="O2",
	min_corner=Vec3(500.0, 2600.0, 0.0),
	max_corner=Vec3(1500.0, 3000.0, 90.0),
	),
	]

	# ------------------------------------------------------------------
	# Core API
	# ------------------------------------------------------------------

	def reset(self, seed: Optional[int] = None) -> dict[str, Any]:
	"""Reset the environment and return the initial observation."""
	if seed is not None:
	random.seed(seed)

	if self._world_fn is not None:
	self._rebuild_world()

	self.drone = DroneState(
	position=Vec3(self.base.position.x, self.base.position.y, 0.0),
	velocity=Vec3(0.0, 0.0, 0.0),
	battery=self.cfg.battery_capacity,
	carrying_payload=True,
	alive=True,
	)

	for t in self.targets:
	t.delivered = False

	for i, h in enumerate(self.hazards):
	h.height = self._hazard_base_heights[i] * random.uniform(0.85, 1.15)
	h.severity = max(0.3, min(1.0, self._hazard_base_severities[i] + random.uniform(-0.1, 0.1)))
	h.reset()

	for r in self.responders:
	r.active = True
	r.status = "stable"
	r.latest_intel = "none"
	r.intel_severity = 0.0
	r.message = ""
	for ev in r.scheduled_events:
	ev.fired = False

	self._target_base_positions = {
	t.id: Vec3(t.position.x, t.position.y, t.position.z)
	for t in self.targets
	}
	self._build_instruction_program()
	self._instruction_progress_reward = 0.0
	self._last_tool_result = {}
	self._tool_history = []

	self.step_count = 0
	self.cumulative_reward = 0.0
	self.done = False
	self.trace = []
	self._prev_nearest_dist = self._nearest_target_dist()

	obs = self.get_observation()

	self.trace.append(TracePoint(
	step=0,
	position=Vec3(self.drone.position.x, self.drone.position.y, self.drone.position.z),
	velocity=Vec3(0.0, 0.0, 0.0),
	battery=self.drone.battery,
	reward=0.0,
	cumulative_reward=0.0,
	events=["reset"],
	observation=obs,
	))

	return obs

	def step(self, action: dict[str, Any]) -> tuple[dict, float, bool, dict]:
	"""Advance the simulation by one timestep.

	Returns ``(observation, reward, done, info)``.
	"""
	if self.done:
	return self.get_observation(), 0.0, True, StepInfo().to_dict()

	self.step_count += 1

	# --- parse & clamp acceleration ---
	accel = Vec3(
	float(action.get("ax", 0.0)),
	float(action.get("ay", 0.0)),
	float(action.get("az", 0.0)),
	).clamp_magnitude(self.cfg.max_acceleration)

	# --- kinematics (Euler integration) ---
	self.drone.velocity = (
	self.drone.velocity + accel.scale(self.cfg.dt)
	).clamp_magnitude(self.cfg.max_speed)

	old_pos = Vec3(self.drone.position.x, self.drone.position.y, self.drone.position.z)
	self.drone.position = self.drone.position + self.drone.velocity.scale(self.cfg.dt)

	# clamp to world bounds
	self.drone.position.x = max(0.0, min(self.cfg.world_x, self.drone.position.x))
	self.drone.position.y = max(0.0, min(self.cfg.world_y, self.drone.position.y))
	self.drone.position.z = max(0.0, min(self.cfg.world_z, self.drone.position.z))

	dist_traveled = old_pos.distance_to(self.drone.position)
	elevation_change = abs(self.drone.position.z - old_pos.z)

	# --- battery ---
	drain = self._compute_battery_drain(dist_traveled, elevation_change)
	self.drone.battery -= drain

	# --- advance dynamic hazards ---
	for h in self.hazards:
	h.tick()

	# --- advance responder events ---
	self._tick_responders()

	# --- world interactions ---
	collision = self._check_collisions()
	in_hazard, hazard_sev = self._check_hazards()

	tool_call = ""
	tool_result: dict[str, Any] = {}
	raw_tool_call = action.get("tool_call")
	if raw_tool_call is not None and str(raw_tool_call).strip():
	tool_call, tool_result = self._execute_tool_call(str(raw_tool_call).strip())

	prev_instruction_cursor = self._instruction_cursor
	delivered_ids: list[str] = []
	if action.get("deliver", False):
	delivered_ids = self._deliver_targets()

	reached_base = (
	((self.drone.position.x - self.base.position.x) ** 2
	+ (self.drone.position.y - self.base.position.y) 2) 0.5
	<= self.base.recharge_radius
	)
	if action.get("recharge", False) and reached_base:
	self.drone.battery = min(
	self.cfg.battery_capacity,
	self.drone.battery + self.cfg.recharge_rate,
	)

	self._update_instruction_progress(
	delivered_ids=delivered_ids,
	reached_base=reached_base,
	tool_call=tool_call,
	)
	completed_now = max(0, self._instruction_cursor - prev_instruction_cursor)

	if self._all_delivered():
	self.drone.carrying_payload = False

	# --- reward ---
	info = StepInfo(
	collision=collision,
	delivered_target_ids=delivered_ids,
	in_hazard=in_hazard,
	hazard_severity=hazard_sev,
	reached_base=reached_base,
	distance_traveled=dist_traveled,
	tool_call=tool_call,
	tool_result=tool_result,
	instruction_completed=self._instruction_cursor,
	instruction_total=len(self.instructions),
	instruction_violations=self._instruction_violations,
	)
	reward, breakdown = self._compute_reward(info)
	info.reward_breakdown = breakdown
	self.cumulative_reward += reward

	# --- termination ---
	if collision:
	self.drone.alive = False
	self.done = True
	elif self.drone.battery <= 0.0:
	self.drone.battery = 0.0
	self.drone.alive = False
	self.done = True
	elif self._is_success():
	self.done = True
	elif self.step_count >= self.cfg.max_episode_steps:
	self.done = True

	# record trace
	events: list[str] = []
	for tid in delivered_ids:
	events.append(f"delivered_{tid}")
	if collision:
	events.append("collision")
	if in_hazard:
	events.append(f"hazard_{hazard_sev:.2f}")
	if self.drone.battery <= 0.0 and not collision:
	events.append("battery_dead")
	if self._is_success():
	events.append("success")
	if tool_call:
	events.append(f"tool_{tool_call}")
	if completed_now > 0:
	events.append(f"instruction+{completed_now}")

	obs = self.get_observation()

	self.trace.append(TracePoint(
	step=self.step_count,
	position=Vec3(self.drone.position.x, self.drone.position.y, self.drone.position.z),
	velocity=Vec3(self.drone.velocity.x, self.drone.velocity.y, self.drone.velocity.z),
	battery=self.drone.battery,
	reward=reward,
	cumulative_reward=self.cumulative_reward,
	events=events,
	observation=obs,
	))

	return obs, reward, self.done, info.to_dict()

	# ------------------------------------------------------------------
	# Observation / render
	# ------------------------------------------------------------------

	def get_observation(self) -> dict[str, Any]:
	"""Compact, RL-friendly observation dict."""
	dp = self.drone.position

	targets_obs = []
	for t in self.targets:
	rel = t.position - dp
	targets_obs.append({
	"id": t.id,
	"relative_position": rel.to_dict(),
	"urgency": t.urgency,
	"delivered": t.delivered,
	})

	hazards_obs = []
	for h in self.hazards:
	rel = h.center - dp
	hazards_obs.append({
	"id": h.id,
	"relative_position": rel.to_dict(),
	"current_height": h._current_height,
	"severity": h.severity,
	})

	obstacles_obs = []
	for obs in self.obstacles:
	c = obs.center
	hs = obs.half_size
	rel = c - dp
	dist = dp.horizontal_distance_to(c)
	obstacles_obs.append({
	"type": "box",
	"relative_position": rel.to_dict(),
	"height": obs.height,
	"size_x": hs.x * 2,
	"size_y": hs.y * 2,
	"distance": dist,
	"kind": obs.kind,
	})
	for cyl in self.cylinders:
	rel = cyl.center - dp
	dist = dp.horizontal_distance_to(cyl.center)
	obstacles_obs.append({
	"type": "cylinder",
	"relative_position": rel.to_dict(),
	"height": cyl.height,
	"size_x": cyl.radius * 2,
	"size_y": cyl.radius * 2,
	"distance": dist,
	"kind": cyl.kind,
	})
	obstacles_obs.sort(key=lambda o: o["distance"])

	responders_obs = []
	for r in self.responders:
	if not r.active:
	continue
	rel = r.position - dp
	intel_dir = r.intel_direction()
	responders_obs.append({
	"id": r.id,
	"relative_position": rel.to_dict(),
	"linked_target_id": r.linked_target_id,
	"status": r.status,
	"status_code": r.status_code(),
	"latest_intel": r.latest_intel,
	"intel_direction": {"x": intel_dir[0], "y": intel_dir[1]},
	"intel_severity": r.intel_severity,
	})

	mission_obs = self._instruction_snapshot()
	return {
	"drone_position": dp.to_dict(),
	"drone_velocity": self.drone.velocity.to_dict(),
	"battery": round(self.drone.battery, 4),
	"carrying_payload": self.drone.carrying_payload,
	"alive": self.drone.alive,
	"targets": targets_obs,
	"hazards": hazards_obs,
	"obstacles": obstacles_obs,
	"responders": responders_obs,
	"mission": mission_obs,
	"last_tool_result": self._last_tool_result,
	"step": self.step_count,
	"max_steps": self.cfg.max_episode_steps,
	}

	def render_state(self) -> dict[str, Any]:
	"""Rich state dict for future Cesium / frontend rendering."""
	return {
	"base_station": self.base.to_dict(),
	"drone": self.drone.to_dict(),
	"targets": [t.to_dict() for t in self.targets],
	"hazards": [h.to_dict() for h in self.hazards],
	"obstacles": [o.to_dict() for o in self.obstacles],
	"cylinders": [c.to_dict() for c in self.cylinders],
	"responders": [r.to_dict() for r in self.responders],
	"mission": self._instruction_snapshot(include_full=True),
	"tool_history": list(self._tool_history),
	"step": self.step_count,
	"max_steps": self.cfg.max_episode_steps,
	"cumulative_reward": round(self.cumulative_reward, 4),
	"done": self.done,
	}

	def get_trace(self) -> dict[str, Any]:
	"""Full episode trace for replay / visualisation."""
	return {
	"world": {
	"bounds": {"x": self.cfg.world_x, "y": self.cfg.world_y, "z": self.cfg.world_z},
	"base_station": self.base.to_dict(),
	"targets": [t.to_dict() for t in self.targets],
	"hazards": [h.to_dict() for h in self.hazards],
	"obstacles": [o.to_dict() for o in self.obstacles],
	"cylinders": [c.to_dict() for c in self.cylinders],
	"responders": [r.to_dict() for r in self.responders],
	"mission": self._instruction_snapshot(include_full=True),
	},
	"trace": [tp.to_dict() for tp in self.trace],
	"summary": {
	"total_steps": self.step_count,
	"cumulative_reward": round(self.cumulative_reward, 4),
	"delivered": [t.id for t in self.targets if t.delivered],
	"alive": self.drone.alive,
	"final_battery": round(self.drone.battery, 4),
	"success": self._is_success(),
	"instruction_completed": self._instruction_cursor,
	"instruction_total": len(self.instructions),
	"instruction_violations": self._instruction_violations,
	"tool_calls": list(self._tool_history),
	},
	}

	# ------------------------------------------------------------------
	# Long-horizon instruction mode
	# ------------------------------------------------------------------

	def _build_instruction_program(self) -> None:
	self.instructions = []
	self._instruction_cursor = 0
	self._instruction_violations = 0

	if not self.cfg.instruction_mode or not self.targets:
	return

	ordered_targets = sorted(self.targets, key=lambda t: (-t.urgency, t.id))
	target_count = len(ordered_targets)
	desired_len = self.cfg.instruction_count if self.cfg.instruction_count > 0 else (target_count * 3 + 1)
	desired_len = max(desired_len, target_count * 2 + 1)

	instructions: list[MissionInstruction] = []
	inst_idx = 1
	cycle = 0
	while len(instructions) < max(desired_len - 1, 1):
	for tgt in ordered_targets:
	if len(instructions) >= max(desired_len - 1, 1):
	break
	instructions.append(
	MissionInstruction(
	id=f"I{inst_idx}",
	kind="deliver_target",
	description=f"Cycle {cycle + 1}: deliver to {tgt.id} in order.",
	target_id=tgt.id,
	)
	)
	inst_idx += 1
	if len(instructions) >= max(desired_len - 1, 1):
	break
	tool = "request_intel" if (cycle % 2 == 0) else "battery_forecast"
	instructions.append(
	MissionInstruction(
	id=f"I{inst_idx}",
	kind="tool_call",
	description=f"Call {tool} after servicing {tgt.id}.",
	target_id=tgt.id,
	tool_name=tool,
	)
	)
	inst_idx += 1
	cycle += 1

	instructions.append(
	MissionInstruction(
	id=f"I{inst_idx}",
	kind="return_base",
	description="Return to base only after all deliveries are completed.",
	)
	)
	self.instructions = instructions

	def _current_instruction(self) -> Optional[MissionInstruction]:
	if self._instruction_cursor >= len(self.instructions):
	return None
	return self.instructions[self._instruction_cursor]

	def _instruction_snapshot(self, include_full: bool = False) -> dict[str, Any]:
	total = len(self.instructions)
	completed = min(self._instruction_cursor, total)
	next_instruction = self._current_instruction()
	out: dict[str, Any] = {
	"enabled": self.cfg.instruction_mode,
	"total": total,
	"completed": completed,
	"remaining": max(total - completed, 0),
	"progress": (completed / total) if total > 0 else 1.0,
	"violations": self._instruction_violations,
	"next_instruction": next_instruction.to_dict() if next_instruction else None,
	}
	if include_full:
	out["instructions"] = [inst.to_dict() for inst in self.instructions]
	return out

	def _complete_current_instruction(self) -> None:
	inst = self._current_instruction()
	if inst is None:
	return
	inst.completed = True
	self._instruction_cursor += 1
	self._instruction_progress_reward += self.cfg.instruction_completion_reward

	def _record_instruction_violation(self) -> None:
	self._instruction_violations += 1
	inst = self._current_instruction()
	if inst is not None:
	inst.violated = True

	def _tool_matches_instruction(self, tool_call: str, inst: MissionInstruction) -> bool:
	base, _, arg = tool_call.partition(":")
	if base != inst.tool_name:
	return False
	if inst.target_id and arg and arg != inst.target_id:
	return False
	return True

	def _update_instruction_progress(
	self,
	delivered_ids: list[str],
	reached_base: bool,
	tool_call: str,
	) -> None:
	if not self.cfg.instruction_mode or not self.instructions:
	return

	inst = self._current_instruction()
	if inst and inst.kind == "deliver_target":
	for tid in delivered_ids:
	if tid != inst.target_id:
	self._record_instruction_violation()

	while True:
	inst = self._current_instruction()
	if inst is None:
	break

	if inst.kind == "deliver_target":
	if inst.target_id in delivered_ids:
	self._complete_current_instruction()
	continue
	break

	if inst.kind == "tool_call":
	if not tool_call:
	break
	if self._tool_matches_instruction(tool_call, inst):
	self._complete_current_instruction()
	else:
	self._record_instruction_violation()
	break

	if inst.kind == "return_base":
	if reached_base and self._all_delivered():
	self._complete_current_instruction()
	break

	break

	def _execute_tool_call(self, tool_call: str) -> tuple[str, dict[str, Any]]:
	raw = tool_call.strip().lower()
	if not raw:
	return "", {}

	tool_name, _, arg = raw.partition(":")
	normalized_call = f"{tool_name}:{arg}" if arg else tool_name

	if tool_name not in self.cfg.available_tools:
	result = {"ok": False, "error": f"unsupported_tool:{tool_name}"}
	self._tool_history.append(normalized_call)
	self._last_tool_result = result
	return normalized_call, result

	if tool_name == "request_intel":
	responder = None
	if arg:
	responder = next(
	(r for r in self.responders if r.active and r.linked_target_id.lower() == arg.lower()),
	None,
	)
	if responder is None:
	responder = next((r for r in self.responders if r.active), None)
	if responder is None:
	result = {"ok": True, "intel": "none", "message": "no_active_responders"}
	else:
	result = {
	"ok": True,
	"intel": responder.latest_intel,
	"intel_severity": round(responder.intel_severity, 3),
	"responder_id": responder.id,
	"target_id": responder.linked_target_id,
	"message": responder.message,
	}
	elif tool_name == "battery_forecast":
	burn = max(self.cfg.drain_per_meter, 1e-6)
	est_range = self.drone.battery / burn
	result = {
	"ok": True,
	"battery": round(self.drone.battery, 3),
	"estimated_range_m": round(est_range, 1),
	}
	else: # mission_report
	result = {
	"ok": True,
	"delivered": [t.id for t in self.targets if t.delivered],
	"remaining": [t.id for t in self.targets if not t.delivered],
	"instruction_progress": round(self._instruction_snapshot()["progress"], 3),
	"violations": self._instruction_violations,
	}

	self._tool_history.append(normalized_call)
	self._last_tool_result = result
	return normalized_call, result

	# ------------------------------------------------------------------
	# Coordinate conversion
	# ------------------------------------------------------------------

	def local_to_latlon(self, vec: Vec3) -> tuple[float, float, float]:
	"""Convert local (x, y, z) metres to (lat, lon, alt).

	Uses a flat-earth approximation centred on ``cfg.origin_lat/lon``.
	Accurate enough for small areas (~tens of km) and Cesium plotting.
	"""
	meters_per_deg_lat = 111_320.0
	meters_per_deg_lon = 111_320.0 * math.cos(math.radians(self.cfg.origin_lat))

	lat = self.cfg.origin_lat + vec.y / meters_per_deg_lat
	lon = self.cfg.origin_lon + vec.x / meters_per_deg_lon
	alt = vec.z
	return (round(lat, 7), round(lon, 7), round(alt, 2))

	# ------------------------------------------------------------------
	# Internal helpers
	# ------------------------------------------------------------------

	def _compute_battery_drain(self, dist: float, elevation_change: float) -> float:
	return (
	dist * self.cfg.drain_per_meter
	+ elevation_change * self.cfg.drain_elevation_factor
	+ self.cfg.drain_idle_per_step
	)

	def _check_collisions(self) -> bool:
	for obs in self.obstacles:
	if obs.contains(self.drone.position):
	return True
	for cyl in self.cylinders:
	if cyl.contains(self.drone.position):
	return True
	return False

	def _check_hazards(self) -> tuple[bool, float]:
	max_sev = 0.0
	in_hazard = False
	for h in self.hazards:
	df = h.danger_factor(self.drone.position)
	if df > 0.0:
	in_hazard = True
	max_sev = max(max_sev, df)
	return in_hazard, max_sev

	def _deliver_targets(self) -> list[str]:
	"""Cylindrical delivery check — drone must be within horizontal radius
	and above the target (within a generous altitude window for drops)."""
	delivered: list[str] = []
	for t in self.targets:
	if t.delivered:
	continue
	dx = self.drone.position.x - t.position.x
	dy = self.drone.position.y - t.position.y
	horiz_dist = (dx * dx + dy * dy) ** 0.5
	alt_above = self.drone.position.z - t.position.z
	if horiz_dist <= t.delivery_radius and -10.0 <= alt_above <= t.delivery_radius * 2:
	t.delivered = True
	delivered.append(t.id)
	return delivered

	def _all_delivered(self) -> bool:
	return all(t.delivered for t in self.targets)

	def _is_success(self) -> bool:
	hdist = ((self.drone.position.x - self.base.position.x) ** 2
	+ (self.drone.position.y - self.base.position.y) 2) 0.5
	return self._all_delivered() and hdist <= self.base.recharge_radius

	def _nearest_target_dist(self) -> float:
	"""Horizontal distance to closest undelivered target, or to base if all done."""
	dists = [
	((self.drone.position.x - t.position.x) ** 2
	+ (self.drone.position.y - t.position.y) 2) 0.5
	for t in self.targets
	if not t.delivered
	]
	if not dists:
	return ((self.drone.position.x - self.base.position.x) ** 2
	+ (self.drone.position.y - self.base.position.y) 2) 0.5
	return min(dists)

	def _tick_responders(self) -> None:
	"""Process scheduled responder events for the current step."""
	for r in self.responders:
	if not r.active:
	continue
	for ev in r.scheduled_events:
	if ev.fired or ev.step != self.step_count:
	continue
	ev.fired = True
	etype = ev.event_type

	if etype == "urgency_update":
	tgt = self._find_target(r.linked_target_id)
	if tgt and not tgt.delivered:
	tgt.urgency = max(0.1, min(1.0, ev.payload.get("new_urgency", tgt.urgency)))
	r.status = "critical" if tgt.urgency >= 0.9 else "urgent" if tgt.urgency >= 0.6 else "stable"
	r.message = f"urgency->{tgt.urgency:.1f}"

	elif etype == "dropzone_relocation":
	tgt = self._find_target(r.linked_target_id)
	if tgt and not tgt.delivered and r.can_update_dropzone:
	dx = ev.payload.get("dx", 0.0)
	dy = ev.payload.get("dy", 0.0)
	tgt.position.x = max(50, min(self.cfg.world_x - 50, tgt.position.x + dx))
	tgt.position.y = max(50, min(self.cfg.world_y - 50, tgt.position.y + dy))
	r.position = Vec3(tgt.position.x, tgt.position.y, 0.0)
	r.message = f"dropzone moved ({dx:+.0f},{dy:+.0f})"
	self._prev_nearest_dist = self._nearest_target_dist()

	elif etype == "hazard_intel":
	r.latest_intel = ev.payload.get("intel", "none")
	r.intel_severity = ev.payload.get("severity", 0.5)
	r.message = f"intel: {r.latest_intel}"

	def _find_target(self, tid: str) -> Optional[DeliveryTarget]:
	for t in self.targets:
	if t.id == tid:
	return t
	return None

	def _obstacle_proximity_penalty(self) -> float:
	"""Graduated penalty for flying close to any obstacle surface."""
	min_dist = float("inf")
	pos = self.drone.position
	for obs in self.obstacles:
	d = obs.nearest_surface_dist(pos)
	if d < min_dist:
	min_dist = d
	for cyl in self.cylinders:
	d = cyl.nearest_surface_dist(pos)
	if d < min_dist:
	min_dist = d
	if min_dist >= self.cfg.obstacle_proximity_radius:
	return 0.0
	factor = 1.0 - min_dist / self.cfg.obstacle_proximity_radius
	return self.cfg.obstacle_proximity_penalty * factor * factor

	def _compute_reward(self, info: StepInfo) -> tuple[float, dict[str, float]]:
	if self.cfg.instruction_mode and self.cfg.sparse_reward_mode:
	return self._compute_sparse_instruction_reward(info)

	bd: dict[str, float] = {}
	total = 0.0

	# per-step cost of time
	bd["step_penalty"] = -self.cfg.step_penalty
	total += bd["step_penalty"]

	# battery usage cost (proportional to energy spent)
	bd["battery_cost"] = -(
	info.distance_traveled * self.cfg.drain_per_meter * self.cfg.battery_cost_factor
	)
	total += bd["battery_cost"]

	if self._instruction_progress_reward > 0.0:
	bd["instruction_progress"] = self._instruction_progress_reward
	total += bd["instruction_progress"]
	self._instruction_progress_reward = 0.0

	# delivery rewards (scaled by urgency) + progress bonus
	for tid in info.delivered_target_ids:
	tgt = next(t for t in self.targets if t.id == tid)
	r = self.cfg.delivery_reward * (1.0 + tgt.urgency)
	bd[f"delivery_{tid}"] = r
	total += r

	if info.delivered_target_ids:
	n_remaining = sum(1 for t in self.targets if not t.delivered)
	progress_bonus = 50.0 * (1.0 - n_remaining / len(self.targets))
	bd["progress_bonus"] = progress_bonus
	total += progress_bonus

	# collision
	if info.collision:
	bd["collision"] = -self.cfg.collision_penalty
	total += bd["collision"]

	# hazard exposure (severity-weighted)
	if info.in_hazard:
	bd["hazard"] = -self.cfg.hazard_penalty * info.hazard_severity
	total += bd["hazard"]

	# safe return bonus
	if info.reached_base and self._all_delivered():
	bd["return_bonus"] = self.cfg.return_bonus
	total += bd["return_bonus"]

	# distance shaping — nudge toward nearest undelivered target (or base)
	# Skip shaping on delivery steps to avoid a huge negative spike
	# when the nearest-target reference jumps to a farther target.
	# Double the factor when heading home after all deliveries.
	curr_dist = self._nearest_target_dist()
	if info.delivered_target_ids:
	bd["distance_shaping"] = 0.0
	self._prev_nearest_dist = curr_dist
	else:
	factor = self.cfg.distance_shaping_factor
	if self._all_delivered():
	factor *= 2.0
	shaping = (self._prev_nearest_dist - curr_dist) * factor
	bd["distance_shaping"] = shaping
	total += shaping
	self._prev_nearest_dist = curr_dist

	# obstacle proximity (graduated — discourages flying close)
	prox = self._obstacle_proximity_penalty()
	if prox > 0:
	bd["obstacle_proximity"] = -prox
	total -= prox

	# failure (battery depletion; collision already penalised above)
	if self.drone.battery <= 0.0 and not info.collision:
	bd["failure"] = -self.cfg.failure_penalty
	total += bd["failure"]

	bd["total"] = total
	return total, bd

	def _compute_sparse_instruction_reward(self, info: StepInfo) -> tuple[float, dict[str, float]]:
	bd: dict[str, float] = {}
	total = 0.0

	# Keep shaping intentionally small in sparse mode.
	bd["step_penalty"] = -(self.cfg.step_penalty * 0.25)
	total += bd["step_penalty"]

	if self._instruction_progress_reward > 0.0:
	bd["instruction_progress"] = self._instruction_progress_reward
	total += bd["instruction_progress"]
	self._instruction_progress_reward = 0.0

	if info.in_hazard:
	bd["hazard"] = -(self.cfg.hazard_penalty * 0.2 * info.hazard_severity)
	total += bd["hazard"]

	terminal = (
	info.collision
	or self.drone.battery <= 0.0
	or self._is_success()
	or self.step_count >= self.cfg.max_episode_steps
	)
	if terminal:
	total_instr = len(self.instructions)
	progress = (self._instruction_cursor / total_instr) if total_instr > 0 else 1.0
	bd["terminal_progress"] = self.cfg.instruction_terminal_progress_bonus * progress
	total += bd["terminal_progress"]

	if self._is_success():
	bd["terminal_success"] = self.cfg.instruction_terminal_success_bonus
	total += bd["terminal_success"]
	else:
	bd["terminal_failure"] = -self.cfg.failure_penalty
	total += bd["terminal_failure"]

	remaining = max(total_instr - self._instruction_cursor, 0)
	if remaining > 0:
	bd["unfinished_penalty"] = -remaining * self.cfg.instruction_unfinished_penalty
	total += bd["unfinished_penalty"]

	if self._instruction_violations > 0:
	bd["instruction_violations"] = (
	-self._instruction_violations * self.cfg.instruction_violation_penalty
	)
	total += bd["instruction_violations"]

	if info.collision:
	bd["collision"] = -self.cfg.collision_penalty
	total += bd["collision"]

	bd["total"] = total
	return total, bd