Spaces:

openenv-community
/

VarahaWildFireDroneReliefTrainingSim

Runtime error

App Files Files Community

atin5551 commited on 2 days ago

Commit

cb70a7d

1 Parent(s): 7e43148

Deploy Varaha OpenEnv Docker Space

Browse files

Files changed (14) hide show

openenv_wrapper/__init__.py +13 -0
openenv_wrapper/__pycache__/__init__.cpython-313.pyc +0 -0
openenv_wrapper/__pycache__/client.cpython-313.pyc +0 -0
openenv_wrapper/__pycache__/models.cpython-313.pyc +0 -0
openenv_wrapper/__pycache__/varaha_environment.cpython-313.pyc +0 -0
openenv_wrapper/client.py +27 -0
openenv_wrapper/models.py +69 -0
openenv_wrapper/server/__init__.py +0 -0
openenv_wrapper/server/__pycache__/__init__.cpython-313.pyc +0 -0
openenv_wrapper/server/__pycache__/app.cpython-313.pyc +0 -0
openenv_wrapper/server/app.py +18 -0
openenv_wrapper/varaha_environment.py +136 -0
sim_types.py +475 -0
varaha_env.py +1323 -0

openenv_wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Varaha OpenEnv package — public API re-exports."""
+from openenv_wrapper.models import VarahaAction, VarahaObservation, VarahaState
+from openenv_wrapper.varaha_environment import VarahaEnvironment
+from openenv_wrapper.client import VarahaEnvClient
+__all__ = [
+    "VarahaAction",
+    "VarahaObservation",
+    "VarahaState",
+    "VarahaEnvironment",
+    "VarahaEnvClient",
+]

openenv_wrapper/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (514 Bytes). View file

openenv_wrapper/__pycache__/client.cpython-313.pyc ADDED Viewed

Binary file (1.97 kB). View file

openenv_wrapper/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (4.21 kB). View file

openenv_wrapper/__pycache__/varaha_environment.cpython-313.pyc ADDED Viewed

Binary file (6.77 kB). View file

openenv_wrapper/client.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""WebSocket client for the Varaha OpenEnv server."""
+from typing import Any, Dict
+from openenv.core.env_client import EnvClient
+from openenv.core.client_types import StepResult
+from openenv_wrapper.models import VarahaAction, VarahaObservation, VarahaState
+class VarahaEnvClient(EnvClient[VarahaAction, VarahaObservation, VarahaState]):
+    """Typed client that speaks to a running Varaha OpenEnv server."""
+    def _step_payload(self, action: VarahaAction) -> Dict[str, Any]:
+        return action.model_dump(exclude={"metadata"})
+    def _parse_result(self, payload: Dict[str, Any]) -> StepResult[VarahaObservation]:
+        obs_data = payload.get("observation", payload.get("data", payload))
+        obs = VarahaObservation(**obs_data)
+        return StepResult(
+            observation=obs,
+            reward=payload.get("reward", obs.reward),
+            done=payload.get("done", obs.done),
+        )
+    def _parse_state(self, payload: Dict[str, Any]) -> VarahaState:
+        return VarahaState(**payload)

openenv_wrapper/models.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Pydantic models for the Varaha OpenEnv environment."""
+from typing import Any, Dict, List, Optional
+from pydantic import Field
+from openenv.core.env_server.types import Action, Observation, State
+class VarahaAction(Action):
+    """Drone acceleration command with automatic delivery/recharge."""
+    ax: float = Field(0.0, description="Desired acceleration along x-axis (m/s^2)")
+    ay: float = Field(0.0, description="Desired acceleration along y-axis (m/s^2)")
+    az: float = Field(0.0, description="Desired acceleration along z-axis (m/s^2)")
+    deliver: bool = Field(True, description="Attempt delivery when near a target")
+    recharge: bool = Field(True, description="Attempt recharge when near base station")
+    tool_call: str = Field(
+        "",
+        description="Optional tool call: request_intel[:target_id] | battery_forecast | mission_report",
+    )
+class VarahaObservation(Observation):
+    """Full observation returned after each step/reset."""
+    drone_position: Dict[str, float] = Field(
+        default_factory=dict, description="Drone {x, y, z} in local metres"
+    )
+    drone_velocity: Dict[str, float] = Field(
+        default_factory=dict, description="Drone velocity {x, y, z} in m/s"
+    )
+    battery: float = Field(0.0, description="Remaining battery units")
+    carrying_payload: bool = Field(True, description="Whether the drone still carries payload")
+    alive: bool = Field(True, description="Whether the drone is still operational")
+    targets: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="Per-target relative position, urgency, delivered status",
+    )
+    hazards: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="Per-hazard relative position, current height, severity",
+    )
+    step_num: int = Field(0, description="Current step number in the episode")
+    max_steps: int = Field(2000, description="Maximum allowed steps")
+    reward_breakdown: Dict[str, float] = Field(
+        default_factory=dict, description="Itemised reward components from the last step"
+    )
+    mission: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Instruction-mode progress, next instruction, and violation counters",
+    )
+    last_tool_result: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Result payload from the most recent tool call",
+    )
+    success: bool = Field(False, description="Whether the mission is successfully completed")
+    trace: Optional[Dict[str, Any]] = Field(
+        None, description="Full episode trace (only populated on the final step)"
+    )
+class VarahaState(State):
+    """Internal environment state exposed via the state property."""
+    cumulative_reward: float = Field(0.0, description="Total accumulated reward")
+    deliveries_completed: int = Field(0, description="Number of targets delivered so far")
+    total_targets: int = Field(0, description="Total number of targets in the episode")
+    battery: float = Field(0.0, description="Current battery level")
+    success: bool = Field(False, description="Whether the mission is complete")

openenv_wrapper/server/__init__.py ADDED Viewed

File without changes

openenv_wrapper/server/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (145 Bytes). View file

openenv_wrapper/server/__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (786 Bytes). View file

openenv_wrapper/server/app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""FastAPI application for the Varaha OpenEnv environment."""
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+from openenv.core.env_server import create_app
+from openenv_wrapper.models import VarahaAction, VarahaObservation
+from openenv_wrapper.varaha_environment import VarahaEnvironment
+app = create_app(
+    VarahaEnvironment,
+    VarahaAction,
+    VarahaObservation,
+    env_name="varaha",
+)

openenv_wrapper/varaha_environment.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""OpenEnv-compatible Varaha wildfire drone environment."""
+from __future__ import annotations
+import sys
+import os
+import uuid
+from typing import Any, Callable, Optional
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import EnvironmentMetadata
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from varaha_env import VarahaConfig, VarahaEnv, build_random_world
+from openenv_wrapper.models import VarahaAction, VarahaObservation, VarahaState
+class VarahaEnvironment(Environment[VarahaAction, VarahaObservation, VarahaState]):
+    """Wildfire logistics drone environment wrapped for OpenEnv.
+    Each episode the drone must deliver supplies to responder zones near
+    wildfire hazards, then return to base.  Supports domain-randomised
+    worlds when ``world_fn`` is provided.
+    """
+    def __init__(
+        self,
+        config: Optional[VarahaConfig] = None,
+        world_fn: Optional[Callable[..., None]] = None,
+    ) -> None:
+        super().__init__()
+        self._config = config or VarahaConfig()
+        self._world_fn = world_fn
+        self._env = VarahaEnv(config=self._config, world_fn=self._world_fn)
+        self._episode_id = str(uuid.uuid4())
+        self._last_info: dict[str, Any] = {}
+    # ------------------------------------------------------------------
+    # OpenEnv abstract interface
+    # ------------------------------------------------------------------
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> VarahaObservation:
+        self._episode_id = episode_id or str(uuid.uuid4())
+        obs_dict = self._env.reset(seed=seed)
+        self._last_info = {}
+        return self._build_observation(obs_dict, reward=0.0, done=False)
+    def step(
+        self,
+        action: VarahaAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> VarahaObservation:
+        action_dict = {
+            "ax": action.ax,
+            "ay": action.ay,
+            "az": action.az,
+            "deliver": action.deliver,
+            "recharge": action.recharge,
+            "tool_call": action.tool_call,
+        }
+        obs_dict, reward, done, info = self._env.step(action_dict)
+        self._last_info = info
+        return self._build_observation(obs_dict, reward=reward, done=done, info=info)
+    @property
+    def state(self) -> VarahaState:
+        delivered = sum(1 for t in self._env.targets if t.delivered)
+        return VarahaState(
+            episode_id=self._episode_id,
+            step_count=self._env.step_count,
+            cumulative_reward=round(self._env.cumulative_reward, 4),
+            deliveries_completed=delivered,
+            total_targets=len(self._env.targets),
+            battery=round(self._env.drone.battery, 4),
+            success=self._env._is_success(),
+        )
+    # ------------------------------------------------------------------
+    # Optional overrides
+    # ------------------------------------------------------------------
+    def get_metadata(self) -> EnvironmentMetadata:
+        return EnvironmentMetadata(
+            name="Varaha Wildfire Logistics",
+            description=(
+                "A 3D drone delivery environment where an agent must navigate "
+                "wildfire hazards and obstacles to deliver supplies to responder "
+                "zones, then return to base."
+            ),
+            version="1.0.0",
+            author="Varaha Team",
+        )
+    def close(self) -> None:
+        pass
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    def _build_observation(
+        self,
+        obs_dict: dict[str, Any],
+        *,
+        reward: float,
+        done: bool,
+        info: dict[str, Any] | None = None,
+    ) -> VarahaObservation:
+        info = info or {}
+        trace = self._env.get_trace() if done else None
+        return VarahaObservation(
+            done=done,
+            reward=round(reward, 4),
+            metadata={"info": info},
+            drone_position=obs_dict["drone_position"],
+            drone_velocity=obs_dict["drone_velocity"],
+            battery=obs_dict["battery"],
+            carrying_payload=obs_dict["carrying_payload"],
+            alive=obs_dict["alive"],
+            targets=obs_dict["targets"],
+            hazards=obs_dict.get("hazards", []),
+            mission=obs_dict.get("mission", {}),
+            last_tool_result=obs_dict.get("last_tool_result", {}),
+            step_num=obs_dict["step"],
+            max_steps=obs_dict["max_steps"],
+            reward_breakdown=info.get("reward_breakdown", {}),
+            success=self._env._is_success(),
+            trace=trace,
+        )

sim_types.py ADDED Viewed

	@@ -0,0 +1,475 @@

+"""Varaha simulation types — core data structures for the wildfire logistics environment."""
+import math
+from dataclasses import dataclass, field
+from typing import Any
+# ---------------------------------------------------------------------------
+# Vec3
+# ---------------------------------------------------------------------------
+@dataclass
+class Vec3:
+    """Lightweight 3-component vector with basic arithmetic helpers."""
+    x: float = 0.0
+    y: float = 0.0
+    z: float = 0.0
+    # --- arithmetic ---
+    def __add__(self, other: "Vec3") -> "Vec3":
+        return Vec3(self.x + other.x, self.y + other.y, self.z + other.z)
+    def __sub__(self, other: "Vec3") -> "Vec3":
+        return Vec3(self.x - other.x, self.y - other.y, self.z - other.z)
+    def scale(self, s: float) -> "Vec3":
+        return Vec3(self.x * s, self.y * s, self.z * s)
+    # --- magnitude ---
+    def norm(self) -> float:
+        return math.sqrt(self.x ** 2 + self.y ** 2 + self.z ** 2)
+    def normalized(self) -> "Vec3":
+        n = self.norm()
+        if n < 1e-9:
+            return Vec3(0.0, 0.0, 0.0)
+        return self.scale(1.0 / n)
+    def clamp_magnitude(self, max_mag: float) -> "Vec3":
+        n = self.norm()
+        if n > max_mag and n > 1e-9:
+            return self.scale(max_mag / n)
+        return Vec3(self.x, self.y, self.z)
+    # --- distance ---
+    def distance_to(self, other: "Vec3") -> float:
+        return (self - other).norm()
+    def horizontal_distance_to(self, other: "Vec3") -> float:
+        dx = self.x - other.x
+        dy = self.y - other.y
+        return math.sqrt(dx * dx + dy * dy)
+    # --- serialization ---
+    def to_dict(self) -> dict[str, float]:
+        return {"x": round(self.x, 4), "y": round(self.y, 4), "z": round(self.z, 4)}
+    def __repr__(self) -> str:
+        return f"Vec3({self.x:.2f}, {self.y:.2f}, {self.z:.2f})"
+# ---------------------------------------------------------------------------
+# Drone
+# ---------------------------------------------------------------------------
+@dataclass
+class DroneState:
+    """Full kinematic + status state of the drone."""
+    position: Vec3 = field(default_factory=Vec3)
+    velocity: Vec3 = field(default_factory=Vec3)
+    battery: float = 100.0
+    carrying_payload: bool = True
+    alive: bool = True
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "position": self.position.to_dict(),
+            "velocity": self.velocity.to_dict(),
+            "battery": round(self.battery, 4),
+            "carrying_payload": self.carrying_payload,
+            "alive": self.alive,
+        }
+# ---------------------------------------------------------------------------
+# World entities
+# ---------------------------------------------------------------------------
+@dataclass
+class BaseStation:
+    """Home base where the drone launches, lands, and recharges."""
+    position: Vec3 = field(default_factory=Vec3)
+    recharge_radius: float = 20.0
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "position": self.position.to_dict(),
+            "recharge_radius": self.recharge_radius,
+        }
+@dataclass
+class DeliveryTarget:
+    """A responder zone requiring supply delivery."""
+    id: str = ""
+    position: Vec3 = field(default_factory=Vec3)
+    urgency: float = 0.5
+    delivered: bool = False
+    delivery_radius: float = 15.0
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "position": self.position.to_dict(),
+            "urgency": round(self.urgency, 4),
+            "delivered": self.delivered,
+            "delivery_radius": self.delivery_radius,
+        }
+@dataclass
+class HazardRegion:
+    """Wildfire danger zone modeled as a ground-level dome.
+    The hazard has a horizontal radius and a height.  Danger is zero
+    above ``height`` and outside ``radius``, allowing drones to fly
+    over fires at sufficient altitude.  Within the dome, danger scales
+    with proximity to the center both horizontally and vertically.
+    ``growth_rate`` controls per-step height increase (metres/step),
+    simulating fire growth over an episode.
+    """
+    id: str = ""
+    center: Vec3 = field(default_factory=Vec3)
+    radius: float = 50.0
+    severity: float = 0.5
+    height: float = 80.0
+    growth_rate: float = 0.0
+    _current_height: float = field(default=0.0, init=False, repr=False)
+    def __post_init__(self):
+        self._current_height = self.height
+    def reset(self):
+        """Reset dynamic state for a new episode."""
+        self._current_height = self.height
+    def tick(self):
+        """Advance one timestep — grow the fire."""
+        if self.growth_rate > 0:
+            self._current_height += self.growth_rate
+    def contains(self, pos: Vec3) -> bool:
+        horiz = ((pos.x - self.center.x) ** 2 + (pos.y - self.center.y) ** 2) ** 0.5
+        alt = pos.z - self.center.z
+        return horiz <= self.radius and 0 <= alt < self._current_height
+    def danger_factor(self, pos: Vec3) -> float:
+        """0 outside the dome, scales up toward the ground-level center."""
+        horiz = ((pos.x - self.center.x) ** 2 + (pos.y - self.center.y) ** 2) ** 0.5
+        if horiz >= self.radius:
+            return 0.0
+        alt = pos.z - self.center.z
+        if alt >= self._current_height or alt < 0:
+            return 0.0
+        horiz_factor = 1.0 - horiz / self.radius
+        vert_factor = 1.0 - alt / self._current_height
+        return self.severity * horiz_factor * vert_factor
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "center": self.center.to_dict(),
+            "radius": self.radius,
+            "severity": self.severity,
+            "height": self.height,
+            "current_height": round(self._current_height, 2),
+            "growth_rate": self.growth_rate,
+        }
+@dataclass
+class ObstacleVolume:
+    """Axis-aligned 3D box that the drone must not enter."""
+    id: str = ""
+    min_corner: Vec3 = field(default_factory=Vec3)
+    max_corner: Vec3 = field(default_factory=Vec3)
+    kind: str = "building"
+    def contains(self, pos: Vec3) -> bool:
+        return (
+            self.min_corner.x <= pos.x <= self.max_corner.x
+            and self.min_corner.y <= pos.y <= self.max_corner.y
+            and self.min_corner.z <= pos.z <= self.max_corner.z
+        )
+    @property
+    def center(self) -> Vec3:
+        return Vec3(
+            (self.min_corner.x + self.max_corner.x) / 2,
+            (self.min_corner.y + self.max_corner.y) / 2,
+            (self.min_corner.z + self.max_corner.z) / 2,
+        )
+    @property
+    def half_size(self) -> Vec3:
+        return Vec3(
+            (self.max_corner.x - self.min_corner.x) / 2,
+            (self.max_corner.y - self.min_corner.y) / 2,
+            (self.max_corner.z - self.min_corner.z) / 2,
+        )
+    @property
+    def height(self) -> float:
+        return self.max_corner.z
+    def nearest_surface_dist(self, pos: Vec3) -> float:
+        """Signed distance to the nearest surface (negative = inside)."""
+        cx, cy = self.center.x, self.center.y
+        hx, hy = self.half_size.x, self.half_size.y
+        dx = max(abs(pos.x - cx) - hx, 0.0)
+        dy = max(abs(pos.y - cy) - hy, 0.0)
+        dz_below = max(self.min_corner.z - pos.z, 0.0)
+        dz_above = max(pos.z - self.max_corner.z, 0.0)
+        return math.sqrt(dx * dx + dy * dy + (dz_below + dz_above) ** 2)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "min_corner": self.min_corner.to_dict(),
+            "max_corner": self.max_corner.to_dict(),
+            "kind": self.kind,
+        }
+@dataclass
+class CylindricalObstacle:
+    """Vertical cylinder obstacle — trees, poles, pillars, tanks."""
+    id: str = ""
+    center: Vec3 = field(default_factory=Vec3)
+    radius: float = 10.0
+    height: float = 50.0
+    kind: str = "tree"
+    def contains(self, pos: Vec3) -> bool:
+        dx = pos.x - self.center.x
+        dy = pos.y - self.center.y
+        horiz_dist = math.sqrt(dx * dx + dy * dy)
+        return horiz_dist <= self.radius and 0 <= pos.z <= self.height
+    def nearest_surface_dist(self, pos: Vec3) -> float:
+        dx = pos.x - self.center.x
+        dy = pos.y - self.center.y
+        horiz_dist = math.sqrt(dx * dx + dy * dy)
+        radial_gap = max(horiz_dist - self.radius, 0.0)
+        vert_gap = max(pos.z - self.height, 0.0) if pos.z > self.height else max(-pos.z, 0.0)
+        return math.sqrt(radial_gap ** 2 + vert_gap ** 2)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "center": self.center.to_dict(),
+            "radius": round(self.radius, 2),
+            "height": round(self.height, 2),
+            "kind": self.kind,
+        }
+# ---------------------------------------------------------------------------
+# Responder units — dynamic actors that alter mission conditions mid-episode
+# ---------------------------------------------------------------------------
+RESPONDER_STATUSES = ("stable", "urgent", "critical")
+RESPONDER_STATUS_MAP = {"stable": 0.0, "urgent": 0.5, "critical": 1.0}
+INTEL_TYPES = (
+    "none",
+    "blocked_north", "blocked_south", "blocked_east", "blocked_west",
+    "safe_north", "safe_south", "safe_east", "safe_west",
+    "fire_expanded", "fire_receded",
+)
+INTEL_DIRECTION_VECS = {
+    "none": (0.0, 0.0),
+    "blocked_north": (0.0, 1.0), "blocked_south": (0.0, -1.0),
+    "blocked_east": (1.0, 0.0), "blocked_west": (-1.0, 0.0),
+    "safe_north": (0.0, 1.0), "safe_south": (0.0, -1.0),
+    "safe_east": (1.0, 0.0), "safe_west": (-1.0, 0.0),
+    "fire_expanded": (0.0, 0.0), "fire_receded": (0.0, 0.0),
+}
+@dataclass
+class ScheduledEvent:
+    """A future event a responder will trigger at a specific step."""
+    step: int = 0
+    event_type: str = ""
+    payload: dict[str, Any] = field(default_factory=dict)
+    fired: bool = False
+@dataclass
+class ResponderUnit:
+    """First responder on the ground linked to a delivery target.
+    Can dynamically alter mission conditions mid-episode:
+      1. Update urgency of their linked target
+      2. Relocate the drop-zone (move target position)
+      3. Broadcast hazard intel (structured approach guidance)
+    """
+    id: str = ""
+    position: Vec3 = field(default_factory=Vec3)
+    linked_target_id: str = ""
+    status: str = "stable"
+    current_need: str = "supplies"
+    message: str = ""
+    can_update_dropzone: bool = False
+    active: bool = True
+    latest_intel: str = "none"
+    intel_severity: float = 0.0
+    scheduled_events: list[ScheduledEvent] = field(default_factory=list)
+    def status_code(self) -> float:
+        return RESPONDER_STATUS_MAP.get(self.status, 0.0)
+    def intel_direction(self) -> tuple[float, float]:
+        return INTEL_DIRECTION_VECS.get(self.latest_intel, (0.0, 0.0))
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "position": self.position.to_dict(),
+            "linked_target_id": self.linked_target_id,
+            "status": self.status,
+            "current_need": self.current_need,
+            "message": self.message,
+            "can_update_dropzone": self.can_update_dropzone,
+            "active": self.active,
+            "latest_intel": self.latest_intel,
+            "intel_severity": round(self.intel_severity, 4),
+        }
+# ---------------------------------------------------------------------------
+# Observation & step diagnostics
+# ---------------------------------------------------------------------------
+@dataclass
+class VarahaObservation:
+    """Structured observation returned to the agent each step.
+    Kept as a dataclass for documentation; the env also offers a plain-dict
+    path via ``get_observation()`` for maximum serialisation flexibility.
+    """
+    drone_position: Vec3 = field(default_factory=Vec3)
+    drone_velocity: Vec3 = field(default_factory=Vec3)
+    battery: float = 100.0
+    carrying_payload: bool = True
+    alive: bool = True
+    targets: list[dict[str, Any]] = field(default_factory=list)
+    step: int = 0
+    max_steps: int = 500
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "drone_position": self.drone_position.to_dict(),
+            "drone_velocity": self.drone_velocity.to_dict(),
+            "battery": round(self.battery, 4),
+            "carrying_payload": self.carrying_payload,
+            "alive": self.alive,
+            "targets": self.targets,
+            "step": self.step,
+            "max_steps": self.max_steps,
+        }
+@dataclass
+class MissionInstruction:
+    """Single mission instruction used for long-horizon planning mode."""
+    id: str = ""
+    kind: str = ""
+    description: str = ""
+    target_id: str = ""
+    tool_name: str = ""
+    completed: bool = False
+    violated: bool = False
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "kind": self.kind,
+            "description": self.description,
+            "target_id": self.target_id,
+            "tool_name": self.tool_name,
+            "completed": self.completed,
+            "violated": self.violated,
+        }
+@dataclass
+class TracePoint:
+    """Single frame of the drone's recorded trajectory."""
+    step: int = 0
+    position: Vec3 = field(default_factory=Vec3)
+    velocity: Vec3 = field(default_factory=Vec3)
+    battery: float = 100.0
+    reward: float = 0.0
+    cumulative_reward: float = 0.0
+    events: list[str] = field(default_factory=list)
+    observation: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "step": self.step,
+            "position": self.position.to_dict(),
+            "velocity": self.velocity.to_dict(),
+            "battery": round(self.battery, 4),
+            "reward": round(self.reward, 4),
+            "cumulative_reward": round(self.cumulative_reward, 4),
+            "events": list(self.events),
+            "observation": self.observation,
+        }
+@dataclass
+class StepInfo:
+    """Per-step diagnostic info returned alongside the reward."""
+    collision: bool = False
+    delivered_target_ids: list[str] = field(default_factory=list)
+    in_hazard: bool = False
+    hazard_severity: float = 0.0
+    reached_base: bool = False
+    distance_traveled: float = 0.0
+    tool_call: str = ""
+    tool_result: dict[str, Any] = field(default_factory=dict)
+    instruction_completed: int = 0
+    instruction_total: int = 0
+    instruction_violations: int = 0
+    reward_breakdown: dict[str, float] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "collision": self.collision,
+            "delivered_target_ids": list(self.delivered_target_ids),
+            "in_hazard": self.in_hazard,
+            "hazard_severity": round(self.hazard_severity, 4),
+            "reached_base": self.reached_base,
+            "distance_traveled": round(self.distance_traveled, 4),
+            "tool_call": self.tool_call,
+            "tool_result": self.tool_result,
+            "instruction_completed": self.instruction_completed,
+            "instruction_total": self.instruction_total,
+            "instruction_violations": self.instruction_violations,
+            "reward_breakdown": {
+                k: round(v, 4) for k, v in self.reward_breakdown.items()
+            },
+        }

varaha_env.py ADDED Viewed

	@@ -0,0 +1,1323 @@

+"""Varaha — wildfire logistics simulation environment.
+A drone must deliver supplies to responder zones near wildfire hazards in
+California-like terrain.  The environment uses lightweight 3D kinematics with
+local metre-based coordinates and an optional lat/lon conversion helper for
+later Cesium visualisation.
+"""
+import math
+import random
+from dataclasses import dataclass
+from typing import Any, Optional
+from sim_types import (
+    Vec3,
+    DroneState,
+    BaseStation,
+    DeliveryTarget,
+    HazardRegion,
+    ObstacleVolume,
+    CylindricalObstacle,
+    ResponderUnit,
+    ScheduledEvent,
+    RESPONDER_STATUSES,
+    INTEL_TYPES,
+    StepInfo,
+    TracePoint,
+    MissionInstruction,
+)
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+@dataclass
+class VarahaConfig:
+    """All tunable environment parameters live here."""
+    # World bounds (metres) — 5 km × 5 km operational area
+    world_x: float = 5000.0
+    world_y: float = 5000.0
+    world_z: float = 200.0
+    # Drone physics
+    battery_capacity: float = 300.0
+    max_speed: float = 25.0          # m/s
+    max_acceleration: float = 8.0    # m/s²
+    dt: float = 0.5                  # seconds per step
+    # Episode
+    max_episode_steps: int = 2000
+    # Battery drain coefficients (tuned for 5 km scale)
+    drain_per_meter: float = 0.008
+    drain_elevation_factor: float = 0.02
+    drain_idle_per_step: float = 0.005
+    recharge_rate: float = 5.0       # battery units restored per recharge step
+    # Reward knobs
+    delivery_reward: float = 200.0
+    return_bonus: float = 100.0
+    step_penalty: float = 0.05
+    battery_cost_factor: float = 0.3
+    collision_penalty: float = 500.0
+    hazard_penalty: float = 5.0
+    failure_penalty: float = 200.0
+    distance_shaping_factor: float = 0.05
+    obstacle_proximity_penalty: float = 1.5
+    obstacle_proximity_radius: float = 80.0
+    # Long-horizon instruction mode (LLM-oriented)
+    instruction_mode: bool = False
+    instruction_count: int = 60
+    sparse_reward_mode: bool = False
+    instruction_completion_reward: float = 0.5
+    instruction_terminal_success_bonus: float = 2200.0
+    instruction_terminal_progress_bonus: float = 800.0
+    instruction_violation_penalty: float = 120.0
+    instruction_unfinished_penalty: float = 10.0
+    available_tools: tuple[str, ...] = (
+        "request_intel",
+        "battery_forecast",
+        "mission_report",
+    )
+    # California origin anchor (near Sacramento — wildfire-relevant)
+    origin_lat: float = 38.55
+    origin_lon: float = -121.47
+# ---------------------------------------------------------------------------
+# Random world generator for domain randomization
+# ---------------------------------------------------------------------------
+def build_random_world(env: "VarahaEnv") -> None:
+    """Legacy easy world gen — kept for backward compatibility."""
+    build_hardcore_world(env)
+def _hdist(a: Vec3, b: Vec3) -> float:
+    return ((a.x - b.x) ** 2 + (a.y - b.y) ** 2) ** 0.5
+def build_hardcore_world(env: "VarahaEnv", ultra_hard: bool = False) -> None:
+    """Generate an extremely challenging randomized world for serious RL training.
+    Features template-based obstacle placement (urban grid, dense forest,
+    corridor maze, river valley, fortress, mixed), cylindrical obstacles,
+    responder units with dynamic events, and adversarial target placement.
+    When ultra_hard=True: denser obstacles, more hazards, more targets, longer episodes.
+    """
+    cfg = env.cfg
+    rng = random
+    wx, wy, wz = cfg.world_x, cfg.world_y, cfg.world_z
+    margin = 200.0
+    def _rpos(z_lo=10.0, z_hi=60.0):
+        return Vec3(rng.uniform(margin, wx - margin),
+                    rng.uniform(margin, wy - margin),
+                    rng.uniform(z_lo, z_hi))
+    def _rpos_ground():
+        return Vec3(rng.uniform(margin, wx - margin),
+                    rng.uniform(margin, wy - margin), 0.0)
+    # --- Base station ---
+    base_pos = Vec3(rng.uniform(100, wx - 100), rng.uniform(100, wy - 100), 0.0)
+    env.base = BaseStation(position=base_pos, recharge_radius=rng.uniform(60, 100))
+    # --- Targets (2-5 normal, 3-6 ultra) ---
+    if ultra_hard:
+        n_targets = rng.choices([3, 4, 5, 6], weights=[0.15, 0.35, 0.35, 0.15])[0]
+    else:
+        n_targets = rng.choices([2, 3, 4, 5], weights=[0.15, 0.40, 0.30, 0.15])[0]
+    targets = []
+    for i in range(n_targets):
+        for _ in range(120):
+            pos = _rpos(z_lo=5.0, z_hi=60.0)
+            if _hdist(pos, base_pos) < 500:
+                continue
+            if all(_hdist(pos, t.position) > 400 for t in targets):
+                break
+        targets.append(DeliveryTarget(
+            id=f"T{i+1}", position=pos,
+            urgency=rng.uniform(0.3, 1.0),
+            delivery_radius=rng.uniform(70.0, 130.0),
+        ))
+    env.targets = targets
+    # --- Hazards (3-8 normal, 5-10 ultra) with wild variety ---
+    if ultra_hard:
+        n_hazards = rng.choices([5, 6, 7, 8, 9, 10], weights=[0.10, 0.20, 0.25, 0.25, 0.15, 0.05])[0]
+    else:
+        n_hazards = rng.choices([3, 4, 5, 6, 7, 8], weights=[0.10, 0.20, 0.25, 0.25, 0.15, 0.05])[0]
+    hazards = []
+    for i in range(n_hazards):
+        center = _rpos_ground()
+        fire_type = rng.choice(["tiny_intense", "massive_low", "tall_mid", "standard"])
+        if fire_type == "tiny_intense":
+            r, sev, ht, gr = rng.uniform(80, 200), rng.uniform(0.9, 1.0), rng.uniform(140, 195), rng.uniform(0.012, 0.025)
+        elif fire_type == "massive_low":
+            r, sev, ht, gr = rng.uniform(500, 1000), rng.uniform(0.3, 0.5), rng.uniform(25, 50), rng.uniform(0.001, 0.004)
+        elif fire_type == "tall_mid":
+            r, sev, ht, gr = rng.uniform(250, 500), rng.uniform(0.7, 0.95), rng.uniform(100, 180), rng.uniform(0.008, 0.015)
+        else:
+            r, sev, ht, gr = rng.uniform(200, 600), rng.uniform(0.4, 0.9), rng.uniform(40, 120), rng.uniform(0.003, 0.012)
+        hazards.append(HazardRegion(id=f"H{i+1}", center=center,
+                                     radius=r, severity=sev, height=ht, growth_rate=gr))
+    env.hazards = hazards
+    # --- Obstacle templates ---
+    obstacles: list[ObstacleVolume] = []
+    cylinders: list[CylindricalObstacle] = []
+    oid = [0]
+    def _next_oid(prefix="O"):
+        oid[0] += 1
+        return f"{prefix}{oid[0]}"
+    def _add_box(cx, cy, w, h, zt, kind="building"):
+        obstacles.append(ObstacleVolume(
+            id=_next_oid(), kind=kind,
+            min_corner=Vec3(cx - w / 2, cy - h / 2, 0.0),
+            max_corner=Vec3(cx + w / 2, cy + h / 2, zt),
+        ))
+    def _add_cyl(cx, cy, radius, height, kind="tree"):
+        cylinders.append(CylindricalObstacle(
+            id=_next_oid("C"), kind=kind,
+            center=Vec3(cx, cy, 0.0), radius=radius, height=height,
+        ))
+    if ultra_hard:
+        template = rng.choices(["urban_grid", "dense_forest", "corridor_maze",
+                               "river_valley", "fortress", "mixed"],
+                              weights=[0.08, 0.12, 0.12, 0.10, 0.10, 0.48])[0]
+    else:
+        template = rng.choice(["urban_grid", "dense_forest", "corridor_maze",
+                               "river_valley", "fortress", "mixed"])
+    # ---- URBAN GRID: rows and columns of buildings ----
+    if template == "urban_grid" or template == "mixed":
+        ox = rng.uniform(500, 1500)
+        oy = rng.uniform(500, 1500)
+        rows = rng.randint(2, 5) if ultra_hard else rng.randint(2, 4)
+        cols = rng.randint(3, 6) if ultra_hard else rng.randint(3, 5)
+        spacing = rng.uniform(300, 550) if ultra_hard else rng.uniform(350, 600)
+        for r in range(rows):
+            for c in range(cols):
+                bx = ox + c * spacing + rng.uniform(-80, 80)
+                by = oy + r * spacing + rng.uniform(-80, 80)
+                if bx < margin or bx > wx - margin or by < margin or by > wy - margin:
+                    continue
+                bw = rng.uniform(80, 300)
+                bh = rng.uniform(80, 300)
+                bzt = rng.choice([rng.uniform(30, 60), rng.uniform(100, 195)])
+                _add_box(bx, by, bw, bh, bzt)
+                if rng.random() < (0.45 if ultra_hard else 0.3):
+                    arm_dir = rng.choice(["east", "north"])
+                    if arm_dir == "east":
+                        _add_box(bx + bw / 2 + 40, by, 80, bh * 0.6, bzt * 0.9)
+                    else:
+                        _add_box(bx, by + bh / 2 + 40, bw * 0.6, 80, bzt * 0.9)
+    # ---- DENSE FOREST: many cylindrical trees ----
+    if template == "dense_forest" or template == "mixed":
+        forest_cx = rng.uniform(800, wx - 800)
+        forest_cy = rng.uniform(800, wy - 800)
+        n_trees = rng.randint(25, 60) if ultra_hard else rng.randint(15, 40)
+        for _ in range(n_trees):
+            tx = forest_cx + rng.gauss(0, 600)
+            ty = forest_cy + rng.gauss(0, 600)
+            tx = max(margin, min(wx - margin, tx))
+            ty = max(margin, min(wy - margin, ty))
+            tree_type = rng.choice(["pine", "oak", "palm", "dead"])
+            if tree_type == "pine":
+                _add_cyl(tx, ty, rng.uniform(8, 20), rng.uniform(40, 100), "tree_pine")
+            elif tree_type == "oak":
+                _add_cyl(tx, ty, rng.uniform(15, 40), rng.uniform(25, 60), "tree_oak")
+            elif tree_type == "palm":
+                _add_cyl(tx, ty, rng.uniform(5, 12), rng.uniform(30, 80), "tree_palm")
+            else:
+                _add_cyl(tx, ty, rng.uniform(10, 25), rng.uniform(20, 50), "tree_dead")
+    # ---- CORRIDOR MAZE: parallel walls with gaps ----
+    if template == "corridor_maze" or template == "mixed":
+        maze_ox = rng.uniform(400, wx / 2)
+        maze_oy = rng.uniform(400, wy / 2)
+        n_walls = rng.randint(6, 12) if ultra_hard else rng.randint(4, 8)
+        wall_dir = rng.choice(["horizontal", "vertical"])
+        spacing = rng.uniform(200, 500)
+        for w in range(n_walls):
+            wl = rng.uniform(400, 1500)
+            wt = rng.uniform(40, 80)
+            wzt = rng.uniform(100, 195)
+            if wall_dir == "horizontal":
+                wy_pos = maze_oy + w * spacing
+                if wy_pos > wy - margin:
+                    continue
+                _add_box(maze_ox + wl / 2, wy_pos, wl, wt, wzt, "wall")
+                gap_x = maze_ox + rng.uniform(0.2, 0.8) * wl
+                _add_box(gap_x, wy_pos, rng.uniform(80, 200), wt, 0, "gap")
+            else:
+                wx_pos = maze_ox + w * spacing
+                if wx_pos > wx - margin:
+                    continue
+                _add_box(wx_pos, maze_oy + wl / 2, wt, wl, wzt, "wall")
+    # ---- RIVER VALLEY: chain of low flat boxes + scattered trees ----
+    if template == "river_valley" or (template == "mixed" and rng.random() < (0.7 if ultra_hard else 0.5)):
+        river_start_x = rng.uniform(margin, wx / 3)
+        river_y = rng.uniform(wy * 0.3, wy * 0.7)
+        n_segs = rng.randint(10, 18) if ultra_hard else rng.randint(6, 12)
+        for seg in range(n_segs):
+            seg_x = river_start_x + seg * rng.uniform(200, 400)
+            seg_y = river_y + rng.gauss(0, 150)
+            if seg_x > wx - margin:
+                break
+            seg_y = max(margin, min(wy - margin, seg_y))
+            _add_box(seg_x, seg_y, rng.uniform(200, 400), rng.uniform(60, 150),
+                     rng.uniform(3, 10), "river")
+            for _ in range(rng.randint(2, 6) if ultra_hard else rng.randint(1, 4)):
+                bank_offset = rng.choice([-1, 1]) * rng.uniform(100, 300)
+                _add_cyl(seg_x + rng.uniform(-100, 100),
+                          seg_y + bank_offset,
+                          rng.uniform(8, 20), rng.uniform(30, 80), "tree_bank")
+    # ---- FORTRESS: walls surrounding a target area ----
+    if template == "fortress" or (template == "mixed" and rng.random() < (0.6 if ultra_hard else 0.4)):
+        if targets:
+            fort_target = rng.choice(targets)
+            ftx, fty = fort_target.position.x, fort_target.position.y
+            wall_half = rng.uniform(250, 500)
+            wall_zt = rng.uniform(120, 190)
+            wall_thick = rng.uniform(50, 80)
+            _add_box(ftx, fty - wall_half, wall_half * 2, wall_thick, wall_zt, "fortress_wall")
+            _add_box(ftx, fty + wall_half, wall_half * 2, wall_thick, wall_zt, "fortress_wall")
+            _add_box(ftx - wall_half, fty, wall_thick, wall_half * 2, wall_zt, "fortress_wall")
+            _add_box(ftx + wall_half, fty, wall_thick, wall_half * 2, wall_zt, "fortress_wall")
+    # ---- Always scatter some light poles and random pillars ----
+    n_poles = rng.randint(6, 18) if ultra_hard else rng.randint(3, 10)
+    for _ in range(n_poles):
+        px = rng.uniform(margin, wx - margin)
+        py = rng.uniform(margin, wy - margin)
+        _add_cyl(px, py, rng.uniform(2, 6), rng.uniform(30, 80), "light_pole")
+    n_pillars = rng.randint(4, 12) if ultra_hard else rng.randint(2, 6)
+    for _ in range(n_pillars):
+        px = rng.uniform(margin, wx - margin)
+        py = rng.uniform(margin, wy - margin)
+        _add_cyl(px, py, rng.uniform(15, 50), rng.uniform(80, 195), "pillar")
+    obstacles = [o for o in obstacles if o.max_corner.z > 1.0]
+    env.obstacles = obstacles
+    env.cylinders = cylinders
+    # --- Responder units (1 per target, up to 5 in ultra) ---
+    responders = []
+    max_resp = 5 if ultra_hard else 4
+    for i, tgt in enumerate(targets[:max_resp]):
+        r = ResponderUnit(
+            id=f"R{i+1}",
+            position=Vec3(tgt.position.x + rng.uniform(-50, 50),
+                          tgt.position.y + rng.uniform(-50, 50), 0.0),
+            linked_target_id=tgt.id,
+            status="stable",
+            current_need=rng.choice(["supplies", "medical", "evacuation", "water"]),
+            can_update_dropzone=rng.random() < 0.5,
+            active=True,
+        )
+        events = []
+        if rng.random() < 0.7:
+            events.append(ScheduledEvent(
+                step=rng.randint(100, 600),
+                event_type="urgency_update",
+                payload={"new_urgency": rng.uniform(0.5, 1.0)},
+            ))
+        if r.can_update_dropzone and rng.random() < 0.5:
+            events.append(ScheduledEvent(
+                step=rng.randint(200, 800),
+                event_type="dropzone_relocation",
+                payload={"dx": rng.uniform(-200, 200), "dy": rng.uniform(-200, 200)},
+            ))
+        if rng.random() < 0.6:
+            intel = rng.choice([
+                "blocked_north", "blocked_south", "blocked_east", "blocked_west",
+                "safe_north", "safe_south", "safe_east", "safe_west",
+                "fire_expanded", "fire_receded",
+            ])
+            events.append(ScheduledEvent(
+                step=rng.randint(50, 500),
+                event_type="hazard_intel",
+                payload={"intel": intel, "severity": rng.uniform(0.3, 1.0)},
+            ))
+        r.scheduled_events = events
+        responders.append(r)
+    env.responders = responders
+def build_hardcore_world_v2(env: "VarahaEnv") -> None:
+    """Ultra-hard variant: denser obstacles, more hazards, more targets."""
+    build_hardcore_world(env, ultra_hard=True)
+# ---------------------------------------------------------------------------
+# Environment
+# ---------------------------------------------------------------------------
+class VarahaEnv:
+    """Core wildfire logistics simulation.
+    Action format (dict)::
+        {
+            "ax": float,       # desired acceleration x (m/s²)
+            "ay": float,       # desired acceleration y
+            "az": float,       # desired acceleration z
+            "deliver": bool,   # attempt delivery if near a target
+            "recharge": bool,  # attempt recharge if near base
+            "tool_call": str,  # optional: request_intel | battery_forecast | mission_report
+        }
+    Returns ``(obs_dict, reward, done, info_dict)`` per OpenAI-gym convention.
+    """
+    def __init__(self, config: Optional[VarahaConfig] = None,
+                 world_fn: Optional[Any] = None) -> None:
+        self.cfg = config or VarahaConfig()
+        self._world_fn = world_fn
+        self.base: BaseStation
+        self.drone: DroneState
+        self.targets: list[DeliveryTarget] = []
+        self.hazards: list[HazardRegion] = []
+        self.obstacles: list[ObstacleVolume] = []
+        self.cylinders: list[CylindricalObstacle] = []
+        self.responders: list[ResponderUnit] = []
+        self.step_count: int = 0
+        self.cumulative_reward: float = 0.0
+        self.done: bool = False
+        self.trace: list[TracePoint] = []
+        self._prev_nearest_dist: float = 0.0
+        self._hazard_base_heights: list[float] = []
+        self._hazard_base_severities: list[float] = []
+        self.instructions: list[MissionInstruction] = []
+        self._instruction_cursor: int = 0
+        self._instruction_violations: int = 0
+        self._tool_history: list[str] = []
+        self._last_tool_result: dict[str, Any] = {}
+        self._instruction_progress_reward: float = 0.0
+        self._rebuild_world()
+    def _rebuild_world(self):
+        if self._world_fn is not None:
+            self._world_fn(self)
+        else:
+            self._build_demo_world()
+        self._hazard_base_heights = [h.height for h in self.hazards]
+        self._hazard_base_severities = [h.severity for h in self.hazards]
+    # ------------------------------------------------------------------
+    # World setup
+    # ------------------------------------------------------------------
+    def _build_demo_world(self) -> None:
+        """Hardcoded 5 km demo scenario.
+        Layout (top-down, +x → east, +y → north, 5 km × 5 km)::
+            T3 (1000,4200)
+            ·
+            H2 (900,3200)    O2 [500-1500, 2600-3000]
+            ·
+            ·                   T2 (4100,2900) ← inside H1 fringe
+            ·                H1 (3800,2600)
+            ·
+            ·        O1 [2200-2800, 1000-2200]
+            ·
+            ·   T1 (1800,600)
+            ·
+            Base (250,250)
+        - T2 sits inside the fringe of hazard H1 → brief hazard exposure required
+        - T3 is behind obstacle O2 and near hazard H2
+        - O1 blocks direct mid-map routing from T1 to T2
+        - Drone can fly over obstacles if altitude > obstacle height
+        - Total route ≈ 12 km, battery budget ≈ 300 units
+        """
+        self.base = BaseStation(position=Vec3(250.0, 250.0, 0.0), recharge_radius=80.0)
+        self.targets = [
+            DeliveryTarget(
+                id="T1", position=Vec3(1800.0, 600.0, 30.0),
+                urgency=0.6, delivery_radius=80.0,
+            ),
+            DeliveryTarget(
+                id="T2", position=Vec3(4100.0, 2900.0, 50.0),
+                urgency=1.0, delivery_radius=120.0,
+            ),
+            DeliveryTarget(
+                id="T3", position=Vec3(1000.0, 4200.0, 20.0),
+                urgency=0.8, delivery_radius=100.0,
+            ),
+        ]
+        self.hazards = [
+            HazardRegion(
+                id="H1", center=Vec3(3800.0, 2600.0, 0.0),
+                radius=500.0, severity=0.9,
+                height=70.0, growth_rate=0.005,
+            ),
+            HazardRegion(
+                id="H2", center=Vec3(900.0, 3200.0, 0.0),
+                radius=400.0, severity=0.7,
+                height=55.0, growth_rate=0.008,
+            ),
+        ]
+        self.obstacles = [
+            ObstacleVolume(
+                id="O1",
+                min_corner=Vec3(2200.0, 1000.0, 0.0),
+                max_corner=Vec3(2800.0, 2200.0, 120.0),
+            ),
+            ObstacleVolume(
+                id="O2",
+                min_corner=Vec3(500.0, 2600.0, 0.0),
+                max_corner=Vec3(1500.0, 3000.0, 90.0),
+            ),
+        ]
+    # ------------------------------------------------------------------
+    # Core API
+    # ------------------------------------------------------------------
+    def reset(self, seed: Optional[int] = None) -> dict[str, Any]:
+        """Reset the environment and return the initial observation."""
+        if seed is not None:
+            random.seed(seed)
+        if self._world_fn is not None:
+            self._rebuild_world()
+        self.drone = DroneState(
+            position=Vec3(self.base.position.x, self.base.position.y, 0.0),
+            velocity=Vec3(0.0, 0.0, 0.0),
+            battery=self.cfg.battery_capacity,
+            carrying_payload=True,
+            alive=True,
+        )
+        for t in self.targets:
+            t.delivered = False
+        for i, h in enumerate(self.hazards):
+            h.height = self._hazard_base_heights[i] * random.uniform(0.85, 1.15)
+            h.severity = max(0.3, min(1.0, self._hazard_base_severities[i] + random.uniform(-0.1, 0.1)))
+            h.reset()
+        for r in self.responders:
+            r.active = True
+            r.status = "stable"
+            r.latest_intel = "none"
+            r.intel_severity = 0.0
+            r.message = ""
+            for ev in r.scheduled_events:
+                ev.fired = False
+        self._target_base_positions = {
+            t.id: Vec3(t.position.x, t.position.y, t.position.z)
+            for t in self.targets
+        }
+        self._build_instruction_program()
+        self._instruction_progress_reward = 0.0
+        self._last_tool_result = {}
+        self._tool_history = []
+        self.step_count = 0
+        self.cumulative_reward = 0.0
+        self.done = False
+        self.trace = []
+        self._prev_nearest_dist = self._nearest_target_dist()
+        obs = self.get_observation()
+        self.trace.append(TracePoint(
+            step=0,
+            position=Vec3(self.drone.position.x, self.drone.position.y, self.drone.position.z),
+            velocity=Vec3(0.0, 0.0, 0.0),
+            battery=self.drone.battery,
+            reward=0.0,
+            cumulative_reward=0.0,
+            events=["reset"],
+            observation=obs,
+        ))
+        return obs
+    def step(self, action: dict[str, Any]) -> tuple[dict, float, bool, dict]:
+        """Advance the simulation by one timestep.
+        Returns ``(observation, reward, done, info)``.
+        """
+        if self.done:
+            return self.get_observation(), 0.0, True, StepInfo().to_dict()
+        self.step_count += 1
+        # --- parse & clamp acceleration ---
+        accel = Vec3(
+            float(action.get("ax", 0.0)),
+            float(action.get("ay", 0.0)),
+            float(action.get("az", 0.0)),
+        ).clamp_magnitude(self.cfg.max_acceleration)
+        # --- kinematics (Euler integration) ---
+        self.drone.velocity = (
+            self.drone.velocity + accel.scale(self.cfg.dt)
+        ).clamp_magnitude(self.cfg.max_speed)
+        old_pos = Vec3(self.drone.position.x, self.drone.position.y, self.drone.position.z)
+        self.drone.position = self.drone.position + self.drone.velocity.scale(self.cfg.dt)
+        # clamp to world bounds
+        self.drone.position.x = max(0.0, min(self.cfg.world_x, self.drone.position.x))
+        self.drone.position.y = max(0.0, min(self.cfg.world_y, self.drone.position.y))
+        self.drone.position.z = max(0.0, min(self.cfg.world_z, self.drone.position.z))
+        dist_traveled = old_pos.distance_to(self.drone.position)
+        elevation_change = abs(self.drone.position.z - old_pos.z)
+        # --- battery ---
+        drain = self._compute_battery_drain(dist_traveled, elevation_change)
+        self.drone.battery -= drain
+        # --- advance dynamic hazards ---
+        for h in self.hazards:
+            h.tick()
+        # --- advance responder events ---
+        self._tick_responders()
+        # --- world interactions ---
+        collision = self._check_collisions()
+        in_hazard, hazard_sev = self._check_hazards()
+        tool_call = ""
+        tool_result: dict[str, Any] = {}
+        raw_tool_call = action.get("tool_call")
+        if raw_tool_call is not None and str(raw_tool_call).strip():
+            tool_call, tool_result = self._execute_tool_call(str(raw_tool_call).strip())
+        prev_instruction_cursor = self._instruction_cursor
+        delivered_ids: list[str] = []
+        if action.get("deliver", False):
+            delivered_ids = self._deliver_targets()
+        reached_base = (
+            ((self.drone.position.x - self.base.position.x) ** 2
+             + (self.drone.position.y - self.base.position.y) ** 2) ** 0.5
+            <= self.base.recharge_radius
+        )
+        if action.get("recharge", False) and reached_base:
+            self.drone.battery = min(
+                self.cfg.battery_capacity,
+                self.drone.battery + self.cfg.recharge_rate,
+            )
+        self._update_instruction_progress(
+            delivered_ids=delivered_ids,
+            reached_base=reached_base,
+            tool_call=tool_call,
+        )
+        completed_now = max(0, self._instruction_cursor - prev_instruction_cursor)
+        if self._all_delivered():
+            self.drone.carrying_payload = False
+        # --- reward ---
+        info = StepInfo(
+            collision=collision,
+            delivered_target_ids=delivered_ids,
+            in_hazard=in_hazard,
+            hazard_severity=hazard_sev,
+            reached_base=reached_base,
+            distance_traveled=dist_traveled,
+            tool_call=tool_call,
+            tool_result=tool_result,
+            instruction_completed=self._instruction_cursor,
+            instruction_total=len(self.instructions),
+            instruction_violations=self._instruction_violations,
+        )
+        reward, breakdown = self._compute_reward(info)
+        info.reward_breakdown = breakdown
+        self.cumulative_reward += reward
+        # --- termination ---
+        if collision:
+            self.drone.alive = False
+            self.done = True
+        elif self.drone.battery <= 0.0:
+            self.drone.battery = 0.0
+            self.drone.alive = False
+            self.done = True
+        elif self._is_success():
+            self.done = True
+        elif self.step_count >= self.cfg.max_episode_steps:
+            self.done = True
+        # record trace
+        events: list[str] = []
+        for tid in delivered_ids:
+            events.append(f"delivered_{tid}")
+        if collision:
+            events.append("collision")
+        if in_hazard:
+            events.append(f"hazard_{hazard_sev:.2f}")
+        if self.drone.battery <= 0.0 and not collision:
+            events.append("battery_dead")
+        if self._is_success():
+            events.append("success")
+        if tool_call:
+            events.append(f"tool_{tool_call}")
+        if completed_now > 0:
+            events.append(f"instruction+{completed_now}")
+        obs = self.get_observation()
+        self.trace.append(TracePoint(
+            step=self.step_count,
+            position=Vec3(self.drone.position.x, self.drone.position.y, self.drone.position.z),
+            velocity=Vec3(self.drone.velocity.x, self.drone.velocity.y, self.drone.velocity.z),
+            battery=self.drone.battery,
+            reward=reward,
+            cumulative_reward=self.cumulative_reward,
+            events=events,
+            observation=obs,
+        ))
+        return obs, reward, self.done, info.to_dict()
+    # ------------------------------------------------------------------
+    # Observation / render
+    # ------------------------------------------------------------------
+    def get_observation(self) -> dict[str, Any]:
+        """Compact, RL-friendly observation dict."""
+        dp = self.drone.position
+        targets_obs = []
+        for t in self.targets:
+            rel = t.position - dp
+            targets_obs.append({
+                "id": t.id,
+                "relative_position": rel.to_dict(),
+                "urgency": t.urgency,
+                "delivered": t.delivered,
+            })
+        hazards_obs = []
+        for h in self.hazards:
+            rel = h.center - dp
+            hazards_obs.append({
+                "id": h.id,
+                "relative_position": rel.to_dict(),
+                "current_height": h._current_height,
+                "severity": h.severity,
+            })
+        obstacles_obs = []
+        for obs in self.obstacles:
+            c = obs.center
+            hs = obs.half_size
+            rel = c - dp
+            dist = dp.horizontal_distance_to(c)
+            obstacles_obs.append({
+                "type": "box",
+                "relative_position": rel.to_dict(),
+                "height": obs.height,
+                "size_x": hs.x * 2,
+                "size_y": hs.y * 2,
+                "distance": dist,
+                "kind": obs.kind,
+            })
+        for cyl in self.cylinders:
+            rel = cyl.center - dp
+            dist = dp.horizontal_distance_to(cyl.center)
+            obstacles_obs.append({
+                "type": "cylinder",
+                "relative_position": rel.to_dict(),
+                "height": cyl.height,
+                "size_x": cyl.radius * 2,
+                "size_y": cyl.radius * 2,
+                "distance": dist,
+                "kind": cyl.kind,
+            })
+        obstacles_obs.sort(key=lambda o: o["distance"])
+        responders_obs = []
+        for r in self.responders:
+            if not r.active:
+                continue
+            rel = r.position - dp
+            intel_dir = r.intel_direction()
+            responders_obs.append({
+                "id": r.id,
+                "relative_position": rel.to_dict(),
+                "linked_target_id": r.linked_target_id,
+                "status": r.status,
+                "status_code": r.status_code(),
+                "latest_intel": r.latest_intel,
+                "intel_direction": {"x": intel_dir[0], "y": intel_dir[1]},
+                "intel_severity": r.intel_severity,
+            })
+        mission_obs = self._instruction_snapshot()
+        return {
+            "drone_position": dp.to_dict(),
+            "drone_velocity": self.drone.velocity.to_dict(),
+            "battery": round(self.drone.battery, 4),
+            "carrying_payload": self.drone.carrying_payload,
+            "alive": self.drone.alive,
+            "targets": targets_obs,
+            "hazards": hazards_obs,
+            "obstacles": obstacles_obs,
+            "responders": responders_obs,
+            "mission": mission_obs,
+            "last_tool_result": self._last_tool_result,
+            "step": self.step_count,
+            "max_steps": self.cfg.max_episode_steps,
+        }
+    def render_state(self) -> dict[str, Any]:
+        """Rich state dict for future Cesium / frontend rendering."""
+        return {
+            "base_station": self.base.to_dict(),
+            "drone": self.drone.to_dict(),
+            "targets": [t.to_dict() for t in self.targets],
+            "hazards": [h.to_dict() for h in self.hazards],
+            "obstacles": [o.to_dict() for o in self.obstacles],
+            "cylinders": [c.to_dict() for c in self.cylinders],
+            "responders": [r.to_dict() for r in self.responders],
+            "mission": self._instruction_snapshot(include_full=True),
+            "tool_history": list(self._tool_history),
+            "step": self.step_count,
+            "max_steps": self.cfg.max_episode_steps,
+            "cumulative_reward": round(self.cumulative_reward, 4),
+            "done": self.done,
+        }
+    def get_trace(self) -> dict[str, Any]:
+        """Full episode trace for replay / visualisation."""
+        return {
+            "world": {
+                "bounds": {"x": self.cfg.world_x, "y": self.cfg.world_y, "z": self.cfg.world_z},
+                "base_station": self.base.to_dict(),
+                "targets": [t.to_dict() for t in self.targets],
+                "hazards": [h.to_dict() for h in self.hazards],
+                "obstacles": [o.to_dict() for o in self.obstacles],
+                "cylinders": [c.to_dict() for c in self.cylinders],
+                "responders": [r.to_dict() for r in self.responders],
+                "mission": self._instruction_snapshot(include_full=True),
+            },
+            "trace": [tp.to_dict() for tp in self.trace],
+            "summary": {
+                "total_steps": self.step_count,
+                "cumulative_reward": round(self.cumulative_reward, 4),
+                "delivered": [t.id for t in self.targets if t.delivered],
+                "alive": self.drone.alive,
+                "final_battery": round(self.drone.battery, 4),
+                "success": self._is_success(),
+                "instruction_completed": self._instruction_cursor,
+                "instruction_total": len(self.instructions),
+                "instruction_violations": self._instruction_violations,
+                "tool_calls": list(self._tool_history),
+            },
+        }
+    # ------------------------------------------------------------------
+    # Long-horizon instruction mode
+    # ------------------------------------------------------------------
+    def _build_instruction_program(self) -> None:
+        self.instructions = []
+        self._instruction_cursor = 0
+        self._instruction_violations = 0
+        if not self.cfg.instruction_mode or not self.targets:
+            return
+        ordered_targets = sorted(self.targets, key=lambda t: (-t.urgency, t.id))
+        target_count = len(ordered_targets)
+        desired_len = self.cfg.instruction_count if self.cfg.instruction_count > 0 else (target_count * 3 + 1)
+        desired_len = max(desired_len, target_count * 2 + 1)
+        instructions: list[MissionInstruction] = []
+        inst_idx = 1
+        cycle = 0
+        while len(instructions) < max(desired_len - 1, 1):
+            for tgt in ordered_targets:
+                if len(instructions) >= max(desired_len - 1, 1):
+                    break
+                instructions.append(
+                    MissionInstruction(
+                        id=f"I{inst_idx}",
+                        kind="deliver_target",
+                        description=f"Cycle {cycle + 1}: deliver to {tgt.id} in order.",
+                        target_id=tgt.id,
+                    )
+                )
+                inst_idx += 1
+                if len(instructions) >= max(desired_len - 1, 1):
+                    break
+                tool = "request_intel" if (cycle % 2 == 0) else "battery_forecast"
+                instructions.append(
+                    MissionInstruction(
+                        id=f"I{inst_idx}",
+                        kind="tool_call",
+                        description=f"Call {tool} after servicing {tgt.id}.",
+                        target_id=tgt.id,
+                        tool_name=tool,
+                    )
+                )
+                inst_idx += 1
+            cycle += 1
+        instructions.append(
+            MissionInstruction(
+                id=f"I{inst_idx}",
+                kind="return_base",
+                description="Return to base only after all deliveries are completed.",
+            )
+        )
+        self.instructions = instructions
+    def _current_instruction(self) -> Optional[MissionInstruction]:
+        if self._instruction_cursor >= len(self.instructions):
+            return None
+        return self.instructions[self._instruction_cursor]
+    def _instruction_snapshot(self, include_full: bool = False) -> dict[str, Any]:
+        total = len(self.instructions)
+        completed = min(self._instruction_cursor, total)
+        next_instruction = self._current_instruction()
+        out: dict[str, Any] = {
+            "enabled": self.cfg.instruction_mode,
+            "total": total,
+            "completed": completed,
+            "remaining": max(total - completed, 0),
+            "progress": (completed / total) if total > 0 else 1.0,
+            "violations": self._instruction_violations,
+            "next_instruction": next_instruction.to_dict() if next_instruction else None,
+        }
+        if include_full:
+            out["instructions"] = [inst.to_dict() for inst in self.instructions]
+        return out
+    def _complete_current_instruction(self) -> None:
+        inst = self._current_instruction()
+        if inst is None:
+            return
+        inst.completed = True
+        self._instruction_cursor += 1
+        self._instruction_progress_reward += self.cfg.instruction_completion_reward
+    def _record_instruction_violation(self) -> None:
+        self._instruction_violations += 1
+        inst = self._current_instruction()
+        if inst is not None:
+            inst.violated = True
+    def _tool_matches_instruction(self, tool_call: str, inst: MissionInstruction) -> bool:
+        base, _, arg = tool_call.partition(":")
+        if base != inst.tool_name:
+            return False
+        if inst.target_id and arg and arg != inst.target_id:
+            return False
+        return True
+    def _update_instruction_progress(
+        self,
+        delivered_ids: list[str],
+        reached_base: bool,
+        tool_call: str,
+    ) -> None:
+        if not self.cfg.instruction_mode or not self.instructions:
+            return
+        inst = self._current_instruction()
+        if inst and inst.kind == "deliver_target":
+            for tid in delivered_ids:
+                if tid != inst.target_id:
+                    self._record_instruction_violation()
+        while True:
+            inst = self._current_instruction()
+            if inst is None:
+                break
+            if inst.kind == "deliver_target":
+                if inst.target_id in delivered_ids:
+                    self._complete_current_instruction()
+                    continue
+                break
+            if inst.kind == "tool_call":
+                if not tool_call:
+                    break
+                if self._tool_matches_instruction(tool_call, inst):
+                    self._complete_current_instruction()
+                else:
+                    self._record_instruction_violation()
+                break
+            if inst.kind == "return_base":
+                if reached_base and self._all_delivered():
+                    self._complete_current_instruction()
+                break
+            break
+    def _execute_tool_call(self, tool_call: str) -> tuple[str, dict[str, Any]]:
+        raw = tool_call.strip().lower()
+        if not raw:
+            return "", {}
+        tool_name, _, arg = raw.partition(":")
+        normalized_call = f"{tool_name}:{arg}" if arg else tool_name
+        if tool_name not in self.cfg.available_tools:
+            result = {"ok": False, "error": f"unsupported_tool:{tool_name}"}
+            self._tool_history.append(normalized_call)
+            self._last_tool_result = result
+            return normalized_call, result
+        if tool_name == "request_intel":
+            responder = None
+            if arg:
+                responder = next(
+                    (r for r in self.responders if r.active and r.linked_target_id.lower() == arg.lower()),
+                    None,
+                )
+            if responder is None:
+                responder = next((r for r in self.responders if r.active), None)
+            if responder is None:
+                result = {"ok": True, "intel": "none", "message": "no_active_responders"}
+            else:
+                result = {
+                    "ok": True,
+                    "intel": responder.latest_intel,
+                    "intel_severity": round(responder.intel_severity, 3),
+                    "responder_id": responder.id,
+                    "target_id": responder.linked_target_id,
+                    "message": responder.message,
+                }
+        elif tool_name == "battery_forecast":
+            burn = max(self.cfg.drain_per_meter, 1e-6)
+            est_range = self.drone.battery / burn
+            result = {
+                "ok": True,
+                "battery": round(self.drone.battery, 3),
+                "estimated_range_m": round(est_range, 1),
+            }
+        else:  # mission_report
+            result = {
+                "ok": True,
+                "delivered": [t.id for t in self.targets if t.delivered],
+                "remaining": [t.id for t in self.targets if not t.delivered],
+                "instruction_progress": round(self._instruction_snapshot()["progress"], 3),
+                "violations": self._instruction_violations,
+            }
+        self._tool_history.append(normalized_call)
+        self._last_tool_result = result
+        return normalized_call, result
+    # ------------------------------------------------------------------
+    # Coordinate conversion
+    # ------------------------------------------------------------------
+    def local_to_latlon(self, vec: Vec3) -> tuple[float, float, float]:
+        """Convert local (x, y, z) metres to (lat, lon, alt).
+        Uses a flat-earth approximation centred on ``cfg.origin_lat/lon``.
+        Accurate enough for small areas (~tens of km) and Cesium plotting.
+        """
+        meters_per_deg_lat = 111_320.0
+        meters_per_deg_lon = 111_320.0 * math.cos(math.radians(self.cfg.origin_lat))
+        lat = self.cfg.origin_lat + vec.y / meters_per_deg_lat
+        lon = self.cfg.origin_lon + vec.x / meters_per_deg_lon
+        alt = vec.z
+        return (round(lat, 7), round(lon, 7), round(alt, 2))
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _compute_battery_drain(self, dist: float, elevation_change: float) -> float:
+        return (
+            dist * self.cfg.drain_per_meter
+            + elevation_change * self.cfg.drain_elevation_factor
+            + self.cfg.drain_idle_per_step
+        )
+    def _check_collisions(self) -> bool:
+        for obs in self.obstacles:
+            if obs.contains(self.drone.position):
+                return True
+        for cyl in self.cylinders:
+            if cyl.contains(self.drone.position):
+                return True
+        return False
+    def _check_hazards(self) -> tuple[bool, float]:
+        max_sev = 0.0
+        in_hazard = False
+        for h in self.hazards:
+            df = h.danger_factor(self.drone.position)
+            if df > 0.0:
+                in_hazard = True
+                max_sev = max(max_sev, df)
+        return in_hazard, max_sev
+    def _deliver_targets(self) -> list[str]:
+        """Cylindrical delivery check — drone must be within horizontal radius
+        and above the target (within a generous altitude window for drops)."""
+        delivered: list[str] = []
+        for t in self.targets:
+            if t.delivered:
+                continue
+            dx = self.drone.position.x - t.position.x
+            dy = self.drone.position.y - t.position.y
+            horiz_dist = (dx * dx + dy * dy) ** 0.5
+            alt_above = self.drone.position.z - t.position.z
+            if horiz_dist <= t.delivery_radius and -10.0 <= alt_above <= t.delivery_radius * 2:
+                t.delivered = True
+                delivered.append(t.id)
+        return delivered
+    def _all_delivered(self) -> bool:
+        return all(t.delivered for t in self.targets)
+    def _is_success(self) -> bool:
+        hdist = ((self.drone.position.x - self.base.position.x) ** 2
+                 + (self.drone.position.y - self.base.position.y) ** 2) ** 0.5
+        return self._all_delivered() and hdist <= self.base.recharge_radius
+    def _nearest_target_dist(self) -> float:
+        """Horizontal distance to closest undelivered target, or to base if all done."""
+        dists = [
+            ((self.drone.position.x - t.position.x) ** 2
+             + (self.drone.position.y - t.position.y) ** 2) ** 0.5
+            for t in self.targets
+            if not t.delivered
+        ]
+        if not dists:
+            return ((self.drone.position.x - self.base.position.x) ** 2
+                    + (self.drone.position.y - self.base.position.y) ** 2) ** 0.5
+        return min(dists)
+    def _tick_responders(self) -> None:
+        """Process scheduled responder events for the current step."""
+        for r in self.responders:
+            if not r.active:
+                continue
+            for ev in r.scheduled_events:
+                if ev.fired or ev.step != self.step_count:
+                    continue
+                ev.fired = True
+                etype = ev.event_type
+                if etype == "urgency_update":
+                    tgt = self._find_target(r.linked_target_id)
+                    if tgt and not tgt.delivered:
+                        tgt.urgency = max(0.1, min(1.0, ev.payload.get("new_urgency", tgt.urgency)))
+                        r.status = "critical" if tgt.urgency >= 0.9 else "urgent" if tgt.urgency >= 0.6 else "stable"
+                        r.message = f"urgency->{tgt.urgency:.1f}"
+                elif etype == "dropzone_relocation":
+                    tgt = self._find_target(r.linked_target_id)
+                    if tgt and not tgt.delivered and r.can_update_dropzone:
+                        dx = ev.payload.get("dx", 0.0)
+                        dy = ev.payload.get("dy", 0.0)
+                        tgt.position.x = max(50, min(self.cfg.world_x - 50, tgt.position.x + dx))
+                        tgt.position.y = max(50, min(self.cfg.world_y - 50, tgt.position.y + dy))
+                        r.position = Vec3(tgt.position.x, tgt.position.y, 0.0)
+                        r.message = f"dropzone moved ({dx:+.0f},{dy:+.0f})"
+                        self._prev_nearest_dist = self._nearest_target_dist()
+                elif etype == "hazard_intel":
+                    r.latest_intel = ev.payload.get("intel", "none")
+                    r.intel_severity = ev.payload.get("severity", 0.5)
+                    r.message = f"intel: {r.latest_intel}"
+    def _find_target(self, tid: str) -> Optional[DeliveryTarget]:
+        for t in self.targets:
+            if t.id == tid:
+                return t
+        return None
+    def _obstacle_proximity_penalty(self) -> float:
+        """Graduated penalty for flying close to any obstacle surface."""
+        min_dist = float("inf")
+        pos = self.drone.position
+        for obs in self.obstacles:
+            d = obs.nearest_surface_dist(pos)
+            if d < min_dist:
+                min_dist = d
+        for cyl in self.cylinders:
+            d = cyl.nearest_surface_dist(pos)
+            if d < min_dist:
+                min_dist = d
+        if min_dist >= self.cfg.obstacle_proximity_radius:
+            return 0.0
+        factor = 1.0 - min_dist / self.cfg.obstacle_proximity_radius
+        return self.cfg.obstacle_proximity_penalty * factor * factor
+    def _compute_reward(self, info: StepInfo) -> tuple[float, dict[str, float]]:
+        if self.cfg.instruction_mode and self.cfg.sparse_reward_mode:
+            return self._compute_sparse_instruction_reward(info)
+        bd: dict[str, float] = {}
+        total = 0.0
+        # per-step cost of time
+        bd["step_penalty"] = -self.cfg.step_penalty
+        total += bd["step_penalty"]
+        # battery usage cost (proportional to energy spent)
+        bd["battery_cost"] = -(
+            info.distance_traveled * self.cfg.drain_per_meter * self.cfg.battery_cost_factor
+        )
+        total += bd["battery_cost"]
+        if self._instruction_progress_reward > 0.0:
+            bd["instruction_progress"] = self._instruction_progress_reward
+            total += bd["instruction_progress"]
+            self._instruction_progress_reward = 0.0
+        # delivery rewards (scaled by urgency) + progress bonus
+        for tid in info.delivered_target_ids:
+            tgt = next(t for t in self.targets if t.id == tid)
+            r = self.cfg.delivery_reward * (1.0 + tgt.urgency)
+            bd[f"delivery_{tid}"] = r
+            total += r
+        if info.delivered_target_ids:
+            n_remaining = sum(1 for t in self.targets if not t.delivered)
+            progress_bonus = 50.0 * (1.0 - n_remaining / len(self.targets))
+            bd["progress_bonus"] = progress_bonus
+            total += progress_bonus
+        # collision
+        if info.collision:
+            bd["collision"] = -self.cfg.collision_penalty
+            total += bd["collision"]
+        # hazard exposure (severity-weighted)
+        if info.in_hazard:
+            bd["hazard"] = -self.cfg.hazard_penalty * info.hazard_severity
+            total += bd["hazard"]
+        # safe return bonus
+        if info.reached_base and self._all_delivered():
+            bd["return_bonus"] = self.cfg.return_bonus
+            total += bd["return_bonus"]
+        # distance shaping — nudge toward nearest undelivered target (or base)
+        # Skip shaping on delivery steps to avoid a huge negative spike
+        # when the nearest-target reference jumps to a farther target.
+        # Double the factor when heading home after all deliveries.
+        curr_dist = self._nearest_target_dist()
+        if info.delivered_target_ids:
+            bd["distance_shaping"] = 0.0
+            self._prev_nearest_dist = curr_dist
+        else:
+            factor = self.cfg.distance_shaping_factor
+            if self._all_delivered():
+                factor *= 2.0
+            shaping = (self._prev_nearest_dist - curr_dist) * factor
+            bd["distance_shaping"] = shaping
+            total += shaping
+            self._prev_nearest_dist = curr_dist
+        # obstacle proximity (graduated — discourages flying close)
+        prox = self._obstacle_proximity_penalty()
+        if prox > 0:
+            bd["obstacle_proximity"] = -prox
+            total -= prox
+        # failure (battery depletion; collision already penalised above)
+        if self.drone.battery <= 0.0 and not info.collision:
+            bd["failure"] = -self.cfg.failure_penalty
+            total += bd["failure"]
+        bd["total"] = total
+        return total, bd
+    def _compute_sparse_instruction_reward(self, info: StepInfo) -> tuple[float, dict[str, float]]:
+        bd: dict[str, float] = {}
+        total = 0.0
+        # Keep shaping intentionally small in sparse mode.
+        bd["step_penalty"] = -(self.cfg.step_penalty * 0.25)
+        total += bd["step_penalty"]
+        if self._instruction_progress_reward > 0.0:
+            bd["instruction_progress"] = self._instruction_progress_reward
+            total += bd["instruction_progress"]
+            self._instruction_progress_reward = 0.0
+        if info.in_hazard:
+            bd["hazard"] = -(self.cfg.hazard_penalty * 0.2 * info.hazard_severity)
+            total += bd["hazard"]
+        terminal = (
+            info.collision
+            or self.drone.battery <= 0.0
+            or self._is_success()
+            or self.step_count >= self.cfg.max_episode_steps
+        )
+        if terminal:
+            total_instr = len(self.instructions)
+            progress = (self._instruction_cursor / total_instr) if total_instr > 0 else 1.0
+            bd["terminal_progress"] = self.cfg.instruction_terminal_progress_bonus * progress
+            total += bd["terminal_progress"]
+            if self._is_success():
+                bd["terminal_success"] = self.cfg.instruction_terminal_success_bonus
+                total += bd["terminal_success"]
+            else:
+                bd["terminal_failure"] = -self.cfg.failure_penalty
+                total += bd["terminal_failure"]
+            remaining = max(total_instr - self._instruction_cursor, 0)
+            if remaining > 0:
+                bd["unfinished_penalty"] = -remaining * self.cfg.instruction_unfinished_penalty
+                total += bd["unfinished_penalty"]
+            if self._instruction_violations > 0:
+                bd["instruction_violations"] = (
+                    -self._instruction_violations * self.cfg.instruction_violation_penalty
+                )
+                total += bd["instruction_violations"]
+            if info.collision:
+                bd["collision"] = -self.cfg.collision_penalty
+                total += bd["collision"]
+        bd["total"] = total
+        return total, bd