Spaces:

atomic24
/

planetary-rover-navigation

Paused

App Files Files Community

Bhaskar commited on Apr 23

Commit

51bb0d4

1 Parent(s): d29a040

Round 2 Upgrade: Added GRPO train.py and vector-field reward shaping

Browse files

Files changed (5) hide show

.gitignore +3 -1
README.md +14 -12
main.py +132 -12
openenv.yaml +18 -2
train.py +523 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 env/
 __pycache__/
-.env

 env/
 __pycache__/
+.env
+*.egg-info/

README.md CHANGED Viewed

@@ -335,15 +335,16 @@ step_efficiency = 1.0 − (steps_taken / max_steps)
 The step reward returned by `/step` is used for online RL training. It is separate from the grader score.
-| Event | Reward |
-|---|---|
-| Every step | −0.01 (step penalty) |
-| Battery drain | −drain × 2.0 (efficiency incentive) |
-| Waypoint reached | +10.0 |
-| Obstacle collision | −5.0 |
-| Battery depleted | −20.0 |
-| Distance shaping | +max(0, (100 − dist) × 0.001) |
-| Episode complete in < 50% of budget | +5.0 (efficiency bonus) |
 ---
@@ -352,9 +353,10 @@ The step reward returned by `/step` is used for online RL training. It is separa
 ```
 planetary-rover-env/
 ├── openenv.yaml      # Typed observation + action space declarations
-├── main.py           # FastAPI server — physics engine + all routes (1507 lines)
-├── baseline.py       # Inference script — 3 task-specific agents + grader client
-├── requirements.txt  # 4 pinned dependencies
 ├── Dockerfile        # Two-stage optimised build, port 7860, non-root user
 └── README.md         # This file
 ```

 The step reward returned by `/step` is used for online RL training. It is separate from the grader score.
+| Event | Reward | Notes |
+|---|---|---|
+| Every step | −0.01 | Constant time-pressure penalty |
+| Battery drain | −drain × 2.0 | Efficiency incentive |
+| **Waypoint reached** | **+100.0** | Massive asymmetric reward to prevent early policy collapse |
+| Obstacle collision | −5.0 | Speed zeroed, micro battery penalty |
+| Battery depleted | −20.0 | Terminal penalty |
+| **Potential-based distance shaping** | `(prev_dist − curr_dist) / initial_dist` | Positive when closing distance; **zero when stationary** (defeats the "stand still" exploit) |
+| **Vector-field shaping (near obstacles)** | up to +0.3 | Active within 10 m of obstacles; rewards cosine similarity between rover heading and computed tangent vector (repulsive + attractive gradient blend) |
+| Episode complete in < 50% of budget | +5.0 | Efficiency bonus |
 ---
 ```
 planetary-rover-env/
 ├── openenv.yaml      # Typed observation + action space declarations
+├── main.py           # FastAPI server — physics engine + all routes (1632 lines)
+├── inference.py      # LLM-driven inference agent (HF Inference API)
+├── train.py          # GRPO training script (Unsloth 4-bit + TRL GRPOTrainer)
+├── requirements.txt  # Pinned runtime dependencies
 ├── Dockerfile        # Two-stage optimised build, port 7860, non-root user
 └── README.md         # This file
 ```

main.py CHANGED Viewed

@@ -567,6 +567,9 @@ class RoverSim:
     min_distance:     float = 0.0   # running minimum; drives partial-progress score
     collision_count:  int   = 0     # cumulative obstacle contacts
     # -------------------------------------------------------------------
     # Helpers
     # -------------------------------------------------------------------
@@ -674,20 +677,129 @@ class RoverSim:
     # Reward  (called by step)
     # -------------------------------------------------------------------
-    def _compute_reward(self, waypoint_hit: bool, collided: bool, drain: float) -> float:
-        r  = -0.01                    # step penalty
-        r -= drain * 2.0              # battery efficiency incentive
-        if waypoint_hit:  r += 10.0
-        if collided:      r -= 5.0
-        if self.battery <= 0.0: r -= 20.0
-        # Distance shaping toward active waypoint
         wp = self.active_waypoint
         if wp:
-            dist = math.hypot(wp[0] - self.px, wp[1] - self.py)
-            r += max(0.0, (100.0 - dist) * 0.001)
-        # Efficiency bonus: episode done in < 50% of budget
-        if self.waypoints_hit == len(self.waypoint_list) and self.steps < self.max_steps * 0.5:
             r += 5.0
         return r
     # -------------------------------------------------------------------
@@ -770,6 +882,10 @@ class RoverSim:
         self.steps += 1
         self._apply_kinematics(action)
         drain        = self._update_battery(action)
         collided, nd = self._check_collision()
@@ -781,6 +897,9 @@ class RoverSim:
         if current_dist < self.min_distance:
             self.min_distance = current_dist
         all_done     = self.waypoints_hit == len(self.waypoint_list)
         batt_dead    = self.battery <= 0.0
         self.done      = all_done or batt_dead
@@ -797,7 +916,7 @@ class RoverSim:
         else:
             termination_reason = "unknown"
-        reward = self._compute_reward(wp_hit, collided, drain)
         self.total_reward += reward
         obs = self._build_obs()
@@ -908,6 +1027,7 @@ def _make_sim(task_id: str, seed: int | None) -> RoverSim:
         done=False, truncated=False, waypoints_hit=0,
         initial_distance=initial_dist,
         min_distance=initial_dist,
         collision_count=0,
     )

     min_distance:     float = 0.0   # running minimum; drives partial-progress score
     collision_count:  int   = 0     # cumulative obstacle contacts
+    # Reward-shaping state — tracks distance at previous step for potential-based shaping
+    _prev_distance:   float = 0.0   # set equal to initial_distance at reset
     # -------------------------------------------------------------------
     # Helpers
     # -------------------------------------------------------------------
     # Reward  (called by step)
     # -------------------------------------------------------------------
+    def _compute_reward(
+        self,
+        waypoint_hit: bool,
+        collided: bool,
+        drain: float,
+        prev_dist: float,
+    ) -> float:
+        """
+        Upgraded reward with two anti-exploit shaping mechanisms:
+        1. **Potential-Based Reward Shaping (flat plains)**
+           Φ(s) = −distance_to_goal.  Shaping = γΦ(s') − Φ(s) ≈ prev_dist − curr_dist.
+           If the rover stands still, curr_dist == prev_dist → shaping = 0,
+           so the step penalty + battery drain yield a guaranteed net negative.
+        2. **Vector-Field Reward Shaping (craters / obstacles)**
+           When any obstacle is within 10 m, compute:
+             • Attractive gradient  g_a = normalise(goal − pos)
+             • Repulsive gradient   g_r = Σ (1/d² − 1/D²) · normalise(pos − obs)
+           Blend into a combined desired vector, take its orthogonal tangent
+           (so the rover flows *around* obstacles rather than into them),
+           and reward based on cosine similarity between the rover's actual
+           heading vector and the tangent vector.
+        The massive +100.0 asymmetric waypoint reward is preserved to
+        anchor the policy toward goal completion.
+        """
+        r = 0.0
+        # ── 0. Constant step cost (time pressure) ──────────────────────
+        r -= 0.01
+        # ── 1. Battery efficiency penalty ──────────────────────────────
+        r -= drain * 2.0
+        if self.battery <= 0.0:
+            r -= 20.0
+        # ── 2. Collision penalty ───────────────────────────────────────
+        if collided:
+            r -= 5.0
+        # ── 3. Waypoint reached — massive asymmetric reward ───────────
+        if waypoint_hit:
+            r += 100.0
+        # ── 4. Potential-based distance shaping ────────────────────────
+        #   Φ(s) = −dist  →  F_shape = Φ(s') − Φ(s) = prev_dist − curr_dist
+        #   Stationary rover: curr == prev → shaping = 0 → net reward < 0
         wp = self.active_waypoint
         if wp:
+            curr_dist = math.hypot(wp[0] - self.px, wp[1] - self.py)
+            # Scale by 1/initial_distance so shaping magnitude is
+            # independent of spawn distance (reward ∈ roughly [-1, +1])
+            scale = 1.0 / max(self.initial_distance, 1.0)
+            distance_shaping = (prev_dist - curr_dist) * scale
+            r += distance_shaping
+        else:
+            curr_dist = 0.0
+        # ── 5. Vector-field shaping near obstacles (within 10 m) ───────
+        INFLUENCE_RADIUS = 10.0
+        nearest_obs = self.obstacles.nearest_n(self.px, self.py, 8)
+        close_obstacles = [(dx, dy, d) for dx, dy, d in nearest_obs
+                           if d < INFLUENCE_RADIUS and d > 1e-6]
+        if close_obstacles and wp:
+            # 5a. Attractive gradient: unit vector toward goal
+            g_ax = wp[0] - self.px
+            g_ay = wp[1] - self.py
+            g_a_mag = math.hypot(g_ax, g_ay)
+            if g_a_mag > 1e-6:
+                g_ax /= g_a_mag
+                g_ay /= g_a_mag
+            else:
+                g_ax, g_ay = 0.0, 0.0
+            # 5b. Repulsive gradient: sum of inverse-square repulsions
+            #     g_r = Σ_i  (1/d_i² − 1/D²) · normalise(pos − obs_i)
+            D = INFLUENCE_RADIUS
+            g_rx, g_ry = 0.0, 0.0
+            for dx, dy, d in close_obstacles:
+                # dx, dy point FROM rover TO obstacle; we want FROM obstacle
+                repel_x, repel_y = -dx, -dy
+                rep_mag = math.hypot(repel_x, repel_y)
+                if rep_mag > 1e-6:
+                    repel_x /= rep_mag
+                    repel_y /= rep_mag
+                strength = (1.0 / (d * d)) - (1.0 / (D * D))
+                g_rx += strength * repel_x
+                g_ry += strength * repel_y
+            # 5c. Blend attractive + repulsive into desired vector
+            alpha = 0.5   # blending weight for repulsive component
+            blend_x = g_ax + alpha * g_rx
+            blend_y = g_ay + alpha * g_ry
+            # 5d. Compute tangent (90° CCW rotation of the blended vector)
+            #     so the rover is guided to flow *around* the obstacle field
+            tangent_x = -blend_y
+            tangent_y =  blend_x
+            t_mag = math.hypot(tangent_x, tangent_y)
+            if t_mag > 1e-6:
+                tangent_x /= t_mag
+                tangent_y /= t_mag
+                # 5e. Rover's actual heading unit vector
+                hx = math.cos(self.heading)
+                hy = math.sin(self.heading)
+                # 5f. Cosine similarity (absolute value — either tangent
+                #     direction is acceptable, clockwise or counter-clockwise)
+                cos_sim = abs(hx * tangent_x + hy * tangent_y)
+                # Scale reward by proximity urgency: closer → stronger signal
+                min_d = close_obstacles[0][2]   # already sorted ascending
+                proximity_weight = 1.0 - (min_d / INFLUENCE_RADIUS)
+                r += 0.3 * cos_sim * proximity_weight
+        # ── 6. Efficiency bonus: episode done in < 50% of step budget ─
+        if (self.waypoints_hit == len(self.waypoint_list)
+                and self.steps < self.max_steps * 0.5):
             r += 5.0
         return r
     # -------------------------------------------------------------------
         self.steps += 1
+        # Snapshot distance BEFORE kinematics so potential-based shaping
+        # can compute Δd = prev_dist − curr_dist for this step.
+        prev_dist = self._prev_distance
         self._apply_kinematics(action)
         drain        = self._update_battery(action)
         collided, nd = self._check_collision()
         if current_dist < self.min_distance:
             self.min_distance = current_dist
+        # Update _prev_distance for the NEXT step's shaping computation
+        self._prev_distance = current_dist
         all_done     = self.waypoints_hit == len(self.waypoint_list)
         batt_dead    = self.battery <= 0.0
         self.done      = all_done or batt_dead
         else:
             termination_reason = "unknown"
+        reward = self._compute_reward(wp_hit, collided, drain, prev_dist)
         self.total_reward += reward
         obs = self._build_obs()
         done=False, truncated=False, waypoints_hit=0,
         initial_distance=initial_dist,
         min_distance=initial_dist,
+        _prev_distance=initial_dist,
         collision_count=0,
     )

openenv.yaml CHANGED Viewed

@@ -281,11 +281,13 @@ reward:
   description: >
     Step reward signal returned in the 'reward' field of step().
     The /grader endpoint computes the normalised episode score [0.0, 1.0]
-    from the full trajectory.
   components:
     waypoint_reached:
-      value: +10.0
       condition: "target_distance < 2.0 meters"
     step_penalty:
       value: -0.01
       condition: "every step"
@@ -295,6 +297,20 @@ reward:
     battery_depleted:
       value: -20.0
       condition: "battery_level == 0.0"
     efficiency_bonus:
       value: +5.0
       condition: "episode completed in < 50% of max_steps"

   description: >
     Step reward signal returned in the 'reward' field of step().
     The /grader endpoint computes the normalised episode score [0.0, 1.0]
+    from the full trajectory.  Reward shaping uses potential-based and
+    vector-field techniques to prevent the "stationary exploit".
   components:
     waypoint_reached:
+      value: +100.0
       condition: "target_distance < 2.0 meters"
+      note: "Massive asymmetric reward prevents early policy collapse."
     step_penalty:
       value: -0.01
       condition: "every step"
     battery_depleted:
       value: -20.0
       condition: "battery_level == 0.0"
+    potential_based_distance_shaping:
+      value: "(prev_dist - curr_dist) / initial_distance"
+      condition: "every step while waypoint is active"
+      note: >
+        Φ(s) = −distance.  Shaping = Φ(s') − Φ(s) = prev_dist − curr_dist.
+        Normalised by initial_distance for spawn-distance independence.
+        Standing still yields shaping = 0, so step penalty + drain = net negative.
+    vector_field_obstacle_shaping:
+      value: "up to +0.3"
+      condition: "any obstacle within 10 metres"
+      note: >
+        Computes attractive (goal) + repulsive (obstacles) gradient blend,
+        takes orthogonal tangent, rewards cosine similarity with rover heading.
+        Scaled by proximity urgency (closer obstacle = stronger signal).
     efficiency_bonus:
       value: +5.0
       condition: "episode completed in < 50% of max_steps"

train.py ADDED Viewed

	@@ -0,0 +1,523 @@

+"""
+train.py — GRPO Training Script for Planetary Rover Navigation
+================================================================
+Uses Unsloth's FastLanguageModel + TRL's GRPOTrainer to fine-tune
+meta-llama/Llama-3.2-1B-Instruct for autonomous rover navigation.
+Hardware target : NVIDIA RTX 3050 — strict 6 GB VRAM limit
+Quantisation    : 4-bit NF4 via Unsloth
+LoRA            : rank 16, attention + MLP projections
+GRPO group size : 4 generations per prompt (prevents OOM)
+Reward functions
+----------------
+  1. Format Gatekeeper — validates <action>JSON</action> structure
+  2. Environment Reward — POSTs parsed action to local physics server
+Prerequisites
+-------------
+  1. Local server running:
+       uvicorn main:app --host 0.0.0.0 --port 7860
+  2. Python packages:
+       pip install unsloth trl datasets peft accelerate
+"""
+from __future__ import annotations
+import json
+import math
+import os
+import re
+import sys
+import time
+import random
+import logging
+from typing import Any
+import requests
+import torch
+from datasets import Dataset
+# ---------------------------------------------------------------------------
+# Unsloth + TRL imports (deferred to allow --help without GPU)
+# ---------------------------------------------------------------------------
+from unsloth import FastLanguageModel
+from trl import GRPOConfig, GRPOTrainer
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+MODEL_NAME      = "meta-llama/Llama-3.2-1B-Instruct"
+SERVER_URL      = "http://0.0.0.0:7860"
+OUTPUT_DIR      = "./grpo_rover_checkpoints"
+SEED            = 42
+# VRAM-safe parameters for RTX 3050 (6 GB)
+MAX_SEQ_LENGTH  = 512          # prompt + completion combined
+LORA_RANK       = 16
+LORA_ALPHA      = 32
+LORA_DROPOUT    = 0.0
+# Training hyperparameters
+NUM_TRAIN_EPISODES   = 150     # prompts per task × 3 tasks = total dataset
+MAX_PROMPT_LENGTH    = 256
+MAX_COMPLETION_LENGTH = 256
+NUM_GENERATIONS      = 4       # GRPO group size — critical for 6 GB
+LEARNING_RATE        = 1e-6
+KL_COEF              = 0.04    # β for KL penalty
+NUM_TRAIN_EPOCHS     = 2
+PER_DEVICE_BATCH     = 1       # keep at 1 for 6 GB
+GRAD_ACCUM_STEPS     = 4
+# Reward tuning
+FORMAT_REWARD_GOOD   = 1.0
+FORMAT_REWARD_BAD    = 0.0
+VERBOSITY_THRESHOLD  = 80      # tokens — a valid <action>{…}</action> is ~30-40
+VERBOSITY_PENALTY_K  = 200     # excess tokens before reward → 0
+# Logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+log = logging.getLogger("train")
+# =============================================================================
+# System prompt (compact — must fit within ~90 tokens so user prompt has room)
+# =============================================================================
+SYSTEM_PROMPT = """\
+You are a planetary rover navigation controller.
+Respond ONLY with your action inside <action></action> tags as valid JSON.
+Action schema:
+{"thrust": float[0,1], "steering": float[-1,1], "brake": 0|1, "vertical_thruster": float[-0.2,0.2]}
+Key physics:
+- heading_error = atan2(target_dy, target_dx) - rover_heading
+- steering ≈ clamp(heading_error * 2.5, -1, 1)
+- thrust=1.0 for progress; brake=0 unless overshooting
+- If nearest_obstacle < 10m, steer perpendicular to dodge\
+"""
+# =============================================================================
+# Compact observation prompt builder
+# =============================================================================
+def build_compact_prompt(
+    task_id:   str,
+    obs:       dict[str, Any],
+    step_num:  int,
+    max_steps: int,
+) -> str:
+    """
+    Build a token-efficient user prompt from an observation dict.
+    Designed to fit in ~100–120 tokens so system + user ≤ 256.
+    """
+    dx = obs["target_relative"]["x"]
+    dy = obs["target_relative"]["y"]
+    # Pre-compute heading error so the model doesn't need trig
+    target_heading = math.atan2(dy, dx)
+    raw_error = target_heading - obs["rover_heading"]
+    while raw_error >  math.pi: raw_error -= 2 * math.pi
+    while raw_error <= -math.pi: raw_error += 2 * math.pi
+    suggested_steering = max(-1.0, min(1.0, raw_error * 2.5))
+    return (
+        f"TASK: {task_id}  STEP: {step_num}/{max_steps}\n"
+        f"target_distance={obs['target_distance']:.1f}m "
+        f"heading_error={raw_error:.4f}rad\n"
+        f"battery={obs['battery_level']:.3f} "
+        f"nearest_obstacle={obs['nearest_obstacle_distance']:.1f}m "
+        f"terrain={obs['terrain_type']}\n"
+        f"suggested_steering={suggested_steering:.4f}\n"
+        f"Output your <action> JSON now."
+    )
+# =============================================================================
+# Dataset generation — resets episodes and collects initial observations
+# =============================================================================
+TASK_MAX_STEPS = {"easy": 200, "medium": 300, "hard": 100}
+def _check_server() -> None:
+    """Fail fast if the environment server is unreachable."""
+    try:
+        r = requests.get(f"{SERVER_URL}/tasks", timeout=5)
+        r.raise_for_status()
+        log.info("Environment server is live at %s", SERVER_URL)
+    except Exception as e:
+        log.error(
+            "Cannot reach environment server at %s — "
+            "start it with: uvicorn main:app --host 0.0.0.0 --port 7860",
+            SERVER_URL,
+        )
+        sys.exit(1)
+def generate_training_dataset(n_per_task: int = NUM_TRAIN_EPISODES) -> Dataset:
+    """
+    Generate a training dataset by resetting episodes across all tasks.
+    Each row contains:
+      prompt   — chat-formatted messages (system + user)
+      task_id  — for environment reward replay
+      seed     — for environment reward replay
+    """
+    rows: list[dict[str, Any]] = []
+    for task_id in ["easy", "medium", "hard"]:
+        max_steps = TASK_MAX_STEPS[task_id]
+        for seed in range(n_per_task):
+            try:
+                resp = requests.post(
+                    f"{SERVER_URL}/reset",
+                    json={"task_id": task_id, "seed": seed},
+                    timeout=10,
+                )
+                resp.raise_for_status()
+                data = resp.json()
+            except Exception as e:
+                log.warning("Reset failed (task=%s seed=%d): %s", task_id, seed, e)
+                continue
+            obs = data["obs"]
+            user_msg = build_compact_prompt(task_id, obs, step_num=1, max_steps=max_steps)
+            rows.append({
+                "prompt": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user",   "content": user_msg},
+                ],
+                "task_id": task_id,
+                "seed":    seed,
+            })
+    random.shuffle(rows)
+    log.info("Generated %d training prompts (%d per task × 3 tasks)", len(rows), n_per_task)
+    return Dataset.from_list(rows)
+# =============================================================================
+# Reward Function 1 — Format Gatekeeper
+# =============================================================================
+# Regex to extract content between <action> and </action> tags
+_ACTION_RE = re.compile(r"<action>\s*(.*?)\s*</action>", re.DOTALL)
+# Required fields and their (min, max) bounds
+_ACTION_FIELDS = {
+    "thrust":            (0.0,  1.0),
+    "steering":          (-1.0, 1.0),
+    "brake":             (0,    1),
+    "vertical_thruster": (-0.2, 0.2),
+}
+def parse_action_from_completion(text: str) -> dict[str, Any] | None:
+    """
+    Extract and validate an action JSON from <action>…</action> tags.
+    Returns the parsed action dict if valid, None otherwise.
+    """
+    match = _ACTION_RE.search(text)
+    if not match:
+        return None
+    try:
+        parsed = json.loads(match.group(1))
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(parsed, dict):
+        return None
+    # Validate required fields exist and are numeric
+    action: dict[str, Any] = {}
+    for field, (lo, hi) in _ACTION_FIELDS.items():
+        if field not in parsed:
+            return None
+        val = parsed[field]
+        try:
+            if field == "brake":
+                val = int(round(float(val)))
+            else:
+                val = float(val)
+        except (TypeError, ValueError):
+            return None
+        # Reject wildly out-of-range (mild overshoot is clamped, not rejected)
+        if val < lo - 0.5 or val > hi + 0.5:
+            return None
+        # Clamp to valid bounds
+        if field == "brake":
+            val = max(0, min(1, val))
+        else:
+            val = max(lo, min(hi, val))
+        action[field] = val
+    return action
+def format_reward_fn(completions: list[str], **kwargs) -> list[float]:
+    """
+    Reward Function 1 — The Format Gatekeeper.
+    Returns 1.0 if the completion contains valid <action>JSON</action>
+    matching the rover action schema.  Returns 0.0 on failure.
+    Applies a soft verbosity penalty: completions exceeding
+    VERBOSITY_THRESHOLD tokens are penalised linearly, reaching 0
+    at VERBOSITY_THRESHOLD + VERBOSITY_PENALTY_K tokens.
+    """
+    rewards: list[float] = []
+    for text in completions:
+        action = parse_action_from_completion(text)
+        if action is None:
+            rewards.append(FORMAT_REWARD_BAD)
+            continue
+        # Base reward for valid format
+        base = FORMAT_REWARD_GOOD
+        # Soft verbosity penalty — count whitespace-split "tokens" as proxy
+        # (actual BPE count varies, but this is a stable heuristic)
+        token_estimate = len(text.split())
+        if token_estimate > VERBOSITY_THRESHOLD:
+            excess = token_estimate - VERBOSITY_THRESHOLD
+            penalty = max(0.0, 1.0 - excess / VERBOSITY_PENALTY_K)
+            base *= penalty
+        rewards.append(base)
+    return rewards
+# =============================================================================
+# Reward Function 2 — Environment Reward
+# =============================================================================
+def environment_reward_fn(completions: list[str], **kwargs) -> list[float]:
+    """
+    Reward Function 2 — The Environment.
+    For each completion:
+      1. Parse the action from <action> tags.
+      2. Reset a fresh episode with the same (task_id, seed) as the prompt.
+      3. POST the action to /step.
+      4. Return the scalar step reward from the physics engine.
+    If parsing or HTTP fails, returns 0.0 (neutral — no signal).
+    """
+    task_ids: list[str] = kwargs.get("task_id", [])
+    seeds:    list[int] = kwargs.get("seed", [])
+    rewards: list[float] = []
+    for i, text in enumerate(completions):
+        # -- Parse action --------------------------------------------------
+        action = parse_action_from_completion(text)
+        if action is None:
+            rewards.append(0.0)
+            continue
+        # -- Determine episode parameters ----------------------------------
+        # kwargs columns are lists aligned with completions.
+        # With num_generations=4, each prompt's metadata is repeated 4 times.
+        task_id = task_ids[i] if i < len(task_ids) else "easy"
+        seed    = seeds[i]    if i < len(seeds)    else 0
+        try:
+            # Reset a fresh episode with the same seed → identical starting state
+            reset_resp = requests.post(
+                f"{SERVER_URL}/reset",
+                json={"task_id": task_id, "seed": seed},
+                timeout=10,
+            )
+            reset_resp.raise_for_status()
+            episode_id = reset_resp.json()["episode_id"]
+            # Step with the generated action
+            step_resp = requests.post(
+                f"{SERVER_URL}/step",
+                json=action,
+                params={"episode_id": episode_id},
+                timeout=10,
+            )
+            step_resp.raise_for_status()
+            step_data = step_resp.json()
+            # Return the scalar reward from the physics engine
+            reward = float(step_data.get("reward", 0.0))
+            rewards.append(reward)
+        except Exception as e:
+            log.warning("Environment reward failed (task=%s seed=%d): %s", task_id, seed, e)
+            rewards.append(0.0)
+    return rewards
+# =============================================================================
+# Model loading
+# =============================================================================
+def load_model():
+    """
+    Load Llama-3.2-1B-Instruct with Unsloth's 4-bit NF4 quantisation
+    and attach LoRA adapters to attention + MLP projections.
+    """
+    log.info("Loading %s with 4-bit NF4 quantisation via Unsloth…", MODEL_NAME)
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name     = MODEL_NAME,
+        max_seq_length = MAX_SEQ_LENGTH,
+        dtype          = None,          # auto-detect (float16 on Ampere)
+        load_in_4bit   = True,          # NF4 quantisation for 6 GB VRAM
+    )
+    log.info("Attaching LoRA (rank=%d, alpha=%d) to attention + MLP…", LORA_RANK, LORA_ALPHA)
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r              = LORA_RANK,
+        target_modules = [
+            # Attention projections
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            # MLP projections (SwiGLU in Llama)
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        lora_alpha     = LORA_ALPHA,
+        lora_dropout   = LORA_DROPOUT,
+        bias           = "none",
+        use_gradient_checkpointing = "unsloth",   # 60% less VRAM
+        random_state   = SEED,
+    )
+    # Ensure pad token is set (required for batched generation)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"    # decoder-only: pad on the left
+    vram_gb = torch.cuda.memory_allocated() / 1e9
+    log.info("Model loaded. Current VRAM: %.2f GB", vram_gb)
+    return model, tokenizer
+# =============================================================================
+# Training configuration
+# =============================================================================
+def build_training_config() -> GRPOConfig:
+    """Build the GRPOConfig with parameters safe for 6 GB VRAM."""
+    return GRPOConfig(
+        output_dir             = OUTPUT_DIR,
+        # ── GRPO-specific ─────────────────────────────────────────────
+        num_generations        = NUM_GENERATIONS,     # group size = 4
+        max_prompt_length      = MAX_PROMPT_LENGTH,   # 256 tokens
+        max_completion_length  = MAX_COMPLETION_LENGTH,# 256 tokens
+        beta                   = KL_COEF,             # KL penalty coeff
+        # ���─ Optimiser ─────────────────────────────────────────────────
+        learning_rate          = LEARNING_RATE,        # 1e-6
+        lr_scheduler_type      = "cosine",
+        warmup_ratio           = 0.05,
+        max_grad_norm          = 1.0,
+        # ── Batch / accumulation ──────────────────────────────────────
+        per_device_train_batch_size = PER_DEVICE_BATCH,   # 1 for 6 GB
+        gradient_accumulation_steps = GRAD_ACCUM_STEPS,   # effective batch = 4
+        num_train_epochs            = NUM_TRAIN_EPOCHS,
+        # ── Precision / memory ────────────────────────────────────────
+        bf16                   = torch.cuda.is_bf16_supported(),
+        fp16                   = not torch.cuda.is_bf16_supported(),
+        # ── Logging / saving ──────────────────────────────────────────
+        logging_steps          = 5,
+        save_steps             = 50,
+        save_total_limit       = 3,
+        report_to              = "none",               # set to "wandb" if desired
+        seed                   = SEED,
+        # ── Misc ──────────────────────────────────────────────────────
+        remove_unused_columns  = False,                # keep task_id/seed cols
+    )
+# =============================================================================
+# Main entry point
+# =============================================================================
+def main() -> None:
+    log.info("=" * 60)
+    log.info("GRPO Training — Planetary Rover Navigation")
+    log.info("Model : %s", MODEL_NAME)
+    log.info("VRAM  : 6 GB target (4-bit NF4, LoRA r=%d, group=%d)",
+             LORA_RANK, NUM_GENERATIONS)
+    log.info("=" * 60)
+    # ── 0. Check server ───────────────────────────────────────────────
+    _check_server()
+    # ── 1. Load model + tokenizer ─────────────────────────────────────
+    model, tokenizer = load_model()
+    # ── 2. Generate training dataset ──────────────────────────────────
+    SYSTEM_PROMPT = """You are an autonomous planetary rover. Navigate to the target.
+Output strictly valid JSON inside <action> tags with 'thrust' (0.0 to 1.0) and 'steering' (-1.0 to 1.0)."""
+    practice_prompts = {
+        "prompt": [
+            [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": "Telemetry linked. Generate initial navigation action."}
+            ]
+        ] * 100
+    }
+    train_dataset = Dataset.from_dict(practice_prompts)
+    # ── 3. Build GRPO config ──────────────────────────────────────────
+    config = build_training_config()
+    # ── 4. Initialise trainer ─────────────────────────────────────────
+    log.info("Initialising GRPOTrainer with 2 reward functions…")
+    trainer = GRPOTrainer(
+        model        = model,
+        tokenizer    = tokenizer,
+        reward_funcs = [format_reward_fn, environment_reward_fn],
+        args         = config,
+        train_dataset = train_dataset,
+    )
+    # ── 5. Train ──────────────────────────────────────────────────────
+    log.info("Starting GRPO training…")
+    start = time.time()
+    trainer.train()
+    elapsed = time.time() - start
+    log.info("Training complete in %.1f minutes.", elapsed / 60)
+    # ── 6. Save final adapter ─────────────────────────────────────────
+    final_path = os.path.join(OUTPUT_DIR, "final_adapter")
+    model.save_pretrained(final_path)
+    tokenizer.save_pretrained(final_path)
+    log.info("Final LoRA adapter saved to %s", final_path)
+    # ── 7. VRAM summary ──────────────────────────────────────────────
+    peak_vram = torch.cuda.max_memory_allocated() / 1e9
+    log.info("Peak VRAM usage: %.2f GB (limit: 6.00 GB)", peak_vram)
+    if peak_vram > 6.0:
+        log.warning("⚠ Peak VRAM exceeded 6 GB! Reduce NUM_GENERATIONS or LORA_RANK.")
+    else:
+        log.info("✅ VRAM stayed within 6 GB budget.")
+if __name__ == "__main__":
+    main()