Spaces:

Prajwal782007
/

Gridmind

Running

App Files Files Community

Prajwal782007 commited on Apr 25

Commit

18750f8

1 Parent(s): e890cbb

fix: update health check endpoint in GridMind notebook and provide utility script to apply fix

Browse files

Files changed (2) hide show

scratch/fix_health_check.py +26 -0
scripts/gridmind_grpo_colab.ipynb +624 -623

scratch/fix_health_check.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import json
+import os
+notebook_path = r"c:\Projects\gridmind\scripts\gridmind_grpo_colab.ipynb"
+with open(notebook_path, 'r', encoding='utf-8') as f:
+    nb = json.load(f)
+for i, cell in enumerate(nb['cells']):
+    if cell['cell_type'] == "code":
+        source_text = "".join(cell['source'])
+        if "health = requests.get(f\"{ENV_URL}/health\"" in source_text:
+            # Replace the health check with a safer one
+            new_source = []
+            for line in cell['source']:
+                if 'health = requests.get(f"{ENV_URL}/health"' in line:
+                    new_source.append('    r = requests.get(f"{ENV_URL}", timeout=10)\n')
+                    new_source.append('    health = {"status": r.status_code}\n')
+                else:
+                    new_source.append(line)
+            nb['cells'][i]['source'] = new_source
+            print(f"Fixed health check in cell {i}")
+            break
+with open(notebook_path, 'w', encoding='utf-8') as f:
+    json.dump(nb, f, indent=1)

scripts/gridmind_grpo_colab.ipynb CHANGED Viewed

@@ -1,626 +1,627 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "id": "193da661",
-      "metadata": {},
-      "source": [
-        "# GridMind-RL: GRPO Training for Industrial Energy Management\n",
-        "\n",
-        "**Meta PyTorch OpenEnv Hackathon \u00e2\u20ac\u201d GridMind-RL Team**\n",
-        "\n",
-        "This notebook trains a small LLM (Qwen2.5-1.5B) using TRL GRPO on the GridMind-RL environment.\n",
-        "The environment covers all 4 hackathon themes:\n",
-        "\n",
-        "1. **Theme 1: Multi-Agent** \u00e2\u20ac\u201d 3 buildings share a grid feeder; each agent makes independent decisions\n",
-        "2. **Theme 2: Instruction Following** \u00e2\u20ac\u201d Task 4 provides natural language objectives that must be satisfied\n",
-        "3. **Theme 3: World Modeling** \u00e2\u20ac\u201d `/simulate` endpoint predicts outcomes before committing actions\n",
-        "4. **Theme 4: Self-Improvement** \u00e2\u20ac\u201d Curriculum automatically advances difficulty as agent performance improves\n",
-        "\n",
-        "| | |\n",
-        "|---|---|\n",
-        "| **Environment** | https://lo-kyu-gridmind.hf.space |\n",
-        "| **Method** | GRPO (Group Relative Policy Optimization) |\n",
-        "| **Model** | Qwen2.5-1.5B-Instruct |\n",
-        "| **Training Time** | ~30-40 minutes on free Colab T4 GPU |\n",
-        "| **Expected Improvement** | 20-40% score gain over heuristic baseline |"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "f28e2f2c",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Install dependencies\n",
-        "!pip install trl==0.8.6 transformers>=4.41.0 torch accelerate datasets requests -q\n",
-        "\n",
-        "import torch\n",
-        "import sys\n",
-        "\n",
-        "print(f\"PyTorch: {torch.__version__}\")\n",
-        "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
-        "if torch.cuda.is_available():\n",
-        "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
-        "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "5021a299",
-      "metadata": {},
-      "source": [
-        "## Step 1: Connect to Environment and Verify Connectivity"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "4cdf0f35",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import requests\n",
-        "import json\n",
-        "import time\n",
-        "\n",
-        "ENV_URL = \"https://lo-kyu-gridmind.hf.space\"\n",
-        "\n",
-        "# Test connectivity\n",
-        "print(\"Testing environment connectivity...\")\n",
-        "try:\n",
-        "    health = requests.get(f\"{ENV_URL}/health\", timeout=10).json()\n",
-        "    print(f\"\u00e2\u0153\u201c Health check: {health}\")\n",
-        "except Exception as e:\n",
-        "    print(f\"\u00e2\u0153\u2014 Health check failed: {e}\")\n",
-        "    sys.exit(1)\n",
-        "\n",
-        "# Test each task reset\n",
-        "print(\"\\nTesting all 4 tasks...\")\n",
-        "for task_id in [1, 2, 3, 4]:\n",
-        "    try:\n",
-        "        r = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10)\n",
-        "        obs = r.json()\n",
-        "        has_card = \"instruction_card\" in obs or \"observations\" in obs and obs[\"observations\"][0].get(\"instruction_card\")\n",
-        "        print(f\"\u00e2\u0153\u201c Task {task_id}: status={r.status_code}, has_instruction_card={has_card}\")\n",
-        "    except Exception as e:\n",
-        "        print(f\"\u00e2\u0153\u2014 Task {task_id} failed: {e}\")\n",
-        "\n",
-        "# Test coordinator (multi-agent)\n",
-        "print(\"\\nTesting multi-agent coordinator...\")\n",
-        "try:\n",
-        "    r = requests.post(f\"{ENV_URL}/coordinator/reset\", json={}, timeout=10)\n",
-        "    obs = r.json()\n",
-        "    n_buildings = len(obs.get(\"observations\", []))\n",
-        "    print(f\"\u00e2\u0153\u201c Coordinator reset: {n_buildings} buildings\")\n",
-        "except Exception as e:\n",
-        "    print(f\"\u00e2\u0153\u2014 Coordinator failed: {e}\")\n",
-        "\n",
-        "# Test world modeling\n",
-        "print(\"\\nTesting world modeling (/simulate)...\")\n",
-        "try:\n",
-        "    r = requests.post(f\"{ENV_URL}/simulate\", \n",
-        "                      json=[{\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \n",
-        "                             \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}],\n",
-        "                      timeout=10)\n",
-        "    sim = r.json()\n",
-        "    has_results = \"results\" in sim\n",
-        "    print(f\"\u00e2\u0153\u201c Simulate: has_results={has_results}\")\n",
-        "except Exception as e:\n",
-        "    print(f\"\u00e2\u0153\u2014 Simulate failed: {e}\")\n",
-        "\n",
-        "print(\"\\n\u00e2\u0153\u201c All connectivity checks passed!\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "4a5b58c2",
-      "metadata": {},
-      "source": [
-        "## Step 2: Measure Baseline Performance (Before Training)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "42cecadb",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import random\n",
-        "\n",
-        "def run_heuristic_episode(task_id=1, max_steps=96):\n",
-        "    \"\"\"Run an episode using a rule-based heuristic policy.\"\"\"\n",
-        "    try:\n",
-        "        r = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10)\n",
-        "        obs_data = r.json()\n",
-        "        obs = obs_data[\"observations\"][0] if \"observations\" in obs_data else obs_data\n",
-        "    except:\n",
-        "        return 0.0\n",
-        "    \n",
-        "    for step in range(max_steps):\n",
-        "        # Simple heuristic: charge off-peak, discharge peak\n",
-        "        hour = step // 4\n",
-        "        hvac = 0.7 if 8 <= hour <= 18 else 0.3\n",
-        "        charge = 0.6 if hour < 6 else (-0.4 if 14 <= hour <= 18 else 0.0)\n",
-        "        shed = 0.3 if 14 <= hour <= 17 else 0.0\n",
-        "        \n",
-        "        action = {\n",
-        "            \"hvac_power_level\": hvac,\n",
-        "            \"thermal_charge_rate\": charge,\n",
-        "            \"batch_job_slot\": 1 if 22 <= hour or hour <= 5 else 0,\n",
-        "            \"load_shed_fraction\": shed,\n",
-        "            \"building_id\": 0\n",
-        "        }\n",
-        "        \n",
-        "        try:\n",
-        "            r = requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
-        "            step_data = r.json()\n",
-        "            if isinstance(step_data, list):\n",
-        "                step_data = step_data[0]\n",
-        "            obs = step_data.get(\"observation\", obs)\n",
-        "            if step_data.get(\"done\", False):\n",
-        "                break\n",
-        "        except:\n",
-        "            break\n",
-        "    \n",
-        "    # Get final grade\n",
-        "    try:\n",
-        "        grade = requests.get(f\"{ENV_URL}/grade\", timeout=10).json()\n",
-        "        return float(grade.get(\"score\", 0))\n",
-        "    except:\n",
-        "        return 0.0\n",
-        "\n",
-        "print(\"Measuring heuristic baseline (2 episodes per task)...\")\n",
-        "baseline_scores = {}\n",
-        "for task_id in [1, 2, 3, 4]:\n",
-        "    scores = []\n",
-        "    for ep in range(2):\n",
-        "        score = run_heuristic_episode(task_id=task_id)\n",
-        "        scores.append(score)\n",
-        "        print(f\"  Task {task_id} Episode {ep+1}: {score:.3f}\")\n",
-        "    baseline_scores[task_id] = sum(scores) / len(scores)\n",
-        "\n",
-        "print(f\"\\nHeuristic Baseline Averages:\")\n",
-        "for task_id, avg in baseline_scores.items():\n",
-        "    print(f\"  Task {task_id}: {avg:.3f}\")\n",
-        "print(f\"  Overall: {sum(baseline_scores.values()) / len(baseline_scores):.3f}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "7abdd330",
-      "metadata": {},
-      "source": [
-        "## Step 3: Build Multi-Theme Training Dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1c496af9",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Build a dataset that covers all 4 themes\n",
-        "dataset = []\n",
-        "\n",
-        "# Theme 1: Multi-Agent (3 buildings cooperating)\n",
-        "print(\"Building multi-agent theme examples...\")\n",
-        "for i in range(20):\n",
-        "    try:\n",
-        "        resp = requests.post(f\"{ENV_URL}/coordinator/reset\", json={}, timeout=10).json()\n",
-        "        if \"observations\" in resp:\n",
-        "            for b_idx, b_obs in enumerate(resp[\"observations\"]):\n",
-        "                prompt = f\"\"\"You control Building {b_idx} in a 3-building facility.\n",
-        "All buildings share one grid connection (feeder limit: 250 kW).\n",
-        "Your current state: temp={b_obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, \n",
-        "storage={b_obs.get('thermal_storage_level', 0.5):.2f}, \n",
-        "price=${b_obs.get('current_price', 0.1):.3f}/kWh\n",
-        "Grid stress signal: {b_obs.get('grid_stress_signal', 0):.2f}\n",
-        "\n",
-        "You must coordinate with other buildings to keep total feeder load under 250 kW.\n",
-        "Each building decides independently. Respond with your JSON action:\n",
-        "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
-        "\"load_shed_fraction\": <0-0.5>, \"building_id\": {b_idx}}}\"\"\"\n",
-        "                dataset.append({\"prompt\": prompt, \"theme\": \"multi_agent\"})\n",
-        "    except:\n",
-        "        pass\n",
-        "\n",
-        "print(f\"Multi-agent examples: {len([d for d in dataset if d.get('theme')=='multi_agent'])}\")\n",
-        "\n",
-        "# Theme 2: Instruction Following (Task 4 with explicit objectives)\n",
-        "print(\"Building instruction-following theme examples...\")\n",
-        "for i in range(20):\n",
-        "    try:\n",
-        "        resp = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": 4}, timeout=10).json()\n",
-        "        if \"observations\" in resp:\n",
-        "            obs = resp[\"observations\"][0]\n",
-        "            instruction = resp.get(\"instruction_card\", obs.get(\"instruction_card\", {}))\n",
-        "            instruction_text = instruction.get(\"text\", \"Minimize cost\") if isinstance(instruction, dict) else str(instruction)\n",
-        "            prompt = f\"\"\"INSTRUCTION CARD: {instruction_text}\n",
-        "\n",
-        "Current state: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, \n",
-        "storage={obs.get('thermal_storage_level', 0.5):.2f}, \n",
-        "cost_so_far=${obs.get('cumulative_cost', 0):.2f}, \n",
-        "step={obs.get('step', 0)}/96\n",
-        "\n",
-        "You MUST satisfy the instruction. Output JSON action:\n",
-        "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
-        "\"load_shed_fraction\": <0-0.5>, \"building_id\": 0}}\"\"\"\n",
-        "            dataset.append({\"prompt\": prompt, \"theme\": \"instruction_following\"})\n",
-        "    except:\n",
-        "        pass\n",
-        "\n",
-        "print(f\"Instruction-following examples: {len([d for d in dataset if d.get('theme')=='instruction_following'])}\")\n",
-        "\n",
-        "# Theme 3: World Modeling (use /simulate)\n",
-        "print(\"Building world-modeling theme examples...\")\n",
-        "for task_id in [1, 2]:\n",
-        "    for i in range(10):\n",
-        "        try:\n",
-        "            resp = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10).json()\n",
-        "            if \"observations\" in resp:\n",
-        "                obs = resp[\"observations\"][0]\n",
-        "                # Simulate 2 candidate actions\n",
-        "                try:\n",
-        "                    sim_a = requests.post(f\"{ENV_URL}/simulate\",\n",
-        "                                         json=[{\"hvac_power_level\": 0.8, \"thermal_charge_rate\": 0.3,\n",
-        "                                                \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}],\n",
-        "                                         timeout=10).json()\n",
-        "                    sim_b = requests.post(f\"{ENV_URL}/simulate\",\n",
-        "                                         json=[{\"hvac_power_level\": 0.3, \"thermal_charge_rate\": -0.2,\n",
-        "                                                \"batch_job_slot\": 0, \"load_shed_fraction\": 0.2, \"building_id\": 0}],\n",
-        "                                         timeout=10).json()\n",
-        "                    sim_context = \"\\nPredicted outcomes:\\nOption A (high HVAC): efficient\\nOption B (low HVAC): economical\"\n",
-        "                except:\n",
-        "                    sim_context = \"\"\n",
-        "                \n",
-        "                prompt = f\"\"\"Plan your actions using simulation of future outcomes.\n",
-        "State: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, storage={obs.get('thermal_storage_level', 0.5):.2f}{sim_context}\n",
-        "\n",
-        "Output your best JSON action:\n",
-        "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
-        "\"load_shed_fraction\": <0-0.5>, \"building_id\": 0}}\"\"\"\n",
-        "                dataset.append({\"prompt\": prompt, \"theme\": \"world_modeling\"})\n",
-        "        except:\n",
-        "            pass\n",
-        "\n",
-        "print(f\"World-modeling examples: {len([d for d in dataset if d.get('theme')=='world_modeling'])}\")\n",
-        "\n",
-        "# Theme 4: Self-Improvement (curriculum across difficulties)\n",
-        "print(\"Building self-improvement theme examples...\")\n",
-        "for difficulty in [1, 1, 2, 2, 3, 3]:\n",
-        "    try:\n",
-        "        resp = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": difficulty}, timeout=10).json()\n",
-        "        if \"observations\" in resp:\n",
-        "            obs = resp[\"observations\"][0]\n",
-        "            prompt = f\"\"\"Difficulty Level {difficulty}/3 - Control building energy system.\n",
-        "State: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, storage={obs.get('thermal_storage_level', 0.5):.2f},\n",
-        "price=${obs.get('current_price', 0.1):.3f}/kWh\n",
-        "\n",
-        "Output JSON action:\n",
-        "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
-        "\"load_shed_fraction\": <0-0.5>, \"building_id\": 0}}\"\"\"\n",
-        "            dataset.append({\"prompt\": prompt, \"theme\": \"curriculum\", \"difficulty\": difficulty})\n",
-        "    except:\n",
-        "        pass\n",
-        "\n",
-        "print(f\"Self-improvement examples: {len([d for d in dataset if d.get('theme')=='curriculum'])}\")\n",
-        "\n",
-        "print(f\"\\nTotal dataset: {len(dataset)} prompts\")\n",
-        "theme_counts = {}\n",
-        "for d in dataset:\n",
-        "    theme = d.get(\"theme\", \"unknown\")\n",
-        "    theme_counts[theme] = theme_counts.get(theme, 0) + 1\n",
-        "print(f\"Theme distribution: {theme_counts}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "2ed46c06",
-      "metadata": {},
-      "source": [
-        "## Step 4: Load Model and Tokenizer"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "5e5826e4",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
-        "\n",
-        "MODEL_NAME = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
-        "print(f\"Loading {MODEL_NAME}...\")\n",
-        "\n",
-        "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
-        "if tokenizer.pad_token is None:\n",
-        "    tokenizer.pad_token = tokenizer.eos_token\n",
-        "\n",
-        "model = AutoModelForCausalLM.from_pretrained(\n",
-        "    MODEL_NAME,\n",
-        "    torch_dtype=torch.float16,\n",
-        "    device_map=\"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-        ")\n",
-        "\n",
-        "total_params = sum(p.numel() for p in model.parameters())\n",
-        "print(f\"Model loaded. Parameters: {total_params/1e6:.0f}M\")\n",
-        "print(f\"Device: {next(model.parameters()).device}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "ba6645a6",
-      "metadata": {},
-      "source": [
-        "## Step 5: Define Reward Function"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "02686008",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import json as _json\n",
-        "\n",
-        "training_rewards = []\n",
-        "\n",
-        "def gridmind_reward_fn(completions, **kwargs):\n",
-        "    \"\"\"Reward function that calls the real environment.\"\"\"\n",
-        "    rewards = []\n",
-        "    \n",
-        "    for completion in completions:\n",
-        "        try:\n",
-        "            # Extract JSON action from completion\n",
-        "            text = str(completion).strip()\n",
-        "            start = text.rfind('{')\n",
-        "            end = text.rfind('}') + 1\n",
-        "            if start < 0 or end <= start:\n",
-        "                rewards.append(-1.0)\n",
-        "                continue\n",
-        "            \n",
-        "            action_str = text[start:end]\n",
-        "            action = _json.loads(action_str)\n",
-        "            \n",
-        "            # Clamp action to valid ranges\n",
-        "            action[\"hvac_power_level\"] = max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5))))\n",
-        "            action[\"thermal_charge_rate\"] = max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0))))\n",
-        "            action[\"batch_job_slot\"] = max(0, min(4, int(action.get(\"batch_job_slot\", 0))))\n",
-        "            action[\"load_shed_fraction\"] = max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0))))\n",
-        "            action[\"building_id\"] = int(action.get(\"building_id\", 0))\n",
-        "            \n",
-        "            # Call environment\n",
-        "            r = requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
-        "            if r.status_code != 200:\n",
-        "                rewards.append(-0.5)\n",
-        "                continue\n",
-        "            \n",
-        "            step_data = r.json()\n",
-        "            if isinstance(step_data, list):\n",
-        "                step_data = step_data[0]\n",
-        "            \n",
-        "            reward = float(step_data.get(\"reward\", 0))\n",
-        "            rewards.append(max(-1.0, min(1.0, reward)))  # Clamp to [-1, 1]\n",
-        "            training_rewards.append(reward)\n",
-        "            \n",
-        "        except Exception as e:\n",
-        "            rewards.append(-1.0)\n",
-        "    \n",
-        "    return rewards\n",
-        "\n",
-        "print(\"Reward function defined.\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "adae3837",
-      "metadata": {},
-      "source": [
-        "## Step 6: Configure and Run GRPO Training"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "ceac8c9d",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from trl import GRPOTrainer, GRPOConfig\n",
-        "from datasets import Dataset\n",
-        "\n",
-        "# Prepare dataset\n",
-        "train_data = [{\"prompt\": d[\"prompt\"]} for d in dataset]\n",
-        "train_ds = Dataset.from_list(train_data)\n",
-        "\n",
-        "print(f\"Training dataset: {len(train_ds)} prompts\")\n",
-        "print(f\"Sample prompt:\\n{train_data[0]['prompt'][:200]}...\\n\")\n",
-        "\n",
-        "# GRPO config for free T4 GPU\n",
-        "config = GRPOConfig(\n",
-        "    output_dir=\"./gridmind-grpo-output\",\n",
-        "    num_train_epochs=1,\n",
-        "    max_steps=60,  # Complete in ~30-40 min on T4\n",
-        "    per_device_train_batch_size=2,\n",
-        "    gradient_accumulation_steps=2,\n",
-        "    max_new_tokens=100,\n",
-        "    max_prompt_length=512,\n",
-        "    learning_rate=5e-6,\n",
-        "    logging_steps=5,\n",
-        "    save_steps=60,\n",
-        "    fp16=True,\n",
-        "    dataloader_num_workers=0,\n",
-        "    report_to=\"none\",\n",
-        "    num_generations=2,  # 2 generations per prompt for speed\n",
-        ")\n",
-        "\n",
-        "print(\"\\nStarting GRPO training...\")\n",
-        "print(f\"Estimated time: 30-40 minutes on Colab T4 GPU\")\n",
-        "print(f\"Steps: {config.max_steps}, Batch size: {config.per_device_train_batch_size * config.gradient_accumulation_steps}\\n\")\n",
-        "\n",
-        "# Initialize trainer\n",
-        "trainer = GRPOTrainer(\n",
-        "    model=model,\n",
-        "    tokenizer=tokenizer,\n",
-        "    config=config,\n",
-        "    train_dataset=train_ds,\n",
-        "    reward_funcs=gridmind_reward_fn,\n",
-        ")\n",
-        "\n",
-        "# Train\n",
-        "trainer.train()\n",
-        "print(\"\\n\u00e2\u0153\u201c Training complete!\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "c145c8c6",
-      "metadata": {},
-      "source": [
-        "## Step 7: Evaluate Trained Model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "dac005cc",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "def run_llm_episode(task_id=1, max_steps=96):\n",
-        "    \"\"\"Run an episode using the trained LLM.\"\"\"\n",
-        "    try:\n",
-        "        r = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10)\n",
-        "        obs_data = r.json()\n",
-        "        obs = obs_data[\"observations\"][0] if \"observations\" in obs_data else obs_data\n",
-        "    except:\n",
-        "        return 0.0\n",
-        "    \n",
-        "    model.eval()\n",
-        "    \n",
-        "    for step in range(max_steps):\n",
-        "        prompt = f\"\"\"Control industrial building energy system.\n",
-        "State: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, storage={obs.get('thermal_storage_level', 0.5):.2f}\n",
-        "Output JSON action (hvac_power_level 0-1, thermal_charge_rate -1 to 1, batch_job_slot 0-4,\n",
-        "load_shed_fraction 0-0.5, building_id 0):\"\"\"\n",
-        "        \n",
-        "        try:\n",
-        "            inputs = tokenizer(prompt, return_tensors=\"pt\", truncation=True, max_length=400).to(model.device)\n",
-        "            with torch.no_grad():\n",
-        "                outputs = model.generate(**inputs, max_new_tokens=80, do_sample=False, pad_token_id=tokenizer.eos_token_id)\n",
-        "            generated = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)\n",
-        "            \n",
-        "            start = generated.rfind('{')\n",
-        "            end = generated.rfind('}') + 1\n",
-        "            if start >= 0 and end > start:\n",
-        "                action = _json.loads(generated[start:end])\n",
-        "                action[\"hvac_power_level\"] = max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5))))\n",
-        "                action[\"thermal_charge_rate\"] = max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0))))\n",
-        "                action[\"batch_job_slot\"] = max(0, min(4, int(action.get(\"batch_job_slot\", 0))))\n",
-        "                action[\"load_shed_fraction\"] = max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0))))\n",
-        "                action[\"building_id\"] = 0\n",
-        "            else:\n",
-        "                action = {\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \"batch_job_slot\": 0,\n",
-        "                         \"load_shed_fraction\": 0.0, \"building_id\": 0}\n",
-        "            \n",
-        "            r = requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
-        "            step_data = r.json()\n",
-        "            if isinstance(step_data, list):\n",
-        "                step_data = step_data[0]\n",
-        "            obs = step_data.get(\"observation\", obs)\n",
-        "            if step_data.get(\"done\", False):\n",
-        "                break\n",
-        "        except:\n",
-        "            break\n",
-        "    \n",
-        "    try:\n",
-        "        grade = requests.get(f\"{ENV_URL}/grade\", timeout=10).json()\n",
-        "        return float(grade.get(\"score\", 0))\n",
-        "    except:\n",
-        "        return 0.0\n",
-        "\n",
-        "print(\"Evaluating trained model (2 episodes per task)...\")\n",
-        "trained_scores = {}\n",
-        "for task_id in [1, 2, 3, 4]:\n",
-        "    scores = []\n",
-        "    for ep in range(2):\n",
-        "        score = run_llm_episode(task_id=task_id)\n",
-        "        scores.append(score)\n",
-        "        print(f\"  Task {task_id} Episode {ep+1}: {score:.3f}\")\n",
-        "    trained_scores[task_id] = sum(scores) / len(scores)\n",
-        "\n",
-        "print(f\"\\nTrained Model Scores:\")\n",
-        "for task_id, avg in trained_scores.items():\n",
-        "    baseline = baseline_scores[task_id]\n",
-        "    improvement = ((avg - baseline) / baseline * 100) if baseline > 0 else 0\n",
-        "    print(f\"  Task {task_id}: {avg:.3f} (baseline: {baseline:.3f}, {improvement:+.1f}%)\")\n",
-        "\n",
-        "trained_avg = sum(trained_scores.values()) / len(trained_scores)\n",
-        "baseline_avg = sum(baseline_scores.values()) / len(baseline_scores)\n",
-        "overall_improvement = ((trained_avg - baseline_avg) / baseline_avg * 100) if baseline_avg > 0 else 0\n",
-        "\n",
-        "print(f\"\\nOverall Scores:\")\n",
-        "print(f\"  Heuristic baseline: {baseline_avg:.3f}\")\n",
-        "print(f\"  Trained LLM:        {trained_avg:.3f}\")\n",
-        "print(f\"  Improvement:        {overall_improvement:+.1f}%\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "id": "0f955e71",
-      "metadata": {},
-      "source": [
-        "## Step 8: Save Results"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "00844cb1",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "results = {\n",
-        "    \"heuristic_baseline\": {\n",
-        "        \"scores_by_task\": {str(k): v for k, v in baseline_scores.items()},\n",
-        "        \"average\": baseline_avg\n",
-        "    },\n",
-        "    \"trained_llm\": {\n",
-        "        \"scores_by_task\": {str(k): v for k, v in trained_scores.items()},\n",
-        "        \"average\": trained_avg\n",
-        "    },\n",
-        "    \"improvement_percent\": overall_improvement,\n",
-        "    \"model\": MODEL_NAME,\n",
-        "    \"training_steps\": config.max_steps,\n",
-        "    \"themes_covered\": [\"multi_agent\", \"instruction_following\", \"world_modeling\", \"curriculum\"],\n",
-        "    \"training_rewards_log\": training_rewards[-20:] if training_rewards else [],\n",
-        "}\n",
-        "\n",
-        "print(\"Saving results...\")\n",
-        "with open(\"gridmind_training_results.json\", \"w\") as f:\n",
-        "    _json.dump(results, f, indent=2)\n",
-        "\n",
-        "print(\"\u00e2\u0153\u201c Results saved to gridmind_training_results.json\")\n",
-        "print(f\"\\nSummary:\")\n",
-        "print(f\"  Model: {MODEL_NAME}\")\n",
-        "print(f\"  Themes: {results['themes_covered']}\")\n",
-        "print(f\"  Heuristic baseline: {baseline_avg:.3f}\")\n",
-        "print(f\"  Trained LLM: {trained_avg:.3f}\")\n",
-        "print(f\"  Improvement: {overall_improvement:+.1f}%\")"
-      ]
-    }
-  ],
-  "metadata": {
-    "language_info": {
-      "name": "python"
-    }
   },
-  "nbformat": 4,
-  "nbformat_minor": 5
 }

 {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "193da661",
+   "metadata": {},
+   "source": [
+    "# GridMind-RL: GRPO Training for Industrial Energy Management\n",
+    "\n",
+    "**Meta PyTorch OpenEnv Hackathon \u00e2\u20ac\u201d GridMind-RL Team**\n",
+    "\n",
+    "This notebook trains a small LLM (Qwen2.5-1.5B) using TRL GRPO on the GridMind-RL environment.\n",
+    "The environment covers all 4 hackathon themes:\n",
+    "\n",
+    "1. **Theme 1: Multi-Agent** \u00e2\u20ac\u201d 3 buildings share a grid feeder; each agent makes independent decisions\n",
+    "2. **Theme 2: Instruction Following** \u00e2\u20ac\u201d Task 4 provides natural language objectives that must be satisfied\n",
+    "3. **Theme 3: World Modeling** \u00e2\u20ac\u201d `/simulate` endpoint predicts outcomes before committing actions\n",
+    "4. **Theme 4: Self-Improvement** \u00e2\u20ac\u201d Curriculum automatically advances difficulty as agent performance improves\n",
+    "\n",
+    "| | |\n",
+    "|---|---|\n",
+    "| **Environment** | https://lo-kyu-gridmind.hf.space |\n",
+    "| **Method** | GRPO (Group Relative Policy Optimization) |\n",
+    "| **Model** | Qwen2.5-1.5B-Instruct |\n",
+    "| **Training Time** | ~30-40 minutes on free Colab T4 GPU |\n",
+    "| **Expected Improvement** | 20-40% score gain over heuristic baseline |"
+   ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f28e2f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install dependencies\n",
+    "!pip install trl==0.8.6 transformers>=4.41.0 torch accelerate datasets requests -q\n",
+    "\n",
+    "import torch\n",
+    "import sys\n",
+    "\n",
+    "print(f\"PyTorch: {torch.__version__}\")\n",
+    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
+    "if torch.cuda.is_available():\n",
+    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+    "    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5021a299",
+   "metadata": {},
+   "source": [
+    "## Step 1: Connect to Environment and Verify Connectivity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cdf0f35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "import time\n",
+    "\n",
+    "ENV_URL = \"https://lo-kyu-gridmind.hf.space\"\n",
+    "\n",
+    "# Test connectivity\n",
+    "print(\"Testing environment connectivity...\")\n",
+    "try:\n",
+    "    r = requests.get(f\"{ENV_URL}\", timeout=10)\n",
+    "    health = {\"status\": r.status_code}\n",
+    "    print(f\"\u00e2\u0153\u201c Health check: {health}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"\u00e2\u0153\u2014 Health check failed: {e}\")\n",
+    "    sys.exit(1)\n",
+    "\n",
+    "# Test each task reset\n",
+    "print(\"\\nTesting all 4 tasks...\")\n",
+    "for task_id in [1, 2, 3, 4]:\n",
+    "    try:\n",
+    "        r = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10)\n",
+    "        obs = r.json()\n",
+    "        has_card = \"instruction_card\" in obs or \"observations\" in obs and obs[\"observations\"][0].get(\"instruction_card\")\n",
+    "        print(f\"\u00e2\u0153\u201c Task {task_id}: status={r.status_code}, has_instruction_card={has_card}\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"\u00e2\u0153\u2014 Task {task_id} failed: {e}\")\n",
+    "\n",
+    "# Test coordinator (multi-agent)\n",
+    "print(\"\\nTesting multi-agent coordinator...\")\n",
+    "try:\n",
+    "    r = requests.post(f\"{ENV_URL}/coordinator/reset\", json={}, timeout=10)\n",
+    "    obs = r.json()\n",
+    "    n_buildings = len(obs.get(\"observations\", []))\n",
+    "    print(f\"\u00e2\u0153\u201c Coordinator reset: {n_buildings} buildings\")\n",
+    "except Exception as e:\n",
+    "    print(f\"\u00e2\u0153\u2014 Coordinator failed: {e}\")\n",
+    "\n",
+    "# Test world modeling\n",
+    "print(\"\\nTesting world modeling (/simulate)...\")\n",
+    "try:\n",
+    "    r = requests.post(f\"{ENV_URL}/simulate\", \n",
+    "                      json=[{\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \n",
+    "                             \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}],\n",
+    "                      timeout=10)\n",
+    "    sim = r.json()\n",
+    "    has_results = \"results\" in sim\n",
+    "    print(f\"\u00e2\u0153\u201c Simulate: has_results={has_results}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"\u00e2\u0153\u2014 Simulate failed: {e}\")\n",
+    "\n",
+    "print(\"\\n\u00e2\u0153\u201c All connectivity checks passed!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a5b58c2",
+   "metadata": {},
+   "source": [
+    "## Step 2: Measure Baseline Performance (Before Training)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42cecadb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "def run_heuristic_episode(task_id=1, max_steps=96):\n",
+    "    \"\"\"Run an episode using a rule-based heuristic policy.\"\"\"\n",
+    "    try:\n",
+    "        r = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10)\n",
+    "        obs_data = r.json()\n",
+    "        obs = obs_data[\"observations\"][0] if \"observations\" in obs_data else obs_data\n",
+    "    except:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    for step in range(max_steps):\n",
+    "        # Simple heuristic: charge off-peak, discharge peak\n",
+    "        hour = step // 4\n",
+    "        hvac = 0.7 if 8 <= hour <= 18 else 0.3\n",
+    "        charge = 0.6 if hour < 6 else (-0.4 if 14 <= hour <= 18 else 0.0)\n",
+    "        shed = 0.3 if 14 <= hour <= 17 else 0.0\n",
+    "        \n",
+    "        action = {\n",
+    "            \"hvac_power_level\": hvac,\n",
+    "            \"thermal_charge_rate\": charge,\n",
+    "            \"batch_job_slot\": 1 if 22 <= hour or hour <= 5 else 0,\n",
+    "            \"load_shed_fraction\": shed,\n",
+    "            \"building_id\": 0\n",
+    "        }\n",
+    "        \n",
+    "        try:\n",
+    "            r = requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
+    "            step_data = r.json()\n",
+    "            if isinstance(step_data, list):\n",
+    "                step_data = step_data[0]\n",
+    "            obs = step_data.get(\"observation\", obs)\n",
+    "            if step_data.get(\"done\", False):\n",
+    "                break\n",
+    "        except:\n",
+    "            break\n",
+    "    \n",
+    "    # Get final grade\n",
+    "    try:\n",
+    "        grade = requests.get(f\"{ENV_URL}/grade\", timeout=10).json()\n",
+    "        return float(grade.get(\"score\", 0))\n",
+    "    except:\n",
+    "        return 0.0\n",
+    "\n",
+    "print(\"Measuring heuristic baseline (2 episodes per task)...\")\n",
+    "baseline_scores = {}\n",
+    "for task_id in [1, 2, 3, 4]:\n",
+    "    scores = []\n",
+    "    for ep in range(2):\n",
+    "        score = run_heuristic_episode(task_id=task_id)\n",
+    "        scores.append(score)\n",
+    "        print(f\"  Task {task_id} Episode {ep+1}: {score:.3f}\")\n",
+    "    baseline_scores[task_id] = sum(scores) / len(scores)\n",
+    "\n",
+    "print(f\"\\nHeuristic Baseline Averages:\")\n",
+    "for task_id, avg in baseline_scores.items():\n",
+    "    print(f\"  Task {task_id}: {avg:.3f}\")\n",
+    "print(f\"  Overall: {sum(baseline_scores.values()) / len(baseline_scores):.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7abdd330",
+   "metadata": {},
+   "source": [
+    "## Step 3: Build Multi-Theme Training Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c496af9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build a dataset that covers all 4 themes\n",
+    "dataset = []\n",
+    "\n",
+    "# Theme 1: Multi-Agent (3 buildings cooperating)\n",
+    "print(\"Building multi-agent theme examples...\")\n",
+    "for i in range(20):\n",
+    "    try:\n",
+    "        resp = requests.post(f\"{ENV_URL}/coordinator/reset\", json={}, timeout=10).json()\n",
+    "        if \"observations\" in resp:\n",
+    "            for b_idx, b_obs in enumerate(resp[\"observations\"]):\n",
+    "                prompt = f\"\"\"You control Building {b_idx} in a 3-building facility.\n",
+    "All buildings share one grid connection (feeder limit: 250 kW).\n",
+    "Your current state: temp={b_obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, \n",
+    "storage={b_obs.get('thermal_storage_level', 0.5):.2f}, \n",
+    "price=${b_obs.get('current_price', 0.1):.3f}/kWh\n",
+    "Grid stress signal: {b_obs.get('grid_stress_signal', 0):.2f}\n",
+    "\n",
+    "You must coordinate with other buildings to keep total feeder load under 250 kW.\n",
+    "Each building decides independently. Respond with your JSON action:\n",
+    "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
+    "\"load_shed_fraction\": <0-0.5>, \"building_id\": {b_idx}}}\"\"\"\n",
+    "                dataset.append({\"prompt\": prompt, \"theme\": \"multi_agent\"})\n",
+    "    except:\n",
+    "        pass\n",
+    "\n",
+    "print(f\"Multi-agent examples: {len([d for d in dataset if d.get('theme')=='multi_agent'])}\")\n",
+    "\n",
+    "# Theme 2: Instruction Following (Task 4 with explicit objectives)\n",
+    "print(\"Building instruction-following theme examples...\")\n",
+    "for i in range(20):\n",
+    "    try:\n",
+    "        resp = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": 4}, timeout=10).json()\n",
+    "        if \"observations\" in resp:\n",
+    "            obs = resp[\"observations\"][0]\n",
+    "            instruction = resp.get(\"instruction_card\", obs.get(\"instruction_card\", {}))\n",
+    "            instruction_text = instruction.get(\"text\", \"Minimize cost\") if isinstance(instruction, dict) else str(instruction)\n",
+    "            prompt = f\"\"\"INSTRUCTION CARD: {instruction_text}\n",
+    "\n",
+    "Current state: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, \n",
+    "storage={obs.get('thermal_storage_level', 0.5):.2f}, \n",
+    "cost_so_far=${obs.get('cumulative_cost', 0):.2f}, \n",
+    "step={obs.get('step', 0)}/96\n",
+    "\n",
+    "You MUST satisfy the instruction. Output JSON action:\n",
+    "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
+    "\"load_shed_fraction\": <0-0.5>, \"building_id\": 0}}\"\"\"\n",
+    "            dataset.append({\"prompt\": prompt, \"theme\": \"instruction_following\"})\n",
+    "    except:\n",
+    "        pass\n",
+    "\n",
+    "print(f\"Instruction-following examples: {len([d for d in dataset if d.get('theme')=='instruction_following'])}\")\n",
+    "\n",
+    "# Theme 3: World Modeling (use /simulate)\n",
+    "print(\"Building world-modeling theme examples...\")\n",
+    "for task_id in [1, 2]:\n",
+    "    for i in range(10):\n",
+    "        try:\n",
+    "            resp = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10).json()\n",
+    "            if \"observations\" in resp:\n",
+    "                obs = resp[\"observations\"][0]\n",
+    "                # Simulate 2 candidate actions\n",
+    "                try:\n",
+    "                    sim_a = requests.post(f\"{ENV_URL}/simulate\",\n",
+    "                                         json=[{\"hvac_power_level\": 0.8, \"thermal_charge_rate\": 0.3,\n",
+    "                                                \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}],\n",
+    "                                         timeout=10).json()\n",
+    "                    sim_b = requests.post(f\"{ENV_URL}/simulate\",\n",
+    "                                         json=[{\"hvac_power_level\": 0.3, \"thermal_charge_rate\": -0.2,\n",
+    "                                                \"batch_job_slot\": 0, \"load_shed_fraction\": 0.2, \"building_id\": 0}],\n",
+    "                                         timeout=10).json()\n",
+    "                    sim_context = \"\\nPredicted outcomes:\\nOption A (high HVAC): efficient\\nOption B (low HVAC): economical\"\n",
+    "                except:\n",
+    "                    sim_context = \"\"\n",
+    "                \n",
+    "                prompt = f\"\"\"Plan your actions using simulation of future outcomes.\n",
+    "State: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, storage={obs.get('thermal_storage_level', 0.5):.2f}{sim_context}\n",
+    "\n",
+    "Output your best JSON action:\n",
+    "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
+    "\"load_shed_fraction\": <0-0.5>, \"building_id\": 0}}\"\"\"\n",
+    "                dataset.append({\"prompt\": prompt, \"theme\": \"world_modeling\"})\n",
+    "        except:\n",
+    "            pass\n",
+    "\n",
+    "print(f\"World-modeling examples: {len([d for d in dataset if d.get('theme')=='world_modeling'])}\")\n",
+    "\n",
+    "# Theme 4: Self-Improvement (curriculum across difficulties)\n",
+    "print(\"Building self-improvement theme examples...\")\n",
+    "for difficulty in [1, 1, 2, 2, 3, 3]:\n",
+    "    try:\n",
+    "        resp = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": difficulty}, timeout=10).json()\n",
+    "        if \"observations\" in resp:\n",
+    "            obs = resp[\"observations\"][0]\n",
+    "            prompt = f\"\"\"Difficulty Level {difficulty}/3 - Control building energy system.\n",
+    "State: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, storage={obs.get('thermal_storage_level', 0.5):.2f},\n",
+    "price=${obs.get('current_price', 0.1):.3f}/kWh\n",
+    "\n",
+    "Output JSON action:\n",
+    "{{\"hvac_power_level\": <0-1>, \"thermal_charge_rate\": <-1 to 1>, \"batch_job_slot\": <0-4>, \n",
+    "\"load_shed_fraction\": <0-0.5>, \"building_id\": 0}}\"\"\"\n",
+    "            dataset.append({\"prompt\": prompt, \"theme\": \"curriculum\", \"difficulty\": difficulty})\n",
+    "    except:\n",
+    "        pass\n",
+    "\n",
+    "print(f\"Self-improvement examples: {len([d for d in dataset if d.get('theme')=='curriculum'])}\")\n",
+    "\n",
+    "print(f\"\\nTotal dataset: {len(dataset)} prompts\")\n",
+    "theme_counts = {}\n",
+    "for d in dataset:\n",
+    "    theme = d.get(\"theme\", \"unknown\")\n",
+    "    theme_counts[theme] = theme_counts.get(theme, 0) + 1\n",
+    "print(f\"Theme distribution: {theme_counts}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ed46c06",
+   "metadata": {},
+   "source": [
+    "## Step 4: Load Model and Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e5826e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "\n",
+    "MODEL_NAME = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
+    "print(f\"Loading {MODEL_NAME}...\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
+    "if tokenizer.pad_token is None:\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    MODEL_NAME,\n",
+    "    torch_dtype=torch.float16,\n",
+    "    device_map=\"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    ")\n",
+    "\n",
+    "total_params = sum(p.numel() for p in model.parameters())\n",
+    "print(f\"Model loaded. Parameters: {total_params/1e6:.0f}M\")\n",
+    "print(f\"Device: {next(model.parameters()).device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba6645a6",
+   "metadata": {},
+   "source": [
+    "## Step 5: Define Reward Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02686008",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json as _json\n",
+    "\n",
+    "training_rewards = []\n",
+    "\n",
+    "def gridmind_reward_fn(completions, **kwargs):\n",
+    "    \"\"\"Reward function that calls the real environment.\"\"\"\n",
+    "    rewards = []\n",
+    "    \n",
+    "    for completion in completions:\n",
+    "        try:\n",
+    "            # Extract JSON action from completion\n",
+    "            text = str(completion).strip()\n",
+    "            start = text.rfind('{')\n",
+    "            end = text.rfind('}') + 1\n",
+    "            if start < 0 or end <= start:\n",
+    "                rewards.append(-1.0)\n",
+    "                continue\n",
+    "            \n",
+    "            action_str = text[start:end]\n",
+    "            action = _json.loads(action_str)\n",
+    "            \n",
+    "            # Clamp action to valid ranges\n",
+    "            action[\"hvac_power_level\"] = max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5))))\n",
+    "            action[\"thermal_charge_rate\"] = max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0))))\n",
+    "            action[\"batch_job_slot\"] = max(0, min(4, int(action.get(\"batch_job_slot\", 0))))\n",
+    "            action[\"load_shed_fraction\"] = max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0))))\n",
+    "            action[\"building_id\"] = int(action.get(\"building_id\", 0))\n",
+    "            \n",
+    "            # Call environment\n",
+    "            r = requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
+    "            if r.status_code != 200:\n",
+    "                rewards.append(-0.5)\n",
+    "                continue\n",
+    "            \n",
+    "            step_data = r.json()\n",
+    "            if isinstance(step_data, list):\n",
+    "                step_data = step_data[0]\n",
+    "            \n",
+    "            reward = float(step_data.get(\"reward\", 0))\n",
+    "            rewards.append(max(-1.0, min(1.0, reward)))  # Clamp to [-1, 1]\n",
+    "            training_rewards.append(reward)\n",
+    "            \n",
+    "        except Exception as e:\n",
+    "            rewards.append(-1.0)\n",
+    "    \n",
+    "    return rewards\n",
+    "\n",
+    "print(\"Reward function defined.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adae3837",
+   "metadata": {},
+   "source": [
+    "## Step 6: Configure and Run GRPO Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceac8c9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import GRPOTrainer, GRPOConfig\n",
+    "from datasets import Dataset\n",
+    "\n",
+    "# Prepare dataset\n",
+    "train_data = [{\"prompt\": d[\"prompt\"]} for d in dataset]\n",
+    "train_ds = Dataset.from_list(train_data)\n",
+    "\n",
+    "print(f\"Training dataset: {len(train_ds)} prompts\")\n",
+    "print(f\"Sample prompt:\\n{train_data[0]['prompt'][:200]}...\\n\")\n",
+    "\n",
+    "# GRPO config for free T4 GPU\n",
+    "config = GRPOConfig(\n",
+    "    output_dir=\"./gridmind-grpo-output\",\n",
+    "    num_train_epochs=1,\n",
+    "    max_steps=60,  # Complete in ~30-40 min on T4\n",
+    "    per_device_train_batch_size=2,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    max_new_tokens=100,\n",
+    "    max_prompt_length=512,\n",
+    "    learning_rate=5e-6,\n",
+    "    logging_steps=5,\n",
+    "    save_steps=60,\n",
+    "    fp16=True,\n",
+    "    dataloader_num_workers=0,\n",
+    "    report_to=\"none\",\n",
+    "    num_generations=2,  # 2 generations per prompt for speed\n",
+    ")\n",
+    "\n",
+    "print(\"\\nStarting GRPO training...\")\n",
+    "print(f\"Estimated time: 30-40 minutes on Colab T4 GPU\")\n",
+    "print(f\"Steps: {config.max_steps}, Batch size: {config.per_device_train_batch_size * config.gradient_accumulation_steps}\\n\")\n",
+    "\n",
+    "# Initialize trainer\n",
+    "trainer = GRPOTrainer(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    "    config=config,\n",
+    "    train_dataset=train_ds,\n",
+    "    reward_funcs=gridmind_reward_fn,\n",
+    ")\n",
+    "\n",
+    "# Train\n",
+    "trainer.train()\n",
+    "print(\"\\n\u00e2\u0153\u201c Training complete!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c145c8c6",
+   "metadata": {},
+   "source": [
+    "## Step 7: Evaluate Trained Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dac005cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_llm_episode(task_id=1, max_steps=96):\n",
+    "    \"\"\"Run an episode using the trained LLM.\"\"\"\n",
+    "    try:\n",
+    "        r = requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=10)\n",
+    "        obs_data = r.json()\n",
+    "        obs = obs_data[\"observations\"][0] if \"observations\" in obs_data else obs_data\n",
+    "    except:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    model.eval()\n",
+    "    \n",
+    "    for step in range(max_steps):\n",
+    "        prompt = f\"\"\"Control industrial building energy system.\n",
+    "State: temp={obs.get('indoor_temperature', 21):.1f}\u00c2\u00b0C, storage={obs.get('thermal_storage_level', 0.5):.2f}\n",
+    "Output JSON action (hvac_power_level 0-1, thermal_charge_rate -1 to 1, batch_job_slot 0-4,\n",
+    "load_shed_fraction 0-0.5, building_id 0):\"\"\"\n",
+    "        \n",
+    "        try:\n",
+    "            inputs = tokenizer(prompt, return_tensors=\"pt\", truncation=True, max_length=400).to(model.device)\n",
+    "            with torch.no_grad():\n",
+    "                outputs = model.generate(**inputs, max_new_tokens=80, do_sample=False, pad_token_id=tokenizer.eos_token_id)\n",
+    "            generated = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)\n",
+    "            \n",
+    "            start = generated.rfind('{')\n",
+    "            end = generated.rfind('}') + 1\n",
+    "            if start >= 0 and end > start:\n",
+    "                action = _json.loads(generated[start:end])\n",
+    "                action[\"hvac_power_level\"] = max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5))))\n",
+    "                action[\"thermal_charge_rate\"] = max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0))))\n",
+    "                action[\"batch_job_slot\"] = max(0, min(4, int(action.get(\"batch_job_slot\", 0))))\n",
+    "                action[\"load_shed_fraction\"] = max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0))))\n",
+    "                action[\"building_id\"] = 0\n",
+    "            else:\n",
+    "                action = {\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \"batch_job_slot\": 0,\n",
+    "                         \"load_shed_fraction\": 0.0, \"building_id\": 0}\n",
+    "            \n",
+    "            r = requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
+    "            step_data = r.json()\n",
+    "            if isinstance(step_data, list):\n",
+    "                step_data = step_data[0]\n",
+    "            obs = step_data.get(\"observation\", obs)\n",
+    "            if step_data.get(\"done\", False):\n",
+    "                break\n",
+    "        except:\n",
+    "            break\n",
+    "    \n",
+    "    try:\n",
+    "        grade = requests.get(f\"{ENV_URL}/grade\", timeout=10).json()\n",
+    "        return float(grade.get(\"score\", 0))\n",
+    "    except:\n",
+    "        return 0.0\n",
+    "\n",
+    "print(\"Evaluating trained model (2 episodes per task)...\")\n",
+    "trained_scores = {}\n",
+    "for task_id in [1, 2, 3, 4]:\n",
+    "    scores = []\n",
+    "    for ep in range(2):\n",
+    "        score = run_llm_episode(task_id=task_id)\n",
+    "        scores.append(score)\n",
+    "        print(f\"  Task {task_id} Episode {ep+1}: {score:.3f}\")\n",
+    "    trained_scores[task_id] = sum(scores) / len(scores)\n",
+    "\n",
+    "print(f\"\\nTrained Model Scores:\")\n",
+    "for task_id, avg in trained_scores.items():\n",
+    "    baseline = baseline_scores[task_id]\n",
+    "    improvement = ((avg - baseline) / baseline * 100) if baseline > 0 else 0\n",
+    "    print(f\"  Task {task_id}: {avg:.3f} (baseline: {baseline:.3f}, {improvement:+.1f}%)\")\n",
+    "\n",
+    "trained_avg = sum(trained_scores.values()) / len(trained_scores)\n",
+    "baseline_avg = sum(baseline_scores.values()) / len(baseline_scores)\n",
+    "overall_improvement = ((trained_avg - baseline_avg) / baseline_avg * 100) if baseline_avg > 0 else 0\n",
+    "\n",
+    "print(f\"\\nOverall Scores:\")\n",
+    "print(f\"  Heuristic baseline: {baseline_avg:.3f}\")\n",
+    "print(f\"  Trained LLM:        {trained_avg:.3f}\")\n",
+    "print(f\"  Improvement:        {overall_improvement:+.1f}%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f955e71",
+   "metadata": {},
+   "source": [
+    "## Step 8: Save Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00844cb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {\n",
+    "    \"heuristic_baseline\": {\n",
+    "        \"scores_by_task\": {str(k): v for k, v in baseline_scores.items()},\n",
+    "        \"average\": baseline_avg\n",
+    "    },\n",
+    "    \"trained_llm\": {\n",
+    "        \"scores_by_task\": {str(k): v for k, v in trained_scores.items()},\n",
+    "        \"average\": trained_avg\n",
+    "    },\n",
+    "    \"improvement_percent\": overall_improvement,\n",
+    "    \"model\": MODEL_NAME,\n",
+    "    \"training_steps\": config.max_steps,\n",
+    "    \"themes_covered\": [\"multi_agent\", \"instruction_following\", \"world_modeling\", \"curriculum\"],\n",
+    "    \"training_rewards_log\": training_rewards[-20:] if training_rewards else [],\n",
+    "}\n",
+    "\n",
+    "print(\"Saving results...\")\n",
+    "with open(\"gridmind_training_results.json\", \"w\") as f:\n",
+    "    _json.dump(results, f, indent=2)\n",
+    "\n",
+    "print(\"\u00e2\u0153\u201c Results saved to gridmind_training_results.json\")\n",
+    "print(f\"\\nSummary:\")\n",
+    "print(f\"  Model: {MODEL_NAME}\")\n",
+    "print(f\"  Themes: {results['themes_covered']}\")\n",
+    "print(f\"  Heuristic baseline: {baseline_avg:.3f}\")\n",
+    "print(f\"  Trained LLM: {trained_avg:.3f}\")\n",
+    "print(f\"  Improvement: {overall_improvement:+.1f}%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }