Spaces:

Prajwal782007
/

Gridmind

Running

App Files Files Community

ShreeshantXD commited on 30 days ago

Commit

fd2ceda

1 Parent(s): d012f99

Add GridMind GRPO training notebook for Colab

Browse files

Files changed (1) hide show

scripts/gridmind_grpo_colab.ipynb +343 -0

scripts/gridmind_grpo_colab.ipynb ADDED Viewed

	@@ -0,0 +1,343 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ⚡ GridMind-RL: Training an LLM Energy Controller with Unsloth + GRPO\n",
+    "> Fine-tuning Qwen2.5-1.5B to manage industrial building energy using \n",
+    "> Reinforcement Learning via the GridMind-RL OpenEnv environment.\n",
+    "> \n",
+    "> **Environment:** https://lo-kyu-gridmind.hf.space\n",
+    "> **Method:** GRPO (Group Relative Policy Optimization)\n",
+    "> **Framework:** Unsloth + TRL  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install unsloth openenv-core\n",
+    "!pip install --no-deps bitsandbytes accelerate xformers peft trl triton\n",
+    "!pip install --no-deps cut_cross_entropy unsloth_zoo\n",
+    "!pip install \"datasets>=3.4.1,<4.0.0\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "from trl import GRPOTrainer, GRPOConfig\n",
+    "from datasets import Dataset\n",
+    "from openenv.core import GenericEnvClient\n",
+    "import torch, asyncio, json, re, nest_asyncio\n",
+    "nest_asyncio.apply()  # needed for asyncio in Colab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def verify_env():\n",
+    "    async with GenericEnvClient(\n",
+    "            base_url=\"https://lo-kyu-gridmind.hf.space\") as env:\n",
+    "        r = await env.reset()\n",
+    "        print(\"✅ Environment live!\")\n",
+    "        print(\"Observation keys:\", list(r.observation.keys()))\n",
+    "        r2 = await env.step({\n",
+    "            \"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0,\n",
+    "            \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0\n",
+    "        })\n",
+    "        print(f\"Step reward: {r2.reward:.3f}, done: {r2.done}\")\n",
+    "\n",
+    "asyncio.run(verify_env())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 512\n",
+    "lora_rank = 8\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=\"unsloth/Qwen2.5-1.5B-Instruct\",\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    load_in_4bit=True,\n",
+    ")\n",
+    "\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=lora_rank,\n",
+    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "                    \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    "    lora_alpha=lora_rank * 2,\n",
+    "    use_gradient_checkpointing=\"unsloth\",\n",
+    "    random_state=42,\n",
+    ")\n",
+    "print(\"✅ Model loaded with Unsloth 4-bit LoRA\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SYSTEM_PROMPT = \"\"\"\\\n",
+    "You are an expert industrial building energy controller.\n",
+    "Each turn you receive the current building state and must respond with \n",
+    "ONLY a valid JSON action object.\n",
+    "\n",
+    "Action format:\n",
+    "{\"hvac_power_level\": <0.0-1.0>, \"thermal_charge_rate\": <-1.0 to 1.0>, \n",
+    " \"batch_job_slot\": <0-4>, \"load_shed_fraction\": <0.0-0.5>}\n",
+    "\n",
+    "Strategy:\n",
+    "- Charge storage when price < $0.08/kWh (positive thermal_charge_rate)\n",
+    "- Discharge storage when price > $0.15/kWh (negative thermal_charge_rate)  \n",
+    "- Shed load 0.3-0.5 when grid_stress_signal > 0.7\n",
+    "- Reduce HVAC during peak hours (8-12, 17-21)\n",
+    "- Keep temperature between 19-23°C\"\"\"\n",
+    "\n",
+    "def make_prompt(i):\n",
+    "    return [{\n",
+    "        \"role\": \"system\", \"content\": SYSTEM_PROMPT\n",
+    "    }, {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": f\"Episode {i+1}: The building simulation is starting. \"\n",
+    "                   \"You will receive the state each step. \"\n",
+    "                   \"Output your first action as JSON now.\"\n",
+    "    }]\n",
+    "\n",
+    "dataset = Dataset.from_dict({\n",
+    "    \"prompt\": [make_prompt(i) for i in range(300)]\n",
+    "})\n",
+    "print(f\"✅ Dataset ready: {len(dataset)} training prompts\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def reward_valid_json(completions, **kwargs):\n",
+    "    \"\"\"Reward 0.3 for any valid JSON output.\"\"\"\n",
+    "    rewards = []\n",
+    "    for completion in completions:\n",
+    "        text = completion[0][\"content\"] if isinstance(completion, list) \\\n",
+    "               else completion\n",
+    "        try:\n",
+    "            match = re.search(r'\\{.*?\\}', text, re.DOTALL)\n",
+    "            if match:\n",
+    "                json.loads(match.group())\n",
+    "                rewards.append(0.3)\n",
+    "            else:\n",
+    "                rewards.append(0.0)\n",
+    "        except Exception:\n",
+    "            rewards.append(0.0)\n",
+    "    return rewards\n",
+    "\n",
+    "def reward_has_required_keys(completions, **kwargs):\n",
+    "    \"\"\"Reward 0.3 if JSON has all 4 required action keys.\"\"\"\n",
+    "    required = {\"hvac_power_level\", \"thermal_charge_rate\", \n",
+    "                \"batch_job_slot\", \"load_shed_fraction\"}\n",
+    "    rewards = []\n",
+    "    for completion in completions:\n",
+    "        text = completion[0][\"content\"] if isinstance(completion, list) \\\n",
+    "               else completion\n",
+    "        try:\n",
+    "            match = re.search(r'\\{.*?\\}', text, re.DOTALL)\n",
+    "            if match:\n",
+    "                action = json.loads(match.group())\n",
+    "                if required.issubset(action.keys()):\n",
+    "                    rewards.append(0.3)\n",
+    "                else:\n",
+    "                    rewards.append(0.1)\n",
+    "            else:\n",
+    "                rewards.append(0.0)\n",
+    "        except Exception:\n",
+    "            rewards.append(0.0)\n",
+    "    return rewards\n",
+    "\n",
+    "def reward_env_interaction(completions, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Reward 0.0-0.4 based on actual environment reward.\n",
+    "    Runs the action against the live GridMind-RL HF Space.\n",
+    "    \"\"\"\n",
+    "    async def run_step(text):\n",
+    "        try:\n",
+    "            match = re.search(r'\\{.*?\\}', text, re.DOTALL)\n",
+    "            action = json.loads(match.group()) if match else {}\n",
+    "            step_action = {\n",
+    "                \"hvac_power_level\": float(\n",
+    "                    max(0, min(1, action.get(\"hvac_power_level\", 0.5)))),\n",
+    "                \"thermal_charge_rate\": float(\n",
+    "                    max(-1, min(1, action.get(\"thermal_charge_rate\", 0.0)))),\n",
+    "                \"batch_job_slot\": int(\n",
+    "                    max(0, min(4, action.get(\"batch_job_slot\", 0)))),\n",
+    "                \"load_shed_fraction\": float(\n",
+    "                    max(0, min(0.5, action.get(\"load_shed_fraction\", 0.0)))),\n",
+    "                \"building_id\": 0\n",
+    "            }\n",
+    "            async with GenericEnvClient(\n",
+    "                    base_url=\"https://lo-kyu-gridmind.hf.space\") as env:\n",
+    "                await env.reset()\n",
+    "                result = await env.step(step_action)\n",
+    "                # Normalize reward to 0-0.4 range\n",
+    "                return min(0.4, max(0.0, result.reward / 25.0))\n",
+    "        except Exception:\n",
+    "            return 0.0\n",
+    "\n",
+    "    rewards = []\n",
+    "    for completion in completions:\n",
+    "        text = completion[0][\"content\"] if isinstance(completion, list) \\\n",
+    "               else completion\n",
+    "        reward = asyncio.run(run_step(text))\n",
+    "        rewards.append(reward)\n",
+    "    return rewards\n",
+    "\n",
+    "print(\"✅ Reward functions defined\")\n",
+    "print(\"  - reward_valid_json: up to 0.3\")\n",
+    "print(\"  - reward_has_required_keys: up to 0.3\")  \n",
+    "print(\"  - reward_env_interaction: up to 0.4 (from live env)\")\n",
+    "print(\"  Total max reward per step: 1.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_args = GRPOConfig(\n",
+    "    output_dir=\"gridmind-grpo-unsloth\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=1,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    num_generations=4,        # GRPO group size\n",
+    "    max_prompt_length=256,\n",
+    "    max_completion_length=128,\n",
+    "    learning_rate=5e-6,\n",
+    "    lr_scheduler_type=\"cosine\",\n",
+    "    warmup_ratio=0.1,\n",
+    "    logging_steps=5,\n",
+    "    save_steps=100,\n",
+    "    fp16=True,\n",
+    "    report_to=\"none\",\n",
+    "    seed=42,\n",
+    ")\n",
+    "print(\"✅ Training config ready\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = GRPOTrainer(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    "    args=training_args,\n",
+    "    train_dataset=dataset,\n",
+    "    reward_funcs=[\n",
+    "        reward_valid_json,\n",
+    "        reward_has_required_keys,\n",
+    "        reward_env_interaction,\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "print(\"🚀 Starting GRPO training...\")\n",
+    "print(\"This trains the model to output valid energy control actions\")\n",
+    "print(\"that maximize rewards from the live GridMind-RL environment.\\n\")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 📊 Training Results\n",
+    "\n",
+    "The reward curve above shows the model learning to:\n",
+    "1. Output valid JSON actions (reward_valid_json increases early)\n",
+    "2. Include all required control fields (reward_has_required_keys)\n",
+    "3. Choose actions that maximize energy savings (reward_env_interaction)\n",
+    "\n",
+    "**Baseline** (random actions): ~0.2 average reward  \n",
+    "**After training**: reward should trend toward 0.6-0.8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"=== Comparing pre-training vs post-training ===\\n\")\n",
+    "\n",
+    "test_state = (\n",
+    "    \"Building state: temp=24.5C, price=$0.18/kWh, \"\n",
+    "    \"storage=0.7, grid_stress=0.85, hour=18, step=60/95\"\n",
+    ")\n",
+    "\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+    "    {\"role\": \"user\", \"content\": test_state}\n",
+    "]\n",
+    "\n",
+    "FastLanguageModel.for_inference(model)\n",
+    "inputs = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=True, add_generation_prompt=True,\n",
+    "    return_tensors=\"pt\"\n",
+    ").to(\"cuda\")\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    outputs = model.generate(\n",
+    "        inputs, max_new_tokens=100, temperature=0.1,\n",
+    "        do_sample=True, pad_token_id=tokenizer.eos_token_id\n",
+    "    )\n",
+    "\n",
+    "response = tokenizer.decode(\n",
+    "    outputs[0][inputs.shape[1]:], skip_special_tokens=True\n",
+    ")\n",
+    "print(\"State:\", test_state)\n",
+    "print(\"\\nModel response:\", response)\n",
+    "print(\"\\n(Should output JSON with load_shed_fraction > 0 due to grid_stress=0.85)\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}