Spaces:

Eshit
/

Wildfire-Containment-Simulator

Sleeping

App Files Files Community

Eshit commited on 26 days ago

Commit

423d538

1 Parent(s): df17371

Delete deprecated GRPO notebook and publish GRPO v2 colab.

Browse files

Files changed (2) hide show

training/grpo_colab.ipynb +0 -0
training/grpo_v2_colab.ipynb +131 -37

training/grpo_colab.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

training/grpo_v2_colab.ipynb CHANGED Viewed

@@ -15,7 +15,7 @@
         "4. GRPO loop too slow - consequence of fix 3\n",
         "5. parse_action(text, None) crash - standalone check_json_format() for format reward\n",
         "\n",
-        "**Hardware:** A10G Large 24GB (HuggingFace Space JupyterLab)\n",
         "\n",
         "**Before running:** In a terminal, authenticate:\n",
         "```\n",
@@ -37,8 +37,11 @@
       "metadata": {},
       "source": [
         "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
-        "!pip install trl==0.15.2 datasets==3.4.1 wandb\n",
-        "!pip install torchvision --extra-index-url https://download.pytorch.org/whl/cu121"
       ],
       "execution_count": null,
       "outputs": []
@@ -56,6 +59,72 @@
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {},
@@ -104,7 +173,11 @@
         "import os, random, json, sys\n",
         "import torch\n",
         "\n",
-        "REPO_ROOT = \".\"  # Adjust to repo root in Colab\n",
         "if REPO_ROOT not in sys.path:\n",
         "    sys.path.insert(0, REPO_ROOT)\n",
         "\n",
@@ -219,59 +292,65 @@
       "cell_type": "code",
       "metadata": {},
       "source": [
         "def reward_fn_outcome(completions, prompts, tier=None, seed=None, **kwargs):\n",
         "    \"\"\"\n",
         "    Score each GRPO completion by:\n",
         "      1. Resetting the env to the EXACT (tier, seed) that generated the prompt (Issue 1 fix).\n",
         "      2. Applying the sampled completion as the single first action (MODEL_STEPS=1, Issue 3/4 fix).\n",
         "      3. Running HeuristicAgent until episode completion (Issue 2 fix - captures terminal reward).\n",
-        "\n",
         "    tier and seed are dataset columns forwarded by GRPOTrainer.\n",
         "    \"\"\"\n",
         "    global _reward_call_count\n",
         "    _reward_call_count += 1\n",
-        "    rewards = []\n",
-        "\n",
-        "    for i, completion in enumerate(completions):\n",
-        "        ep_tier = tier[i] if tier is not None else controller.get_tier()\n",
-        "        ep_seed = seed[i] if seed is not None else random.choice(SEED_POOL)\n",
         "\n",
-        "        env = WildfireEnv()\n",
-        "        obs = env.reset(task_id=ep_tier, seed=ep_seed)\n",
-        "        total_reward = 0.0\n",
-        "\n",
-        "        # Apply the sampled completion as step 0\n",
-        "        text = completion if isinstance(completion, str) else completion[0]['content']\n",
-        "        action, _ = parse_action(text, obs)\n",
-        "        result = env.step(action)\n",
-        "        total_reward += result.reward\n",
-        "        obs = result.observation\n",
-        "\n",
-        "        # Heuristic drives everything after (full episode to capture terminal reward)\n",
-        "        heuristic = HeuristicAgent()\n",
-        "        while not env.done:\n",
-        "            action = heuristic.act(obs)\n",
-        "            result = env.step(action)\n",
-        "            total_reward += result.reward\n",
-        "            obs = result.observation\n",
         "\n",
-        "        rewards.append(total_reward)\n",
         "\n",
-        "    # Update curriculum (once per batch, not per completion)\n",
         "    mean_r = sum(rewards) / len(rewards)\n",
         "    promoted = controller.after_episode(mean_r)\n",
         "    if promoted:\n",
         "        print(f'  *** Curriculum promoted to: {promoted} (mean batch reward={mean_r:.2f}) ***')\n",
         "\n",
-        "    # Sample completions to disk for inspection\n",
         "    if _reward_call_count % 10 == 0:\n",
         "        sample_path = f'training/samples/call_{_reward_call_count}.txt'\n",
         "        with open(sample_path, 'w') as f:\n",
         "            f.write(f'call={_reward_call_count}  tier={tier[0] if tier else \"?\"}  reward={rewards[0]:.3f}\\n')\n",
         "            f.write('---\\n')\n",
         "            c = completions[0]\n",
         "            f.write(c if isinstance(c, str) else c[0]['content'])\n",
-        "            f.write('\\n')\n",
         "\n",
         "    return rewards\n",
         "\n",
@@ -295,7 +374,7 @@
         "    return rewards\n",
         "\n",
         "\n",
-        "print('Reward functions defined.')"
       ],
       "execution_count": null,
       "outputs": []
@@ -400,8 +479,8 @@
         "    output_dir='./grpo_checkpoints',\n",
         "    num_generations=8,\n",
         "    learning_rate=3e-6,\n",
-        "    max_steps=400,\n",
-        "    save_steps=20,\n",
         "    per_device_train_batch_size=1,\n",
         "    gradient_accumulation_steps=4,\n",
         "    max_completion_length=192,\n",
@@ -453,7 +532,11 @@
         "stats = [{'step': ep, 'tier': t, 'mean_reward': r} for ep, t, r in history]\n",
         "with open('./training_stats.json', 'w') as f:\n",
         "    json.dump(stats, f, indent=2)\n",
-        "print('Stats saved -> training_stats.json')"
       ],
       "execution_count": null,
       "outputs": []
@@ -526,7 +609,9 @@
       "source": [
         "import numpy as np\n",
         "\n",
-        "with open('scripts/results.json', 'r') as f:\n",
         "    baselines = json.load(f)\n",
         "\n",
         "FastLanguageModel.for_inference(model)\n",
@@ -612,6 +697,15 @@
         "print('Pop saved rate:     ', end='')\n",
         "print('  '.join(f'{t}={results[t][\"pop_saved_pct\"]:.0f}%' for t in TIERS))\n",
         "\n",
         "assert any_tier_close, (\n",
         "    'Trained model did not come within 1.0 of heuristic on any tier. '\n",
         "    'Check training logs and sample completions.'\n",

         "4. GRPO loop too slow - consequence of fix 3\n",
         "5. parse_action(text, None) crash - standalone check_json_format() for format reward\n",
         "\n",
+        "**Hardware:** A100 Large 40GB (HuggingFace Space JupyterLab) — ~75 min wall-clock for 150 steps\n",
         "\n",
         "**Before running:** In a terminal, authenticate:\n",
         "```\n",
       "metadata": {},
       "source": [
         "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
+        "!pip install \"trl==0.20.0\" datasets==3.4.1 wandb\n",
+        "# torchvision: choose the index matching your CUDA version\n",
+        "# HF Space A100/A10G (CUDA 12.8): use cu128\n",
+        "# Standard Colab (CUDA 12.1): replace cu128 with cu121\n",
+        "!pip install torchvision --index-url https://download.pytorch.org/whl/cu128"
       ],
       "execution_count": null,
       "outputs": []
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "import sys\n",
+        "from enum import Enum\n",
+        "import importlib.machinery\n",
+        "from unittest.mock import MagicMock\n",
+        "\n",
+        "# torchvision C extension is ABI-incompatible with torch 2.10.0+cu128.\n",
+        "# Stub it out — text-only GRPO never calls vision ops.\n",
+        "# If your torchvision imports correctly, this cell is harmless (setdefault won't overwrite).\n",
+        "for _key in list(sys.modules.keys()):\n",
+        "    if 'torchvision' in _key:\n",
+        "        del sys.modules[_key]\n",
+        "\n",
+        "class _InterpolationMode(Enum):\n",
+        "    NEAREST = \"nearest\"\n",
+        "    NEAREST_EXACT = \"nearest_exact\"\n",
+        "    BOX = \"box\"\n",
+        "    BILINEAR = \"bilinear\"\n",
+        "    BICUBIC = \"bicubic\"\n",
+        "    HAMMING = \"hamming\"\n",
+        "    LANCZOS = \"lanczos\"\n",
+        "\n",
+        "class _StubModule(type(sys)): \n",
+        "    def __getattr__(self, name):\n",
+        "        if name.startswith('__'):\n",
+        "            raise AttributeError(name)\n",
+        "        mock = MagicMock()\n",
+        "        setattr(self, name, mock)\n",
+        "        return mock\n",
+        "\n",
+        "def _make(name):\n",
+        "    m = _StubModule(name)\n",
+        "    m.__spec__    = importlib.machinery.ModuleSpec(name, None)\n",
+        "    m.__path__    = []\n",
+        "    m.__package__ = name\n",
+        "    return m\n",
+        "\n",
+        "_tv      = _make(\"torchvision\")\n",
+        "_tv.__version__ = \"0.20.0\"\n",
+        "_tr      = _make(\"torchvision.transforms\")\n",
+        "_tr.InterpolationMode = _InterpolationMode\n",
+        "_tr_v2   = _make(\"torchvision.transforms.v2\")\n",
+        "_tvF     = _make(\"torchvision.transforms.v2.functional\")\n",
+        "_ops     = _make(\"torchvision.ops\")\n",
+        "_models  = _make(\"torchvision.models\")\n",
+        "_io      = _make(\"torchvision.io\")\n",
+        "_utils   = _make(\"torchvision.utils\")\n",
+        "_datasets= _make(\"torchvision.datasets\")\n",
+        "_tv.transforms = _tr\n",
+        "_tr.v2 = _tr_v2\n",
+        "_tr_v2.functional = _tvF\n",
+        "_tv.ops = _ops; _tv.models = _models; _tv.io = _io\n",
+        "_tv.utils = _utils; _tv.datasets = _datasets\n",
+        "\n",
+        "for _mod in [_tv, _tr, _tr_v2, _tvF, _ops, _models, _io, _utils, _datasets]:\n",
+        "    sys.modules[_mod.__name__] = _mod\n",
+        "\n",
+        "print(\"torchvision stubbed OK (safe for text-only training)\")"
+      ],
+      "execution_count": null,
+      "outputs": [],
+      "id": "c9ae1850"
+    },
     {
       "cell_type": "markdown",
       "metadata": {},
         "import os, random, json, sys\n",
         "import torch\n",
         "\n",
+        "# Clone the simulator repo first (run once in a terminal or notebook cell):\n",
+        "# !git clone https://github.com/Abrodolph/Wildfire-Containment-Simulator /home/user/app/Wildfire-Containment-Simulator\n",
+        "# !pip install -e /home/user/app/Wildfire-Containment-Simulator --quiet\n",
+        "REPO_ROOT = \"/home/user/app/Wildfire-Containment-Simulator\"  # HF JupyterLab path\n",
+        "# On standard Colab: REPO_ROOT = \"/content/Wildfire-Containment-Simulator\"\n",
         "if REPO_ROOT not in sys.path:\n",
         "    sys.path.insert(0, REPO_ROOT)\n",
         "\n",
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "from concurrent.futures import ThreadPoolExecutor\n",
+        "\n",
+        "def _run_episode(args):\n",
+        "    \"\"\"Run one full wildfire episode for a single GRPO completion (parallelizable).\"\"\"\n",
+        "    completion, ep_tier, ep_seed = args\n",
+        "    env = WildfireEnv()\n",
+        "    obs = env.reset(task_id=ep_tier, seed=ep_seed)\n",
+        "    total_reward = 0.0\n",
+        "    text = completion if isinstance(completion, str) else completion[0]['content']\n",
+        "    action, _ = parse_action(text, obs)\n",
+        "    result = env.step(action)\n",
+        "    total_reward += result.reward\n",
+        "    obs = result.observation\n",
+        "    heuristic = HeuristicAgent()\n",
+        "    while not env.done:\n",
+        "        action = heuristic.act(obs)\n",
+        "        result = env.step(action)\n",
+        "        total_reward += result.reward\n",
+        "        obs = result.observation\n",
+        "    return total_reward\n",
+        "\n",
+        "\n",
         "def reward_fn_outcome(completions, prompts, tier=None, seed=None, **kwargs):\n",
         "    \"\"\"\n",
         "    Score each GRPO completion by:\n",
         "      1. Resetting the env to the EXACT (tier, seed) that generated the prompt (Issue 1 fix).\n",
         "      2. Applying the sampled completion as the single first action (MODEL_STEPS=1, Issue 3/4 fix).\n",
         "      3. Running HeuristicAgent until episode completion (Issue 2 fix - captures terminal reward).\n",
+        "    Episodes are run in parallel threads to reduce wall-clock time.\n",
         "    tier and seed are dataset columns forwarded by GRPOTrainer.\n",
         "    \"\"\"\n",
         "    global _reward_call_count\n",
         "    _reward_call_count += 1\n",
         "\n",
+        "    args_list = [\n",
+        "        (\n",
+        "            completions[i],\n",
+        "            tier[i] if tier is not None else controller.get_tier(),\n",
+        "            seed[i] if seed is not None else random.choice(SEED_POOL),\n",
+        "        )\n",
+        "        for i in range(len(completions))\n",
+        "    ]\n",
         "\n",
+        "    with ThreadPoolExecutor(max_workers=len(completions)) as executor:\n",
+        "        rewards = list(executor.map(_run_episode, args_list))\n",
         "\n",
         "    mean_r = sum(rewards) / len(rewards)\n",
         "    promoted = controller.after_episode(mean_r)\n",
         "    if promoted:\n",
         "        print(f'  *** Curriculum promoted to: {promoted} (mean batch reward={mean_r:.2f}) ***')\n",
         "\n",
         "    if _reward_call_count % 10 == 0:\n",
+        "        os.makedirs('training/samples', exist_ok=True)\n",
         "        sample_path = f'training/samples/call_{_reward_call_count}.txt'\n",
         "        with open(sample_path, 'w') as f:\n",
         "            f.write(f'call={_reward_call_count}  tier={tier[0] if tier else \"?\"}  reward={rewards[0]:.3f}\\n')\n",
         "            f.write('---\\n')\n",
         "            c = completions[0]\n",
         "            f.write(c if isinstance(c, str) else c[0]['content'])\n",
         "\n",
         "    return rewards\n",
         "\n",
         "    return rewards\n",
         "\n",
         "\n",
+        "print('Reward functions defined (parallelized).')"
       ],
       "execution_count": null,
       "outputs": []
         "    output_dir='./grpo_checkpoints',\n",
         "    num_generations=8,\n",
         "    learning_rate=3e-6,\n",
+        "    max_steps=150,       # 150 steps ~ 75 min on A100; increase to 400 if time allows\n",
+        "    save_steps=10,\n",
         "    per_device_train_batch_size=1,\n",
         "    gradient_accumulation_steps=4,\n",
         "    max_completion_length=192,\n",
         "stats = [{'step': ep, 'tier': t, 'mean_reward': r} for ep, t, r in history]\n",
         "with open('./training_stats.json', 'w') as f:\n",
         "    json.dump(stats, f, indent=2)\n",
+        "print('Stats saved -> training_stats.json')\n",
+        "\n",
+        "# To resume training for more steps later:\n",
+        "# grpo_config.max_steps = 300  # new total\n",
+        "# trainer.train(resume_from_checkpoint='./grpo_checkpoints')"
       ],
       "execution_count": null,
       "outputs": []
       "source": [
         "import numpy as np\n",
         "\n",
+        "# Adjust path to repo root if needed\n",
+        "BASELINES_PATH = f'{REPO_ROOT}/scripts/results.json'\n",
+        "with open(BASELINES_PATH, 'r') as f:\n",
         "    baselines = json.load(f)\n",
         "\n",
         "FastLanguageModel.for_inference(model)\n",
         "print('Pop saved rate:     ', end='')\n",
         "print('  '.join(f'{t}={results[t][\"pop_saved_pct\"]:.0f}%' for t in TIERS))\n",
         "\n",
+        "with open('./grpo_eval_results.json', 'w') as f:\n",
+        "    json.dump({\n",
+        "        'trained': results,\n",
+        "        'baselines': baselines,\n",
+        "        'eval_seeds': EVAL_SEEDS,\n",
+        "        'model': 'Eshit/wildfire-grpo-7b',\n",
+        "    }, f, indent=2)\n",
+        "print('Eval results saved -> grpo_eval_results.json')\n",
+        "\n",
         "assert any_tier_close, (\n",
         "    'Trained model did not come within 1.0 of heuristic on any tier. '\n",
         "    'Check training logs and sample completions.'\n",