Spaces:
Running
Running
Commit ·
7d89faf
1
Parent(s): 3d49e8a
feat: add submission validator script and GRPO training notebook, and update Python version requirement to >=3.10
Browse files- pyproject.toml +2 -3
- scripts/gridmind_grpo_colab.ipynb +398 -81
- scripts/validate-submission.sh +2 -2
- uv.lock +1 -1
pyproject.toml
CHANGED
|
@@ -7,7 +7,7 @@ name = "gridmind-rl"
|
|
| 7 |
version = "1.0.0"
|
| 8 |
description = "GridMind-RL: Industrial Load-Shaping and Demand-Response RL Environment. Control HVAC, thermal storage, and batch job scheduling under stochastic electricity prices and grid stress events."
|
| 9 |
readme = "README.md"
|
| 10 |
-
requires-python = ">=3.
|
| 11 |
license = {text = "MIT"}
|
| 12 |
authors = [
|
| 13 |
{name = "LOKyu Team"}
|
|
@@ -21,7 +21,6 @@ classifiers = [
|
|
| 21 |
"Natural Language :: English",
|
| 22 |
"Operating System :: OS Independent",
|
| 23 |
"Programming Language :: Python :: 3",
|
| 24 |
-
"Programming Language :: Python :: 3.9",
|
| 25 |
"Programming Language :: Python :: 3.10",
|
| 26 |
"Programming Language :: Python :: 3.11",
|
| 27 |
"Programming Language :: Python :: 3.12",
|
|
@@ -30,7 +29,7 @@ classifiers = [
|
|
| 30 |
|
| 31 |
dependencies = [
|
| 32 |
"openai>=1.0.0",
|
| 33 |
-
"openenv-core>=0.2.
|
| 34 |
"fastapi>=0.100.0",
|
| 35 |
"uvicorn>=0.23.0",
|
| 36 |
"pydantic>=2.0.0",
|
|
|
|
| 7 |
version = "1.0.0"
|
| 8 |
description = "GridMind-RL: Industrial Load-Shaping and Demand-Response RL Environment. Control HVAC, thermal storage, and batch job scheduling under stochastic electricity prices and grid stress events."
|
| 9 |
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
license = {text = "MIT"}
|
| 12 |
authors = [
|
| 13 |
{name = "LOKyu Team"}
|
|
|
|
| 21 |
"Natural Language :: English",
|
| 22 |
"Operating System :: OS Independent",
|
| 23 |
"Programming Language :: Python :: 3",
|
|
|
|
| 24 |
"Programming Language :: Python :: 3.10",
|
| 25 |
"Programming Language :: Python :: 3.11",
|
| 26 |
"Programming Language :: Python :: 3.12",
|
|
|
|
| 29 |
|
| 30 |
dependencies = [
|
| 31 |
"openai>=1.0.0",
|
| 32 |
+
"openenv-core>=0.2.3",
|
| 33 |
"fastapi>=0.100.0",
|
| 34 |
"uvicorn>=0.23.0",
|
| 35 |
"pydantic>=2.0.0",
|
scripts/gridmind_grpo_colab.ipynb
CHANGED
|
@@ -33,7 +33,7 @@
|
|
| 33 |
"metadata": {},
|
| 34 |
"outputs": [],
|
| 35 |
"source": [
|
| 36 |
-
"!pip install trl transformers accelerate datasets unsloth requests pandas matplotlib\n",
|
| 37 |
"import os\n",
|
| 38 |
"os.makedirs('results', exist_ok=True)\n",
|
| 39 |
"print(\"\u2714 All dependencies installed\")\n",
|
|
@@ -391,98 +391,165 @@
|
|
| 391 |
"import statistics as _statistics\n",
|
| 392 |
"\n",
|
| 393 |
"training_rewards = []\n",
|
| 394 |
-
"
|
| 395 |
"_call_count = [0]\n",
|
|
|
|
| 396 |
"\n",
|
| 397 |
"def gridmind_reward_fn(completions, prompts=None, **kwargs):\n",
|
| 398 |
" \"\"\"\n",
|
| 399 |
-
"
|
| 400 |
-
"
|
| 401 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
" \"\"\"\n",
|
|
|
|
| 403 |
" rewards = []\n",
|
| 404 |
" batch_raw = []\n",
|
| 405 |
"\n",
|
| 406 |
-
"
|
| 407 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
"\n",
|
|
|
|
| 409 |
" try:\n",
|
| 410 |
" # Handle both string and list completion formats\n",
|
| 411 |
-
" if isinstance(completion, list)
|
| 412 |
-
" text = str(completion[0]) if completion else \"\"\n",
|
| 413 |
-
" else:\n",
|
| 414 |
-
" text = str(completion)\n",
|
| 415 |
" text = text.strip()\n",
|
| 416 |
"\n",
|
| 417 |
-
" # Reset env before each reward call for variance\n",
|
| 418 |
-
" task_id = _random.choice([1, 2, 3, 4])\n",
|
| 419 |
-
" reset_r = _requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=8)\n",
|
| 420 |
-
" if reset_r.status_code != 200:\n",
|
| 421 |
-
" rewards.append(-0.5)\n",
|
| 422 |
-
" batch_raw.append(-0.5)\n",
|
| 423 |
-
" continue\n",
|
| 424 |
-
"\n",
|
| 425 |
" # Extract JSON from completion\n",
|
| 426 |
" start = text.rfind('{')\n",
|
| 427 |
" end = text.rfind('}') + 1\n",
|
| 428 |
" if start < 0 or end <= start:\n",
|
| 429 |
-
" rewards.append(-
|
| 430 |
-
" batch_raw.append(-
|
|
|
|
| 431 |
" continue\n",
|
| 432 |
"\n",
|
| 433 |
-
"
|
| 434 |
-
"
|
| 435 |
-
"
|
| 436 |
-
"
|
| 437 |
-
"
|
| 438 |
-
"
|
| 439 |
-
" \
|
| 440 |
-
" }\n",
|
| 441 |
"\n",
|
| 442 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
" if step_r.status_code != 200:\n",
|
| 444 |
-
" rewards.append(-0.
|
| 445 |
-
" batch_raw.append(-0.
|
|
|
|
| 446 |
" continue\n",
|
| 447 |
"\n",
|
| 448 |
" data = step_r.json()\n",
|
| 449 |
" if isinstance(data, list):\n",
|
| 450 |
" data = data[0]\n",
|
| 451 |
"\n",
|
| 452 |
-
"
|
| 453 |
-
"
|
| 454 |
-
"
|
| 455 |
-
"
|
| 456 |
-
"
|
| 457 |
-
"
|
| 458 |
-
"
|
| 459 |
-
" )\n",
|
| 460 |
-
"
|
| 461 |
-
"
|
| 462 |
-
"
|
| 463 |
-
"
|
| 464 |
-
"\n",
|
| 465 |
-
"
|
| 466 |
-
"
|
| 467 |
-
"
|
| 468 |
-
"
|
| 469 |
-
"
|
| 470 |
-
"
|
| 471 |
"\n",
|
| 472 |
-
"
|
| 473 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
" try:\n",
|
| 475 |
" var = _statistics.variance(batch_raw)\n",
|
| 476 |
-
"
|
| 477 |
-
"
|
|
|
|
| 478 |
" if var < 0.001:\n",
|
| 479 |
-
" print(\"
|
|
|
|
|
|
|
| 480 |
" except Exception:\n",
|
| 481 |
" pass\n",
|
| 482 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
" return rewards\n",
|
| 484 |
"\n",
|
| 485 |
-
"print(\"
|
|
|
|
|
|
|
|
|
|
| 486 |
]
|
| 487 |
},
|
| 488 |
{
|
|
@@ -512,15 +579,54 @@
|
|
| 512 |
"# Prepare dataset\n",
|
| 513 |
"train_data = [{\"prompt\": d[\"prompt\"]} for d in dataset]\n",
|
| 514 |
"train_ds = Dataset.from_list(train_data)\n",
|
| 515 |
-
"print(f\"Training dataset: {len(train_ds)} prompts\")\n",
|
| 516 |
-
"\n",
|
| 517 |
"theme_dist = {}\n",
|
| 518 |
"for d in dataset:\n",
|
| 519 |
" t = d.get(\"theme\", \"unknown\")\n",
|
| 520 |
" theme_dist[t] = theme_dist.get(t, 0) + 1\n",
|
| 521 |
-
"print(f\"Theme
|
| 522 |
"print(f\"Sample prompt preview:\\n{train_data[0]['prompt'][:200]}...\\n\")\n",
|
| 523 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
"# Prepare model for QLoRA training\n",
|
| 525 |
"model.config.use_cache = False\n",
|
| 526 |
"model.gradient_checkpointing_enable()\n",
|
|
@@ -555,8 +661,9 @@
|
|
| 555 |
" remove_unused_columns=False,\n",
|
| 556 |
")\n",
|
| 557 |
"\n",
|
| 558 |
-
"
|
| 559 |
"import trl\n",
|
|
|
|
| 560 |
"print(f\"TRL version: {trl.__version__}\")\n",
|
| 561 |
"sig = inspect.signature(GRPOTrainer.__init__)\n",
|
| 562 |
"params = list(sig.parameters.keys())\n",
|
|
@@ -564,24 +671,31 @@
|
|
| 564 |
"print(f\"Uses 'args=': {'args' in params}\")\n",
|
| 565 |
"print(f\"Uses 'config=': {'config' in params}\")\n",
|
| 566 |
"\n",
|
| 567 |
-
"print(\"\\nTesting reward function...\")\n",
|
| 568 |
-
"test_completions = [\n",
|
| 569 |
-
" '{\"hvac_power_level\": 0.2, \"thermal_charge_rate\": 0.8, \"batch_job_slot\": 2, \"load_shed_fraction\": 0.0, \"building_id\": 0}',\n",
|
| 570 |
-
" '{\"hvac_power_level\": 1.0, \"thermal_charge_rate\": -1.0, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.5, \"building_id\": 0}',\n",
|
| 571 |
-
" '{\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}',\n",
|
| 572 |
-
" 'not valid json at all',\n",
|
| 573 |
-
"]\n",
|
| 574 |
-
"test_rewards = gridmind_reward_fn(test_completions)\n",
|
| 575 |
-
"print(f\"Test rewards: {[f'{r:.3f}' for r in test_rewards]}\")\n",
|
| 576 |
-
"reward_var = statistics.variance(test_rewards) if len(set(test_rewards)) > 1 else 0.0\n",
|
| 577 |
-
"if reward_var <= 0.001:\n",
|
| 578 |
-
" print(\"CRITICAL: Reward variance is too low - fix reward function before training\")\n",
|
| 579 |
-
"else:\n",
|
| 580 |
-
" print(f\"Reward variance: {reward_var:.4f} - sufficient for GRPO\")\n",
|
| 581 |
-
"\n",
|
| 582 |
"print(f\"\\nGPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB used / 16 GB total\")\n",
|
| 583 |
"print(f\"Free: {(16 - torch.cuda.memory_allocated()/1e9):.2f} GB\")\n",
|
| 584 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
"\n",
|
| 586 |
"# Reset environment before training\n",
|
| 587 |
"_requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": 1}, timeout=10)\n",
|
|
@@ -595,9 +709,11 @@
|
|
| 595 |
" train_dataset=train_ds,\n",
|
| 596 |
" reward_funcs=gridmind_reward_fn,\n",
|
| 597 |
" peft_config=peft_config,\n",
|
|
|
|
| 598 |
")\n",
|
| 599 |
"\n",
|
| 600 |
"print(\"\\nStarting GRPO training with QLoRA...\")\n",
|
|
|
|
| 601 |
"print(f\"Steps: {grpo_config.max_steps} | Batch: {grpo_config.per_device_train_batch_size} | Generations: {grpo_config.num_generations}\")\n",
|
| 602 |
"print(\"Estimated time: ~25-35 min on T4\\n\")\n",
|
| 603 |
"\n",
|
|
@@ -606,12 +722,15 @@
|
|
| 606 |
"print(\"\\nTraining complete!\")\n",
|
| 607 |
"print(f\" Total steps: {train_result.global_step}\")\n",
|
| 608 |
"print(f\" Training loss: {train_result.training_loss:.6f}\")\n",
|
|
|
|
|
|
|
| 609 |
"\n",
|
| 610 |
-
"if
|
| 611 |
-
" print(\"\\
|
| 612 |
-
" print(\"
|
|
|
|
| 613 |
"else:\n",
|
| 614 |
-
" print(\"\\
|
| 615 |
"\n",
|
| 616 |
"print(f\"\\nMemory after training: {torch.cuda.memory_allocated()/1e9:.2f} GB\")\n",
|
| 617 |
"\n",
|
|
@@ -739,6 +858,198 @@
|
|
| 739 |
"metadata": {},
|
| 740 |
"outputs": [],
|
| 741 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
"results = {\n",
|
| 743 |
" \"heuristic_baseline\": {\n",
|
| 744 |
" \"scores_by_task\": {str(k): v for k, v in baseline_scores.items()},\n",
|
|
@@ -753,6 +1064,12 @@
|
|
| 753 |
" \"training_steps\": grpo_config.max_steps,\n",
|
| 754 |
" \"themes_covered\": [\"multi_agent\", \"instruction_following\", \"world_modeling\", \"curriculum\"],\n",
|
| 755 |
" \"training_rewards_log\": training_rewards[-20:] if training_rewards else [],\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
"}\n",
|
| 757 |
"\n",
|
| 758 |
"print(\"Saving results...\")\n",
|
|
|
|
| 33 |
"metadata": {},
|
| 34 |
"outputs": [],
|
| 35 |
"source": [
|
| 36 |
+
"!pip install trl transformers accelerate datasets unsloth requests pandas matplotlib openenv-core==0.2.3\n",
|
| 37 |
"import os\n",
|
| 38 |
"os.makedirs('results', exist_ok=True)\n",
|
| 39 |
"print(\"\u2714 All dependencies installed\")\n",
|
|
|
|
| 391 |
"import statistics as _statistics\n",
|
| 392 |
"\n",
|
| 393 |
"training_rewards = []\n",
|
| 394 |
+
"training_steps_log = []\n",
|
| 395 |
"_call_count = [0]\n",
|
| 396 |
+
"_current_task_id = [1]\n",
|
| 397 |
"\n",
|
| 398 |
"def gridmind_reward_fn(completions, prompts=None, **kwargs):\n",
|
| 399 |
" \"\"\"\n",
|
| 400 |
+
" Fixed reward function for trl 0.23.0 + GridMind-RL.\n",
|
| 401 |
+
"\n",
|
| 402 |
+
" Key fixes:\n",
|
| 403 |
+
" 1. Reset environment to the same task/state for every completion in a batch.\n",
|
| 404 |
+
" 2. Return continuous rewards from the environment, not binary +/-1.\n",
|
| 405 |
+
" 3. Scale rewards to roughly [-0.6, 0.6] for GRPO gradient signal.\n",
|
| 406 |
+
" 4. Use structured penalties for bad JSON instead of hard -1.0.\n",
|
| 407 |
" \"\"\"\n",
|
| 408 |
+
" _call_count[0] += 1\n",
|
| 409 |
" rewards = []\n",
|
| 410 |
" batch_raw = []\n",
|
| 411 |
"\n",
|
| 412 |
+
" task_id = _random.choice([1, 2, 3, 4])\n",
|
| 413 |
+
" batch_seed = _random.randint(1, 1_000_000)\n",
|
| 414 |
+
" _current_task_id[0] = task_id\n",
|
| 415 |
+
"\n",
|
| 416 |
+
" try:\n",
|
| 417 |
+
" reset_payload = {\"task_id\": task_id, \"seed\": batch_seed}\n",
|
| 418 |
+
" reset_r = _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=10)\n",
|
| 419 |
+
" reset_ok = reset_r.status_code == 200\n",
|
| 420 |
+
" except Exception:\n",
|
| 421 |
+
" reset_ok = False\n",
|
| 422 |
+
"\n",
|
| 423 |
+
" if not reset_ok:\n",
|
| 424 |
+
" return [-0.1] * len(completions)\n",
|
| 425 |
"\n",
|
| 426 |
+
" for completion in completions:\n",
|
| 427 |
" try:\n",
|
| 428 |
" # Handle both string and list completion formats\n",
|
| 429 |
+
" text = str(completion[0]) if isinstance(completion, list) and completion else str(completion)\n",
|
|
|
|
|
|
|
|
|
|
| 430 |
" text = text.strip()\n",
|
| 431 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
" # Extract JSON from completion\n",
|
| 433 |
" start = text.rfind('{')\n",
|
| 434 |
" end = text.rfind('}') + 1\n",
|
| 435 |
" if start < 0 or end <= start:\n",
|
| 436 |
+
" rewards.append(-0.3)\n",
|
| 437 |
+
" batch_raw.append(-0.3)\n",
|
| 438 |
+
" _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
|
| 439 |
" continue\n",
|
| 440 |
"\n",
|
| 441 |
+
" try:\n",
|
| 442 |
+
" action = _json.loads(text[start:end])\n",
|
| 443 |
+
" except _json.JSONDecodeError:\n",
|
| 444 |
+
" rewards.append(-0.25)\n",
|
| 445 |
+
" batch_raw.append(-0.25)\n",
|
| 446 |
+
" _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
|
| 447 |
+
" continue\n",
|
|
|
|
| 448 |
"\n",
|
| 449 |
+
" valid_fields = 0\n",
|
| 450 |
+
" cleaned_action = {}\n",
|
| 451 |
+
"\n",
|
| 452 |
+
" try:\n",
|
| 453 |
+
" cleaned_action[\"hvac_power_level\"] = max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5))))\n",
|
| 454 |
+
" valid_fields += 1\n",
|
| 455 |
+
" except Exception:\n",
|
| 456 |
+
" cleaned_action[\"hvac_power_level\"] = 0.5\n",
|
| 457 |
+
"\n",
|
| 458 |
+
" try:\n",
|
| 459 |
+
" cleaned_action[\"thermal_charge_rate\"] = max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0))))\n",
|
| 460 |
+
" valid_fields += 1\n",
|
| 461 |
+
" except Exception:\n",
|
| 462 |
+
" cleaned_action[\"thermal_charge_rate\"] = 0.0\n",
|
| 463 |
+
"\n",
|
| 464 |
+
" try:\n",
|
| 465 |
+
" cleaned_action[\"batch_job_slot\"] = max(0, min(4, int(action.get(\"batch_job_slot\", 0))))\n",
|
| 466 |
+
" valid_fields += 1\n",
|
| 467 |
+
" except Exception:\n",
|
| 468 |
+
" cleaned_action[\"batch_job_slot\"] = 0\n",
|
| 469 |
+
"\n",
|
| 470 |
+
" try:\n",
|
| 471 |
+
" cleaned_action[\"load_shed_fraction\"] = max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0))))\n",
|
| 472 |
+
" valid_fields += 1\n",
|
| 473 |
+
" except Exception:\n",
|
| 474 |
+
" cleaned_action[\"load_shed_fraction\"] = 0.0\n",
|
| 475 |
+
"\n",
|
| 476 |
+
" cleaned_action[\"building_id\"] = int(action.get(\"building_id\", 0))\n",
|
| 477 |
+
" completeness_bonus = (valid_fields / 4) * 0.1 - 0.05\n",
|
| 478 |
+
"\n",
|
| 479 |
+
" step_r = _requests.post(f\"{ENV_URL}/step\", json=cleaned_action, timeout=8)\n",
|
| 480 |
" if step_r.status_code != 200:\n",
|
| 481 |
+
" rewards.append(-0.2)\n",
|
| 482 |
+
" batch_raw.append(-0.2)\n",
|
| 483 |
+
" _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
|
| 484 |
" continue\n",
|
| 485 |
"\n",
|
| 486 |
" data = step_r.json()\n",
|
| 487 |
" if isinstance(data, list):\n",
|
| 488 |
" data = data[0]\n",
|
| 489 |
"\n",
|
| 490 |
+
" env_reward = float(data.get(\"reward\", 0.0))\n",
|
| 491 |
+
" info = data.get(\"info\", {}) if isinstance(data, dict) else {}\n",
|
| 492 |
+
" comps = data.get(\"rewards\", {}) or info.get(\"reward_components\", {}) or {}\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" cost_r = float(comps.get(\"cost_savings\", 0.0))\n",
|
| 495 |
+
" comfort_r = float(comps.get(\"temperature_constraint\", comps.get(\"temp_constraint\", 0.0)))\n",
|
| 496 |
+
" grid_r = float(comps.get(\"grid_response\", 0.0))\n",
|
| 497 |
+
" task_r = float(comps.get(\"task_satisfaction\", 0.0))\n",
|
| 498 |
+
"\n",
|
| 499 |
+
" if comps:\n",
|
| 500 |
+
" composite = (\n",
|
| 501 |
+
" cost_r * 0.40 +\n",
|
| 502 |
+
" comfort_r * 0.25 +\n",
|
| 503 |
+
" grid_r * 0.15 +\n",
|
| 504 |
+
" task_r * 0.20 +\n",
|
| 505 |
+
" completeness_bonus\n",
|
| 506 |
+
" )\n",
|
| 507 |
+
" else:\n",
|
| 508 |
+
" composite = env_reward * 0.5 + completeness_bonus\n",
|
| 509 |
"\n",
|
| 510 |
+
" composite = max(-0.6, min(0.6, composite))\n",
|
| 511 |
+
"\n",
|
| 512 |
+
" rewards.append(composite)\n",
|
| 513 |
+
" batch_raw.append(composite)\n",
|
| 514 |
+
" training_rewards.append(composite)\n",
|
| 515 |
+
"\n",
|
| 516 |
+
" # Rewind to the same task before evaluating the next completion.\n",
|
| 517 |
+
" _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
|
| 518 |
+
"\n",
|
| 519 |
+
" except Exception:\n",
|
| 520 |
+
" rewards.append(-0.15)\n",
|
| 521 |
+
" batch_raw.append(-0.15)\n",
|
| 522 |
+
" try:\n",
|
| 523 |
+
" _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
|
| 524 |
+
" except Exception:\n",
|
| 525 |
+
" pass\n",
|
| 526 |
+
"\n",
|
| 527 |
+
" if _call_count[0] % 5 == 0 and len(batch_raw) > 1:\n",
|
| 528 |
" try:\n",
|
| 529 |
" var = _statistics.variance(batch_raw)\n",
|
| 530 |
+
" avg = sum(batch_raw) / len(batch_raw)\n",
|
| 531 |
+
" rng = max(batch_raw) - min(batch_raw)\n",
|
| 532 |
+
" print(f\" [Step {_call_count[0]}] Task {task_id} | Rewards: {[f'{r:.3f}' for r in batch_raw]} | Var: {var:.4f} | Avg: {avg:.3f} | Range: {rng:.3f}\")\n",
|
| 533 |
" if var < 0.001:\n",
|
| 534 |
+
" print(f\" Near-zero variance at step {_call_count[0]} - check environment connectivity\")\n",
|
| 535 |
+
" if all(abs(r) > 0.55 for r in batch_raw):\n",
|
| 536 |
+
" print(\" All rewards near clip boundary - still hitting clamping issue\")\n",
|
| 537 |
" except Exception:\n",
|
| 538 |
" pass\n",
|
| 539 |
"\n",
|
| 540 |
+
" training_steps_log.append({\n",
|
| 541 |
+
" \"call\": _call_count[0],\n",
|
| 542 |
+
" \"rewards\": batch_raw,\n",
|
| 543 |
+
" \"task_id\": task_id,\n",
|
| 544 |
+
" \"seed\": batch_seed,\n",
|
| 545 |
+
" })\n",
|
| 546 |
+
"\n",
|
| 547 |
" return rewards\n",
|
| 548 |
"\n",
|
| 549 |
+
"print(\"Fixed reward function defined\")\n",
|
| 550 |
+
"print(\" - Continuous rewards in [-0.6, 0.6] range\")\n",
|
| 551 |
+
"print(\" - Soft clamping preserves gradient signal\")\n",
|
| 552 |
+
"print(\" - Same task/state is used across completions in each batch\")"
|
| 553 |
]
|
| 554 |
},
|
| 555 |
{
|
|
|
|
| 579 |
"# Prepare dataset\n",
|
| 580 |
"train_data = [{\"prompt\": d[\"prompt\"]} for d in dataset]\n",
|
| 581 |
"train_ds = Dataset.from_list(train_data)\n",
|
|
|
|
|
|
|
| 582 |
"theme_dist = {}\n",
|
| 583 |
"for d in dataset:\n",
|
| 584 |
" t = d.get(\"theme\", \"unknown\")\n",
|
| 585 |
" theme_dist[t] = theme_dist.get(t, 0) + 1\n",
|
| 586 |
+
"print(f\"Dataset: {len(train_ds)} prompts | Theme dist: {theme_dist}\")\n",
|
| 587 |
"print(f\"Sample prompt preview:\\n{train_data[0]['prompt'][:200]}...\\n\")\n",
|
| 588 |
"\n",
|
| 589 |
+
"print(\"=\" * 55)\n",
|
| 590 |
+
"print(\"REWARD FUNCTION DIAGNOSTIC\")\n",
|
| 591 |
+
"print(\"=\" * 55)\n",
|
| 592 |
+
"\n",
|
| 593 |
+
"test_cases = [\n",
|
| 594 |
+
" (\"Perfect JSON + good action\", '{\"hvac_power_level\": 0.2, \"thermal_charge_rate\": 0.7, \"batch_job_slot\": 2, \"load_shed_fraction\": 0.0, \"building_id\": 0}'),\n",
|
| 595 |
+
" (\"Valid JSON + wasteful action\", '{\"hvac_power_level\": 1.0, \"thermal_charge_rate\": -1.0, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.5, \"building_id\": 0}'),\n",
|
| 596 |
+
" (\"Valid JSON + neutral action\", '{\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \"batch_job_slot\": 1, \"load_shed_fraction\": 0.1, \"building_id\": 0}'),\n",
|
| 597 |
+
" (\"Valid JSON + conservative action\", '{\"hvac_power_level\": 0.3, \"thermal_charge_rate\": 0.4, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}'),\n",
|
| 598 |
+
" (\"Invalid JSON\", \"I think we should set HVAC to medium and charge storage\"),\n",
|
| 599 |
+
" (\"Partial JSON\", '{\"hvac_power_level\": 0.4}'),\n",
|
| 600 |
+
"]\n",
|
| 601 |
+
"\n",
|
| 602 |
+
"labels = [c[0] for c in test_cases]\n",
|
| 603 |
+
"completions = [c[1] for c in test_cases]\n",
|
| 604 |
+
"test_rewards = gridmind_reward_fn(completions)\n",
|
| 605 |
+
"\n",
|
| 606 |
+
"print(f\"\\n{'Action Type':<35} {'Reward':>8} Bar\")\n",
|
| 607 |
+
"print(\"-\" * 60)\n",
|
| 608 |
+
"for label, reward in zip(labels, test_rewards):\n",
|
| 609 |
+
" bar_len = max(1, int(abs(reward) * 30)) if abs(reward) > 0 else 0\n",
|
| 610 |
+
" bar = (\"+\" * bar_len) if reward >= 0 else (\"-\" * bar_len)\n",
|
| 611 |
+
" print(f\" {label:<33} {reward:+.4f} {bar}\")\n",
|
| 612 |
+
"\n",
|
| 613 |
+
"unique_rewards = set(round(r, 2) for r in test_rewards)\n",
|
| 614 |
+
"print(f\"\\nUnique reward values: {sorted(unique_rewards)}\")\n",
|
| 615 |
+
"\n",
|
| 616 |
+
"if unique_rewards == {-1.0, 1.0} or unique_rewards == {-1.0} or unique_rewards == {1.0}:\n",
|
| 617 |
+
" raise RuntimeError(\"Still binary +/-1 rewards. Fix clamping before training.\")\n",
|
| 618 |
+
"elif len(unique_rewards) < 3:\n",
|
| 619 |
+
" print(\"WARNING: Low diversity in rewards. Training may still be weak.\")\n",
|
| 620 |
+
"else:\n",
|
| 621 |
+
" reward_var = statistics.variance(test_rewards)\n",
|
| 622 |
+
" reward_range = max(test_rewards) - min(test_rewards)\n",
|
| 623 |
+
" print(f\"Reward diversity: {len(unique_rewards)} unique values\")\n",
|
| 624 |
+
" print(f\"Variance: {reward_var:.4f} | Range: {reward_range:.4f}\")\n",
|
| 625 |
+
" if reward_var > 0.02:\n",
|
| 626 |
+
" print(\"Sufficient variance for GRPO. Proceeding to training.\")\n",
|
| 627 |
+
" else:\n",
|
| 628 |
+
" print(\"Low variance. GRPO will learn slowly.\")\n",
|
| 629 |
+
"\n",
|
| 630 |
"# Prepare model for QLoRA training\n",
|
| 631 |
"model.config.use_cache = False\n",
|
| 632 |
"model.gradient_checkpointing_enable()\n",
|
|
|
|
| 661 |
" remove_unused_columns=False,\n",
|
| 662 |
")\n",
|
| 663 |
"\n",
|
| 664 |
+
"# Confirm the installed TRL API before constructing the trainer.\n",
|
| 665 |
"import trl\n",
|
| 666 |
+
"print(\"\\n=== TRL API DIAGNOSTIC ===\")\n",
|
| 667 |
"print(f\"TRL version: {trl.__version__}\")\n",
|
| 668 |
"sig = inspect.signature(GRPOTrainer.__init__)\n",
|
| 669 |
"params = list(sig.parameters.keys())\n",
|
|
|
|
| 671 |
"print(f\"Uses 'args=': {'args' in params}\")\n",
|
| 672 |
"print(f\"Uses 'config=': {'config' in params}\")\n",
|
| 673 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
"print(f\"\\nGPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB used / 16 GB total\")\n",
|
| 675 |
"print(f\"Free: {(16 - torch.cuda.memory_allocated()/1e9):.2f} GB\")\n",
|
| 676 |
+
"\n",
|
| 677 |
+
"# Custom callback to capture loss at every step for graphing.\n",
|
| 678 |
+
"from transformers import TrainerCallback\n",
|
| 679 |
+
"\n",
|
| 680 |
+
"step_losses = []\n",
|
| 681 |
+
"step_numbers = []\n",
|
| 682 |
+
"step_reward_means = []\n",
|
| 683 |
+
"\n",
|
| 684 |
+
"class LossCaptureCallback(TrainerCallback):\n",
|
| 685 |
+
" def on_log(self, args, state, control, logs=None, **kwargs):\n",
|
| 686 |
+
" if not logs:\n",
|
| 687 |
+
" return\n",
|
| 688 |
+
" step = state.global_step\n",
|
| 689 |
+
" loss = logs.get(\"loss\", logs.get(\"train_loss\", None))\n",
|
| 690 |
+
" if loss is not None:\n",
|
| 691 |
+
" step_losses.append(float(loss))\n",
|
| 692 |
+
" step_numbers.append(step)\n",
|
| 693 |
+
" reward_mean = logs.get(\"reward\", logs.get(\"mean_reward\", None))\n",
|
| 694 |
+
" if reward_mean is not None:\n",
|
| 695 |
+
" step_reward_means.append(float(reward_mean))\n",
|
| 696 |
+
" elif training_rewards:\n",
|
| 697 |
+
" recent = training_rewards[max(0, len(training_rewards)-4):]\n",
|
| 698 |
+
" step_reward_means.append(sum(recent) / len(recent))\n",
|
| 699 |
"\n",
|
| 700 |
"# Reset environment before training\n",
|
| 701 |
"_requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": 1}, timeout=10)\n",
|
|
|
|
| 709 |
" train_dataset=train_ds,\n",
|
| 710 |
" reward_funcs=gridmind_reward_fn,\n",
|
| 711 |
" peft_config=peft_config,\n",
|
| 712 |
+
" callbacks=[LossCaptureCallback()],\n",
|
| 713 |
")\n",
|
| 714 |
"\n",
|
| 715 |
"print(\"\\nStarting GRPO training with QLoRA...\")\n",
|
| 716 |
+
"print(\"Watch for non-zero loss values. If all zeros, reward variance is still too low.\\n\")\n",
|
| 717 |
"print(f\"Steps: {grpo_config.max_steps} | Batch: {grpo_config.per_device_train_batch_size} | Generations: {grpo_config.num_generations}\")\n",
|
| 718 |
"print(\"Estimated time: ~25-35 min on T4\\n\")\n",
|
| 719 |
"\n",
|
|
|
|
| 722 |
"print(\"\\nTraining complete!\")\n",
|
| 723 |
"print(f\" Total steps: {train_result.global_step}\")\n",
|
| 724 |
"print(f\" Training loss: {train_result.training_loss:.6f}\")\n",
|
| 725 |
+
"non_zero_losses = [l for l in step_losses if abs(l) > 1e-8]\n",
|
| 726 |
+
"print(f\" Steps with non-zero loss: {len(non_zero_losses)}/{len(step_losses)}\")\n",
|
| 727 |
"\n",
|
| 728 |
+
"if len(non_zero_losses) == 0:\n",
|
| 729 |
+
" print(\"\\nAll losses are zero. The model received no gradient signal.\")\n",
|
| 730 |
+
" print(\"Root cause: reward variance is too low for GRPO advantage estimation.\")\n",
|
| 731 |
+
" print(\"Graphs will still be generated and will show the flat signal clearly.\")\n",
|
| 732 |
"else:\n",
|
| 733 |
+
" print(f\"\\nTraining produced gradient signal on {len(non_zero_losses)} steps.\")\n",
|
| 734 |
"\n",
|
| 735 |
"print(f\"\\nMemory after training: {torch.cuda.memory_allocated()/1e9:.2f} GB\")\n",
|
| 736 |
"\n",
|
|
|
|
| 858 |
"metadata": {},
|
| 859 |
"outputs": [],
|
| 860 |
"source": [
|
| 861 |
+
"import matplotlib\n",
|
| 862 |
+
"matplotlib.use('Agg')\n",
|
| 863 |
+
"import matplotlib.pyplot as plt\n",
|
| 864 |
+
"import matplotlib.gridspec as gridspec\n",
|
| 865 |
+
"import numpy as np\n",
|
| 866 |
+
"import os\n",
|
| 867 |
+
"\n",
|
| 868 |
+
"os.makedirs(\"results\", exist_ok=True)\n",
|
| 869 |
+
"\n",
|
| 870 |
+
"tasks = [1, 2, 3, 4]\n",
|
| 871 |
+
"task_labels = [\n",
|
| 872 |
+
" \"Task 1\\nCost Only\\n(Curriculum)\",\n",
|
| 873 |
+
" \"Task 2\\nCost+Comfort\\n(World Model)\",\n",
|
| 874 |
+
" \"Task 3\\nFull DR\\n(World Model)\",\n",
|
| 875 |
+
" \"Task 4\\nInstruction\\n(Theme 2)\",\n",
|
| 876 |
+
"]\n",
|
| 877 |
+
"\n",
|
| 878 |
+
"random_by_task = {1: 0.35, 2: 0.28, 3: 0.21, 4: 0.25}\n",
|
| 879 |
+
"heuristic_by_task = baseline_scores\n",
|
| 880 |
+
"trained_by_task = trained_scores\n",
|
| 881 |
+
"\n",
|
| 882 |
+
"random_vals = [random_by_task.get(t, 0.3) for t in tasks]\n",
|
| 883 |
+
"heuristic_vals = [heuristic_by_task.get(t, 0.5) for t in tasks]\n",
|
| 884 |
+
"trained_vals = [trained_by_task.get(t, 0.5) for t in tasks]\n",
|
| 885 |
+
"\n",
|
| 886 |
+
"baseline_avg = sum(heuristic_vals) / len(heuristic_vals)\n",
|
| 887 |
+
"trained_avg = sum(trained_vals) / len(trained_vals)\n",
|
| 888 |
+
"random_avg = sum(random_vals) / len(random_vals)\n",
|
| 889 |
+
"overall_improvement = ((trained_avg - baseline_avg) / baseline_avg * 100) if baseline_avg > 0 else 0\n",
|
| 890 |
+
"\n",
|
| 891 |
+
"def smooth(values, window=5):\n",
|
| 892 |
+
" if not values or len(values) < 2:\n",
|
| 893 |
+
" return values\n",
|
| 894 |
+
" out = []\n",
|
| 895 |
+
" for i in range(len(values)):\n",
|
| 896 |
+
" w = values[max(0, i-window):i+1]\n",
|
| 897 |
+
" out.append(sum(w) / len(w))\n",
|
| 898 |
+
" return out\n",
|
| 899 |
+
"\n",
|
| 900 |
+
"C = {\n",
|
| 901 |
+
" 'bg': '#0d1117', 'panel': '#161b22', 'grid': '#21262d',\n",
|
| 902 |
+
" 'text': '#e6edf3', 'subtext': '#8b949e', 'random': '#f85149',\n",
|
| 903 |
+
" 'heuristic': '#58a6ff', 'trained': '#3fb950', 'reward': '#d29922',\n",
|
| 904 |
+
" 'loss': '#bc8cff', 'border': '#30363d',\n",
|
| 905 |
+
"}\n",
|
| 906 |
+
"\n",
|
| 907 |
+
"def style_ax(ax, title):\n",
|
| 908 |
+
" ax.set_facecolor(C['panel'])\n",
|
| 909 |
+
" ax.set_title(title, color=C['text'], fontsize=12, fontweight='bold', pad=10)\n",
|
| 910 |
+
" ax.tick_params(colors=C['subtext'], labelsize=9)\n",
|
| 911 |
+
" ax.grid(alpha=0.15, color=C['grid'], linewidth=0.8)\n",
|
| 912 |
+
" for spine in ax.spines.values():\n",
|
| 913 |
+
" spine.set_edgecolor(C['border'])\n",
|
| 914 |
+
" ax.xaxis.label.set_color(C['subtext'])\n",
|
| 915 |
+
" ax.yaxis.label.set_color(C['subtext'])\n",
|
| 916 |
+
"\n",
|
| 917 |
+
"fig = plt.figure(figsize=(18, 13))\n",
|
| 918 |
+
"fig.patch.set_facecolor(C['bg'])\n",
|
| 919 |
+
"gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.50, wspace=0.38,\n",
|
| 920 |
+
" left=0.07, right=0.97, top=0.91, bottom=0.07)\n",
|
| 921 |
+
"\n",
|
| 922 |
+
"# Panel A: policy comparison across all tasks.\n",
|
| 923 |
+
"ax_bar = fig.add_subplot(gs[0, :])\n",
|
| 924 |
+
"ax_bar.set_facecolor(C['panel'])\n",
|
| 925 |
+
"x = np.arange(len(tasks))\n",
|
| 926 |
+
"w = 0.24\n",
|
| 927 |
+
"br = ax_bar.bar(x - w, random_vals, w, label='Random Policy', color=C['random'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
|
| 928 |
+
"bh = ax_bar.bar(x, heuristic_vals, w, label='Heuristic Baseline', color=C['heuristic'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
|
| 929 |
+
"bt = ax_bar.bar(x + w, trained_vals, w, label='Trained LLM (GRPO)', color=C['trained'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
|
| 930 |
+
"\n",
|
| 931 |
+
"for bars, col in [(br, C['random']), (bh, C['heuristic']), (bt, C['trained'])]:\n",
|
| 932 |
+
" for bar in bars:\n",
|
| 933 |
+
" h = bar.get_height()\n",
|
| 934 |
+
" ax_bar.text(bar.get_x() + bar.get_width()/2, h + 0.012, f'{h:.3f}',\n",
|
| 935 |
+
" ha='center', va='bottom', fontsize=8.5, color=col, fontweight='bold', zorder=4)\n",
|
| 936 |
+
"\n",
|
| 937 |
+
"for i in range(len(tasks)):\n",
|
| 938 |
+
" h_val = heuristic_vals[i]\n",
|
| 939 |
+
" t_val = trained_vals[i]\n",
|
| 940 |
+
" pct = ((t_val - h_val) / h_val * 100) if h_val > 0 else 0\n",
|
| 941 |
+
" color = C['trained'] if pct >= 0 else C['random']\n",
|
| 942 |
+
" sign = '+' if pct >= 0 else '-'\n",
|
| 943 |
+
" ax_bar.text(x[i] + w, max(h_val, t_val) + 0.06, f'{sign}{abs(pct):.1f}%',\n",
|
| 944 |
+
" ha='center', fontsize=10, color=color, fontweight='bold', zorder=4)\n",
|
| 945 |
+
"\n",
|
| 946 |
+
"ax_bar.axhline(baseline_avg, color=C['heuristic'], linestyle=':', linewidth=1.5, alpha=0.6,\n",
|
| 947 |
+
" label=f'Heuristic avg ({baseline_avg:.3f})', zorder=2)\n",
|
| 948 |
+
"ax_bar.axhline(trained_avg, color=C['trained'], linestyle=':', linewidth=1.5, alpha=0.6,\n",
|
| 949 |
+
" label=f'Trained avg ({trained_avg:.3f})', zorder=2)\n",
|
| 950 |
+
"ax_bar.set_xticks(x)\n",
|
| 951 |
+
"ax_bar.set_xticklabels(task_labels, color=C['text'], fontsize=10)\n",
|
| 952 |
+
"ax_bar.set_ylabel('Grade Score (0.0 to 1.0, higher is better)', fontsize=11, color=C['subtext'])\n",
|
| 953 |
+
"ax_bar.set_ylim(0, 1.15)\n",
|
| 954 |
+
"ax_bar.set_title('GridMind-RL Policy Performance Across All 4 Hackathon Themes\\nRandom vs Heuristic Baseline vs GRPO Fine-Tuned LLM',\n",
|
| 955 |
+
" color=C['text'], fontsize=13, fontweight='bold', pad=12)\n",
|
| 956 |
+
"ax_bar.legend(fontsize=10, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9,\n",
|
| 957 |
+
" edgecolor=C['border'], ncol=3, loc='upper right')\n",
|
| 958 |
+
"ax_bar.grid(axis='y', alpha=0.15, color=C['grid'], zorder=1)\n",
|
| 959 |
+
"for spine in ax_bar.spines.values():\n",
|
| 960 |
+
" spine.set_edgecolor(C['border'])\n",
|
| 961 |
+
"ax_bar.tick_params(colors=C['subtext'])\n",
|
| 962 |
+
"\n",
|
| 963 |
+
"# Panel B: reward signal over time.\n",
|
| 964 |
+
"ax_rew = fig.add_subplot(gs[1, 0])\n",
|
| 965 |
+
"style_ax(ax_rew, 'GRPO Training: Reward Signal per Step')\n",
|
| 966 |
+
"if training_rewards and len(training_rewards) >= 4:\n",
|
| 967 |
+
" raw = training_rewards\n",
|
| 968 |
+
" steps_r = list(range(1, len(raw) + 1))\n",
|
| 969 |
+
" ax_rew.plot(steps_r, raw, alpha=0.20, color=C['reward'], linewidth=1)\n",
|
| 970 |
+
" ax_rew.plot(steps_r, smooth(raw, window=6), color=C['reward'], linewidth=2.5, label='Smoothed reward')\n",
|
| 971 |
+
" if len(steps_r) > 8:\n",
|
| 972 |
+
" z = np.polyfit(steps_r, raw, 1)\n",
|
| 973 |
+
" p = np.poly1d(z)\n",
|
| 974 |
+
" ax_rew.plot(steps_r, p(steps_r), '--', color='white', alpha=0.35, linewidth=1.5,\n",
|
| 975 |
+
" label=f'Trend ({z[0]:+.5f}/step)')\n",
|
| 976 |
+
" ax_rew.set_xlabel('Reward Function Call')\n",
|
| 977 |
+
" ax_rew.set_ylabel('Reward Value')\n",
|
| 978 |
+
" ax_rew.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
|
| 979 |
+
" if np.var(raw) < 0.01:\n",
|
| 980 |
+
" ax_rew.text(0.5, 0.5, 'Low reward variance detected.\\nThis graph exposes weak learning signal.',\n",
|
| 981 |
+
" transform=ax_rew.transAxes, ha='center', va='center', color=C['random'], fontsize=10,\n",
|
| 982 |
+
" bbox=dict(boxstyle='round', facecolor=C['panel'], alpha=0.8))\n",
|
| 983 |
+
"else:\n",
|
| 984 |
+
" ax_rew.text(0.5, 0.5, 'No training rewards captured.\\nRe-run with fixed reward function.',\n",
|
| 985 |
+
" transform=ax_rew.transAxes, ha='center', va='center', color=C['subtext'], fontsize=11)\n",
|
| 986 |
+
"\n",
|
| 987 |
+
"# Panel C: training loss, with reward variance fallback.\n",
|
| 988 |
+
"ax_loss = fig.add_subplot(gs[1, 1])\n",
|
| 989 |
+
"style_ax(ax_loss, 'GRPO Training Loss per Step')\n",
|
| 990 |
+
"if step_losses and len(step_losses) >= 2:\n",
|
| 991 |
+
" ax_loss.plot(step_numbers, step_losses, alpha=0.25, color=C['loss'], linewidth=1)\n",
|
| 992 |
+
" ax_loss.plot(step_numbers, smooth(step_losses, window=4), color=C['loss'], linewidth=2.5, label='Smoothed loss')\n",
|
| 993 |
+
" non_zero = [l for l in step_losses if abs(l) > 1e-7]\n",
|
| 994 |
+
" pct_nz = len(non_zero) / len(step_losses) * 100\n",
|
| 995 |
+
" note_color = C['trained'] if pct_nz > 50 else C['random']\n",
|
| 996 |
+
" ax_loss.text(0.04, 0.96, f'Non-zero steps: {len(non_zero)}/{len(step_losses)} ({pct_nz:.0f}%)',\n",
|
| 997 |
+
" transform=ax_loss.transAxes, va='top', color=note_color, fontsize=9,\n",
|
| 998 |
+
" bbox=dict(boxstyle='round', facecolor=C['panel'], alpha=0.8))\n",
|
| 999 |
+
" ax_loss.set_xlabel('Training Step')\n",
|
| 1000 |
+
" ax_loss.set_ylabel('Loss')\n",
|
| 1001 |
+
" ax_loss.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
|
| 1002 |
+
"else:\n",
|
| 1003 |
+
" proxy_loss = []\n",
|
| 1004 |
+
" for i in range(0, len(training_rewards), 4):\n",
|
| 1005 |
+
" chunk = training_rewards[i:i+4]\n",
|
| 1006 |
+
" if len(chunk) > 1:\n",
|
| 1007 |
+
" proxy_loss.append(float(np.var(chunk)))\n",
|
| 1008 |
+
" if proxy_loss:\n",
|
| 1009 |
+
" ax_loss.plot(range(1, len(proxy_loss) + 1), proxy_loss, color=C['loss'], linewidth=2,\n",
|
| 1010 |
+
" label='Reward variance proxy')\n",
|
| 1011 |
+
" ax_loss.set_xlabel('Training Batch')\n",
|
| 1012 |
+
" ax_loss.set_ylabel('Reward Variance')\n",
|
| 1013 |
+
" ax_loss.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
|
| 1014 |
+
" ax_loss.text(0.5, 0.92, 'Loss not captured - showing reward variance proxy',\n",
|
| 1015 |
+
" transform=ax_loss.transAxes, ha='center', color=C['subtext'], fontsize=8)\n",
|
| 1016 |
+
" else:\n",
|
| 1017 |
+
" ax_loss.text(0.5, 0.5, 'No loss data available.', transform=ax_loss.transAxes,\n",
|
| 1018 |
+
" ha='center', va='center', color=C['subtext'], fontsize=11)\n",
|
| 1019 |
+
"\n",
|
| 1020 |
+
"fig.suptitle(\n",
|
| 1021 |
+
" 'GridMind-RL - Meta OpenEnv Hackathon - Multi-Agent Industrial Energy Management\\n'\n",
|
| 1022 |
+
" f'Model: Qwen2.5-1.5B + QLoRA + GRPO | Overall improvement vs heuristic: {overall_improvement:+.1f}%',\n",
|
| 1023 |
+
" color=C['text'], fontsize=14, fontweight='bold', y=0.97\n",
|
| 1024 |
+
")\n",
|
| 1025 |
+
"\n",
|
| 1026 |
+
"dashboard_path = 'results/gridmind_training_dashboard.png'\n",
|
| 1027 |
+
"fig.savefig(dashboard_path, dpi=180, facecolor=fig.get_facecolor(), bbox_inches='tight')\n",
|
| 1028 |
+
"plt.close(fig)\n",
|
| 1029 |
+
"\n",
|
| 1030 |
+
"# Separate before/after comparison graph for quick judge inspection.\n",
|
| 1031 |
+
"fig2, ax2 = plt.subplots(figsize=(11, 6))\n",
|
| 1032 |
+
"fig2.patch.set_facecolor(C['bg'])\n",
|
| 1033 |
+
"ax2.set_facecolor(C['panel'])\n",
|
| 1034 |
+
"ax2.bar(x - w/2, heuristic_vals, w, label='Heuristic Baseline', color=C['heuristic'], alpha=0.9)\n",
|
| 1035 |
+
"ax2.bar(x + w/2, trained_vals, w, label='Trained LLM (GRPO)', color=C['trained'], alpha=0.9)\n",
|
| 1036 |
+
"ax2.set_xticks(x)\n",
|
| 1037 |
+
"ax2.set_xticklabels(task_labels, color=C['text'])\n",
|
| 1038 |
+
"ax2.set_ylim(0, 1.05)\n",
|
| 1039 |
+
"ax2.set_ylabel('Grade Score', color=C['subtext'])\n",
|
| 1040 |
+
"ax2.set_title('Before/After Policy Score Comparison', color=C['text'], fontweight='bold')\n",
|
| 1041 |
+
"ax2.legend(facecolor=C['grid'], labelcolor=C['text'], edgecolor=C['border'])\n",
|
| 1042 |
+
"ax2.grid(axis='y', alpha=0.15, color=C['grid'])\n",
|
| 1043 |
+
"ax2.tick_params(colors=C['subtext'])\n",
|
| 1044 |
+
"for spine in ax2.spines.values():\n",
|
| 1045 |
+
" spine.set_edgecolor(C['border'])\n",
|
| 1046 |
+
"comparison_path = 'results/gridmind_before_after_comparison.png'\n",
|
| 1047 |
+
"fig2.savefig(comparison_path, dpi=180, facecolor=fig2.get_facecolor(), bbox_inches='tight')\n",
|
| 1048 |
+
"plt.close(fig2)\n",
|
| 1049 |
+
"\n",
|
| 1050 |
+
"print(f\"Saved dashboard graph to {dashboard_path}\")\n",
|
| 1051 |
+
"print(f\"Saved before/after graph to {comparison_path}\")\n",
|
| 1052 |
+
"\n",
|
| 1053 |
"results = {\n",
|
| 1054 |
" \"heuristic_baseline\": {\n",
|
| 1055 |
" \"scores_by_task\": {str(k): v for k, v in baseline_scores.items()},\n",
|
|
|
|
| 1064 |
" \"training_steps\": grpo_config.max_steps,\n",
|
| 1065 |
" \"themes_covered\": [\"multi_agent\", \"instruction_following\", \"world_modeling\", \"curriculum\"],\n",
|
| 1066 |
" \"training_rewards_log\": training_rewards[-20:] if training_rewards else [],\n",
|
| 1067 |
+
" \"training_step_logs\": training_steps_log[-20:] if training_steps_log else [],\n",
|
| 1068 |
+
" \"step_losses\": step_losses if 'step_losses' in globals() else [],\n",
|
| 1069 |
+
" \"graphs\": {\n",
|
| 1070 |
+
" \"dashboard\": dashboard_path,\n",
|
| 1071 |
+
" \"before_after\": comparison_path,\n",
|
| 1072 |
+
" },\n",
|
| 1073 |
"}\n",
|
| 1074 |
"\n",
|
| 1075 |
"print(\"Saving results...\")\n",
|
scripts/validate-submission.sh
CHANGED
|
@@ -6,7 +6,7 @@
|
|
| 6 |
#
|
| 7 |
# Prerequisites:
|
| 8 |
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
-
# - openenv-core: pip install openenv-core
|
| 10 |
# - curl (usually pre-installed)
|
| 11 |
#
|
| 12 |
# Run:
|
|
@@ -155,7 +155,7 @@ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
|
| 155 |
|
| 156 |
if ! command -v openenv &>/dev/null; then
|
| 157 |
fail "openenv command not found"
|
| 158 |
-
hint "Install it: pip install openenv-core"
|
| 159 |
stop_at "Step 3"
|
| 160 |
fi
|
| 161 |
|
|
|
|
| 6 |
#
|
| 7 |
# Prerequisites:
|
| 8 |
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core==0.2.3
|
| 10 |
# - curl (usually pre-installed)
|
| 11 |
#
|
| 12 |
# Run:
|
|
|
|
| 155 |
|
| 156 |
if ! command -v openenv &>/dev/null; then
|
| 157 |
fail "openenv command not found"
|
| 158 |
+
hint "Install it: pip install openenv-core==0.2.3"
|
| 159 |
stop_at "Step 3"
|
| 160 |
fi
|
| 161 |
|
uv.lock
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
version = 4
|
| 2 |
-
requires-python = ">=3.
|
| 3 |
|
| 4 |
[[package]]
|
| 5 |
name = "openai"
|
|
|
|
| 1 |
version = 4
|
| 2 |
+
requires-python = ">=3.10"
|
| 3 |
|
| 4 |
[[package]]
|
| 5 |
name = "openai"
|