Prajwal782007 commited on
Commit
7d89faf
·
1 Parent(s): 3d49e8a

feat: add submission validator script and GRPO training notebook, and update Python version requirement to >=3.10

Browse files
pyproject.toml CHANGED
@@ -7,7 +7,7 @@ name = "gridmind-rl"
7
  version = "1.0.0"
8
  description = "GridMind-RL: Industrial Load-Shaping and Demand-Response RL Environment. Control HVAC, thermal storage, and batch job scheduling under stochastic electricity prices and grid stress events."
9
  readme = "README.md"
10
- requires-python = ">=3.9"
11
  license = {text = "MIT"}
12
  authors = [
13
  {name = "LOKyu Team"}
@@ -21,7 +21,6 @@ classifiers = [
21
  "Natural Language :: English",
22
  "Operating System :: OS Independent",
23
  "Programming Language :: Python :: 3",
24
- "Programming Language :: Python :: 3.9",
25
  "Programming Language :: Python :: 3.10",
26
  "Programming Language :: Python :: 3.11",
27
  "Programming Language :: Python :: 3.12",
@@ -30,7 +29,7 @@ classifiers = [
30
 
31
  dependencies = [
32
  "openai>=1.0.0",
33
- "openenv-core>=0.2.0",
34
  "fastapi>=0.100.0",
35
  "uvicorn>=0.23.0",
36
  "pydantic>=2.0.0",
 
7
  version = "1.0.0"
8
  description = "GridMind-RL: Industrial Load-Shaping and Demand-Response RL Environment. Control HVAC, thermal storage, and batch job scheduling under stochastic electricity prices and grid stress events."
9
  readme = "README.md"
10
+ requires-python = ">=3.10"
11
  license = {text = "MIT"}
12
  authors = [
13
  {name = "LOKyu Team"}
 
21
  "Natural Language :: English",
22
  "Operating System :: OS Independent",
23
  "Programming Language :: Python :: 3",
 
24
  "Programming Language :: Python :: 3.10",
25
  "Programming Language :: Python :: 3.11",
26
  "Programming Language :: Python :: 3.12",
 
29
 
30
  dependencies = [
31
  "openai>=1.0.0",
32
+ "openenv-core>=0.2.3",
33
  "fastapi>=0.100.0",
34
  "uvicorn>=0.23.0",
35
  "pydantic>=2.0.0",
scripts/gridmind_grpo_colab.ipynb CHANGED
@@ -33,7 +33,7 @@
33
  "metadata": {},
34
  "outputs": [],
35
  "source": [
36
- "!pip install trl transformers accelerate datasets unsloth requests pandas matplotlib\n",
37
  "import os\n",
38
  "os.makedirs('results', exist_ok=True)\n",
39
  "print(\"\u2714 All dependencies installed\")\n",
@@ -391,98 +391,165 @@
391
  "import statistics as _statistics\n",
392
  "\n",
393
  "training_rewards = []\n",
394
- "_reward_variance_log = []\n",
395
  "_call_count = [0]\n",
 
396
  "\n",
397
  "def gridmind_reward_fn(completions, prompts=None, **kwargs):\n",
398
  " \"\"\"\n",
399
- " Reward function compatible with trl 0.23.0.\n",
400
- " Called with positional completions list.\n",
401
- " Must return list of floats same length as completions.\n",
 
 
 
 
402
  " \"\"\"\n",
 
403
  " rewards = []\n",
404
  " batch_raw = []\n",
405
  "\n",
406
- " for completion in completions:\n",
407
- " _call_count[0] += 1\n",
 
 
 
 
 
 
 
 
 
 
 
408
  "\n",
 
409
  " try:\n",
410
  " # Handle both string and list completion formats\n",
411
- " if isinstance(completion, list):\n",
412
- " text = str(completion[0]) if completion else \"\"\n",
413
- " else:\n",
414
- " text = str(completion)\n",
415
  " text = text.strip()\n",
416
  "\n",
417
- " # Reset env before each reward call for variance\n",
418
- " task_id = _random.choice([1, 2, 3, 4])\n",
419
- " reset_r = _requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": task_id}, timeout=8)\n",
420
- " if reset_r.status_code != 200:\n",
421
- " rewards.append(-0.5)\n",
422
- " batch_raw.append(-0.5)\n",
423
- " continue\n",
424
- "\n",
425
  " # Extract JSON from completion\n",
426
  " start = text.rfind('{')\n",
427
  " end = text.rfind('}') + 1\n",
428
  " if start < 0 or end <= start:\n",
429
- " rewards.append(-1.0)\n",
430
- " batch_raw.append(-1.0)\n",
 
431
  " continue\n",
432
  "\n",
433
- " action = _json.loads(text[start:end])\n",
434
- " action = {\n",
435
- " \"hvac_power_level\": max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5)))),\n",
436
- " \"thermal_charge_rate\": max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0)))),\n",
437
- " \"batch_job_slot\": max(0, min(4, int(action.get(\"batch_job_slot\", 0)))),\n",
438
- " \"load_shed_fraction\": max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0)))),\n",
439
- " \"building_id\": int(action.get(\"building_id\", 0)),\n",
440
- " }\n",
441
  "\n",
442
- " step_r = _requests.post(f\"{ENV_URL}/step\", json=action, timeout=8)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  " if step_r.status_code != 200:\n",
444
- " rewards.append(-0.5)\n",
445
- " batch_raw.append(-0.5)\n",
 
446
  " continue\n",
447
  "\n",
448
  " data = step_r.json()\n",
449
  " if isinstance(data, list):\n",
450
  " data = data[0]\n",
451
  "\n",
452
- " base = float(data.get(\"reward\", 0.0))\n",
453
- " comps = data.get(\"rewards\", {})\n",
454
- " bonus = (\n",
455
- " float(comps.get(\"cost_savings\", 0)) * 0.3 +\n",
456
- " float(comps.get(\"task_satisfaction\", 0)) * 0.2 +\n",
457
- " float(comps.get(\"efficiency_bonus\", 0)) * 0.1 +\n",
458
- " float(comps.get(\"temperature_constraint\", 0)) * 0.15\n",
459
- " )\n",
460
- " final = max(-1.0, min(1.0, base + bonus))\n",
461
- " rewards.append(final)\n",
462
- " batch_raw.append(final)\n",
463
- " training_rewards.append(final)\n",
464
- "\n",
465
- " except _json.JSONDecodeError:\n",
466
- " rewards.append(-0.8)\n",
467
- " batch_raw.append(-0.8)\n",
468
- " except Exception:\n",
469
- " rewards.append(-0.5)\n",
470
- " batch_raw.append(-0.5)\n",
471
  "\n",
472
- " # Log variance every 10 calls\n",
473
- " if len(batch_raw) > 1 and _call_count[0] % 10 == 0:\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  " try:\n",
475
  " var = _statistics.variance(batch_raw)\n",
476
- " _reward_variance_log.append(var)\n",
477
- " print(f\" [Call {_call_count[0]}] Rewards: {[f'{r:.3f}' for r in batch_raw]} | Variance: {var:.4f}\")\n",
 
478
  " if var < 0.001:\n",
479
- " print(\" Zero variance - no learning signal!\")\n",
 
 
480
  " except Exception:\n",
481
  " pass\n",
482
  "\n",
 
 
 
 
 
 
 
483
  " return rewards\n",
484
  "\n",
485
- "print(\"Reward function defined (trl 0.23.0 compatible)\")"
 
 
 
486
  ]
487
  },
488
  {
@@ -512,15 +579,54 @@
512
  "# Prepare dataset\n",
513
  "train_data = [{\"prompt\": d[\"prompt\"]} for d in dataset]\n",
514
  "train_ds = Dataset.from_list(train_data)\n",
515
- "print(f\"Training dataset: {len(train_ds)} prompts\")\n",
516
- "\n",
517
  "theme_dist = {}\n",
518
  "for d in dataset:\n",
519
  " t = d.get(\"theme\", \"unknown\")\n",
520
  " theme_dist[t] = theme_dist.get(t, 0) + 1\n",
521
- "print(f\"Theme distribution: {theme_dist}\")\n",
522
  "print(f\"Sample prompt preview:\\n{train_data[0]['prompt'][:200]}...\\n\")\n",
523
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  "# Prepare model for QLoRA training\n",
525
  "model.config.use_cache = False\n",
526
  "model.gradient_checkpointing_enable()\n",
@@ -555,8 +661,9 @@
555
  " remove_unused_columns=False,\n",
556
  ")\n",
557
  "\n",
558
- "print(\"=== PRE-TRAINING DIAGNOSTIC ===\\n\")\n",
559
  "import trl\n",
 
560
  "print(f\"TRL version: {trl.__version__}\")\n",
561
  "sig = inspect.signature(GRPOTrainer.__init__)\n",
562
  "params = list(sig.parameters.keys())\n",
@@ -564,24 +671,31 @@
564
  "print(f\"Uses 'args=': {'args' in params}\")\n",
565
  "print(f\"Uses 'config=': {'config' in params}\")\n",
566
  "\n",
567
- "print(\"\\nTesting reward function...\")\n",
568
- "test_completions = [\n",
569
- " '{\"hvac_power_level\": 0.2, \"thermal_charge_rate\": 0.8, \"batch_job_slot\": 2, \"load_shed_fraction\": 0.0, \"building_id\": 0}',\n",
570
- " '{\"hvac_power_level\": 1.0, \"thermal_charge_rate\": -1.0, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.5, \"building_id\": 0}',\n",
571
- " '{\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}',\n",
572
- " 'not valid json at all',\n",
573
- "]\n",
574
- "test_rewards = gridmind_reward_fn(test_completions)\n",
575
- "print(f\"Test rewards: {[f'{r:.3f}' for r in test_rewards]}\")\n",
576
- "reward_var = statistics.variance(test_rewards) if len(set(test_rewards)) > 1 else 0.0\n",
577
- "if reward_var <= 0.001:\n",
578
- " print(\"CRITICAL: Reward variance is too low - fix reward function before training\")\n",
579
- "else:\n",
580
- " print(f\"Reward variance: {reward_var:.4f} - sufficient for GRPO\")\n",
581
- "\n",
582
  "print(f\"\\nGPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB used / 16 GB total\")\n",
583
  "print(f\"Free: {(16 - torch.cuda.memory_allocated()/1e9):.2f} GB\")\n",
584
- "print(\"\\n=== READY TO TRAIN ===\" if reward_var > 0.001 else \"\\n=== FIX REWARD FUNCTION FIRST ===\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  "\n",
586
  "# Reset environment before training\n",
587
  "_requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": 1}, timeout=10)\n",
@@ -595,9 +709,11 @@
595
  " train_dataset=train_ds,\n",
596
  " reward_funcs=gridmind_reward_fn,\n",
597
  " peft_config=peft_config,\n",
 
598
  ")\n",
599
  "\n",
600
  "print(\"\\nStarting GRPO training with QLoRA...\")\n",
 
601
  "print(f\"Steps: {grpo_config.max_steps} | Batch: {grpo_config.per_device_train_batch_size} | Generations: {grpo_config.num_generations}\")\n",
602
  "print(\"Estimated time: ~25-35 min on T4\\n\")\n",
603
  "\n",
@@ -606,12 +722,15 @@
606
  "print(\"\\nTraining complete!\")\n",
607
  "print(f\" Total steps: {train_result.global_step}\")\n",
608
  "print(f\" Training loss: {train_result.training_loss:.6f}\")\n",
 
 
609
  "\n",
610
- "if train_result.training_loss == 0.0:\n",
611
- " print(\"\\nWARNING: Loss is 0.0 - reward function may have zero variance.\")\n",
612
- " print(\"Check reward diagnostic output above. This means the model saw no learning signal.\")\n",
 
613
  "else:\n",
614
- " print(\"\\nNon-zero loss confirmed - model received learning signal.\")\n",
615
  "\n",
616
  "print(f\"\\nMemory after training: {torch.cuda.memory_allocated()/1e9:.2f} GB\")\n",
617
  "\n",
@@ -739,6 +858,198 @@
739
  "metadata": {},
740
  "outputs": [],
741
  "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  "results = {\n",
743
  " \"heuristic_baseline\": {\n",
744
  " \"scores_by_task\": {str(k): v for k, v in baseline_scores.items()},\n",
@@ -753,6 +1064,12 @@
753
  " \"training_steps\": grpo_config.max_steps,\n",
754
  " \"themes_covered\": [\"multi_agent\", \"instruction_following\", \"world_modeling\", \"curriculum\"],\n",
755
  " \"training_rewards_log\": training_rewards[-20:] if training_rewards else [],\n",
 
 
 
 
 
 
756
  "}\n",
757
  "\n",
758
  "print(\"Saving results...\")\n",
 
33
  "metadata": {},
34
  "outputs": [],
35
  "source": [
36
+ "!pip install trl transformers accelerate datasets unsloth requests pandas matplotlib openenv-core==0.2.3\n",
37
  "import os\n",
38
  "os.makedirs('results', exist_ok=True)\n",
39
  "print(\"\u2714 All dependencies installed\")\n",
 
391
  "import statistics as _statistics\n",
392
  "\n",
393
  "training_rewards = []\n",
394
+ "training_steps_log = []\n",
395
  "_call_count = [0]\n",
396
+ "_current_task_id = [1]\n",
397
  "\n",
398
  "def gridmind_reward_fn(completions, prompts=None, **kwargs):\n",
399
  " \"\"\"\n",
400
+ " Fixed reward function for trl 0.23.0 + GridMind-RL.\n",
401
+ "\n",
402
+ " Key fixes:\n",
403
+ " 1. Reset environment to the same task/state for every completion in a batch.\n",
404
+ " 2. Return continuous rewards from the environment, not binary +/-1.\n",
405
+ " 3. Scale rewards to roughly [-0.6, 0.6] for GRPO gradient signal.\n",
406
+ " 4. Use structured penalties for bad JSON instead of hard -1.0.\n",
407
  " \"\"\"\n",
408
+ " _call_count[0] += 1\n",
409
  " rewards = []\n",
410
  " batch_raw = []\n",
411
  "\n",
412
+ " task_id = _random.choice([1, 2, 3, 4])\n",
413
+ " batch_seed = _random.randint(1, 1_000_000)\n",
414
+ " _current_task_id[0] = task_id\n",
415
+ "\n",
416
+ " try:\n",
417
+ " reset_payload = {\"task_id\": task_id, \"seed\": batch_seed}\n",
418
+ " reset_r = _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=10)\n",
419
+ " reset_ok = reset_r.status_code == 200\n",
420
+ " except Exception:\n",
421
+ " reset_ok = False\n",
422
+ "\n",
423
+ " if not reset_ok:\n",
424
+ " return [-0.1] * len(completions)\n",
425
  "\n",
426
+ " for completion in completions:\n",
427
  " try:\n",
428
  " # Handle both string and list completion formats\n",
429
+ " text = str(completion[0]) if isinstance(completion, list) and completion else str(completion)\n",
 
 
 
430
  " text = text.strip()\n",
431
  "\n",
 
 
 
 
 
 
 
 
432
  " # Extract JSON from completion\n",
433
  " start = text.rfind('{')\n",
434
  " end = text.rfind('}') + 1\n",
435
  " if start < 0 or end <= start:\n",
436
+ " rewards.append(-0.3)\n",
437
+ " batch_raw.append(-0.3)\n",
438
+ " _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
439
  " continue\n",
440
  "\n",
441
+ " try:\n",
442
+ " action = _json.loads(text[start:end])\n",
443
+ " except _json.JSONDecodeError:\n",
444
+ " rewards.append(-0.25)\n",
445
+ " batch_raw.append(-0.25)\n",
446
+ " _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
447
+ " continue\n",
 
448
  "\n",
449
+ " valid_fields = 0\n",
450
+ " cleaned_action = {}\n",
451
+ "\n",
452
+ " try:\n",
453
+ " cleaned_action[\"hvac_power_level\"] = max(0.0, min(1.0, float(action.get(\"hvac_power_level\", 0.5))))\n",
454
+ " valid_fields += 1\n",
455
+ " except Exception:\n",
456
+ " cleaned_action[\"hvac_power_level\"] = 0.5\n",
457
+ "\n",
458
+ " try:\n",
459
+ " cleaned_action[\"thermal_charge_rate\"] = max(-1.0, min(1.0, float(action.get(\"thermal_charge_rate\", 0.0))))\n",
460
+ " valid_fields += 1\n",
461
+ " except Exception:\n",
462
+ " cleaned_action[\"thermal_charge_rate\"] = 0.0\n",
463
+ "\n",
464
+ " try:\n",
465
+ " cleaned_action[\"batch_job_slot\"] = max(0, min(4, int(action.get(\"batch_job_slot\", 0))))\n",
466
+ " valid_fields += 1\n",
467
+ " except Exception:\n",
468
+ " cleaned_action[\"batch_job_slot\"] = 0\n",
469
+ "\n",
470
+ " try:\n",
471
+ " cleaned_action[\"load_shed_fraction\"] = max(0.0, min(0.5, float(action.get(\"load_shed_fraction\", 0.0))))\n",
472
+ " valid_fields += 1\n",
473
+ " except Exception:\n",
474
+ " cleaned_action[\"load_shed_fraction\"] = 0.0\n",
475
+ "\n",
476
+ " cleaned_action[\"building_id\"] = int(action.get(\"building_id\", 0))\n",
477
+ " completeness_bonus = (valid_fields / 4) * 0.1 - 0.05\n",
478
+ "\n",
479
+ " step_r = _requests.post(f\"{ENV_URL}/step\", json=cleaned_action, timeout=8)\n",
480
  " if step_r.status_code != 200:\n",
481
+ " rewards.append(-0.2)\n",
482
+ " batch_raw.append(-0.2)\n",
483
+ " _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
484
  " continue\n",
485
  "\n",
486
  " data = step_r.json()\n",
487
  " if isinstance(data, list):\n",
488
  " data = data[0]\n",
489
  "\n",
490
+ " env_reward = float(data.get(\"reward\", 0.0))\n",
491
+ " info = data.get(\"info\", {}) if isinstance(data, dict) else {}\n",
492
+ " comps = data.get(\"rewards\", {}) or info.get(\"reward_components\", {}) or {}\n",
493
+ "\n",
494
+ " cost_r = float(comps.get(\"cost_savings\", 0.0))\n",
495
+ " comfort_r = float(comps.get(\"temperature_constraint\", comps.get(\"temp_constraint\", 0.0)))\n",
496
+ " grid_r = float(comps.get(\"grid_response\", 0.0))\n",
497
+ " task_r = float(comps.get(\"task_satisfaction\", 0.0))\n",
498
+ "\n",
499
+ " if comps:\n",
500
+ " composite = (\n",
501
+ " cost_r * 0.40 +\n",
502
+ " comfort_r * 0.25 +\n",
503
+ " grid_r * 0.15 +\n",
504
+ " task_r * 0.20 +\n",
505
+ " completeness_bonus\n",
506
+ " )\n",
507
+ " else:\n",
508
+ " composite = env_reward * 0.5 + completeness_bonus\n",
509
  "\n",
510
+ " composite = max(-0.6, min(0.6, composite))\n",
511
+ "\n",
512
+ " rewards.append(composite)\n",
513
+ " batch_raw.append(composite)\n",
514
+ " training_rewards.append(composite)\n",
515
+ "\n",
516
+ " # Rewind to the same task before evaluating the next completion.\n",
517
+ " _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
518
+ "\n",
519
+ " except Exception:\n",
520
+ " rewards.append(-0.15)\n",
521
+ " batch_raw.append(-0.15)\n",
522
+ " try:\n",
523
+ " _requests.post(f\"{ENV_URL}/reset\", json=reset_payload, timeout=8)\n",
524
+ " except Exception:\n",
525
+ " pass\n",
526
+ "\n",
527
+ " if _call_count[0] % 5 == 0 and len(batch_raw) > 1:\n",
528
  " try:\n",
529
  " var = _statistics.variance(batch_raw)\n",
530
+ " avg = sum(batch_raw) / len(batch_raw)\n",
531
+ " rng = max(batch_raw) - min(batch_raw)\n",
532
+ " print(f\" [Step {_call_count[0]}] Task {task_id} | Rewards: {[f'{r:.3f}' for r in batch_raw]} | Var: {var:.4f} | Avg: {avg:.3f} | Range: {rng:.3f}\")\n",
533
  " if var < 0.001:\n",
534
+ " print(f\" Near-zero variance at step {_call_count[0]} - check environment connectivity\")\n",
535
+ " if all(abs(r) > 0.55 for r in batch_raw):\n",
536
+ " print(\" All rewards near clip boundary - still hitting clamping issue\")\n",
537
  " except Exception:\n",
538
  " pass\n",
539
  "\n",
540
+ " training_steps_log.append({\n",
541
+ " \"call\": _call_count[0],\n",
542
+ " \"rewards\": batch_raw,\n",
543
+ " \"task_id\": task_id,\n",
544
+ " \"seed\": batch_seed,\n",
545
+ " })\n",
546
+ "\n",
547
  " return rewards\n",
548
  "\n",
549
+ "print(\"Fixed reward function defined\")\n",
550
+ "print(\" - Continuous rewards in [-0.6, 0.6] range\")\n",
551
+ "print(\" - Soft clamping preserves gradient signal\")\n",
552
+ "print(\" - Same task/state is used across completions in each batch\")"
553
  ]
554
  },
555
  {
 
579
  "# Prepare dataset\n",
580
  "train_data = [{\"prompt\": d[\"prompt\"]} for d in dataset]\n",
581
  "train_ds = Dataset.from_list(train_data)\n",
 
 
582
  "theme_dist = {}\n",
583
  "for d in dataset:\n",
584
  " t = d.get(\"theme\", \"unknown\")\n",
585
  " theme_dist[t] = theme_dist.get(t, 0) + 1\n",
586
+ "print(f\"Dataset: {len(train_ds)} prompts | Theme dist: {theme_dist}\")\n",
587
  "print(f\"Sample prompt preview:\\n{train_data[0]['prompt'][:200]}...\\n\")\n",
588
  "\n",
589
+ "print(\"=\" * 55)\n",
590
+ "print(\"REWARD FUNCTION DIAGNOSTIC\")\n",
591
+ "print(\"=\" * 55)\n",
592
+ "\n",
593
+ "test_cases = [\n",
594
+ " (\"Perfect JSON + good action\", '{\"hvac_power_level\": 0.2, \"thermal_charge_rate\": 0.7, \"batch_job_slot\": 2, \"load_shed_fraction\": 0.0, \"building_id\": 0}'),\n",
595
+ " (\"Valid JSON + wasteful action\", '{\"hvac_power_level\": 1.0, \"thermal_charge_rate\": -1.0, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.5, \"building_id\": 0}'),\n",
596
+ " (\"Valid JSON + neutral action\", '{\"hvac_power_level\": 0.5, \"thermal_charge_rate\": 0.0, \"batch_job_slot\": 1, \"load_shed_fraction\": 0.1, \"building_id\": 0}'),\n",
597
+ " (\"Valid JSON + conservative action\", '{\"hvac_power_level\": 0.3, \"thermal_charge_rate\": 0.4, \"batch_job_slot\": 0, \"load_shed_fraction\": 0.0, \"building_id\": 0}'),\n",
598
+ " (\"Invalid JSON\", \"I think we should set HVAC to medium and charge storage\"),\n",
599
+ " (\"Partial JSON\", '{\"hvac_power_level\": 0.4}'),\n",
600
+ "]\n",
601
+ "\n",
602
+ "labels = [c[0] for c in test_cases]\n",
603
+ "completions = [c[1] for c in test_cases]\n",
604
+ "test_rewards = gridmind_reward_fn(completions)\n",
605
+ "\n",
606
+ "print(f\"\\n{'Action Type':<35} {'Reward':>8} Bar\")\n",
607
+ "print(\"-\" * 60)\n",
608
+ "for label, reward in zip(labels, test_rewards):\n",
609
+ " bar_len = max(1, int(abs(reward) * 30)) if abs(reward) > 0 else 0\n",
610
+ " bar = (\"+\" * bar_len) if reward >= 0 else (\"-\" * bar_len)\n",
611
+ " print(f\" {label:<33} {reward:+.4f} {bar}\")\n",
612
+ "\n",
613
+ "unique_rewards = set(round(r, 2) for r in test_rewards)\n",
614
+ "print(f\"\\nUnique reward values: {sorted(unique_rewards)}\")\n",
615
+ "\n",
616
+ "if unique_rewards == {-1.0, 1.0} or unique_rewards == {-1.0} or unique_rewards == {1.0}:\n",
617
+ " raise RuntimeError(\"Still binary +/-1 rewards. Fix clamping before training.\")\n",
618
+ "elif len(unique_rewards) < 3:\n",
619
+ " print(\"WARNING: Low diversity in rewards. Training may still be weak.\")\n",
620
+ "else:\n",
621
+ " reward_var = statistics.variance(test_rewards)\n",
622
+ " reward_range = max(test_rewards) - min(test_rewards)\n",
623
+ " print(f\"Reward diversity: {len(unique_rewards)} unique values\")\n",
624
+ " print(f\"Variance: {reward_var:.4f} | Range: {reward_range:.4f}\")\n",
625
+ " if reward_var > 0.02:\n",
626
+ " print(\"Sufficient variance for GRPO. Proceeding to training.\")\n",
627
+ " else:\n",
628
+ " print(\"Low variance. GRPO will learn slowly.\")\n",
629
+ "\n",
630
  "# Prepare model for QLoRA training\n",
631
  "model.config.use_cache = False\n",
632
  "model.gradient_checkpointing_enable()\n",
 
661
  " remove_unused_columns=False,\n",
662
  ")\n",
663
  "\n",
664
+ "# Confirm the installed TRL API before constructing the trainer.\n",
665
  "import trl\n",
666
+ "print(\"\\n=== TRL API DIAGNOSTIC ===\")\n",
667
  "print(f\"TRL version: {trl.__version__}\")\n",
668
  "sig = inspect.signature(GRPOTrainer.__init__)\n",
669
  "params = list(sig.parameters.keys())\n",
 
671
  "print(f\"Uses 'args=': {'args' in params}\")\n",
672
  "print(f\"Uses 'config=': {'config' in params}\")\n",
673
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
  "print(f\"\\nGPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB used / 16 GB total\")\n",
675
  "print(f\"Free: {(16 - torch.cuda.memory_allocated()/1e9):.2f} GB\")\n",
676
+ "\n",
677
+ "# Custom callback to capture loss at every step for graphing.\n",
678
+ "from transformers import TrainerCallback\n",
679
+ "\n",
680
+ "step_losses = []\n",
681
+ "step_numbers = []\n",
682
+ "step_reward_means = []\n",
683
+ "\n",
684
+ "class LossCaptureCallback(TrainerCallback):\n",
685
+ " def on_log(self, args, state, control, logs=None, **kwargs):\n",
686
+ " if not logs:\n",
687
+ " return\n",
688
+ " step = state.global_step\n",
689
+ " loss = logs.get(\"loss\", logs.get(\"train_loss\", None))\n",
690
+ " if loss is not None:\n",
691
+ " step_losses.append(float(loss))\n",
692
+ " step_numbers.append(step)\n",
693
+ " reward_mean = logs.get(\"reward\", logs.get(\"mean_reward\", None))\n",
694
+ " if reward_mean is not None:\n",
695
+ " step_reward_means.append(float(reward_mean))\n",
696
+ " elif training_rewards:\n",
697
+ " recent = training_rewards[max(0, len(training_rewards)-4):]\n",
698
+ " step_reward_means.append(sum(recent) / len(recent))\n",
699
  "\n",
700
  "# Reset environment before training\n",
701
  "_requests.post(f\"{ENV_URL}/reset\", json={\"task_id\": 1}, timeout=10)\n",
 
709
  " train_dataset=train_ds,\n",
710
  " reward_funcs=gridmind_reward_fn,\n",
711
  " peft_config=peft_config,\n",
712
+ " callbacks=[LossCaptureCallback()],\n",
713
  ")\n",
714
  "\n",
715
  "print(\"\\nStarting GRPO training with QLoRA...\")\n",
716
+ "print(\"Watch for non-zero loss values. If all zeros, reward variance is still too low.\\n\")\n",
717
  "print(f\"Steps: {grpo_config.max_steps} | Batch: {grpo_config.per_device_train_batch_size} | Generations: {grpo_config.num_generations}\")\n",
718
  "print(\"Estimated time: ~25-35 min on T4\\n\")\n",
719
  "\n",
 
722
  "print(\"\\nTraining complete!\")\n",
723
  "print(f\" Total steps: {train_result.global_step}\")\n",
724
  "print(f\" Training loss: {train_result.training_loss:.6f}\")\n",
725
+ "non_zero_losses = [l for l in step_losses if abs(l) > 1e-8]\n",
726
+ "print(f\" Steps with non-zero loss: {len(non_zero_losses)}/{len(step_losses)}\")\n",
727
  "\n",
728
+ "if len(non_zero_losses) == 0:\n",
729
+ " print(\"\\nAll losses are zero. The model received no gradient signal.\")\n",
730
+ " print(\"Root cause: reward variance is too low for GRPO advantage estimation.\")\n",
731
+ " print(\"Graphs will still be generated and will show the flat signal clearly.\")\n",
732
  "else:\n",
733
+ " print(f\"\\nTraining produced gradient signal on {len(non_zero_losses)} steps.\")\n",
734
  "\n",
735
  "print(f\"\\nMemory after training: {torch.cuda.memory_allocated()/1e9:.2f} GB\")\n",
736
  "\n",
 
858
  "metadata": {},
859
  "outputs": [],
860
  "source": [
861
+ "import matplotlib\n",
862
+ "matplotlib.use('Agg')\n",
863
+ "import matplotlib.pyplot as plt\n",
864
+ "import matplotlib.gridspec as gridspec\n",
865
+ "import numpy as np\n",
866
+ "import os\n",
867
+ "\n",
868
+ "os.makedirs(\"results\", exist_ok=True)\n",
869
+ "\n",
870
+ "tasks = [1, 2, 3, 4]\n",
871
+ "task_labels = [\n",
872
+ " \"Task 1\\nCost Only\\n(Curriculum)\",\n",
873
+ " \"Task 2\\nCost+Comfort\\n(World Model)\",\n",
874
+ " \"Task 3\\nFull DR\\n(World Model)\",\n",
875
+ " \"Task 4\\nInstruction\\n(Theme 2)\",\n",
876
+ "]\n",
877
+ "\n",
878
+ "random_by_task = {1: 0.35, 2: 0.28, 3: 0.21, 4: 0.25}\n",
879
+ "heuristic_by_task = baseline_scores\n",
880
+ "trained_by_task = trained_scores\n",
881
+ "\n",
882
+ "random_vals = [random_by_task.get(t, 0.3) for t in tasks]\n",
883
+ "heuristic_vals = [heuristic_by_task.get(t, 0.5) for t in tasks]\n",
884
+ "trained_vals = [trained_by_task.get(t, 0.5) for t in tasks]\n",
885
+ "\n",
886
+ "baseline_avg = sum(heuristic_vals) / len(heuristic_vals)\n",
887
+ "trained_avg = sum(trained_vals) / len(trained_vals)\n",
888
+ "random_avg = sum(random_vals) / len(random_vals)\n",
889
+ "overall_improvement = ((trained_avg - baseline_avg) / baseline_avg * 100) if baseline_avg > 0 else 0\n",
890
+ "\n",
891
+ "def smooth(values, window=5):\n",
892
+ " if not values or len(values) < 2:\n",
893
+ " return values\n",
894
+ " out = []\n",
895
+ " for i in range(len(values)):\n",
896
+ " w = values[max(0, i-window):i+1]\n",
897
+ " out.append(sum(w) / len(w))\n",
898
+ " return out\n",
899
+ "\n",
900
+ "C = {\n",
901
+ " 'bg': '#0d1117', 'panel': '#161b22', 'grid': '#21262d',\n",
902
+ " 'text': '#e6edf3', 'subtext': '#8b949e', 'random': '#f85149',\n",
903
+ " 'heuristic': '#58a6ff', 'trained': '#3fb950', 'reward': '#d29922',\n",
904
+ " 'loss': '#bc8cff', 'border': '#30363d',\n",
905
+ "}\n",
906
+ "\n",
907
+ "def style_ax(ax, title):\n",
908
+ " ax.set_facecolor(C['panel'])\n",
909
+ " ax.set_title(title, color=C['text'], fontsize=12, fontweight='bold', pad=10)\n",
910
+ " ax.tick_params(colors=C['subtext'], labelsize=9)\n",
911
+ " ax.grid(alpha=0.15, color=C['grid'], linewidth=0.8)\n",
912
+ " for spine in ax.spines.values():\n",
913
+ " spine.set_edgecolor(C['border'])\n",
914
+ " ax.xaxis.label.set_color(C['subtext'])\n",
915
+ " ax.yaxis.label.set_color(C['subtext'])\n",
916
+ "\n",
917
+ "fig = plt.figure(figsize=(18, 13))\n",
918
+ "fig.patch.set_facecolor(C['bg'])\n",
919
+ "gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.50, wspace=0.38,\n",
920
+ " left=0.07, right=0.97, top=0.91, bottom=0.07)\n",
921
+ "\n",
922
+ "# Panel A: policy comparison across all tasks.\n",
923
+ "ax_bar = fig.add_subplot(gs[0, :])\n",
924
+ "ax_bar.set_facecolor(C['panel'])\n",
925
+ "x = np.arange(len(tasks))\n",
926
+ "w = 0.24\n",
927
+ "br = ax_bar.bar(x - w, random_vals, w, label='Random Policy', color=C['random'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
928
+ "bh = ax_bar.bar(x, heuristic_vals, w, label='Heuristic Baseline', color=C['heuristic'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
929
+ "bt = ax_bar.bar(x + w, trained_vals, w, label='Trained LLM (GRPO)', color=C['trained'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
930
+ "\n",
931
+ "for bars, col in [(br, C['random']), (bh, C['heuristic']), (bt, C['trained'])]:\n",
932
+ " for bar in bars:\n",
933
+ " h = bar.get_height()\n",
934
+ " ax_bar.text(bar.get_x() + bar.get_width()/2, h + 0.012, f'{h:.3f}',\n",
935
+ " ha='center', va='bottom', fontsize=8.5, color=col, fontweight='bold', zorder=4)\n",
936
+ "\n",
937
+ "for i in range(len(tasks)):\n",
938
+ " h_val = heuristic_vals[i]\n",
939
+ " t_val = trained_vals[i]\n",
940
+ " pct = ((t_val - h_val) / h_val * 100) if h_val > 0 else 0\n",
941
+ " color = C['trained'] if pct >= 0 else C['random']\n",
942
+ " sign = '+' if pct >= 0 else '-'\n",
943
+ " ax_bar.text(x[i] + w, max(h_val, t_val) + 0.06, f'{sign}{abs(pct):.1f}%',\n",
944
+ " ha='center', fontsize=10, color=color, fontweight='bold', zorder=4)\n",
945
+ "\n",
946
+ "ax_bar.axhline(baseline_avg, color=C['heuristic'], linestyle=':', linewidth=1.5, alpha=0.6,\n",
947
+ " label=f'Heuristic avg ({baseline_avg:.3f})', zorder=2)\n",
948
+ "ax_bar.axhline(trained_avg, color=C['trained'], linestyle=':', linewidth=1.5, alpha=0.6,\n",
949
+ " label=f'Trained avg ({trained_avg:.3f})', zorder=2)\n",
950
+ "ax_bar.set_xticks(x)\n",
951
+ "ax_bar.set_xticklabels(task_labels, color=C['text'], fontsize=10)\n",
952
+ "ax_bar.set_ylabel('Grade Score (0.0 to 1.0, higher is better)', fontsize=11, color=C['subtext'])\n",
953
+ "ax_bar.set_ylim(0, 1.15)\n",
954
+ "ax_bar.set_title('GridMind-RL Policy Performance Across All 4 Hackathon Themes\\nRandom vs Heuristic Baseline vs GRPO Fine-Tuned LLM',\n",
955
+ " color=C['text'], fontsize=13, fontweight='bold', pad=12)\n",
956
+ "ax_bar.legend(fontsize=10, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9,\n",
957
+ " edgecolor=C['border'], ncol=3, loc='upper right')\n",
958
+ "ax_bar.grid(axis='y', alpha=0.15, color=C['grid'], zorder=1)\n",
959
+ "for spine in ax_bar.spines.values():\n",
960
+ " spine.set_edgecolor(C['border'])\n",
961
+ "ax_bar.tick_params(colors=C['subtext'])\n",
962
+ "\n",
963
+ "# Panel B: reward signal over time.\n",
964
+ "ax_rew = fig.add_subplot(gs[1, 0])\n",
965
+ "style_ax(ax_rew, 'GRPO Training: Reward Signal per Step')\n",
966
+ "if training_rewards and len(training_rewards) >= 4:\n",
967
+ " raw = training_rewards\n",
968
+ " steps_r = list(range(1, len(raw) + 1))\n",
969
+ " ax_rew.plot(steps_r, raw, alpha=0.20, color=C['reward'], linewidth=1)\n",
970
+ " ax_rew.plot(steps_r, smooth(raw, window=6), color=C['reward'], linewidth=2.5, label='Smoothed reward')\n",
971
+ " if len(steps_r) > 8:\n",
972
+ " z = np.polyfit(steps_r, raw, 1)\n",
973
+ " p = np.poly1d(z)\n",
974
+ " ax_rew.plot(steps_r, p(steps_r), '--', color='white', alpha=0.35, linewidth=1.5,\n",
975
+ " label=f'Trend ({z[0]:+.5f}/step)')\n",
976
+ " ax_rew.set_xlabel('Reward Function Call')\n",
977
+ " ax_rew.set_ylabel('Reward Value')\n",
978
+ " ax_rew.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
979
+ " if np.var(raw) < 0.01:\n",
980
+ " ax_rew.text(0.5, 0.5, 'Low reward variance detected.\\nThis graph exposes weak learning signal.',\n",
981
+ " transform=ax_rew.transAxes, ha='center', va='center', color=C['random'], fontsize=10,\n",
982
+ " bbox=dict(boxstyle='round', facecolor=C['panel'], alpha=0.8))\n",
983
+ "else:\n",
984
+ " ax_rew.text(0.5, 0.5, 'No training rewards captured.\\nRe-run with fixed reward function.',\n",
985
+ " transform=ax_rew.transAxes, ha='center', va='center', color=C['subtext'], fontsize=11)\n",
986
+ "\n",
987
+ "# Panel C: training loss, with reward variance fallback.\n",
988
+ "ax_loss = fig.add_subplot(gs[1, 1])\n",
989
+ "style_ax(ax_loss, 'GRPO Training Loss per Step')\n",
990
+ "if step_losses and len(step_losses) >= 2:\n",
991
+ " ax_loss.plot(step_numbers, step_losses, alpha=0.25, color=C['loss'], linewidth=1)\n",
992
+ " ax_loss.plot(step_numbers, smooth(step_losses, window=4), color=C['loss'], linewidth=2.5, label='Smoothed loss')\n",
993
+ " non_zero = [l for l in step_losses if abs(l) > 1e-7]\n",
994
+ " pct_nz = len(non_zero) / len(step_losses) * 100\n",
995
+ " note_color = C['trained'] if pct_nz > 50 else C['random']\n",
996
+ " ax_loss.text(0.04, 0.96, f'Non-zero steps: {len(non_zero)}/{len(step_losses)} ({pct_nz:.0f}%)',\n",
997
+ " transform=ax_loss.transAxes, va='top', color=note_color, fontsize=9,\n",
998
+ " bbox=dict(boxstyle='round', facecolor=C['panel'], alpha=0.8))\n",
999
+ " ax_loss.set_xlabel('Training Step')\n",
1000
+ " ax_loss.set_ylabel('Loss')\n",
1001
+ " ax_loss.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
1002
+ "else:\n",
1003
+ " proxy_loss = []\n",
1004
+ " for i in range(0, len(training_rewards), 4):\n",
1005
+ " chunk = training_rewards[i:i+4]\n",
1006
+ " if len(chunk) > 1:\n",
1007
+ " proxy_loss.append(float(np.var(chunk)))\n",
1008
+ " if proxy_loss:\n",
1009
+ " ax_loss.plot(range(1, len(proxy_loss) + 1), proxy_loss, color=C['loss'], linewidth=2,\n",
1010
+ " label='Reward variance proxy')\n",
1011
+ " ax_loss.set_xlabel('Training Batch')\n",
1012
+ " ax_loss.set_ylabel('Reward Variance')\n",
1013
+ " ax_loss.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
1014
+ " ax_loss.text(0.5, 0.92, 'Loss not captured - showing reward variance proxy',\n",
1015
+ " transform=ax_loss.transAxes, ha='center', color=C['subtext'], fontsize=8)\n",
1016
+ " else:\n",
1017
+ " ax_loss.text(0.5, 0.5, 'No loss data available.', transform=ax_loss.transAxes,\n",
1018
+ " ha='center', va='center', color=C['subtext'], fontsize=11)\n",
1019
+ "\n",
1020
+ "fig.suptitle(\n",
1021
+ " 'GridMind-RL - Meta OpenEnv Hackathon - Multi-Agent Industrial Energy Management\\n'\n",
1022
+ " f'Model: Qwen2.5-1.5B + QLoRA + GRPO | Overall improvement vs heuristic: {overall_improvement:+.1f}%',\n",
1023
+ " color=C['text'], fontsize=14, fontweight='bold', y=0.97\n",
1024
+ ")\n",
1025
+ "\n",
1026
+ "dashboard_path = 'results/gridmind_training_dashboard.png'\n",
1027
+ "fig.savefig(dashboard_path, dpi=180, facecolor=fig.get_facecolor(), bbox_inches='tight')\n",
1028
+ "plt.close(fig)\n",
1029
+ "\n",
1030
+ "# Separate before/after comparison graph for quick judge inspection.\n",
1031
+ "fig2, ax2 = plt.subplots(figsize=(11, 6))\n",
1032
+ "fig2.patch.set_facecolor(C['bg'])\n",
1033
+ "ax2.set_facecolor(C['panel'])\n",
1034
+ "ax2.bar(x - w/2, heuristic_vals, w, label='Heuristic Baseline', color=C['heuristic'], alpha=0.9)\n",
1035
+ "ax2.bar(x + w/2, trained_vals, w, label='Trained LLM (GRPO)', color=C['trained'], alpha=0.9)\n",
1036
+ "ax2.set_xticks(x)\n",
1037
+ "ax2.set_xticklabels(task_labels, color=C['text'])\n",
1038
+ "ax2.set_ylim(0, 1.05)\n",
1039
+ "ax2.set_ylabel('Grade Score', color=C['subtext'])\n",
1040
+ "ax2.set_title('Before/After Policy Score Comparison', color=C['text'], fontweight='bold')\n",
1041
+ "ax2.legend(facecolor=C['grid'], labelcolor=C['text'], edgecolor=C['border'])\n",
1042
+ "ax2.grid(axis='y', alpha=0.15, color=C['grid'])\n",
1043
+ "ax2.tick_params(colors=C['subtext'])\n",
1044
+ "for spine in ax2.spines.values():\n",
1045
+ " spine.set_edgecolor(C['border'])\n",
1046
+ "comparison_path = 'results/gridmind_before_after_comparison.png'\n",
1047
+ "fig2.savefig(comparison_path, dpi=180, facecolor=fig2.get_facecolor(), bbox_inches='tight')\n",
1048
+ "plt.close(fig2)\n",
1049
+ "\n",
1050
+ "print(f\"Saved dashboard graph to {dashboard_path}\")\n",
1051
+ "print(f\"Saved before/after graph to {comparison_path}\")\n",
1052
+ "\n",
1053
  "results = {\n",
1054
  " \"heuristic_baseline\": {\n",
1055
  " \"scores_by_task\": {str(k): v for k, v in baseline_scores.items()},\n",
 
1064
  " \"training_steps\": grpo_config.max_steps,\n",
1065
  " \"themes_covered\": [\"multi_agent\", \"instruction_following\", \"world_modeling\", \"curriculum\"],\n",
1066
  " \"training_rewards_log\": training_rewards[-20:] if training_rewards else [],\n",
1067
+ " \"training_step_logs\": training_steps_log[-20:] if training_steps_log else [],\n",
1068
+ " \"step_losses\": step_losses if 'step_losses' in globals() else [],\n",
1069
+ " \"graphs\": {\n",
1070
+ " \"dashboard\": dashboard_path,\n",
1071
+ " \"before_after\": comparison_path,\n",
1072
+ " },\n",
1073
  "}\n",
1074
  "\n",
1075
  "print(\"Saving results...\")\n",
scripts/validate-submission.sh CHANGED
@@ -6,7 +6,7 @@
6
  #
7
  # Prerequisites:
8
  # - Docker: https://docs.docker.com/get-docker/
9
- # - openenv-core: pip install openenv-core
10
  # - curl (usually pre-installed)
11
  #
12
  # Run:
@@ -155,7 +155,7 @@ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
155
 
156
  if ! command -v openenv &>/dev/null; then
157
  fail "openenv command not found"
158
- hint "Install it: pip install openenv-core"
159
  stop_at "Step 3"
160
  fi
161
 
 
6
  #
7
  # Prerequisites:
8
  # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core==0.2.3
10
  # - curl (usually pre-installed)
11
  #
12
  # Run:
 
155
 
156
  if ! command -v openenv &>/dev/null; then
157
  fail "openenv command not found"
158
+ hint "Install it: pip install openenv-core==0.2.3"
159
  stop_at "Step 3"
160
  fi
161
 
uv.lock CHANGED
@@ -1,5 +1,5 @@
1
  version = 4
2
- requires-python = ">=3.9"
3
 
4
  [[package]]
5
  name = "openai"
 
1
  version = 4
2
+ requires-python = ">=3.10"
3
 
4
  [[package]]
5
  name = "openai"