Eshit commited on
Commit
3e8e5dd
·
1 Parent(s): 5d6ff6f

Add GRPO training results: 150 steps, promoted easy→medium→hard

Browse files
training/grpo_eval_results.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trained": {
3
+ "easy": {
4
+ "mean": 5.126686666666666,
5
+ "std": 3.900705914893981,
6
+ "pop_saved_pct": 86.66666666666667,
7
+ "json_success_rate": 98.52941176470588
8
+ },
9
+ "medium": {
10
+ "mean": 5.7353000000000005,
11
+ "std": 3.070706015886249,
12
+ "pop_saved_pct": 97.14285714285715,
13
+ "json_success_rate": 99.77900552486187
14
+ },
15
+ "hard": {
16
+ "mean": 2.144273333333333,
17
+ "std": 2.867108504973066,
18
+ "pop_saved_pct": 92.48062015503875,
19
+ "json_success_rate": 99.17695473251028
20
+ }
21
+ },
22
+ "baselines": {
23
+ "random": {
24
+ "easy": {
25
+ "scores": [7.7749, 7.7751, 7.775, 7.775, 0.04],
26
+ "mean": 6.228,
27
+ "std": 3.094,
28
+ "mean_containment_pct": 1.0,
29
+ "mean_pop_saved_pct": 0.92,
30
+ "mean_steps": 25.8,
31
+ "crew_casualty_rate": 0.0,
32
+ "mean_time_s": 0.067
33
+ },
34
+ "medium": {
35
+ "scores": [-1.7044, -1.0029, 1.0762, 0.7527, 7.4403],
36
+ "mean": 1.3124,
37
+ "std": 3.2367,
38
+ "mean_containment_pct": 1.0,
39
+ "mean_pop_saved_pct": 0.7365,
40
+ "mean_steps": 72.0,
41
+ "crew_casualty_rate": 0.0,
42
+ "mean_time_s": 0.676
43
+ },
44
+ "hard": {
45
+ "scores": [7.8668, 1.3602, -0.7466, 1.0443, 1.2813],
46
+ "mean": 2.1612,
47
+ "std": 2.9554,
48
+ "mean_containment_pct": 1.0,
49
+ "mean_pop_saved_pct": 0.9023,
50
+ "mean_steps": 84.6,
51
+ "crew_casualty_rate": 0.0,
52
+ "mean_time_s": 1.301
53
+ }
54
+ },
55
+ "heuristic": {
56
+ "easy": {
57
+ "scores": [7.6749, 7.575, 7.475, 7.475, 7.4749],
58
+ "mean": 7.535,
59
+ "std": 0.08,
60
+ "mean_containment_pct": 1.0,
61
+ "mean_pop_saved_pct": 1.0,
62
+ "mean_steps": 26.6,
63
+ "crew_casualty_rate": 0.0,
64
+ "mean_time_s": 0.118
65
+ },
66
+ "medium": {
67
+ "scores": [7.6001, 7.7001, 7.8, 7.7, 0.7683],
68
+ "mean": 6.3137,
69
+ "std": 2.7734,
70
+ "mean_containment_pct": 1.0,
71
+ "mean_pop_saved_pct": 0.9746,
72
+ "mean_steps": 46.2,
73
+ "crew_casualty_rate": 0.0,
74
+ "mean_time_s": 0.48
75
+ },
76
+ "hard": {
77
+ "scores": [7.8668, 7.867, 0.9443, 7.6667, -0.6696],
78
+ "mean": 4.735,
79
+ "std": 3.7892,
80
+ "mean_containment_pct": 1.0,
81
+ "mean_pop_saved_pct": 0.9279,
82
+ "mean_steps": 83.2,
83
+ "crew_casualty_rate": 0.0,
84
+ "mean_time_s": 1.487
85
+ }
86
+ }
87
+ },
88
+ "eval_seeds": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56],
89
+ "model": "Eshit/wildfire-grpo-7b"
90
+ }
training/grpo_v2_colab.ipynb CHANGED
@@ -15,7 +15,14 @@
15
  "4. GRPO loop too slow - consequence of fix 3\n",
16
  "5. parse_action(text, None) crash - standalone check_json_format() for format reward\n",
17
  "\n",
18
- "**Hardware:** A100 40GB on Colab"
 
 
 
 
 
 
 
19
  ]
20
  },
21
  {
@@ -27,26 +34,27 @@
27
  },
28
  {
29
  "cell_type": "code",
30
- "execution_count": null,
31
  "metadata": {},
32
- "outputs": [],
33
  "source": [
34
  "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
35
- "!pip install trl==0.15.2 datasets==3.4.1 wandb"
36
- ]
 
 
 
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": null,
41
  "metadata": {},
42
- "outputs": [],
43
  "source": [
44
  "import torch\n",
45
  "assert torch.cuda.is_available(), \"GPU not available - switch to a GPU runtime\"\n",
46
  "gpu_name = torch.cuda.get_device_name(0)\n",
47
- "gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9\n",
48
  "print(f\"GPU: {gpu_name} | VRAM: {gpu_mem:.1f} GB\")"
49
- ]
 
 
50
  },
51
  {
52
  "cell_type": "markdown",
@@ -57,9 +65,7 @@
57
  },
58
  {
59
  "cell_type": "code",
60
- "execution_count": null,
61
  "metadata": {},
62
- "outputs": [],
63
  "source": [
64
  "from unsloth import FastLanguageModel\n",
65
  "\n",
@@ -80,7 +86,9 @@
80
  "\n",
81
  "print(f\"Loaded SFT checkpoint: {SFT_MODEL}\")\n",
82
  "model.print_trainable_parameters()"
83
- ]
 
 
84
  },
85
  {
86
  "cell_type": "markdown",
@@ -91,9 +99,7 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": null,
95
  "metadata": {},
96
- "outputs": [],
97
  "source": [
98
  "import os, random, json, sys\n",
99
  "import torch\n",
@@ -132,7 +138,9 @@
132
  "print(f\"Start tier: {controller.get_tier()}\")\n",
133
  "print(f\"Seed pool: {len(SEED_POOL)} seeds\")\n",
134
  "print(\"Env imports OK\")"
135
- ]
 
 
136
  },
137
  {
138
  "cell_type": "markdown",
@@ -145,9 +153,7 @@
145
  },
146
  {
147
  "cell_type": "code",
148
- "execution_count": null,
149
  "metadata": {},
150
- "outputs": [],
151
  "source": [
152
  "import json as _json\n",
153
  "import re as _re\n",
@@ -194,7 +200,9 @@
194
  "assert check_json_format('{\"action_type\": \"bogus\"}') == 'regex_fallback'\n",
195
  "assert check_json_format('no json here') == 'safe_idle'\n",
196
  "print('check_json_format OK')"
197
- ]
 
 
198
  },
199
  {
200
  "cell_type": "markdown",
@@ -209,9 +217,7 @@
209
  },
210
  {
211
  "cell_type": "code",
212
- "execution_count": null,
213
  "metadata": {},
214
- "outputs": [],
215
  "source": [
216
  "def reward_fn_outcome(completions, prompts, tier=None, seed=None, **kwargs):\n",
217
  " \"\"\"\n",
@@ -290,7 +296,9 @@
290
  "\n",
291
  "\n",
292
  "print('Reward functions defined.')"
293
- ]
 
 
294
  },
295
  {
296
  "cell_type": "markdown",
@@ -304,9 +312,7 @@
304
  },
305
  {
306
  "cell_type": "code",
307
- "execution_count": null,
308
  "metadata": {},
309
- "outputs": [],
310
  "source": [
311
  "def build_prompt_dataset(n=200):\n",
312
  " \"\"\"\n",
@@ -338,7 +344,9 @@
338
  "print(f\"Tier: {_test_ds[0]['tier']}, Seed: {_test_ds[0]['seed']}\")\n",
339
  "print(f\"Prompt roles: {[m['role'] for m in _test_ds[0]['prompt']]}\")\n",
340
  "del _test_ds"
341
- ]
 
 
342
  },
343
  {
344
  "cell_type": "markdown",
@@ -351,9 +359,7 @@
351
  },
352
  {
353
  "cell_type": "code",
354
- "execution_count": null,
355
  "metadata": {},
356
- "outputs": [],
357
  "source": [
358
  "from transformers import TrainerCallback\n",
359
  "\n",
@@ -373,7 +379,9 @@
373
  "\n",
374
  "\n",
375
  "print('CurriculumDatasetCallback defined.')"
376
- ]
 
 
377
  },
378
  {
379
  "cell_type": "markdown",
@@ -384,9 +392,7 @@
384
  },
385
  {
386
  "cell_type": "code",
387
- "execution_count": null,
388
  "metadata": {},
389
- "outputs": [],
390
  "source": [
391
  "from trl import GRPOTrainer, GRPOConfig\n",
392
  "\n",
@@ -418,7 +424,9 @@
418
  "trainer.add_callback(CurriculumDatasetCallback(trainer))\n",
419
  "\n",
420
  "print('GRPOTrainer ready.')"
421
- ]
 
 
422
  },
423
  {
424
  "cell_type": "markdown",
@@ -429,9 +437,7 @@
429
  },
430
  {
431
  "cell_type": "code",
432
- "execution_count": null,
433
  "metadata": {},
434
- "outputs": [],
435
  "source": [
436
  "import wandb\n",
437
  "wandb.init(project='wildfire-grpo', name='qwen7b-v2')\n",
@@ -448,7 +454,9 @@
448
  "with open('./training_stats.json', 'w') as f:\n",
449
  " json.dump(stats, f, indent=2)\n",
450
  "print('Stats saved -> training_stats.json')"
451
- ]
 
 
452
  },
453
  {
454
  "cell_type": "markdown",
@@ -461,9 +469,7 @@
461
  },
462
  {
463
  "cell_type": "code",
464
- "execution_count": null,
465
  "metadata": {},
466
- "outputs": [],
467
  "source": [
468
  "class LLMAgent:\n",
469
  " \"\"\"Wraps the trained model for evaluation. Must be re-instantiated per episode.\"\"\"\n",
@@ -510,13 +516,13 @@
510
  "\n",
511
  "\n",
512
  "print('LLMAgent class defined.')"
513
- ]
 
 
514
  },
515
  {
516
  "cell_type": "code",
517
- "execution_count": null,
518
  "metadata": {},
519
- "outputs": [],
520
  "source": [
521
  "import numpy as np\n",
522
  "\n",
@@ -613,7 +619,9 @@
613
  "print('\\nPASS: At least one tier within 1.0 of heuristic baseline.')\n",
614
  "\n",
615
  "FastLanguageModel.for_training(model)"
616
- ]
 
 
617
  },
618
  {
619
  "cell_type": "markdown",
@@ -624,38 +632,39 @@
624
  },
625
  {
626
  "cell_type": "code",
627
- "execution_count": null,
628
  "metadata": {},
629
- "outputs": [],
630
  "source": [
631
  "model.save_pretrained('./grpo_final')\n",
632
  "tokenizer.save_pretrained('./grpo_final')\n",
633
  "print('Saved to ./grpo_final')"
634
- ]
 
 
635
  },
636
  {
637
  "cell_type": "code",
638
- "execution_count": null,
639
  "metadata": {},
640
- "outputs": [],
641
  "source": [
642
  "HF_USERNAME = 'Eshit' # <-- CHANGE THIS\n",
643
  "model.push_to_hub(f'{HF_USERNAME}/wildfire-grpo-7b')\n",
644
  "tokenizer.push_to_hub(f'{HF_USERNAME}/wildfire-grpo-7b')\n",
645
  "print(f'Pushed to hub: {HF_USERNAME}/wildfire-grpo-7b')"
646
- ]
 
 
647
  },
648
  {
649
  "cell_type": "code",
650
- "execution_count": null,
651
  "metadata": {},
652
- "outputs": [],
653
  "source": [
654
  "!zip -r grpo_final.zip ./grpo_final\n",
655
- "from google.colab import files\n",
656
- "files.download('grpo_final.zip')\n",
657
- "print('Download started.')"
658
- ]
 
 
 
659
  }
660
  ],
661
  "metadata": {
@@ -675,4 +684,4 @@
675
  },
676
  "nbformat": 4,
677
  "nbformat_minor": 5
678
- }
 
15
  "4. GRPO loop too slow - consequence of fix 3\n",
16
  "5. parse_action(text, None) crash - standalone check_json_format() for format reward\n",
17
  "\n",
18
+ "**Hardware:** A10G Large 24GB (HuggingFace Space JupyterLab)\n",
19
+ "\n",
20
+ "**Before running:** In a terminal, authenticate:\n",
21
+ "```\n",
22
+ "huggingface-cli login # HF token with write access (to load SFT model + push result)\n",
23
+ "wandb login # wandb API key (Section 9 logs to wandb)\n",
24
+ "```\n",
25
+ "Also ensure the repo is cloned and this notebook is opened from inside the repo root (so `REPO_ROOT = \".\"` resolves correctly)."
26
  ]
27
  },
28
  {
 
34
  },
35
  {
36
  "cell_type": "code",
 
37
  "metadata": {},
 
38
  "source": [
39
  "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"\n",
40
+ "!pip install trl==0.15.2 datasets==3.4.1 wandb\n",
41
+ "!pip install torchvision --extra-index-url https://download.pytorch.org/whl/cu121"
42
+ ],
43
+ "execution_count": null,
44
+ "outputs": []
45
  },
46
  {
47
  "cell_type": "code",
 
48
  "metadata": {},
 
49
  "source": [
50
  "import torch\n",
51
  "assert torch.cuda.is_available(), \"GPU not available - switch to a GPU runtime\"\n",
52
  "gpu_name = torch.cuda.get_device_name(0)\n",
53
+ "gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9\n",
54
  "print(f\"GPU: {gpu_name} | VRAM: {gpu_mem:.1f} GB\")"
55
+ ],
56
+ "execution_count": null,
57
+ "outputs": []
58
  },
59
  {
60
  "cell_type": "markdown",
 
65
  },
66
  {
67
  "cell_type": "code",
 
68
  "metadata": {},
 
69
  "source": [
70
  "from unsloth import FastLanguageModel\n",
71
  "\n",
 
86
  "\n",
87
  "print(f\"Loaded SFT checkpoint: {SFT_MODEL}\")\n",
88
  "model.print_trainable_parameters()"
89
+ ],
90
+ "execution_count": null,
91
+ "outputs": []
92
  },
93
  {
94
  "cell_type": "markdown",
 
99
  },
100
  {
101
  "cell_type": "code",
 
102
  "metadata": {},
 
103
  "source": [
104
  "import os, random, json, sys\n",
105
  "import torch\n",
 
138
  "print(f\"Start tier: {controller.get_tier()}\")\n",
139
  "print(f\"Seed pool: {len(SEED_POOL)} seeds\")\n",
140
  "print(\"Env imports OK\")"
141
+ ],
142
+ "execution_count": null,
143
+ "outputs": []
144
  },
145
  {
146
  "cell_type": "markdown",
 
153
  },
154
  {
155
  "cell_type": "code",
 
156
  "metadata": {},
 
157
  "source": [
158
  "import json as _json\n",
159
  "import re as _re\n",
 
200
  "assert check_json_format('{\"action_type\": \"bogus\"}') == 'regex_fallback'\n",
201
  "assert check_json_format('no json here') == 'safe_idle'\n",
202
  "print('check_json_format OK')"
203
+ ],
204
+ "execution_count": null,
205
+ "outputs": []
206
  },
207
  {
208
  "cell_type": "markdown",
 
217
  },
218
  {
219
  "cell_type": "code",
 
220
  "metadata": {},
 
221
  "source": [
222
  "def reward_fn_outcome(completions, prompts, tier=None, seed=None, **kwargs):\n",
223
  " \"\"\"\n",
 
296
  "\n",
297
  "\n",
298
  "print('Reward functions defined.')"
299
+ ],
300
+ "execution_count": null,
301
+ "outputs": []
302
  },
303
  {
304
  "cell_type": "markdown",
 
312
  },
313
  {
314
  "cell_type": "code",
 
315
  "metadata": {},
 
316
  "source": [
317
  "def build_prompt_dataset(n=200):\n",
318
  " \"\"\"\n",
 
344
  "print(f\"Tier: {_test_ds[0]['tier']}, Seed: {_test_ds[0]['seed']}\")\n",
345
  "print(f\"Prompt roles: {[m['role'] for m in _test_ds[0]['prompt']]}\")\n",
346
  "del _test_ds"
347
+ ],
348
+ "execution_count": null,
349
+ "outputs": []
350
  },
351
  {
352
  "cell_type": "markdown",
 
359
  },
360
  {
361
  "cell_type": "code",
 
362
  "metadata": {},
 
363
  "source": [
364
  "from transformers import TrainerCallback\n",
365
  "\n",
 
379
  "\n",
380
  "\n",
381
  "print('CurriculumDatasetCallback defined.')"
382
+ ],
383
+ "execution_count": null,
384
+ "outputs": []
385
  },
386
  {
387
  "cell_type": "markdown",
 
392
  },
393
  {
394
  "cell_type": "code",
 
395
  "metadata": {},
 
396
  "source": [
397
  "from trl import GRPOTrainer, GRPOConfig\n",
398
  "\n",
 
424
  "trainer.add_callback(CurriculumDatasetCallback(trainer))\n",
425
  "\n",
426
  "print('GRPOTrainer ready.')"
427
+ ],
428
+ "execution_count": null,
429
+ "outputs": []
430
  },
431
  {
432
  "cell_type": "markdown",
 
437
  },
438
  {
439
  "cell_type": "code",
 
440
  "metadata": {},
 
441
  "source": [
442
  "import wandb\n",
443
  "wandb.init(project='wildfire-grpo', name='qwen7b-v2')\n",
 
454
  "with open('./training_stats.json', 'w') as f:\n",
455
  " json.dump(stats, f, indent=2)\n",
456
  "print('Stats saved -> training_stats.json')"
457
+ ],
458
+ "execution_count": null,
459
+ "outputs": []
460
  },
461
  {
462
  "cell_type": "markdown",
 
469
  },
470
  {
471
  "cell_type": "code",
 
472
  "metadata": {},
 
473
  "source": [
474
  "class LLMAgent:\n",
475
  " \"\"\"Wraps the trained model for evaluation. Must be re-instantiated per episode.\"\"\"\n",
 
516
  "\n",
517
  "\n",
518
  "print('LLMAgent class defined.')"
519
+ ],
520
+ "execution_count": null,
521
+ "outputs": []
522
  },
523
  {
524
  "cell_type": "code",
 
525
  "metadata": {},
 
526
  "source": [
527
  "import numpy as np\n",
528
  "\n",
 
619
  "print('\\nPASS: At least one tier within 1.0 of heuristic baseline.')\n",
620
  "\n",
621
  "FastLanguageModel.for_training(model)"
622
+ ],
623
+ "execution_count": null,
624
+ "outputs": []
625
  },
626
  {
627
  "cell_type": "markdown",
 
632
  },
633
  {
634
  "cell_type": "code",
 
635
  "metadata": {},
 
636
  "source": [
637
  "model.save_pretrained('./grpo_final')\n",
638
  "tokenizer.save_pretrained('./grpo_final')\n",
639
  "print('Saved to ./grpo_final')"
640
+ ],
641
+ "execution_count": null,
642
+ "outputs": []
643
  },
644
  {
645
  "cell_type": "code",
 
646
  "metadata": {},
 
647
  "source": [
648
  "HF_USERNAME = 'Eshit' # <-- CHANGE THIS\n",
649
  "model.push_to_hub(f'{HF_USERNAME}/wildfire-grpo-7b')\n",
650
  "tokenizer.push_to_hub(f'{HF_USERNAME}/wildfire-grpo-7b')\n",
651
  "print(f'Pushed to hub: {HF_USERNAME}/wildfire-grpo-7b')"
652
+ ],
653
+ "execution_count": null,
654
+ "outputs": []
655
  },
656
  {
657
  "cell_type": "code",
 
658
  "metadata": {},
 
659
  "source": [
660
  "!zip -r grpo_final.zip ./grpo_final\n",
661
+ "print('Zipped to grpo_final.zip')\n",
662
+ "# On HF JupyterLab: right-click grpo_final.zip in the file browser and choose Download.\n",
663
+ "# On Google Colab only (not needed here):\n",
664
+ "# from google.colab import files; files.download('grpo_final.zip')"
665
+ ],
666
+ "execution_count": null,
667
+ "outputs": []
668
  }
669
  ],
670
  "metadata": {
 
684
  },
685
  "nbformat": 4,
686
  "nbformat_minor": 5
687
+ }
training/training_stats.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"step": 0, "tier": "easy", "mean_reward": 4.22450625},
3
+ {"step": 1, "tier": "easy", "mean_reward": 7.235850000000001},
4
+ {"step": 2, "tier": "easy", "mean_reward": 5.956550000000004},
5
+ {"step": 3, "tier": "easy", "mean_reward": 3.8100750000000003},
6
+ {"step": 4, "tier": "easy", "mean_reward": 5.760793749999998},
7
+ {"step": 5, "tier": "easy", "mean_reward": 7.463293749999999},
8
+ {"step": 6, "tier": "easy", "mean_reward": 7.546843750000001},
9
+ {"step": 7, "tier": "easy", "mean_reward": 5.279537499999998},
10
+ {"step": 8, "tier": "easy", "mean_reward": 5.774396875000001},
11
+ {"step": 9, "tier": "easy", "mean_reward": 5.672221875000001},
12
+ {"step": 10, "tier": "easy", "mean_reward": 7.486034375000001},
13
+ {"step": 11, "tier": "easy", "mean_reward": 3.8812187500000004},
14
+ {"step": 12, "tier": "easy", "mean_reward": 6.099375},
15
+ {"step": 13, "tier": "easy", "mean_reward": 4.054215625},
16
+ {"step": 14, "tier": "easy", "mean_reward": 2.3378656249999996},
17
+ {"step": 15, "tier": "easy", "mean_reward": 7.213131249999999},
18
+ {"step": 16, "tier": "easy", "mean_reward": 7.514025000000003},
19
+ {"step": 17, "tier": "easy", "mean_reward": 5.553949999999997},
20
+ {"step": 18, "tier": "easy", "mean_reward": 5.864062499999999},
21
+ {"step": 19, "tier": "easy", "mean_reward": 7.496884374999997},
22
+ {"step": 20, "tier": "easy", "mean_reward": 5.562199999999999},
23
+ {"step": 21, "tier": "easy", "mean_reward": 5.7229656250000005},
24
+ {"step": 22, "tier": "easy", "mean_reward": 7.3982468750000026},
25
+ {"step": 23, "tier": "easy", "mean_reward": 4.385203124999996},
26
+ {"step": 24, "tier": "easy", "mean_reward": 7.101512500000001},
27
+ {"step": 25, "tier": "easy", "mean_reward": 7.246253125},
28
+ {"step": 26, "tier": "easy", "mean_reward": 5.618318749999997},
29
+ {"step": 27, "tier": "easy", "mean_reward": 3.7970281249999998},
30
+ {"step": 28, "tier": "easy", "mean_reward": 5.964250000000002},
31
+ {"step": 29, "tier": "easy", "mean_reward": 7.492940624999996},
32
+ {"step": 30, "tier": "easy", "mean_reward": 6.027812499999999},
33
+ {"step": 31, "tier": "easy", "mean_reward": 5.941168749999999},
34
+ {"step": 32, "tier": "easy", "mean_reward": 6.864665624999995},
35
+ {"step": 33, "tier": "easy", "mean_reward": 5.611512500000002},
36
+ {"step": 34, "tier": "easy", "mean_reward": 5.644321875},
37
+ {"step": 35, "tier": "easy", "mean_reward": 6.196540625},
38
+ {"step": 36, "tier": "easy", "mean_reward": 7.3195125},
39
+ {"step": 37, "tier": "easy", "mean_reward": 6.589524999999998},
40
+ {"step": 38, "tier": "easy", "mean_reward": 6.493584374999999},
41
+ {"step": 39, "tier": "easy", "mean_reward": 4.5787531249999995},
42
+ {"step": 40, "tier": "easy", "mean_reward": 7.1647374999999975},
43
+ {"step": 41, "tier": "easy", "mean_reward": 6.307021875},
44
+ {"step": 42, "tier": "easy", "mean_reward": 5.6441625},
45
+ {"step": 43, "tier": "easy", "mean_reward": 6.051987499999996},
46
+ {"step": 44, "tier": "easy", "mean_reward": 6.970406250000004},
47
+ {"step": 45, "tier": "easy", "mean_reward": 7.375721874999999},
48
+ {"step": 46, "tier": "easy", "mean_reward": 6.082374999999997},
49
+ {"step": 47, "tier": "easy", "mean_reward": 6.735612500000002},
50
+ {"step": 48, "tier": "easy", "mean_reward": 6.820753125000001},
51
+ {"step": 49, "tier": "easy", "mean_reward": 5.743384375000001},
52
+ {"step": 50, "tier": "easy", "mean_reward": 6.935793750000004},
53
+ {"step": 51, "tier": "easy", "mean_reward": 6.389853125},
54
+ {"step": 52, "tier": "easy", "mean_reward": 6.366893750000002},
55
+ {"step": 53, "tier": "medium", "mean_reward": 6.685290624999997},
56
+ {"step": 54, "tier": "medium", "mean_reward": 5.949612500000001},
57
+ {"step": 55, "tier": "medium", "mean_reward": 2.770065624999999},
58
+ {"step": 56, "tier": "medium", "mean_reward": 7.203259374999998},
59
+ {"step": 57, "tier": "medium", "mean_reward": 4.506112500000001},
60
+ {"step": 58, "tier": "medium", "mean_reward": 7.0263187500000015},
61
+ {"step": 59, "tier": "medium", "mean_reward": 5.168934375000002},
62
+ {"step": 60, "tier": "medium", "mean_reward": 7.033081250000002},
63
+ {"step": 61, "tier": "medium", "mean_reward": 6.253359374999997},
64
+ {"step": 62, "tier": "medium", "mean_reward": 6.959756249999999},
65
+ {"step": 63, "tier": "hard", "mean_reward": 6.969309374999998},
66
+ {"step": 64, "tier": "hard", "mean_reward": 5.3616906250000005},
67
+ {"step": 65, "tier": "hard", "mean_reward": 6.252678124999999},
68
+ {"step": 66, "tier": "hard", "mean_reward": 2.5560937500000005},
69
+ {"step": 67, "tier": "hard", "mean_reward": 5.578853125},
70
+ {"step": 68, "tier": "hard", "mean_reward": 7.466365625000002},
71
+ {"step": 69, "tier": "hard", "mean_reward": 7.713275000000002},
72
+ {"step": 70, "tier": "hard", "mean_reward": 7.621018749999998},
73
+ {"step": 71, "tier": "hard", "mean_reward": 6.264199999999996},
74
+ {"step": 72, "tier": "hard", "mean_reward": 4.712021874999998},
75
+ {"step": 73, "tier": "hard", "mean_reward": 3.8931437500000015},
76
+ {"step": 74, "tier": "hard", "mean_reward": 7.114093750000004},
77
+ {"step": 75, "tier": "hard", "mean_reward": 6.6951906249999995},
78
+ {"step": 76, "tier": "hard", "mean_reward": 2.933387499999999},
79
+ {"step": 77, "tier": "hard", "mean_reward": 6.704121874999999},
80
+ {"step": 78, "tier": "hard", "mean_reward": 5.275803125},
81
+ {"step": 79, "tier": "hard", "mean_reward": 5.645184375000001},
82
+ {"step": 80, "tier": "hard", "mean_reward": 7.5555062500000005},
83
+ {"step": 81, "tier": "hard", "mean_reward": 5.178903125000001},
84
+ {"step": 82, "tier": "hard", "mean_reward": 5.782215625},
85
+ {"step": 83, "tier": "hard", "mean_reward": 7.4922562500000005},
86
+ {"step": 84, "tier": "hard", "mean_reward": 5.397803125000002},
87
+ {"step": 85, "tier": "hard", "mean_reward": 5.785240625},
88
+ {"step": 86, "tier": "hard", "mean_reward": 6.006559375000001},
89
+ {"step": 87, "tier": "hard", "mean_reward": 5.064365625000001},
90
+ {"step": 88, "tier": "hard", "mean_reward": 6.120146874999998},
91
+ {"step": 89, "tier": "hard", "mean_reward": 7.3549874999999965},
92
+ {"step": 90, "tier": "hard", "mean_reward": 5.017793749999999},
93
+ {"step": 91, "tier": "hard", "mean_reward": 7.611765625000001},
94
+ {"step": 92, "tier": "hard", "mean_reward": 7.58835},
95
+ {"step": 93, "tier": "hard", "mean_reward": 4.282640625000003},
96
+ {"step": 94, "tier": "hard", "mean_reward": 7.624143749999999},
97
+ {"step": 95, "tier": "hard", "mean_reward": 7.467125},
98
+ {"step": 96, "tier": "hard", "mean_reward": 7.492253125000001},
99
+ {"step": 97, "tier": "hard", "mean_reward": 3.8446718750000026},
100
+ {"step": 98, "tier": "hard", "mean_reward": 6.381118750000002},
101
+ {"step": 99, "tier": "hard", "mean_reward": 5.9315812500000025},
102
+ {"step": 100, "tier": "hard", "mean_reward": 5.303253125000001},
103
+ {"step": 101, "tier": "hard", "mean_reward": 5.379359374999997},
104
+ {"step": 102, "tier": "hard", "mean_reward": 6.105550000000001},
105
+ {"step": 103, "tier": "hard", "mean_reward": 4.132209375000002},
106
+ {"step": 104, "tier": "hard", "mean_reward": 5.99065},
107
+ {"step": 105, "tier": "hard", "mean_reward": 6.396168749999998},
108
+ {"step": 106, "tier": "hard", "mean_reward": 6.190524999999998},
109
+ {"step": 107, "tier": "hard", "mean_reward": 7.378921874999999},
110
+ {"step": 108, "tier": "hard", "mean_reward": 5.527831249999997},
111
+ {"step": 109, "tier": "hard", "mean_reward": 5.664981250000001},
112
+ {"step": 110, "tier": "hard", "mean_reward": 6.596590625000001},
113
+ {"step": 111, "tier": "hard", "mean_reward": 5.718784375000003},
114
+ {"step": 112, "tier": "hard", "mean_reward": 5.454768749999999},
115
+ {"step": 113, "tier": "hard", "mean_reward": 5.661271875},
116
+ {"step": 114, "tier": "hard", "mean_reward": 4.344675},
117
+ {"step": 115, "tier": "hard", "mean_reward": 4.810181250000001},
118
+ {"step": 116, "tier": "hard", "mean_reward": 5.746131249999998},
119
+ {"step": 117, "tier": "hard", "mean_reward": 5.718934375},
120
+ {"step": 118, "tier": "hard", "mean_reward": 7.343309375},
121
+ {"step": 119, "tier": "hard", "mean_reward": 5.728325},
122
+ {"step": 120, "tier": "hard", "mean_reward": 4.915784375},
123
+ {"step": 121, "tier": "hard", "mean_reward": 5.746521875},
124
+ {"step": 122, "tier": "hard", "mean_reward": 6.815368750000003},
125
+ {"step": 123, "tier": "hard", "mean_reward": 6.415571874999999},
126
+ {"step": 124, "tier": "hard", "mean_reward": 6.616740625000003},
127
+ {"step": 125, "tier": "hard", "mean_reward": 7.136087499999999},
128
+ {"step": 126, "tier": "hard", "mean_reward": 6.3915187499999995},
129
+ {"step": 127, "tier": "hard", "mean_reward": 6.998762500000002},
130
+ {"step": 128, "tier": "hard", "mean_reward": 6.718474999999998},
131
+ {"step": 129, "tier": "hard", "mean_reward": 6.675468750000001},
132
+ {"step": 130, "tier": "hard", "mean_reward": 6.832443750000001},
133
+ {"step": 131, "tier": "hard", "mean_reward": 7.4953281249999995},
134
+ {"step": 132, "tier": "hard", "mean_reward": 6.984856249999997},
135
+ {"step": 133, "tier": "hard", "mean_reward": 4.969693749999999},
136
+ {"step": 134, "tier": "hard", "mean_reward": 6.62208125},
137
+ {"step": 135, "tier": "hard", "mean_reward": 5.769275000000002},
138
+ {"step": 136, "tier": "hard", "mean_reward": 5.799609374999999},
139
+ {"step": 137, "tier": "hard", "mean_reward": 5.565890624999998},
140
+ {"step": 138, "tier": "hard", "mean_reward": 3.290540625},
141
+ {"step": 139, "tier": "hard", "mean_reward": 7.368412500000004},
142
+ {"step": 140, "tier": "hard", "mean_reward": 7.106300000000002},
143
+ {"step": 141, "tier": "hard", "mean_reward": 5.6757718750000015},
144
+ {"step": 142, "tier": "hard", "mean_reward": 5.496281250000001},
145
+ {"step": 143, "tier": "hard", "mean_reward": 5.8853125},
146
+ {"step": 144, "tier": "hard", "mean_reward": 7.661725},
147
+ {"step": 145, "tier": "hard", "mean_reward": 5.6637625},
148
+ {"step": 146, "tier": "hard", "mean_reward": 6.095750000000003},
149
+ {"step": 147, "tier": "hard", "mean_reward": 7.636731250000005},
150
+ {"step": 148, "tier": "hard", "mean_reward": 6.188656249999999},
151
+ {"step": 149, "tier": "hard", "mean_reward": 6.59115}
152
+ ]