LLM / ppo-lora /trainer_state.json
wangrongsheng
commit from root
42de9a6
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": null,
"global_step": 0,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 9.999359182892298e-06,
"loss": 0.0058,
"reward": -0.5258,
"step": 9
},
{
"epoch": 0.0,
"learning_rate": 9.997144223525905e-06,
"loss": 0.0053,
"reward": -0.5235,
"step": 19
},
{
"epoch": 0.0,
"learning_rate": 9.993347911338379e-06,
"loss": 0.0063,
"reward": -0.5422,
"step": 29
},
{
"epoch": 0.01,
"learning_rate": 9.987971447677674e-06,
"loss": 0.0052,
"reward": -0.5766,
"step": 39
},
{
"epoch": 0.01,
"learning_rate": 9.981016533932741e-06,
"loss": 0.0042,
"reward": -0.3694,
"step": 49
},
{
"epoch": 0.01,
"learning_rate": 9.972485370995129e-06,
"loss": 0.004,
"reward": -0.346,
"step": 59
},
{
"epoch": 0.01,
"learning_rate": 9.962380658562499e-06,
"loss": 0.0047,
"reward": -0.4197,
"step": 69
},
{
"epoch": 0.01,
"learning_rate": 9.950705594284304e-06,
"loss": 0.0043,
"reward": -0.1848,
"step": 79
},
{
"epoch": 0.01,
"learning_rate": 9.937463872749902e-06,
"loss": 0.0046,
"reward": -0.4406,
"step": 89
},
{
"epoch": 0.01,
"learning_rate": 9.922659684319374e-06,
"loss": 0.0044,
"reward": -0.4997,
"step": 99
},
{
"epoch": 0.02,
"learning_rate": 9.906297713797501e-06,
"loss": 0.0048,
"reward": -0.4274,
"step": 109
},
{
"epoch": 0.02,
"learning_rate": 9.888383138951242e-06,
"loss": 0.0046,
"reward": -0.3451,
"step": 119
},
{
"epoch": 0.02,
"learning_rate": 9.868921628871223e-06,
"loss": 0.0051,
"reward": -0.4131,
"step": 129
},
{
"epoch": 0.02,
"learning_rate": 9.847919342177744e-06,
"loss": 0.004,
"reward": -0.2998,
"step": 139
},
{
"epoch": 0.02,
"learning_rate": 9.825382925071883e-06,
"loss": 0.0045,
"reward": -0.4068,
"step": 149
},
{
"epoch": 0.02,
"learning_rate": 9.801319509232287e-06,
"loss": 0.0051,
"reward": -0.4744,
"step": 159
},
{
"epoch": 0.02,
"learning_rate": 9.775736709558352e-06,
"loss": 0.0054,
"reward": -0.5511,
"step": 169
},
{
"epoch": 0.03,
"learning_rate": 9.748642621760474e-06,
"loss": 0.0055,
"reward": -0.2605,
"step": 179
},
{
"epoch": 0.03,
"learning_rate": 9.720045819798151e-06,
"loss": 0.0043,
"reward": -0.4817,
"step": 189
},
{
"epoch": 0.03,
"learning_rate": 9.68995535316674e-06,
"loss": 0.0061,
"reward": -0.3717,
"step": 199
},
{
"epoch": 0.03,
"learning_rate": 9.658380744033737e-06,
"loss": 0.0041,
"reward": -0.3632,
"step": 209
},
{
"epoch": 0.03,
"learning_rate": 9.625331984225475e-06,
"loss": 0.0039,
"reward": -0.4044,
"step": 219
},
{
"epoch": 0.03,
"learning_rate": 9.590819532065188e-06,
"loss": 0.0049,
"reward": -0.3763,
"step": 229
},
{
"epoch": 0.03,
"learning_rate": 9.554854309063474e-06,
"loss": 0.0053,
"reward": -0.3172,
"step": 239
},
{
"epoch": 0.04,
"learning_rate": 9.517447696462163e-06,
"loss": 0.0047,
"reward": -0.5815,
"step": 249
},
{
"epoch": 0.04,
"learning_rate": 9.478611531632702e-06,
"loss": 0.0048,
"reward": -0.35,
"step": 259
},
{
"epoch": 0.04,
"learning_rate": 9.438358104330215e-06,
"loss": 0.0054,
"reward": -0.4544,
"step": 269
},
{
"epoch": 0.04,
"learning_rate": 9.396700152804386e-06,
"loss": 0.0039,
"reward": -0.3515,
"step": 279
},
{
"epoch": 0.04,
"learning_rate": 9.353650859768423e-06,
"loss": 0.0052,
"reward": -0.3799,
"step": 289
},
{
"epoch": 0.04,
"learning_rate": 9.309223848227377e-06,
"loss": 0.0048,
"reward": -0.5378,
"step": 299
},
{
"epoch": 0.04,
"learning_rate": 9.26343317716712e-06,
"loss": 0.0057,
"reward": -0.4758,
"step": 309
},
{
"epoch": 0.05,
"learning_rate": 9.216293337105358e-06,
"loss": 0.0044,
"reward": -0.5203,
"step": 319
},
{
"epoch": 0.05,
"learning_rate": 9.167819245506095e-06,
"loss": 0.0054,
"reward": -0.5237,
"step": 329
},
{
"epoch": 0.05,
"learning_rate": 9.118026242058976e-06,
"loss": 0.0058,
"reward": -0.2535,
"step": 339
},
{
"epoch": 0.05,
"learning_rate": 9.066930083825025e-06,
"loss": 0.0067,
"reward": -0.5059,
"step": 349
},
{
"epoch": 0.05,
"learning_rate": 9.014546940250301e-06,
"loss": 0.0059,
"reward": -0.5255,
"step": 359
},
{
"epoch": 0.05,
"learning_rate": 8.960893388049056e-06,
"loss": 0.0047,
"reward": -0.3411,
"step": 369
},
{
"epoch": 0.05,
"learning_rate": 8.905986405958015e-06,
"loss": 0.0048,
"reward": -0.3859,
"step": 379
},
{
"epoch": 0.06,
"learning_rate": 8.849843369363425e-06,
"loss": 0.0058,
"reward": -0.3785,
"step": 389
},
{
"epoch": 0.06,
"learning_rate": 8.792482044802602e-06,
"loss": 0.0047,
"reward": -0.3432,
"step": 399
},
{
"epoch": 0.06,
"learning_rate": 8.733920584341673e-06,
"loss": 0.0057,
"reward": -0.3958,
"step": 409
},
{
"epoch": 0.06,
"learning_rate": 8.674177519831351e-06,
"loss": 0.0049,
"reward": -0.4713,
"step": 419
},
{
"epoch": 0.06,
"learning_rate": 8.613271757042483e-06,
"loss": 0.0048,
"reward": -0.4485,
"step": 429
},
{
"epoch": 0.06,
"learning_rate": 8.551222569683315e-06,
"loss": 0.0054,
"reward": -0.5861,
"step": 439
},
{
"epoch": 0.06,
"learning_rate": 8.488049593300287e-06,
"loss": 0.0052,
"reward": -0.292,
"step": 449
},
{
"epoch": 0.06,
"learning_rate": 8.423772819064349e-06,
"loss": 0.0049,
"reward": -0.4702,
"step": 459
},
{
"epoch": 0.07,
"learning_rate": 8.358412587444719e-06,
"loss": 0.0053,
"reward": -0.5542,
"step": 469
},
{
"epoch": 0.07,
"learning_rate": 8.291989581772126e-06,
"loss": 0.0048,
"reward": -0.4116,
"step": 479
},
{
"epoch": 0.07,
"learning_rate": 8.224524821693534e-06,
"loss": 0.005,
"reward": -0.3746,
"step": 489
},
{
"epoch": 0.07,
"learning_rate": 8.156039656520448e-06,
"loss": 0.0046,
"reward": -0.4481,
"step": 499
},
{
"epoch": 0.07,
"learning_rate": 8.086555758472905e-06,
"loss": 0.0043,
"reward": -0.4127,
"step": 509
},
{
"epoch": 0.07,
"learning_rate": 8.016095115821258e-06,
"loss": 0.0069,
"reward": -0.5597,
"step": 519
},
{
"epoch": 0.07,
"learning_rate": 7.944680025927976e-06,
"loss": 0.0061,
"reward": -0.4318,
"step": 529
},
{
"epoch": 0.08,
"learning_rate": 7.872333088191598e-06,
"loss": 0.0045,
"reward": -0.3063,
"step": 539
},
{
"epoch": 0.08,
"learning_rate": 7.799077196895131e-06,
"loss": 0.0048,
"reward": -0.3184,
"step": 549
},
{
"epoch": 0.08,
"learning_rate": 7.724935533961129e-06,
"loss": 0.0058,
"reward": -0.476,
"step": 559
},
{
"epoch": 0.08,
"learning_rate": 7.649931561615733e-06,
"loss": 0.0051,
"reward": -0.3959,
"step": 569
},
{
"epoch": 0.08,
"learning_rate": 7.574089014964029e-06,
"loss": 0.0051,
"reward": -0.4472,
"step": 579
},
{
"epoch": 0.08,
"learning_rate": 7.497431894479034e-06,
"loss": 0.0058,
"reward": -0.3736,
"step": 589
},
{
"epoch": 0.08,
"learning_rate": 7.419984458406722e-06,
"loss": 0.0043,
"reward": -0.4911,
"step": 599
},
{
"epoch": 0.09,
"learning_rate": 7.341771215089468e-06,
"loss": 0.005,
"reward": -0.3037,
"step": 609
},
{
"epoch": 0.09,
"learning_rate": 7.26281691521035e-06,
"loss": 0.0062,
"reward": -0.4219,
"step": 619
},
{
"epoch": 0.09,
"learning_rate": 7.183146543960761e-06,
"loss": 0.0051,
"reward": -0.3435,
"step": 629
},
{
"epoch": 0.09,
"learning_rate": 7.102785313133821e-06,
"loss": 0.0038,
"reward": -0.3745,
"step": 639
},
{
"epoch": 0.09,
"learning_rate": 7.021758653146058e-06,
"loss": 0.0053,
"reward": -0.3743,
"step": 649
},
{
"epoch": 0.09,
"learning_rate": 6.940092204989934e-06,
"loss": 0.006,
"reward": -0.4696,
"step": 659
},
{
"epoch": 0.09,
"learning_rate": 6.8578118121197144e-06,
"loss": 0.0058,
"reward": -0.4343,
"step": 669
},
{
"epoch": 0.1,
"learning_rate": 6.774943512273267e-06,
"loss": 0.0044,
"reward": -0.2842,
"step": 679
},
{
"epoch": 0.1,
"learning_rate": 6.691513529232391e-06,
"loss": 0.0046,
"reward": -0.345,
"step": 689
},
{
"epoch": 0.1,
"learning_rate": 6.607548264524268e-06,
"loss": 0.0047,
"reward": -0.3973,
"step": 699
},
{
"epoch": 0.1,
"learning_rate": 6.523074289066666e-06,
"loss": 0.0039,
"reward": -0.3412,
"step": 709
},
{
"epoch": 0.1,
"learning_rate": 6.43811833475953e-06,
"loss": 0.0054,
"reward": -0.3593,
"step": 719
},
{
"epoch": 0.1,
"learning_rate": 6.352707286025644e-06,
"loss": 0.0049,
"reward": -0.3673,
"step": 729
},
{
"epoch": 0.1,
"learning_rate": 6.266868171303017e-06,
"loss": 0.0046,
"reward": -0.4224,
"step": 739
},
{
"epoch": 0.11,
"learning_rate": 6.1806281544917e-06,
"loss": 0.0045,
"reward": -0.1521,
"step": 749
},
{
"epoch": 0.11,
"learning_rate": 6.094014526357734e-06,
"loss": 0.0056,
"reward": -0.3997,
"step": 759
},
{
"epoch": 0.11,
"learning_rate": 6.007054695896955e-06,
"loss": 0.0046,
"reward": -0.289,
"step": 769
},
{
"epoch": 0.11,
"learning_rate": 5.9197761816613705e-06,
"loss": 0.0042,
"reward": -0.3431,
"step": 779
},
{
"epoch": 0.11,
"learning_rate": 5.8322066030508924e-06,
"loss": 0.0049,
"reward": -0.3222,
"step": 789
},
{
"epoch": 0.11,
"learning_rate": 5.74437367157313e-06,
"loss": 0.0055,
"reward": -0.4949,
"step": 799
},
{
"epoch": 0.11,
"learning_rate": 5.656305182074053e-06,
"loss": 0.0065,
"reward": -0.4194,
"step": 809
},
{
"epoch": 0.12,
"learning_rate": 5.568029003942271e-06,
"loss": 0.005,
"reward": -0.5284,
"step": 819
},
{
"epoch": 0.12,
"learning_rate": 5.4795730722897435e-06,
"loss": 0.0058,
"reward": -0.5156,
"step": 829
},
{
"epoch": 0.12,
"learning_rate": 5.390965379111655e-06,
"loss": 0.0043,
"reward": -0.3164,
"step": 839
},
{
"epoch": 0.12,
"learning_rate": 5.302233964428329e-06,
"loss": 0.004,
"reward": -0.1936,
"step": 849
},
{
"epoch": 0.12,
"learning_rate": 5.213406907411921e-06,
"loss": 0.0039,
"reward": -0.2792,
"step": 859
},
{
"epoch": 0.12,
"learning_rate": 5.124512317500723e-06,
"loss": 0.005,
"reward": -0.2348,
"step": 869
},
{
"epoch": 0.12,
"learning_rate": 5.035578325503908e-06,
"loss": 0.0042,
"reward": -0.3528,
"step": 879
},
{
"epoch": 0.13,
"learning_rate": 4.946633074699485e-06,
"loss": 0.005,
"reward": -0.3753,
"step": 889
},
{
"epoch": 0.13,
"learning_rate": 4.857704711928325e-06,
"loss": 0.0052,
"reward": -0.4383,
"step": 899
},
{
"epoch": 0.13,
"learning_rate": 4.768821378687066e-06,
"loss": 0.0045,
"reward": -0.3664,
"step": 909
},
{
"epoch": 0.13,
"learning_rate": 4.6800112022226865e-06,
"loss": 0.0043,
"reward": -0.3584,
"step": 919
},
{
"epoch": 0.13,
"learning_rate": 4.591302286631612e-06,
"loss": 0.0048,
"reward": -0.455,
"step": 929
},
{
"epoch": 0.13,
"learning_rate": 4.502722703966126e-06,
"loss": 0.004,
"reward": -0.3906,
"step": 939
},
{
"epoch": 0.13,
"learning_rate": 4.414300485350945e-06,
"loss": 0.0046,
"reward": -0.4897,
"step": 949
},
{
"epoch": 0.14,
"learning_rate": 4.326063612112721e-06,
"loss": 0.0042,
"reward": -0.384,
"step": 959
},
{
"epoch": 0.14,
"learning_rate": 4.2380400069253255e-06,
"loss": 0.0039,
"reward": -0.4159,
"step": 969
},
{
"epoch": 0.14,
"learning_rate": 4.15025752497367e-06,
"loss": 0.0048,
"reward": -0.4036,
"step": 979
},
{
"epoch": 0.14,
"learning_rate": 4.062743945138906e-06,
"loss": 0.0042,
"reward": -0.4189,
"step": 989
},
{
"epoch": 0.14,
"learning_rate": 3.975526961207759e-06,
"loss": 0.005,
"reward": -0.2446,
"step": 999
},
{
"epoch": 0.14,
"learning_rate": 3.88863417310879e-06,
"loss": 0.0047,
"reward": -0.3705,
"step": 1009
},
{
"epoch": 0.14,
"learning_rate": 3.8020930781783704e-06,
"loss": 0.0048,
"reward": -0.3862,
"step": 1019
},
{
"epoch": 0.15,
"learning_rate": 3.7159310624591055e-06,
"loss": 0.0041,
"reward": -0.4077,
"step": 1029
},
{
"epoch": 0.15,
"learning_rate": 3.630175392033488e-06,
"loss": 0.0072,
"reward": -0.2784,
"step": 1039
},
{
"epoch": 0.15,
"learning_rate": 3.544853204395513e-06,
"loss": 0.0058,
"reward": -0.436,
"step": 1049
},
{
"epoch": 0.15,
"learning_rate": 3.45999149986298e-06,
"loss": 0.0045,
"reward": -0.3979,
"step": 1059
},
{
"epoch": 0.15,
"learning_rate": 3.375617133033209e-06,
"loss": 0.0055,
"reward": -0.4709,
"step": 1069
},
{
"epoch": 0.15,
"learning_rate": 3.2917568042848648e-06,
"loss": 0.0036,
"reward": -0.3748,
"step": 1079
},
{
"epoch": 0.15,
"learning_rate": 3.2167439478854534e-06,
"loss": 0.0057,
"reward": -0.586,
"step": 1089
},
{
"epoch": 0.16,
"learning_rate": 3.1339332616713215e-06,
"loss": 0.0047,
"reward": -0.2301,
"step": 1099
},
{
"epoch": 0.16,
"learning_rate": 3.051713094715606e-06,
"loss": 0.0052,
"reward": -0.4129,
"step": 1109
},
{
"epoch": 0.16,
"learning_rate": 2.9701094656976084e-06,
"loss": 0.0044,
"reward": -0.2389,
"step": 1119
},
{
"epoch": 0.16,
"learning_rate": 2.8891481981923897e-06,
"loss": 0.0039,
"reward": -0.3641,
"step": 1129
},
{
"epoch": 0.16,
"learning_rate": 2.8088549124988785e-06,
"loss": 0.0052,
"reward": -0.3099,
"step": 1139
},
{
"epoch": 0.16,
"learning_rate": 2.729255017532277e-06,
"loss": 0.0037,
"reward": -0.3203,
"step": 1149
},
{
"epoch": 0.16,
"learning_rate": 2.6503737027833926e-06,
"loss": 0.0052,
"reward": -0.4073,
"step": 1159
},
{
"epoch": 0.17,
"learning_rate": 2.5722359303473778e-06,
"loss": 0.0052,
"reward": -0.3275,
"step": 1169
},
{
"epoch": 0.17,
"learning_rate": 2.494866427024452e-06,
"loss": 0.004,
"reward": -0.3811,
"step": 1179
},
{
"epoch": 0.17,
"learning_rate": 2.4182896764950726e-06,
"loss": 0.0052,
"reward": -0.3413,
"step": 1189
},
{
"epoch": 0.17,
"learning_rate": 2.3425299115720295e-06,
"loss": 0.0052,
"reward": -0.5202,
"step": 1199
},
{
"epoch": 0.17,
"learning_rate": 2.267611106531961e-06,
"loss": 0.0048,
"reward": -0.3018,
"step": 1209
},
{
"epoch": 0.17,
"learning_rate": 2.193556969528642e-06,
"loss": 0.0056,
"reward": -0.4154,
"step": 1219
},
{
"epoch": 0.17,
"learning_rate": 2.1203909350905215e-06,
"loss": 0.0046,
"reward": -0.3844,
"step": 1229
},
{
"epoch": 0.18,
"learning_rate": 2.048136156704833e-06,
"loss": 0.0052,
"reward": -0.3442,
"step": 1239
},
{
"epoch": 0.18,
"learning_rate": 1.9768154994906352e-06,
"loss": 0.0053,
"reward": -0.357,
"step": 1249
},
{
"epoch": 0.18,
"learning_rate": 1.9064515329631267e-06,
"loss": 0.0043,
"reward": -0.2778,
"step": 1259
},
{
"epoch": 0.18,
"learning_rate": 1.837066523891483e-06,
"loss": 0.0066,
"reward": -0.3448,
"step": 1269
},
{
"epoch": 0.18,
"learning_rate": 1.768682429252519e-06,
"loss": 0.0051,
"reward": -0.3787,
"step": 1279
},
{
"epoch": 0.18,
"learning_rate": 1.7013208892823618e-06,
"loss": 0.004,
"reward": -0.4296,
"step": 1289
},
{
"epoch": 0.18,
"learning_rate": 1.6350032206283833e-06,
"loss": 0.0047,
"reward": -0.5144,
"step": 1299
},
{
"epoch": 0.19,
"learning_rate": 1.5697504096035116e-06,
"loss": 0.0052,
"reward": -0.331,
"step": 1309
},
{
"epoch": 0.19,
"learning_rate": 1.5055831055450825e-06,
"loss": 0.0039,
"reward": -0.3365,
"step": 1319
},
{
"epoch": 0.19,
"learning_rate": 1.4425216142803395e-06,
"loss": 0.0047,
"reward": -0.3728,
"step": 1329
},
{
"epoch": 0.19,
"learning_rate": 1.3805858917006148e-06,
"loss": 0.0048,
"reward": -0.3547,
"step": 1339
},
{
"epoch": 0.19,
"learning_rate": 1.3197955374462767e-06,
"loss": 0.0043,
"reward": -0.3398,
"step": 1349
},
{
"epoch": 0.19,
"learning_rate": 1.260169788704383e-06,
"loss": 0.0056,
"reward": -0.2891,
"step": 1359
},
{
"epoch": 0.19,
"learning_rate": 1.2017275141210604e-06,
"loss": 0.0042,
"reward": -0.3165,
"step": 1369
},
{
"epoch": 0.2,
"learning_rate": 1.1444872078304865e-06,
"loss": 0.0039,
"reward": -0.366,
"step": 1379
},
{
"epoch": 0.2,
"learning_rate": 1.0884669836024058e-06,
"loss": 0.004,
"reward": -0.2873,
"step": 1389
},
{
"epoch": 0.2,
"learning_rate": 1.033684569109994e-06,
"loss": 0.0046,
"reward": -0.2922,
"step": 1399
},
{
"epoch": 0.2,
"learning_rate": 9.80157300319926e-07,
"loss": 0.0037,
"reward": -0.3076,
"step": 1409
},
{
"epoch": 0.2,
"learning_rate": 9.279021160063712e-07,
"loss": 0.0066,
"reward": -0.5213,
"step": 1419
},
{
"epoch": 0.2,
"learning_rate": 8.769355523907086e-07,
"loss": 0.0044,
"reward": -0.5063,
"step": 1429
},
{
"epoch": 0.2,
"learning_rate": 8.272737379086088e-07,
"loss": 0.0041,
"reward": -0.5909,
"step": 1439
},
{
"epoch": 0.21,
"learning_rate": 7.789323881061777e-07,
"loss": 0.0044,
"reward": -0.2486,
"step": 1449
},
{
"epoch": 0.21,
"learning_rate": 7.31926800666744e-07,
"loss": 0.0044,
"reward": -0.4971,
"step": 1459
},
{
"epoch": 0.21,
"learning_rate": 6.862718505698945e-07,
"loss": 0.0043,
"reward": -0.5182,
"step": 1469
},
{
"epoch": 0.21,
"learning_rate": 6.419819853842591e-07,
"loss": 0.0046,
"reward": -0.2389,
"step": 1479
},
{
"epoch": 0.21,
"learning_rate": 5.990712206955674e-07,
"loss": 0.0039,
"reward": -0.3725,
"step": 1489
},
{
"epoch": 0.21,
"learning_rate": 5.575531356713904e-07,
"loss": 0.0046,
"reward": -0.4709,
"step": 1499
},
{
"epoch": 0.21,
"learning_rate": 5.17440868764002e-07,
"loss": 0.005,
"reward": -0.3545,
"step": 1509
},
{
"epoch": 0.22,
"learning_rate": 4.787471135526945e-07,
"loss": 0.0041,
"reward": -0.2898,
"step": 1519
},
{
"epoch": 0.22,
"learning_rate": 4.414841147268811e-07,
"loss": 0.0045,
"reward": -0.4093,
"step": 1529
},
{
"epoch": 0.22,
"learning_rate": 4.056636642112427e-07,
"loss": 0.004,
"reward": -0.3965,
"step": 1539
},
{
"epoch": 0.22,
"learning_rate": 3.7129709743416733e-07,
"loss": 0.0045,
"reward": -0.3877,
"step": 1549
},
{
"epoch": 0.22,
"learning_rate": 3.3839528974062843e-07,
"loss": 0.0046,
"reward": -0.3655,
"step": 1559
},
{
"epoch": 0.22,
"learning_rate": 3.069686529506777e-07,
"loss": 0.0045,
"reward": -0.5236,
"step": 1569
},
{
"epoch": 0.22,
"learning_rate": 2.770271320646023e-07,
"loss": 0.0053,
"reward": -0.441,
"step": 1579
},
{
"epoch": 0.22,
"learning_rate": 2.4858020211582346e-07,
"loss": 0.004,
"reward": -0.2618,
"step": 1589
},
{
"epoch": 0.23,
"learning_rate": 2.2163686517250427e-07,
"loss": 0.004,
"reward": -0.4623,
"step": 1599
},
{
"epoch": 0.23,
"learning_rate": 1.962056474888391e-07,
"loss": 0.0038,
"reward": -0.2542,
"step": 1609
},
{
"epoch": 0.23,
"learning_rate": 1.7229459680690176e-07,
"loss": 0.0051,
"reward": -0.377,
"step": 1619
},
{
"epoch": 0.23,
"learning_rate": 1.4991127980993503e-07,
"loss": 0.0046,
"reward": -0.3763,
"step": 1629
},
{
"epoch": 0.23,
"learning_rate": 1.290627797278593e-07,
"loss": 0.0046,
"reward": -0.4469,
"step": 1639
},
{
"epoch": 0.23,
"learning_rate": 1.0975569409577668e-07,
"loss": 0.0054,
"reward": -0.4301,
"step": 1649
},
{
"epoch": 0.23,
"learning_rate": 9.199613266616891e-08,
"loss": 0.0047,
"reward": -0.481,
"step": 1659
},
{
"epoch": 0.24,
"learning_rate": 7.578971547546554e-08,
"loss": 0.0047,
"reward": -0.549,
"step": 1669
},
{
"epoch": 0.24,
"learning_rate": 6.114157106556773e-08,
"loss": 0.0058,
"reward": -0.3571,
"step": 1679
},
{
"epoch": 0.24,
"learning_rate": 4.8056334860921426e-08,
"loss": 0.005,
"reward": -0.3539,
"step": 1689
},
{
"epoch": 0.24,
"learning_rate": 3.653814770162578e-08,
"loss": 0.0037,
"reward": -0.3161,
"step": 1699
},
{
"epoch": 0.24,
"learning_rate": 2.6590654533062975e-08,
"loss": 0.0053,
"reward": -0.4499,
"step": 1709
},
{
"epoch": 0.24,
"learning_rate": 1.821700325244813e-08,
"loss": 0.0047,
"reward": -0.3267,
"step": 1719
},
{
"epoch": 0.24,
"learning_rate": 1.141984371267335e-08,
"loss": 0.0043,
"reward": -0.4299,
"step": 1729
},
{
"epoch": 0.25,
"learning_rate": 6.2013268837601834e-09,
"loss": 0.0051,
"reward": -0.4119,
"step": 1739
},
{
"epoch": 0.25,
"learning_rate": 2.5631041721801976e-09,
"loss": 0.0048,
"reward": -0.492,
"step": 1749
},
{
"epoch": 0.25,
"learning_rate": 5.063268982646863e-10,
"loss": 0.0053,
"reward": -0.5111,
"step": 1759
},
{
"epoch": 0.25,
"learning_rate": 3.164593187165999e-11,
"loss": 0.0057,
"reward": -0.5665,
"step": 1769
},
{
"epoch": 0.25,
"learning_rate": 1.1392114864122727e-09,
"loss": 0.0039,
"reward": -0.3717,
"step": 1779
},
{
"epoch": 0.25,
"learning_rate": 3.828673071319955e-09,
"loss": 0.0049,
"reward": -0.4831,
"step": 1789
},
{
"epoch": 0.25,
"learning_rate": 8.099179602960583e-09,
"loss": 0.0052,
"reward": -0.4277,
"step": 1799
},
{
"epoch": 0.26,
"learning_rate": 1.3949379673960372e-08,
"loss": 0.0047,
"reward": -0.3236,
"step": 1809
},
{
"epoch": 0.26,
"learning_rate": 2.1377421980859903e-08,
"loss": 0.0044,
"reward": -0.4584,
"step": 1819
},
{
"epoch": 0.26,
"learning_rate": 3.0380955909962706e-08,
"loss": 0.004,
"reward": -0.427,
"step": 1829
},
{
"epoch": 0.26,
"learning_rate": 4.095713228118803e-08,
"loss": 0.0046,
"reward": -0.4793,
"step": 1839
},
{
"epoch": 0.26,
"learning_rate": 5.310260424970071e-08,
"loss": 0.005,
"reward": -0.3642,
"step": 1849
},
{
"epoch": 0.26,
"learning_rate": 6.681352836502009e-08,
"loss": 0.0036,
"reward": -0.3235,
"step": 1859
},
{
"epoch": 0.26,
"learning_rate": 8.208556578728933e-08,
"loss": 0.0055,
"reward": -0.4695,
"step": 1869
},
{
"epoch": 0.27,
"learning_rate": 9.891388366030486e-08,
"loss": 0.005,
"reward": -0.3998,
"step": 1879
},
{
"epoch": 0.27,
"learning_rate": 1.172931566408797e-07,
"loss": 0.0046,
"reward": -0.3878,
"step": 1889
},
{
"epoch": 0.27,
"learning_rate": 1.3721756858405545e-07,
"loss": 0.0055,
"reward": -0.5067,
"step": 1899
},
{
"epoch": 0.27,
"learning_rate": 1.5868081438363213e-07,
"loss": 0.0038,
"reward": -0.3747,
"step": 1909
},
{
"epoch": 0.27,
"learning_rate": 1.8167610196742125e-07,
"loss": 0.0052,
"reward": -0.2461,
"step": 1919
},
{
"epoch": 0.27,
"learning_rate": 2.0619615444661012e-07,
"loss": 0.0044,
"reward": -0.2565,
"step": 1929
},
{
"epoch": 0.27,
"learning_rate": 2.3223321241854335e-07,
"loss": 0.0045,
"reward": -0.4397,
"step": 1939
},
{
"epoch": 0.28,
"learning_rate": 2.5977903642219305e-07,
"loss": 0.0047,
"reward": -0.2828,
"step": 1949
},
{
"epoch": 0.28,
"learning_rate": 2.8882490954554976e-07,
"loss": 0.0049,
"reward": -0.361,
"step": 1959
},
{
"epoch": 0.28,
"learning_rate": 3.193616401840971e-07,
"loss": 0.0054,
"reward": -0.2941,
"step": 1969
},
{
"epoch": 0.28,
"learning_rate": 3.513795649495133e-07,
"loss": 0.0046,
"reward": -0.3027,
"step": 1979
},
{
"epoch": 0.28,
"learning_rate": 3.848685517276546e-07,
"loss": 0.0037,
"reward": -0.391,
"step": 1989
},
{
"epoch": 0.28,
"learning_rate": 4.1981800288488416e-07,
"loss": 0.0048,
"reward": -0.3608,
"step": 1999
},
{
"epoch": 0.28,
"learning_rate": 4.562168586217008e-07,
"loss": 0.0044,
"reward": -0.3958,
"step": 2009
},
{
"epoch": 0.29,
"learning_rate": 4.940536004726337e-07,
"loss": 0.0072,
"reward": -0.5123,
"step": 2019
},
{
"epoch": 0.29,
"learning_rate": 5.33316254951266e-07,
"loss": 0.0044,
"reward": -0.2958,
"step": 2029
},
{
"epoch": 0.29,
"learning_rate": 5.739923973392685e-07,
"loss": 0.0047,
"reward": -0.4311,
"step": 2039
},
{
"epoch": 0.29,
"learning_rate": 6.160691556182191e-07,
"loss": 0.0039,
"reward": -0.4388,
"step": 2049
},
{
"epoch": 0.29,
"learning_rate": 6.595332145429611e-07,
"loss": 0.0066,
"reward": -0.4963,
"step": 2059
},
{
"epoch": 0.29,
"learning_rate": 7.043708198552385e-07,
"loss": 0.0051,
"reward": -0.3088,
"step": 2069
},
{
"epoch": 0.29,
"learning_rate": 7.505677826362434e-07,
"loss": 0.0054,
"reward": -0.3397,
"step": 2079
},
{
"epoch": 0.3,
"learning_rate": 7.981094837967229e-07,
"loss": 0.0044,
"reward": -0.4052,
"step": 2089
},
{
"epoch": 0.3,
"learning_rate": 8.46980878703203e-07,
"loss": 0.0049,
"reward": -0.3994,
"step": 2099
},
{
"epoch": 0.3,
"learning_rate": 8.971665019388887e-07,
"loss": 0.0051,
"reward": -0.4254,
"step": 2109
},
{
"epoch": 0.3,
"learning_rate": 9.486504721977097e-07,
"loss": 0.0043,
"reward": -0.3374,
"step": 2119
},
{
"epoch": 0.3,
"learning_rate": 1.0014164973099866e-06,
"loss": 0.0058,
"reward": -0.3854,
"step": 2129
},
{
"epoch": 0.3,
"learning_rate": 1.0554478793981015e-06,
"loss": 0.0043,
"reward": -0.3606,
"step": 2139
},
{
"epoch": 0.3,
"learning_rate": 1.1107275201605728e-06,
"loss": 0.0041,
"reward": -0.3453,
"step": 2149
},
{
"epoch": 0.31,
"learning_rate": 1.1672379262828238e-06,
"loss": 0.008,
"reward": -0.3853,
"step": 2159
},
{
"epoch": 0.31,
"learning_rate": 1.2249612149729779e-06,
"loss": 0.0052,
"reward": -0.3406,
"step": 2169
},
{
"epoch": 0.31,
"learning_rate": 1.2838791196208771e-06,
"loss": 0.0043,
"reward": -0.3696,
"step": 2179
},
{
"epoch": 0.31,
"learning_rate": 1.3439729955785901e-06,
"loss": 0.0048,
"reward": -0.326,
"step": 2189
},
{
"epoch": 0.31,
"learning_rate": 1.40522382606052e-06,
"loss": 0.0048,
"reward": -0.3875,
"step": 2199
},
{
"epoch": 0.31,
"learning_rate": 1.4676122281613041e-06,
"loss": 0.0044,
"reward": -0.3224,
"step": 2209
},
{
"epoch": 0.31,
"learning_rate": 1.5311184589895495e-06,
"loss": 0.0054,
"reward": -0.4125,
"step": 2219
},
{
"epoch": 0.32,
"learning_rate": 1.595722421915507e-06,
"loss": 0.0051,
"reward": -0.5115,
"step": 2229
},
{
"epoch": 0.32,
"learning_rate": 1.6614036729306638e-06,
"loss": 0.0042,
"reward": -0.5107,
"step": 2239
},
{
"epoch": 0.32,
"learning_rate": 1.7281414271172925e-06,
"loss": 0.0048,
"reward": -0.4813,
"step": 2249
},
{
"epoch": 0.32,
"learning_rate": 1.7959145652258408e-06,
"loss": 0.0036,
"reward": -0.2592,
"step": 2259
},
{
"epoch": 0.32,
"learning_rate": 1.8647016403581763e-06,
"loss": 0.0045,
"reward": -0.431,
"step": 2269
},
{
"epoch": 0.32,
"learning_rate": 1.9344808847544632e-06,
"loss": 0.0045,
"reward": -0.3041,
"step": 2279
},
{
"epoch": 0.32,
"learning_rate": 2.0052302166816017e-06,
"loss": 0.0054,
"reward": -0.2746,
"step": 2289
},
{
"epoch": 0.33,
"learning_rate": 2.0769272474210374e-06,
"loss": 0.0059,
"reward": -0.5107,
"step": 2299
},
{
"epoch": 0.33,
"learning_rate": 2.149549288353676e-06,
"loss": 0.0044,
"reward": -0.3638,
"step": 2309
},
{
"epoch": 0.33,
"learning_rate": 2.2230733581397427e-06,
"loss": 0.0055,
"reward": -0.3426,
"step": 2319
},
{
"epoch": 0.33,
"learning_rate": 2.297476189991249e-06,
"loss": 0.0048,
"reward": -0.3561,
"step": 2329
},
{
"epoch": 0.33,
"learning_rate": 2.3727342390348134e-06,
"loss": 0.0058,
"reward": -0.334,
"step": 2339
},
{
"epoch": 0.33,
"learning_rate": 2.4488236897624457e-06,
"loss": 0.0043,
"reward": -0.188,
"step": 2349
},
{
"epoch": 0.33,
"learning_rate": 2.525720463568009e-06,
"loss": 0.005,
"reward": -0.3464,
"step": 2359
},
{
"epoch": 0.34,
"learning_rate": 2.603400226366919e-06,
"loss": 0.0041,
"reward": -0.4215,
"step": 2369
},
{
"epoch": 0.34,
"learning_rate": 2.6818383962967134e-06,
"loss": 0.0059,
"reward": -0.483,
"step": 2379
},
{
"epoch": 0.34,
"learning_rate": 2.7610101514959903e-06,
"loss": 0.0042,
"reward": -0.3055,
"step": 2389
},
{
"epoch": 0.34,
"learning_rate": 2.840890437959334e-06,
"loss": 0.0047,
"reward": -0.2254,
"step": 2399
},
{
"epoch": 0.34,
"learning_rate": 2.92145397746568e-06,
"loss": 0.0049,
"reward": -0.4363,
"step": 2409
},
{
"epoch": 0.34,
"learning_rate": 3.002675275577659e-06,
"loss": 0.0053,
"reward": -0.4156,
"step": 2419
},
{
"epoch": 0.34,
"learning_rate": 3.0845286297093167e-06,
"loss": 0.0032,
"reward": -0.3561,
"step": 2429
},
{
"epoch": 0.35,
"learning_rate": 3.166988137259751e-06,
"loss": 0.0039,
"reward": -0.4369,
"step": 2439
},
{
"epoch": 0.35,
"learning_rate": 3.2500277038100126e-06,
"loss": 0.0049,
"reward": -0.431,
"step": 2449
},
{
"epoch": 0.35,
"learning_rate": 3.3336210513807366e-06,
"loss": 0.0047,
"reward": -0.4215,
"step": 2459
},
{
"epoch": 0.35,
"learning_rate": 3.4177417267478236e-06,
"loss": 0.0044,
"reward": -0.2004,
"step": 2469
},
{
"epoch": 0.35,
"learning_rate": 3.502363109813589e-06,
"loss": 0.0039,
"reward": -0.3595,
"step": 2479
},
{
"epoch": 0.35,
"learning_rate": 3.5874584220307686e-06,
"loss": 0.0052,
"reward": -0.3384,
"step": 2489
},
{
"epoch": 0.35,
"learning_rate": 3.673000734876582e-06,
"loss": 0.0042,
"reward": -0.1973,
"step": 2499
},
{
"epoch": 0.36,
"learning_rate": 3.7589629783743188e-06,
"loss": 0.0044,
"reward": -0.4077,
"step": 2509
},
{
"epoch": 0.36,
"learning_rate": 3.845317949659662e-06,
"loss": 0.0046,
"reward": -0.5238,
"step": 2519
},
{
"epoch": 0.36,
"learning_rate": 3.932038321589086e-06,
"loss": 0.0039,
"reward": -0.3569,
"step": 2529
},
{
"epoch": 0.36,
"learning_rate": 4.01909665138754e-06,
"loss": 0.005,
"reward": -0.4477,
"step": 2539
},
{
"epoch": 0.36,
"learning_rate": 4.106465389332768e-06,
"loss": 0.0035,
"reward": -0.303,
"step": 2549
},
{
"epoch": 0.36,
"learning_rate": 4.194116887473445e-06,
"loss": 0.0042,
"reward": -0.3804,
"step": 2559
},
{
"epoch": 0.36,
"learning_rate": 4.282023408378429e-06,
"loss": 0.0047,
"reward": -0.4259,
"step": 2569
},
{
"epoch": 0.37,
"learning_rate": 4.370157133914274e-06,
"loss": 0.0049,
"reward": -0.3371,
"step": 2579
},
{
"epoch": 0.37,
"learning_rate": 4.458490174048339e-06,
"loss": 0.0047,
"reward": -0.4291,
"step": 2589
},
{
"epoch": 0.37,
"learning_rate": 4.5469945756746145e-06,
"loss": 0.0044,
"reward": -0.4106,
"step": 2599
},
{
"epoch": 0.37,
"learning_rate": 4.635642331459522e-06,
"loss": 0.0036,
"reward": -0.3657,
"step": 2609
},
{
"epoch": 0.37,
"learning_rate": 4.724405388704883e-06,
"loss": 0.0052,
"reward": -0.3857,
"step": 2619
},
{
"epoch": 0.37,
"learning_rate": 4.804367507873278e-06,
"loss": 0.0047,
"reward": -0.3853,
"step": 2629
},
{
"epoch": 0.37,
"learning_rate": 4.893272229193558e-06,
"loss": 0.0054,
"reward": -0.3366,
"step": 2639
},
{
"epoch": 0.38,
"learning_rate": 4.982210724656406e-06,
"loss": 0.005,
"reward": -0.4375,
"step": 2649
},
{
"epoch": 0.38,
"learning_rate": 5.071154849558693e-06,
"loss": 0.0043,
"reward": -0.3342,
"step": 2659
},
{
"epoch": 0.38,
"learning_rate": 5.160076457415833e-06,
"loss": 0.0047,
"reward": -0.4507,
"step": 2669
},
{
"epoch": 0.38,
"learning_rate": 5.248947408868792e-06,
"loss": 0.005,
"reward": -0.4391,
"step": 2679
},
{
"epoch": 0.38,
"learning_rate": 5.33773958058882e-06,
"loss": 0.0036,
"reward": -0.2586,
"step": 2689
},
{
"epoch": 0.38,
"learning_rate": 5.426424874177126e-06,
"loss": 0.0044,
"reward": -0.4199,
"step": 2699
},
{
"epoch": 0.38,
"learning_rate": 5.514975225056634e-06,
"loss": 0.0044,
"reward": -0.4119,
"step": 2709
},
{
"epoch": 0.38,
"learning_rate": 5.603362611353067e-06,
"loss": 0.0043,
"reward": -0.3747,
"step": 2719
},
{
"epoch": 0.39,
"learning_rate": 5.691559062762498e-06,
"loss": 0.0051,
"reward": -0.2529,
"step": 2729
},
{
"epoch": 0.39,
"learning_rate": 5.779536669402608e-06,
"loss": 0.0038,
"reward": -0.3863,
"step": 2739
},
{
"epoch": 0.39,
"learning_rate": 5.867267590644781e-06,
"loss": 0.0043,
"reward": -0.4039,
"step": 2749
},
{
"epoch": 0.39,
"learning_rate": 5.954724063924317e-06,
"loss": 0.0041,
"reward": -0.3328,
"step": 2759
},
{
"epoch": 0.39,
"learning_rate": 6.041878413525932e-06,
"loss": 0.0044,
"reward": -0.2204,
"step": 2769
},
{
"epoch": 0.39,
"learning_rate": 6.128703059341788e-06,
"loss": 0.0056,
"reward": -0.4904,
"step": 2779
},
{
"epoch": 0.39,
"learning_rate": 6.215170525599225e-06,
"loss": 0.0047,
"reward": -0.3944,
"step": 2789
},
{
"epoch": 0.4,
"learning_rate": 6.3012534495555275e-06,
"loss": 0.0053,
"reward": -0.5206,
"step": 2799
},
{
"epoch": 0.4,
"learning_rate": 6.386924590156891e-06,
"loss": 0.0033,
"reward": -0.1779,
"step": 2809
},
{
"epoch": 0.4,
"learning_rate": 6.4721568366589e-06,
"loss": 0.0045,
"reward": -0.4464,
"step": 2819
},
{
"epoch": 0.4,
"learning_rate": 6.556923217205744e-06,
"loss": 0.0036,
"reward": -0.3361,
"step": 2829
},
{
"epoch": 0.4,
"learning_rate": 6.641196907365499e-06,
"loss": 0.0049,
"reward": -0.5177,
"step": 2839
},
{
"epoch": 0.4,
"learning_rate": 6.724951238618763e-06,
"loss": 0.0048,
"reward": -0.4074,
"step": 2849
},
{
"epoch": 0.4,
"learning_rate": 6.808159706797949e-06,
"loss": 0.0047,
"reward": -0.2954,
"step": 2859
},
{
"epoch": 0.41,
"learning_rate": 6.8907959804745294e-06,
"loss": 0.0046,
"reward": -0.312,
"step": 2869
},
{
"epoch": 0.41,
"learning_rate": 6.972833909291694e-06,
"loss": 0.0053,
"reward": -0.4417,
"step": 2879
},
{
"epoch": 0.41,
"learning_rate": 7.054247532239634e-06,
"loss": 0.0036,
"reward": -0.2909,
"step": 2889
},
{
"epoch": 0.41,
"learning_rate": 7.135011085870964e-06,
"loss": 0.0044,
"reward": -0.2276,
"step": 2899
},
{
"epoch": 0.41,
"learning_rate": 7.21509901245357e-06,
"loss": 0.0049,
"reward": -0.3637,
"step": 2909
},
{
"epoch": 0.41,
"learning_rate": 7.294485968058404e-06,
"loss": 0.0067,
"reward": -0.3676,
"step": 2919
},
{
"epoch": 0.41,
"learning_rate": 7.373146830579596e-06,
"loss": 0.0034,
"reward": -0.1601,
"step": 2929
},
{
"epoch": 0.42,
"learning_rate": 7.451056707684396e-06,
"loss": 0.0041,
"reward": -0.1314,
"step": 2939
},
{
"epoch": 0.42,
"learning_rate": 7.528190944690356e-06,
"loss": 0.0035,
"reward": -0.2391,
"step": 2949
},
{
"epoch": 0.42,
"learning_rate": 7.6045251323673555e-06,
"loss": 0.0052,
"reward": -0.4042,
"step": 2959
},
{
"epoch": 0.42,
"learning_rate": 7.680035114661929e-06,
"loss": 0.0044,
"reward": -0.3035,
"step": 2969
},
{
"epoch": 0.42,
"learning_rate": 7.754696996341486e-06,
"loss": 0.0049,
"reward": -0.3552,
"step": 2979
},
{
"epoch": 0.42,
"learning_rate": 7.828487150555979e-06,
"loss": 0.0049,
"reward": -0.2459,
"step": 2989
},
{
"epoch": 0.42,
"learning_rate": 7.901382226314662e-06,
"loss": 0.0036,
"reward": -0.2989,
"step": 2999
},
{
"epoch": 0.43,
"learning_rate": 7.973359155875521e-06,
"loss": 0.0057,
"reward": -0.5494,
"step": 3009
},
{
"epoch": 0.43,
"learning_rate": 8.044395162045135e-06,
"loss": 0.0045,
"reward": -0.3344,
"step": 3019
},
{
"epoch": 0.43,
"learning_rate": 8.11446776538649e-06,
"loss": 0.0045,
"reward": -0.3949,
"step": 3029
},
{
"epoch": 0.43,
"learning_rate": 8.183554791332675e-06,
"loss": 0.004,
"reward": -0.2997,
"step": 3039
},
{
"epoch": 0.43,
"learning_rate": 8.251634377204023e-06,
"loss": 0.0034,
"reward": -0.2654,
"step": 3049
},
{
"epoch": 0.43,
"learning_rate": 8.318684979126607e-06,
"loss": 0.0034,
"reward": -0.4013,
"step": 3059
},
{
"epoch": 0.43,
"learning_rate": 8.384685378849804e-06,
"loss": 0.0035,
"reward": -0.4777,
"step": 3069
},
{
"epoch": 0.44,
"learning_rate": 8.44961469046086e-06,
"loss": 0.0031,
"reward": -0.3525,
"step": 3079
},
{
"epoch": 0.44,
"learning_rate": 8.513452366994242e-06,
"loss": 0.0042,
"reward": -0.1987,
"step": 3089
},
{
"epoch": 0.44,
"learning_rate": 8.576178206933773e-06,
"loss": 0.0043,
"reward": -0.439,
"step": 3099
},
{
"epoch": 0.44,
"learning_rate": 8.637772360605421e-06,
"loss": 0.0048,
"reward": -0.2539,
"step": 3109
},
{
"epoch": 0.44,
"learning_rate": 8.698215336458743e-06,
"loss": 0.0047,
"reward": -0.4092,
"step": 3119
},
{
"epoch": 0.44,
"learning_rate": 8.757488007235031e-06,
"loss": 0.0042,
"reward": -0.288,
"step": 3129
},
{
"epoch": 0.44,
"learning_rate": 8.815571616020147e-06,
"loss": 0.0033,
"reward": -0.2841,
"step": 3139
},
{
"epoch": 0.45,
"learning_rate": 8.872447782180168e-06,
"loss": 0.0044,
"reward": -0.3696,
"step": 3149
},
{
"epoch": 0.45,
"learning_rate": 8.92809850717797e-06,
"loss": 0.0034,
"reward": -0.1236,
"step": 3159
},
{
"epoch": 0.45,
"learning_rate": 8.982506180268895e-06,
"loss": 0.0041,
"reward": -0.269,
"step": 3169
},
{
"epoch": 0.45,
"learning_rate": 9.035653584073673e-06,
"loss": 0.0029,
"reward": -0.4326,
"step": 3179
},
{
"epoch": 0.45,
"learning_rate": 9.087523900026907e-06,
"loss": 0.0038,
"reward": -0.2885,
"step": 3189
},
{
"epoch": 0.45,
"learning_rate": 9.138100713699312e-06,
"loss": 0.0038,
"reward": -0.3119,
"step": 3199
},
{
"epoch": 0.45,
"learning_rate": 9.187368019992092e-06,
"loss": 0.004,
"reward": -0.196,
"step": 3209
},
{
"epoch": 0.46,
"learning_rate": 9.235310228201782e-06,
"loss": 0.0036,
"reward": -0.3683,
"step": 3219
},
{
"epoch": 0.46,
"learning_rate": 9.281912166953929e-06,
"loss": 0.0035,
"reward": -0.1997,
"step": 3229
},
{
"epoch": 0.46,
"learning_rate": 9.327159089004098e-06,
"loss": 0.0042,
"reward": -0.403,
"step": 3239
},
{
"epoch": 0.46,
"learning_rate": 9.371036675904667e-06,
"loss": 0.0034,
"reward": -0.3637,
"step": 3249
},
{
"epoch": 0.46,
"learning_rate": 9.413531042535915e-06,
"loss": 0.0049,
"reward": -0.3766,
"step": 3259
},
{
"epoch": 0.46,
"learning_rate": 9.454628741499976e-06,
"loss": 0.0044,
"reward": -0.3386,
"step": 3269
},
{
"epoch": 0.46,
"learning_rate": 9.494316767376295e-06,
"loss": 0.0039,
"reward": -0.3232,
"step": 3279
},
{
"epoch": 0.47,
"learning_rate": 9.532582560837204e-06,
"loss": 0.0032,
"reward": -0.3225,
"step": 3289
},
{
"epoch": 0.47,
"learning_rate": 9.569414012622356e-06,
"loss": 0.0037,
"reward": -0.3366,
"step": 3299
},
{
"epoch": 0.47,
"learning_rate": 9.604799467370689e-06,
"loss": 0.0038,
"reward": -0.3283,
"step": 3309
},
{
"epoch": 0.47,
"learning_rate": 9.63872772730879e-06,
"loss": 0.0033,
"reward": -0.2504,
"step": 3319
},
{
"epoch": 0.47,
"learning_rate": 9.671188055794462e-06,
"loss": 0.0037,
"reward": -0.3496,
"step": 3329
},
{
"epoch": 0.47,
"learning_rate": 9.702170180714328e-06,
"loss": 0.0035,
"reward": -0.3135,
"step": 3339
},
{
"epoch": 0.47,
"learning_rate": 9.731664297734458e-06,
"loss": 0.0036,
"reward": -0.3951,
"step": 3349
},
{
"epoch": 0.48,
"learning_rate": 9.75966107340297e-06,
"loss": 0.0032,
"reward": -0.4631,
"step": 3359
},
{
"epoch": 0.48,
"learning_rate": 9.786151648103613e-06,
"loss": 0.0036,
"reward": -0.3141,
"step": 3369
},
{
"epoch": 0.48,
"learning_rate": 9.811127638859398e-06,
"loss": 0.003,
"reward": -0.1992,
"step": 3379
},
{
"epoch": 0.48,
"learning_rate": 9.834581141985404e-06,
"loss": 0.0038,
"reward": -0.3201,
"step": 3389
},
{
"epoch": 0.48,
"learning_rate": 9.85650473558991e-06,
"loss": 0.0031,
"reward": -0.1577,
"step": 3399
},
{
"epoch": 0.48,
"learning_rate": 9.876891481923067e-06,
"loss": 0.0035,
"reward": -0.2635,
"step": 3409
},
{
"epoch": 0.48,
"learning_rate": 9.89573492957235e-06,
"loss": 0.0044,
"reward": -0.3539,
"step": 3419
},
{
"epoch": 0.49,
"learning_rate": 9.91302911550412e-06,
"loss": 0.0031,
"reward": -0.3358,
"step": 3429
},
{
"epoch": 0.49,
"learning_rate": 9.928768566950632e-06,
"loss": 0.0041,
"reward": -0.2885,
"step": 3439
},
{
"epoch": 0.49,
"learning_rate": 9.94294830314191e-06,
"loss": 0.0037,
"reward": -0.2699,
"step": 3449
},
{
"epoch": 0.49,
"learning_rate": 9.955563836881898e-06,
"loss": 0.0053,
"reward": -0.317,
"step": 3459
},
{
"epoch": 0.49,
"learning_rate": 9.966611175968454e-06,
"loss": 0.0035,
"reward": -0.2824,
"step": 3469
},
{
"epoch": 0.49,
"learning_rate": 9.976086824456686e-06,
"loss": 0.004,
"reward": -0.5046,
"step": 3479
},
{
"epoch": 0.49,
"learning_rate": 9.983987783765243e-06,
"loss": 0.0036,
"reward": -0.4293,
"step": 3489
},
{
"epoch": 0.5,
"learning_rate": 9.990311553625227e-06,
"loss": 0.0034,
"reward": -0.1845,
"step": 3499
},
{
"epoch": 0.5,
"learning_rate": 9.995056132871399e-06,
"loss": 0.0035,
"reward": -0.1239,
"step": 3509
},
{
"epoch": 0.5,
"learning_rate": 9.998220020075455e-06,
"loss": 0.0031,
"reward": -0.271,
"step": 3519
},
{
"epoch": 0.5,
"learning_rate": 9.999802214021156e-06,
"loss": 0.0039,
"reward": -0.4751,
"step": 3529
},
{
"epoch": 0.5,
"learning_rate": 9.999802214021156e-06,
"loss": 0.0038,
"reward": -0.454,
"step": 3539
},
{
"epoch": 0.5,
"learning_rate": 9.998220020075455e-06,
"loss": 0.0041,
"reward": -0.1722,
"step": 3549
},
{
"epoch": 0.5,
"learning_rate": 9.9950561328714e-06,
"loss": 0.0041,
"reward": -0.2842,
"step": 3559
},
{
"epoch": 0.51,
"learning_rate": 9.990311553625229e-06,
"loss": 0.0042,
"reward": -0.3914,
"step": 3569
},
{
"epoch": 0.51,
"learning_rate": 9.983987783765245e-06,
"loss": 0.0038,
"reward": -0.1993,
"step": 3579
},
{
"epoch": 0.51,
"learning_rate": 9.976086824456686e-06,
"loss": 0.0029,
"reward": -0.3465,
"step": 3589
},
{
"epoch": 0.51,
"learning_rate": 9.966611175968454e-06,
"loss": 0.0034,
"reward": -0.209,
"step": 3599
},
{
"epoch": 0.51,
"learning_rate": 9.955563836881898e-06,
"loss": 0.0032,
"reward": -0.2638,
"step": 3609
},
{
"epoch": 0.51,
"learning_rate": 9.94294830314191e-06,
"loss": 0.0032,
"reward": -0.2811,
"step": 3619
},
{
"epoch": 0.51,
"learning_rate": 9.928768566950635e-06,
"loss": 0.0038,
"reward": -0.3794,
"step": 3629
},
{
"epoch": 0.52,
"learning_rate": 9.91302911550412e-06,
"loss": 0.0038,
"reward": -0.3419,
"step": 3639
},
{
"epoch": 0.52,
"learning_rate": 9.89573492957235e-06,
"loss": 0.0032,
"reward": -0.31,
"step": 3649
},
{
"epoch": 0.52,
"learning_rate": 9.876891481923067e-06,
"loss": 0.0034,
"reward": -0.1604,
"step": 3659
},
{
"epoch": 0.52,
"learning_rate": 9.85650473558991e-06,
"loss": 0.0035,
"reward": -0.1379,
"step": 3669
},
{
"epoch": 0.52,
"learning_rate": 9.834581141985404e-06,
"loss": 0.0038,
"reward": -0.3531,
"step": 3679
},
{
"epoch": 0.52,
"learning_rate": 9.811127638859398e-06,
"loss": 0.0028,
"reward": -0.1161,
"step": 3689
},
{
"epoch": 0.52,
"learning_rate": 9.786151648103615e-06,
"loss": 0.0038,
"reward": -0.3628,
"step": 3699
},
{
"epoch": 0.53,
"learning_rate": 9.759661073402971e-06,
"loss": 0.0033,
"reward": -0.3243,
"step": 3709
},
{
"epoch": 0.53,
"learning_rate": 9.731664297734458e-06,
"loss": 0.0028,
"reward": -0.2481,
"step": 3719
},
{
"epoch": 0.53,
"learning_rate": 9.702170180714325e-06,
"loss": 0.0038,
"reward": -0.3179,
"step": 3729
},
{
"epoch": 0.53,
"learning_rate": 9.671188055794462e-06,
"loss": 0.0028,
"reward": -0.1725,
"step": 3739
},
{
"epoch": 0.53,
"learning_rate": 9.63872772730879e-06,
"loss": 0.003,
"reward": -0.1697,
"step": 3749
},
{
"epoch": 0.53,
"learning_rate": 9.604799467370689e-06,
"loss": 0.0035,
"reward": -0.2575,
"step": 3759
},
{
"epoch": 0.53,
"learning_rate": 9.569414012622358e-06,
"loss": 0.0043,
"reward": -0.3233,
"step": 3769
},
{
"epoch": 0.54,
"learning_rate": 9.532582560837208e-06,
"loss": 0.0031,
"reward": -0.0708,
"step": 3779
},
{
"epoch": 0.54,
"learning_rate": 9.494316767376295e-06,
"loss": 0.003,
"reward": -0.2344,
"step": 3789
},
{
"epoch": 0.54,
"learning_rate": 9.454628741499978e-06,
"loss": 0.0036,
"reward": -0.0844,
"step": 3799
},
{
"epoch": 0.54,
"learning_rate": 9.413531042535916e-06,
"loss": 0.0034,
"reward": -0.2369,
"step": 3809
},
{
"epoch": 0.54,
"learning_rate": 9.371036675904671e-06,
"loss": 0.0036,
"reward": -0.3654,
"step": 3819
},
{
"epoch": 0.54,
"learning_rate": 9.327159089004098e-06,
"loss": 0.0036,
"reward": -0.2757,
"step": 3829
},
{
"epoch": 0.54,
"learning_rate": 9.281912166953932e-06,
"loss": 0.0035,
"reward": -0.3181,
"step": 3839
},
{
"epoch": 0.54,
"learning_rate": 9.235310228201784e-06,
"loss": 0.0048,
"reward": -0.3122,
"step": 3849
},
{
"epoch": 0.55,
"learning_rate": 9.187368019992095e-06,
"loss": 0.0033,
"reward": -0.2904,
"step": 3859
},
{
"epoch": 0.55,
"learning_rate": 9.138100713699312e-06,
"loss": 0.0037,
"reward": -0.1489,
"step": 3869
},
{
"epoch": 0.55,
"learning_rate": 9.087523900026905e-06,
"loss": 0.0029,
"reward": -0.2763,
"step": 3879
},
{
"epoch": 0.55,
"learning_rate": 9.035653584073675e-06,
"loss": 0.003,
"reward": -0.1622,
"step": 3889
},
{
"epoch": 0.55,
"learning_rate": 8.982506180268893e-06,
"loss": 0.0034,
"reward": -0.3107,
"step": 3899
},
{
"epoch": 0.55,
"learning_rate": 8.928098507177972e-06,
"loss": 0.0033,
"reward": -0.1224,
"step": 3909
},
{
"epoch": 0.55,
"learning_rate": 8.872447782180166e-06,
"loss": 0.0038,
"reward": -0.1105,
"step": 3919
},
{
"epoch": 0.56,
"learning_rate": 8.815571616020149e-06,
"loss": 0.0031,
"reward": -0.348,
"step": 3929
},
{
"epoch": 0.56,
"learning_rate": 8.757488007235031e-06,
"loss": 0.0035,
"reward": -0.2409,
"step": 3939
},
{
"epoch": 0.56,
"learning_rate": 8.698215336458744e-06,
"loss": 0.0033,
"reward": -0.3141,
"step": 3949
},
{
"epoch": 0.56,
"learning_rate": 8.637772360605418e-06,
"loss": 0.0034,
"reward": -0.313,
"step": 3959
},
{
"epoch": 0.56,
"learning_rate": 8.576178206933775e-06,
"loss": 0.0046,
"reward": -0.4326,
"step": 3969
},
{
"epoch": 0.56,
"learning_rate": 8.513452366994239e-06,
"loss": 0.003,
"reward": -0.1198,
"step": 3979
},
{
"epoch": 0.56,
"learning_rate": 8.44961469046086e-06,
"loss": 0.0032,
"reward": -0.1191,
"step": 3989
},
{
"epoch": 0.57,
"learning_rate": 8.384685378849806e-06,
"loss": 0.0041,
"reward": -0.3557,
"step": 3999
},
{
"epoch": 0.57,
"learning_rate": 8.318684979126612e-06,
"loss": 0.0035,
"reward": -0.2939,
"step": 4009
},
{
"epoch": 0.57,
"learning_rate": 8.251634377204026e-06,
"loss": 0.0049,
"reward": -0.1952,
"step": 4019
},
{
"epoch": 0.57,
"learning_rate": 8.183554791332677e-06,
"loss": 0.0029,
"reward": -0.1551,
"step": 4029
},
{
"epoch": 0.57,
"learning_rate": 8.114467765386494e-06,
"loss": 0.0043,
"reward": -0.2539,
"step": 4039
},
{
"epoch": 0.57,
"learning_rate": 8.04439516204514e-06,
"loss": 0.0037,
"reward": -0.3849,
"step": 4049
},
{
"epoch": 0.57,
"learning_rate": 7.973359155875525e-06,
"loss": 0.0028,
"reward": -0.2593,
"step": 4059
},
{
"epoch": 0.58,
"learning_rate": 7.901382226314662e-06,
"loss": 0.0032,
"reward": -0.1569,
"step": 4069
},
{
"epoch": 0.58,
"learning_rate": 7.82848715055598e-06,
"loss": 0.0035,
"reward": -0.4049,
"step": 4079
},
{
"epoch": 0.58,
"learning_rate": 7.75469699634149e-06,
"loss": 0.0031,
"reward": -0.3131,
"step": 4089
},
{
"epoch": 0.58,
"learning_rate": 7.68003511466193e-06,
"loss": 0.0033,
"reward": -0.2798,
"step": 4099
},
{
"epoch": 0.58,
"learning_rate": 7.604525132367354e-06,
"loss": 0.0034,
"reward": -0.2349,
"step": 4109
},
{
"epoch": 0.58,
"learning_rate": 7.528190944690358e-06,
"loss": 0.0034,
"reward": -0.1358,
"step": 4119
},
{
"epoch": 0.58,
"learning_rate": 7.4510567076843945e-06,
"loss": 0.0034,
"reward": -0.1241,
"step": 4129
},
{
"epoch": 0.59,
"learning_rate": 7.373146830579598e-06,
"loss": 0.0031,
"reward": -0.0964,
"step": 4139
},
{
"epoch": 0.59,
"learning_rate": 7.294485968058401e-06,
"loss": 0.003,
"reward": -0.064,
"step": 4149
},
{
"epoch": 0.59,
"learning_rate": 7.2150990124535726e-06,
"loss": 0.0031,
"reward": -0.2023,
"step": 4159
},
{
"epoch": 0.59,
"learning_rate": 7.135011085870962e-06,
"loss": 0.0035,
"reward": -0.3227,
"step": 4169
},
{
"epoch": 0.59,
"learning_rate": 7.054247532239637e-06,
"loss": 0.003,
"reward": -0.1747,
"step": 4179
},
{
"epoch": 0.59,
"learning_rate": 6.9728339092916915e-06,
"loss": 0.003,
"reward": -0.1391,
"step": 4189
},
{
"epoch": 0.59,
"learning_rate": 6.890795980474532e-06,
"loss": 0.0029,
"reward": -0.1152,
"step": 4199
},
{
"epoch": 0.6,
"learning_rate": 6.808159706797946e-06,
"loss": 0.0027,
"reward": -0.225,
"step": 4209
},
{
"epoch": 0.6,
"learning_rate": 6.72495123861877e-06,
"loss": 0.0035,
"reward": -0.107,
"step": 4219
},
{
"epoch": 0.6,
"learning_rate": 6.6411969073655014e-06,
"loss": 0.0037,
"reward": -0.213,
"step": 4229
},
{
"epoch": 0.6,
"learning_rate": 6.55692321720575e-06,
"loss": 0.0029,
"reward": -0.2639,
"step": 4239
},
{
"epoch": 0.6,
"learning_rate": 6.472156836658903e-06,
"loss": 0.0034,
"reward": -0.1981,
"step": 4249
},
{
"epoch": 0.6,
"learning_rate": 6.386924590156898e-06,
"loss": 0.0035,
"reward": -0.1908,
"step": 4259
},
{
"epoch": 0.6,
"learning_rate": 6.301253449555531e-06,
"loss": 0.0028,
"reward": -0.2391,
"step": 4269
},
{
"epoch": 0.61,
"learning_rate": 6.215170525599231e-06,
"loss": 0.0032,
"reward": -0.2446,
"step": 4279
},
{
"epoch": 0.61,
"learning_rate": 6.128703059341789e-06,
"loss": 0.0032,
"reward": -0.2266,
"step": 4289
},
{
"epoch": 0.61,
"learning_rate": 6.041878413525939e-06,
"loss": 0.0032,
"reward": -0.1362,
"step": 4299
},
{
"epoch": 0.61,
"learning_rate": 5.9547240639243184e-06,
"loss": 0.0035,
"reward": -0.2071,
"step": 4309
},
{
"epoch": 0.61,
"learning_rate": 5.867267590644787e-06,
"loss": 0.0035,
"reward": -0.2041,
"step": 4319
},
{
"epoch": 0.61,
"learning_rate": 5.779536669402611e-06,
"loss": 0.0029,
"reward": -0.2125,
"step": 4329
},
{
"epoch": 0.61,
"learning_rate": 5.6915590627625005e-06,
"loss": 0.0033,
"reward": -0.3047,
"step": 4339
},
{
"epoch": 0.62,
"learning_rate": 5.60336261135307e-06,
"loss": 0.0028,
"reward": -0.2644,
"step": 4349
},
{
"epoch": 0.62,
"learning_rate": 5.514975225056633e-06,
"loss": 0.0032,
"reward": -0.2142,
"step": 4359
},
{
"epoch": 0.62,
"learning_rate": 5.4264248741771295e-06,
"loss": 0.0036,
"reward": -0.2406,
"step": 4369
},
{
"epoch": 0.62,
"learning_rate": 5.337739580588822e-06,
"loss": 0.0035,
"reward": -0.4047,
"step": 4379
},
{
"epoch": 0.62,
"learning_rate": 5.248947408868794e-06,
"loss": 0.0034,
"reward": -0.1619,
"step": 4389
},
{
"epoch": 0.62,
"learning_rate": 5.16007645741583e-06,
"loss": 0.0028,
"reward": -0.1479,
"step": 4399
},
{
"epoch": 0.62,
"learning_rate": 5.071154849558695e-06,
"loss": 0.0034,
"reward": -0.1027,
"step": 4409
},
{
"epoch": 0.63,
"learning_rate": 4.982210724656409e-06,
"loss": 0.0034,
"reward": -0.2996,
"step": 4419
},
{
"epoch": 0.63,
"learning_rate": 4.893272229193561e-06,
"loss": 0.0028,
"reward": -0.0919,
"step": 4429
},
{
"epoch": 0.63,
"learning_rate": 4.804367507873277e-06,
"loss": 0.003,
"reward": -0.1089,
"step": 4439
},
{
"epoch": 0.63,
"learning_rate": 4.715524694710839e-06,
"loss": 0.0031,
"reward": -0.1168,
"step": 4449
},
{
"epoch": 0.63,
"learning_rate": 4.626771904130584e-06,
"loss": 0.0034,
"reward": -0.3128,
"step": 4459
},
{
"epoch": 0.63,
"learning_rate": 4.538137222069105e-06,
"loss": 0.0041,
"reward": -0.1848,
"step": 4469
},
{
"epoch": 0.63,
"learning_rate": 4.449648697087378e-06,
"loss": 0.0032,
"reward": -0.1865,
"step": 4479
},
{
"epoch": 0.64,
"learning_rate": 4.361334331494812e-06,
"loss": 0.0027,
"reward": -0.1697,
"step": 4489
},
{
"epoch": 0.64,
"learning_rate": 4.2732220724878194e-06,
"loss": 0.0035,
"reward": -0.1667,
"step": 4499
},
{
"epoch": 0.64,
"learning_rate": 4.185339803305934e-06,
"loss": 0.0038,
"reward": -0.3111,
"step": 4509
},
{
"epoch": 0.64,
"learning_rate": 4.097715334408112e-06,
"loss": 0.003,
"reward": -0.2045,
"step": 4519
},
{
"epoch": 0.64,
"learning_rate": 4.010376394672062e-06,
"loss": 0.0031,
"reward": -0.152,
"step": 4529
},
{
"epoch": 0.64,
"learning_rate": 3.9233506226194126e-06,
"loss": 0.003,
"reward": -0.2064,
"step": 4539
},
{
"epoch": 0.64,
"learning_rate": 3.836665557669496e-06,
"loss": 0.0034,
"reward": -0.2407,
"step": 4549
},
{
"epoch": 0.65,
"learning_rate": 3.750348631424402e-06,
"loss": 0.0029,
"reward": -0.1174,
"step": 4559
},
{
"epoch": 0.65,
"learning_rate": 3.66442715898827e-06,
"loss": 0.003,
"reward": -0.2606,
"step": 4569
},
{
"epoch": 0.65,
"learning_rate": 3.578928330323367e-06,
"loss": 0.003,
"reward": -0.1689,
"step": 4579
},
{
"epoch": 0.65,
"learning_rate": 3.493879201645759e-06,
"loss": 0.0028,
"reward": -0.2151,
"step": 4589
},
{
"epoch": 0.65,
"learning_rate": 3.409306686863399e-06,
"loss": 0.0029,
"reward": -0.1513,
"step": 4599
},
{
"epoch": 0.65,
"learning_rate": 3.3252375490591217e-06,
"loss": 0.0026,
"reward": -0.0821,
"step": 4609
},
{
"epoch": 0.65,
"learning_rate": 3.24169839202147e-06,
"loss": 0.0034,
"reward": -0.2608,
"step": 4619
},
{
"epoch": 0.66,
"learning_rate": 3.158715651825871e-06,
"loss": 0.0028,
"reward": -0.132,
"step": 4629
},
{
"epoch": 0.66,
"learning_rate": 3.076315588468941e-06,
"loss": 0.0028,
"reward": -0.0714,
"step": 4639
},
{
"epoch": 0.66,
"learning_rate": 2.9945242775584143e-06,
"loss": 0.0032,
"reward": -0.2108,
"step": 4649
},
{
"epoch": 0.66,
"learning_rate": 2.913367602061552e-06,
"loss": 0.0027,
"reward": -0.0474,
"step": 4659
},
{
"epoch": 0.66,
"learning_rate": 2.832871244114375e-06,
"loss": 0.0033,
"reward": -0.287,
"step": 4669
},
{
"epoch": 0.66,
"learning_rate": 2.753060676894588e-06,
"loss": 0.0029,
"reward": -0.2081,
"step": 4679
},
{
"epoch": 0.66,
"learning_rate": 2.6739611565604947e-06,
"loss": 0.0036,
"reward": -0.1504,
"step": 4689
},
{
"epoch": 0.67,
"learning_rate": 2.5955977142586946e-06,
"loss": 0.0033,
"reward": -0.2813,
"step": 4699
},
{
"epoch": 0.67,
"learning_rate": 2.5179951482029225e-06,
"loss": 0.0029,
"reward": -0.3317,
"step": 4709
},
{
"epoch": 0.67,
"learning_rate": 2.4411780158266533e-06,
"loss": 0.0029,
"reward": -0.1715,
"step": 4719
},
{
"epoch": 0.67,
"learning_rate": 2.3651706260118184e-06,
"loss": 0.0029,
"reward": -0.1342,
"step": 4729
},
{
"epoch": 0.67,
"learning_rate": 2.289997031396286e-06,
"loss": 0.0032,
"reward": -0.0324,
"step": 4739
},
{
"epoch": 0.67,
"learning_rate": 2.215681020762313e-06,
"loss": 0.0032,
"reward": -0.1613,
"step": 4749
},
{
"epoch": 0.67,
"learning_rate": 2.1422461115086167e-06,
"loss": 0.0027,
"reward": -0.2861,
"step": 4759
},
{
"epoch": 0.68,
"learning_rate": 2.069715542208207e-06,
"loss": 0.0026,
"reward": -0.181,
"step": 4769
},
{
"epoch": 0.68,
"learning_rate": 1.998112265254541e-06,
"loss": 0.0028,
"reward": -0.1974,
"step": 4779
},
{
"epoch": 0.68,
"learning_rate": 1.9274589395981937e-06,
"loss": 0.0036,
"reward": -0.2612,
"step": 4789
},
{
"epoch": 0.68,
"learning_rate": 1.8577779235764249e-06,
"loss": 0.0031,
"reward": -0.1949,
"step": 4799
},
{
"epoch": 0.68,
"learning_rate": 1.7890912678378392e-06,
"loss": 0.0033,
"reward": -0.1398,
"step": 4809
},
{
"epoch": 0.68,
"learning_rate": 1.7214207083644098e-06,
"loss": 0.0032,
"reward": -0.1068,
"step": 4819
},
{
"epoch": 0.68,
"learning_rate": 1.6547876595931444e-06,
"loss": 0.0028,
"reward": -0.2427,
"step": 4829
},
{
"epoch": 0.69,
"learning_rate": 1.5892132076394151e-06,
"loss": 0.0031,
"reward": -0.0524,
"step": 4839
},
{
"epoch": 0.69,
"learning_rate": 1.524718103624252e-06,
"loss": 0.0029,
"reward": -0.1146,
"step": 4849
},
{
"epoch": 0.69,
"learning_rate": 1.4613227571076138e-06,
"loss": 0.0029,
"reward": -0.1041,
"step": 4859
},
{
"epoch": 0.69,
"learning_rate": 1.3990472296297808e-06,
"loss": 0.0025,
"reward": -0.2616,
"step": 4869
},
{
"epoch": 0.69,
"learning_rate": 1.3379112283628081e-06,
"loss": 0.0029,
"reward": -0.187,
"step": 4879
},
{
"epoch": 0.69,
"learning_rate": 1.2779340998742185e-06,
"loss": 0.0032,
"reward": -0.1955,
"step": 4889
},
{
"epoch": 0.69,
"learning_rate": 1.219134824004704e-06,
"loss": 0.0028,
"reward": -0.106,
"step": 4899
},
{
"epoch": 0.7,
"learning_rate": 1.16153200786198e-06,
"loss": 0.0032,
"reward": -0.2001,
"step": 4909
},
{
"epoch": 0.7,
"learning_rate": 1.1051438799324999e-06,
"loss": 0.0029,
"reward": -0.1302,
"step": 4919
},
{
"epoch": 0.7,
"learning_rate": 1.0499882843130487e-06,
"loss": 0.0032,
"reward": -0.2308,
"step": 4929
},
{
"epoch": 0.7,
"learning_rate": 9.96082675063948e-07,
"loss": 0.0028,
"reward": -0.2366,
"step": 4939
},
{
"epoch": 0.7,
"learning_rate": 9.43444110685714e-07,
"loss": 0.003,
"reward": -0.1516,
"step": 4949
},
{
"epoch": 0.7,
"learning_rate": 8.920892487208343e-07,
"loss": 0.0034,
"reward": 0.0169,
"step": 4959
},
{
"epoch": 0.7,
"learning_rate": 8.420343404825132e-07,
"loss": 0.0026,
"reward": -0.0979,
"step": 4969
},
{
"epoch": 0.7,
"learning_rate": 7.932952259118776e-07,
"loss": 0.003,
"reward": -0.1379,
"step": 4979
},
{
"epoch": 0.71,
"learning_rate": 7.458873285654489e-07,
"loss": 0.0035,
"reward": -0.0396,
"step": 4989
},
{
"epoch": 0.71,
"learning_rate": 6.998256507343016e-07,
"loss": 0.0027,
"reward": 0.0041,
"step": 4999
},
{
"epoch": 0.71,
"learning_rate": 6.551247686965872e-07,
"loss": 0.0034,
"reward": -0.1662,
"step": 5009
},
{
"epoch": 0.71,
"learning_rate": 6.117988281048626e-07,
"loss": 0.0032,
"reward": -0.1413,
"step": 5019
},
{
"epoch": 0.71,
"learning_rate": 5.698615395096485e-07,
"loss": 0.0029,
"reward": -0.2091,
"step": 5029
},
{
"epoch": 0.71,
"learning_rate": 5.293261740207456e-07,
"loss": 0.0028,
"reward": -0.2433,
"step": 5039
},
{
"epoch": 0.71,
"learning_rate": 4.902055591075355e-07,
"loss": 0.0031,
"reward": -0.2965,
"step": 5049
},
{
"epoch": 0.72,
"learning_rate": 4.525120745397493e-07,
"loss": 0.0029,
"reward": -0.1276,
"step": 5059
},
{
"epoch": 0.72,
"learning_rate": 4.1625764846984276e-07,
"loss": 0.0031,
"reward": -0.1481,
"step": 5069
},
{
"epoch": 0.72,
"learning_rate": 3.814537536583318e-07,
"loss": 0.0028,
"reward": -0.1388,
"step": 5079
},
{
"epoch": 0.72,
"learning_rate": 3.481114038432176e-07,
"loss": 0.003,
"reward": -0.2624,
"step": 5089
},
{
"epoch": 0.72,
"learning_rate": 3.1624115025468695e-07,
"loss": 0.0029,
"reward": -0.0321,
"step": 5099
},
{
"epoch": 0.72,
"learning_rate": 2.8585307827613764e-07,
"loss": 0.0029,
"reward": -0.2161,
"step": 5109
},
{
"epoch": 0.72,
"learning_rate": 2.569568042526721e-07,
"loss": 0.0035,
"reward": -0.2493,
"step": 5119
},
{
"epoch": 0.73,
"learning_rate": 2.2956147244796946e-07,
"loss": 0.0031,
"reward": -0.1123,
"step": 5129
},
{
"epoch": 0.73,
"learning_rate": 2.0367575215059222e-07,
"loss": 0.0031,
"reward": -0.1892,
"step": 5139
},
{
"epoch": 0.73,
"learning_rate": 1.7930783493055936e-07,
"loss": 0.0031,
"reward": -0.3653,
"step": 5149
},
{
"epoch": 0.73,
"learning_rate": 1.5646543204712595e-07,
"loss": 0.003,
"reward": -0.1532,
"step": 5159
},
{
"epoch": 0.73,
"learning_rate": 1.3515577200853946e-07,
"loss": 0.0031,
"reward": -0.2271,
"step": 5169
},
{
"epoch": 0.73,
"learning_rate": 1.1538559828457586e-07,
"loss": 0.0033,
"reward": -0.1938,
"step": 5179
},
{
"epoch": 0.73,
"learning_rate": 9.716116717254698e-08,
"loss": 0.003,
"reward": -0.2875,
"step": 5189
},
{
"epoch": 0.74,
"learning_rate": 8.048824581750325e-08,
"loss": 0.0032,
"reward": -0.1349,
"step": 5199
},
{
"epoch": 0.74,
"learning_rate": 6.537211038719571e-08,
"loss": 0.0027,
"reward": -0.1755,
"step": 5209
},
{
"epoch": 0.74,
"learning_rate": 5.1817544402442686e-08,
"loss": 0.0033,
"reward": -0.1694,
"step": 5219
},
{
"epoch": 0.74,
"learning_rate": 3.9828837223365166e-08,
"loss": 0.0027,
"reward": -0.1913,
"step": 5229
},
{
"epoch": 0.74,
"learning_rate": 2.9409782692019218e-08,
"loss": 0.0032,
"reward": -0.2301,
"step": 5239
},
{
"epoch": 0.74,
"learning_rate": 2.056367793183134e-08,
"loss": 0.0032,
"reward": -0.3049,
"step": 5249
},
{
"epoch": 0.74,
"learning_rate": 1.3293322304213652e-08,
"loss": 0.0034,
"reward": -0.2132,
"step": 5259
},
{
"epoch": 0.75,
"learning_rate": 7.601016522708616e-09,
"loss": 0.0031,
"reward": -0.2705,
"step": 5269
},
{
"epoch": 0.75,
"learning_rate": 3.4885619249203086e-09,
"loss": 0.0031,
"reward": -0.1178,
"step": 5279
},
{
"epoch": 0.75,
"learning_rate": 9.572599024820773e-10,
"loss": 0.0031,
"reward": -0.1919,
"step": 5289
},
{
"epoch": 0.75,
"learning_rate": 7.911489227074853e-12,
"loss": 0.0029,
"reward": -0.0784,
"step": 5299
},
{
"epoch": 0.75,
"learning_rate": 6.408171077015856e-10,
"loss": 0.0031,
"reward": -0.1285,
"step": 5309
},
{
"epoch": 0.75,
"learning_rate": 2.8557764740955172e-09,
"loss": 0.0031,
"reward": -0.1184,
"step": 5319
},
{
"epoch": 0.75,
"learning_rate": 6.652088661621703e-09,
"loss": 0.003,
"reward": -0.2125,
"step": 5329
},
{
"epoch": 0.76,
"learning_rate": 1.2028552322327358e-08,
"loss": 0.004,
"reward": -0.202,
"step": 5339
},
{
"epoch": 0.76,
"learning_rate": 1.898346606725887e-08,
"loss": 0.0032,
"reward": -0.206,
"step": 5349
},
{
"epoch": 0.76,
"learning_rate": 2.7514629004871673e-08,
"loss": 0.0033,
"reward": -0.2027,
"step": 5359
},
{
"epoch": 0.76,
"learning_rate": 3.761934143750256e-08,
"loss": 0.0044,
"reward": -0.2893,
"step": 5369
},
{
"epoch": 0.76,
"learning_rate": 4.9294405715696324e-08,
"loss": 0.0032,
"reward": -0.1786,
"step": 5379
},
{
"epoch": 0.76,
"learning_rate": 6.253612725009962e-08,
"loss": 0.0033,
"reward": -0.26,
"step": 5389
},
{
"epoch": 0.76,
"learning_rate": 7.734031568062683e-08,
"loss": 0.0024,
"reward": -0.2568,
"step": 5399
},
{
"epoch": 0.77,
"learning_rate": 9.370228620249778e-08,
"loss": 0.0037,
"reward": -0.1439,
"step": 5409
},
{
"epoch": 0.77,
"learning_rate": 1.116168610487578e-07,
"loss": 0.0028,
"reward": -0.2807,
"step": 5419
},
{
"epoch": 0.77,
"learning_rate": 1.3107837112877664e-07,
"loss": 0.0032,
"reward": -0.2143,
"step": 5429
},
{
"epoch": 0.77,
"learning_rate": 1.5208065782225667e-07,
"loss": 0.0031,
"reward": -0.2846,
"step": 5439
},
{
"epoch": 0.77,
"learning_rate": 1.7461707492811786e-07,
"loss": 0.0032,
"reward": -0.2704,
"step": 5449
},
{
"epoch": 0.77,
"learning_rate": 1.9868049076771478e-07,
"loss": 0.0035,
"reward": -0.2247,
"step": 5459
},
{
"epoch": 0.77,
"learning_rate": 2.2426329044164808e-07,
"loss": 0.0032,
"reward": -0.1316,
"step": 5469
},
{
"epoch": 0.78,
"learning_rate": 2.5135737823952457e-07,
"loss": 0.0033,
"reward": -0.1427,
"step": 5479
},
{
"epoch": 0.78,
"learning_rate": 2.7995418020185016e-07,
"loss": 0.0029,
"reward": -0.2073,
"step": 5489
},
{
"epoch": 0.78,
"learning_rate": 3.100446468332596e-07,
"loss": 0.0026,
"reward": -0.1949,
"step": 5499
},
{
"epoch": 0.78,
"learning_rate": 3.41619255966264e-07,
"loss": 0.0028,
"reward": -0.2821,
"step": 5509
},
{
"epoch": 0.78,
"learning_rate": 3.746680157745258e-07,
"loss": 0.0029,
"reward": -0.1413,
"step": 5519
},
{
"epoch": 0.78,
"learning_rate": 4.091804679348144e-07,
"loss": 0.0032,
"reward": -0.2282,
"step": 5529
},
{
"epoch": 0.78,
"learning_rate": 4.451456909365265e-07,
"loss": 0.003,
"reward": -0.1473,
"step": 5539
},
{
"epoch": 0.79,
"learning_rate": 4.825523035378365e-07,
"loss": 0.0032,
"reward": -0.1949,
"step": 5549
},
{
"epoch": 0.79,
"learning_rate": 5.213884683672954e-07,
"loss": 0.0041,
"reward": -0.0118,
"step": 5559
},
{
"epoch": 0.79,
"learning_rate": 5.61641895669785e-07,
"loss": 0.0031,
"reward": -0.1958,
"step": 5569
},
{
"epoch": 0.79,
"learning_rate": 6.03299847195613e-07,
"loss": 0.003,
"reward": -0.0907,
"step": 5579
},
{
"epoch": 0.79,
"learning_rate": 6.46349140231578e-07,
"loss": 0.0032,
"reward": -0.1633,
"step": 5589
},
{
"epoch": 0.79,
"learning_rate": 6.907761517726225e-07,
"loss": 0.0037,
"reward": -0.3335,
"step": 5599
},
{
"epoch": 0.79,
"learning_rate": 7.365668228328832e-07,
"loss": 0.0032,
"reward": -0.2729,
"step": 5609
},
{
"epoch": 0.8,
"learning_rate": 7.837066628946427e-07,
"loss": 0.004,
"reward": -0.2201,
"step": 5619
},
{
"epoch": 0.8,
"learning_rate": 8.321807544939037e-07,
"loss": 0.0035,
"reward": -0.1243,
"step": 5629
},
{
"epoch": 0.8,
"learning_rate": 8.819737579410198e-07,
"loss": 0.003,
"reward": -0.1845,
"step": 5639
},
{
"epoch": 0.8,
"learning_rate": 9.33069916174974e-07,
"loss": 0.0032,
"reward": -0.2037,
"step": 5649
},
{
"epoch": 0.8,
"learning_rate": 9.854530597496953e-07,
"loss": 0.0031,
"reward": -0.0922,
"step": 5659
},
{
"epoch": 0.8,
"learning_rate": 1.039106611950943e-06,
"loss": 0.0025,
"reward": -0.2618,
"step": 5669
},
{
"epoch": 0.8,
"learning_rate": 1.0940135940419828e-06,
"loss": 0.003,
"reward": -0.1229,
"step": 5679
},
{
"epoch": 0.81,
"learning_rate": 1.1501566306365758e-06,
"loss": 0.0029,
"reward": -0.0647,
"step": 5689
},
{
"epoch": 0.81,
"learning_rate": 1.2075179551973986e-06,
"loss": 0.003,
"reward": -0.1913,
"step": 5699
},
{
"epoch": 0.81,
"learning_rate": 1.2660794156583233e-06,
"loss": 0.0028,
"reward": -0.1541,
"step": 5709
},
{
"epoch": 0.81,
"learning_rate": 1.3258224801686503e-06,
"loss": 0.0033,
"reward": -0.1554,
"step": 5719
},
{
"epoch": 0.81,
"learning_rate": 1.3867282429575152e-06,
"loss": 0.0041,
"reward": -0.3137,
"step": 5729
},
{
"epoch": 0.81,
"learning_rate": 1.4487774303166875e-06,
"loss": 0.0033,
"reward": -0.2465,
"step": 5739
},
{
"epoch": 0.81,
"learning_rate": 1.511950406699712e-06,
"loss": 0.0038,
"reward": -0.2697,
"step": 5749
},
{
"epoch": 0.82,
"learning_rate": 1.576227180935655e-06,
"loss": 0.0032,
"reward": -0.155,
"step": 5759
},
{
"epoch": 0.82,
"learning_rate": 1.641587412555281e-06,
"loss": 0.0026,
"reward": -0.128,
"step": 5769
},
{
"epoch": 0.82,
"learning_rate": 1.7080104182278716e-06,
"loss": 0.0036,
"reward": -0.1322,
"step": 5779
},
{
"epoch": 0.82,
"learning_rate": 1.7754751783064622e-06,
"loss": 0.0036,
"reward": -0.2405,
"step": 5789
},
{
"epoch": 0.82,
"learning_rate": 1.8439603434795516e-06,
"loss": 0.0032,
"reward": -0.1742,
"step": 5799
},
{
"epoch": 0.82,
"learning_rate": 1.9134442415270926e-06,
"loss": 0.003,
"reward": -0.1415,
"step": 5809
},
{
"epoch": 0.82,
"learning_rate": 1.983904884178742e-06,
"loss": 0.0027,
"reward": -0.3485,
"step": 5819
},
{
"epoch": 0.83,
"learning_rate": 2.0553199740720214e-06,
"loss": 0.0033,
"reward": -0.2301,
"step": 5829
},
{
"epoch": 0.83,
"learning_rate": 2.1276669118084043e-06,
"loss": 0.0038,
"reward": -0.419,
"step": 5839
},
{
"epoch": 0.83,
"learning_rate": 2.200922803104868e-06,
"loss": 0.0028,
"reward": -0.1832,
"step": 5849
},
{
"epoch": 0.83,
"learning_rate": 2.2750644660388675e-06,
"loss": 0.0028,
"reward": -0.1299,
"step": 5859
},
{
"epoch": 0.83,
"learning_rate": 2.3500684383842605e-06,
"loss": 0.0027,
"reward": -0.0986,
"step": 5869
},
{
"epoch": 0.83,
"learning_rate": 2.4259109850359695e-06,
"loss": 0.0027,
"reward": -0.1309,
"step": 5879
},
{
"epoch": 0.83,
"learning_rate": 2.502568105520961e-06,
"loss": 0.0034,
"reward": -0.1672,
"step": 5889
},
{
"epoch": 0.84,
"learning_rate": 2.580015541593277e-06,
"loss": 0.0031,
"reward": -0.1751,
"step": 5899
},
{
"epoch": 0.84,
"learning_rate": 2.6582287849105274e-06,
"loss": 0.0031,
"reward": -0.2151,
"step": 5909
},
{
"epoch": 0.84,
"learning_rate": 2.737183084789651e-06,
"loss": 0.0029,
"reward": -0.1455,
"step": 5919
},
{
"epoch": 0.84,
"learning_rate": 2.816853456039244e-06,
"loss": 0.0026,
"reward": -0.1059,
"step": 5929
},
{
"epoch": 0.84,
"learning_rate": 2.8972146868661828e-06,
"loss": 0.0028,
"reward": -0.1925,
"step": 5939
},
{
"epoch": 0.84,
"learning_rate": 2.9782413468539417e-06,
"loss": 0.0028,
"reward": -0.3239,
"step": 5949
},
{
"epoch": 0.84,
"learning_rate": 3.0599077950100608e-06,
"loss": 0.0039,
"reward": -0.2807,
"step": 5959
},
{
"epoch": 0.85,
"learning_rate": 3.1421881878802864e-06,
"loss": 0.0028,
"reward": -0.2573,
"step": 5969
},
{
"epoch": 0.85,
"learning_rate": 3.225056487726732e-06,
"loss": 0.003,
"reward": -0.2115,
"step": 5979
},
{
"epoch": 0.85,
"learning_rate": 3.308486470767613e-06,
"loss": 0.0028,
"reward": -0.1847,
"step": 5989
},
{
"epoch": 0.85,
"learning_rate": 3.3924517354757315e-06,
"loss": 0.0027,
"reward": -0.1512,
"step": 5999
},
{
"epoch": 0.85,
"learning_rate": 3.476925710933339e-06,
"loss": 0.0031,
"reward": -0.0941,
"step": 6009
},
{
"epoch": 0.85,
"learning_rate": 3.5618816652404714e-06,
"loss": 0.0033,
"reward": -0.2708,
"step": 6019
},
{
"epoch": 0.85,
"learning_rate": 3.6472927139743546e-06,
"loss": 0.0024,
"reward": -0.1351,
"step": 6029
},
{
"epoch": 0.86,
"learning_rate": 3.7331318286969787e-06,
"loss": 0.003,
"reward": -0.2288,
"step": 6039
},
{
"epoch": 0.86,
"learning_rate": 3.819371845508301e-06,
"loss": 0.0025,
"reward": -0.1268,
"step": 6049
},
{
"epoch": 0.86,
"learning_rate": 3.9059854736422616e-06,
"loss": 0.0035,
"reward": -0.1775,
"step": 6059
},
{
"epoch": 0.86,
"learning_rate": 3.992945304103046e-06,
"loss": 0.0029,
"reward": -0.1164,
"step": 6069
},
{
"epoch": 0.86,
"learning_rate": 4.080223818338627e-06,
"loss": 0.003,
"reward": -0.3306,
"step": 6079
},
{
"epoch": 0.86,
"learning_rate": 4.16779339694911e-06,
"loss": 0.0029,
"reward": -0.124,
"step": 6089
},
{
"epoch": 0.86,
"learning_rate": 4.25562632842687e-06,
"loss": 0.0029,
"reward": -0.1823,
"step": 6099
},
{
"epoch": 0.86,
"learning_rate": 4.343694817925945e-06,
"loss": 0.0026,
"reward": -0.2144,
"step": 6109
},
{
"epoch": 0.87,
"learning_rate": 4.4319709960577205e-06,
"loss": 0.0028,
"reward": -0.0664,
"step": 6119
},
{
"epoch": 0.87,
"learning_rate": 4.520426927710255e-06,
"loss": 0.0027,
"reward": -0.1177,
"step": 6129
},
{
"epoch": 0.87,
"learning_rate": 4.609034620888349e-06,
"loss": 0.0034,
"reward": -0.135,
"step": 6139
},
{
"epoch": 0.87,
"learning_rate": 4.697766035571671e-06,
"loss": 0.0043,
"reward": -0.3131,
"step": 6149
},
{
"epoch": 0.87,
"learning_rate": 4.786593092588086e-06,
"loss": 0.0024,
"reward": -0.234,
"step": 6159
},
{
"epoch": 0.87,
"learning_rate": 4.875487682499278e-06,
"loss": 0.0027,
"reward": -0.0573,
"step": 6169
},
{
"epoch": 0.87,
"learning_rate": 4.96442167449609e-06,
"loss": 0.0034,
"reward": -0.1771,
"step": 6179
},
{
"epoch": 0.88,
"learning_rate": 5.053366925300511e-06,
"loss": 0.0028,
"reward": -0.1481,
"step": 6189
},
{
"epoch": 0.88,
"learning_rate": 5.142295288071675e-06,
"loss": 0.0041,
"reward": -0.2635,
"step": 6199
},
{
"epoch": 0.88,
"learning_rate": 5.2311786213129315e-06,
"loss": 0.003,
"reward": -0.1914,
"step": 6209
},
{
"epoch": 0.88,
"learning_rate": 5.319988797777316e-06,
"loss": 0.0027,
"reward": -0.181,
"step": 6219
},
{
"epoch": 0.88,
"learning_rate": 5.408697713368388e-06,
"loss": 0.0028,
"reward": -0.1659,
"step": 6229
},
{
"epoch": 0.88,
"learning_rate": 5.4972772960338784e-06,
"loss": 0.0024,
"reward": -0.0837,
"step": 6239
},
{
"epoch": 0.88,
"learning_rate": 5.585699514649057e-06,
"loss": 0.003,
"reward": -0.1156,
"step": 6249
},
{
"epoch": 0.89,
"learning_rate": 5.6739363878872754e-06,
"loss": 0.003,
"reward": -0.1918,
"step": 6259
},
{
"epoch": 0.89,
"learning_rate": 5.761959993074669e-06,
"loss": 0.0027,
"reward": -0.2119,
"step": 6269
},
{
"epoch": 0.89,
"learning_rate": 5.84974247502633e-06,
"loss": 0.0031,
"reward": -0.1654,
"step": 6279
},
{
"epoch": 0.89,
"learning_rate": 5.93725605486109e-06,
"loss": 0.0029,
"reward": -0.1048,
"step": 6289
},
{
"epoch": 0.89,
"learning_rate": 6.024473038792242e-06,
"loss": 0.0029,
"reward": -0.1743,
"step": 6299
},
{
"epoch": 0.89,
"learning_rate": 6.1113658268912065e-06,
"loss": 0.0028,
"reward": -0.1883,
"step": 6309
},
{
"epoch": 0.89,
"learning_rate": 6.197906921821632e-06,
"loss": 0.0027,
"reward": -0.1351,
"step": 6319
},
{
"epoch": 0.9,
"learning_rate": 6.284068937540894e-06,
"loss": 0.0027,
"reward": -0.1687,
"step": 6329
},
{
"epoch": 0.9,
"learning_rate": 6.369824607966508e-06,
"loss": 0.0028,
"reward": -0.1027,
"step": 6339
},
{
"epoch": 0.9,
"learning_rate": 6.455146795604479e-06,
"loss": 0.0028,
"reward": -0.0773,
"step": 6349
},
{
"epoch": 0.9,
"learning_rate": 6.5400085001370186e-06,
"loss": 0.0025,
"reward": -0.2412,
"step": 6359
},
{
"epoch": 0.9,
"learning_rate": 6.624382866966794e-06,
"loss": 0.003,
"reward": -0.1633,
"step": 6369
},
{
"epoch": 0.9,
"learning_rate": 6.708243195715136e-06,
"loss": 0.0026,
"reward": -0.2322,
"step": 6379
},
{
"epoch": 0.9,
"learning_rate": 6.791562948671414e-06,
"loss": 0.0032,
"reward": -0.1261,
"step": 6389
},
{
"epoch": 0.91,
"learning_rate": 6.874315759190886e-06,
"loss": 0.0025,
"reward": -0.1441,
"step": 6399
},
{
"epoch": 0.91,
"learning_rate": 6.956475440038507e-06,
"loss": 0.0031,
"reward": -0.0489,
"step": 6409
},
{
"epoch": 0.91,
"learning_rate": 7.038015991675885e-06,
"loss": 0.0028,
"reward": -0.1033,
"step": 6419
},
{
"epoch": 0.91,
"learning_rate": 7.118911610488885e-06,
"loss": 0.0027,
"reward": -0.2106,
"step": 6429
},
{
"epoch": 0.91,
"learning_rate": 7.199136696953178e-06,
"loss": 0.0036,
"reward": -0.0478,
"step": 6439
},
{
"epoch": 0.91,
"learning_rate": 7.2786658637353125e-06,
"loss": 0.003,
"reward": -0.218,
"step": 6449
},
{
"epoch": 0.91,
"learning_rate": 7.357473943726493e-06,
"loss": 0.0026,
"reward": -0.0883,
"step": 6459
},
{
"epoch": 0.92,
"learning_rate": 7.435535998006814e-06,
"loss": 0.0034,
"reward": -0.2207,
"step": 6469
},
{
"epoch": 0.92,
"learning_rate": 7.512827323737152e-06,
"loss": 0.0033,
"reward": -0.192,
"step": 6479
},
{
"epoch": 0.92,
"learning_rate": 7.589323461976461e-06,
"loss": 0.0028,
"reward": -0.1123,
"step": 6489
},
{
"epoch": 0.92,
"learning_rate": 7.665000205421812e-06,
"loss": 0.0029,
"reward": -0.2671,
"step": 6499
},
{
"epoch": 0.92,
"learning_rate": 7.739833606068832e-06,
"loss": 0.0031,
"reward": -0.1165,
"step": 6509
},
{
"epoch": 0.92,
"learning_rate": 7.81379998279006e-06,
"loss": 0.0029,
"reward": -0.1378,
"step": 6519
},
{
"epoch": 0.92,
"learning_rate": 7.88687592882892e-06,
"loss": 0.0033,
"reward": -0.1337,
"step": 6529
},
{
"epoch": 0.93,
"learning_rate": 7.959038319206758e-06,
"loss": 0.0027,
"reward": -0.2007,
"step": 6539
},
{
"epoch": 0.93,
"learning_rate": 8.03026431804083e-06,
"loss": 0.0026,
"reward": -0.1243,
"step": 6549
},
{
"epoch": 0.93,
"learning_rate": 8.100531385770696e-06,
"loss": 0.0029,
"reward": -0.0918,
"step": 6559
},
{
"epoch": 0.93,
"learning_rate": 8.169817286290935e-06,
"loss": 0.0029,
"reward": -0.0355,
"step": 6569
},
{
"epoch": 0.93,
"learning_rate": 8.238100093987765e-06,
"loss": 0.0027,
"reward": -0.2479,
"step": 6579
},
{
"epoch": 0.93,
"learning_rate": 8.305358200677449e-06,
"loss": 0.0026,
"reward": -0.074,
"step": 6589
},
{
"epoch": 0.93,
"learning_rate": 8.371570322444209e-06,
"loss": 0.0029,
"reward": -0.0884,
"step": 6599
},
{
"epoch": 0.94,
"learning_rate": 8.436715506375557e-06,
"loss": 0.0028,
"reward": -0.1674,
"step": 6609
},
{
"epoch": 0.94,
"learning_rate": 8.500773137192906e-06,
"loss": 0.0035,
"reward": -0.2794,
"step": 6619
},
{
"epoch": 0.94,
"learning_rate": 8.56372294377524e-06,
"loss": 0.0027,
"reward": -0.1072,
"step": 6629
},
{
"epoch": 0.94,
"learning_rate": 8.625545005574002e-06,
"loss": 0.0023,
"reward": -0.0992,
"step": 6639
},
{
"epoch": 0.94,
"learning_rate": 8.686219758916918e-06,
"loss": 0.0029,
"reward": -0.1256,
"step": 6649
},
{
"epoch": 0.94,
"learning_rate": 8.745728003198991e-06,
"loss": 0.0039,
"reward": 0.0413,
"step": 6659
},
{
"epoch": 0.94,
"learning_rate": 8.804050906958537e-06,
"loss": 0.0026,
"reward": -0.1593,
"step": 6669
},
{
"epoch": 0.95,
"learning_rate": 8.861170013836436e-06,
"loss": 0.0031,
"reward": -0.0352,
"step": 6679
},
{
"epoch": 0.95,
"learning_rate": 8.917067248416647e-06,
"loss": 0.003,
"reward": -0.0722,
"step": 6689
},
{
"epoch": 0.95,
"learning_rate": 8.971724921946235e-06,
"loss": 0.0027,
"reward": -0.1898,
"step": 6699
},
{
"epoch": 0.95,
"learning_rate": 9.025125737932962e-06,
"loss": 0.003,
"reward": -0.1166,
"step": 6709
},
{
"epoch": 0.95,
"learning_rate": 9.077252797618818e-06,
"loss": 0.0024,
"reward": 0.0003,
"step": 6719
},
{
"epoch": 0.95,
"learning_rate": 9.128089605327627e-06,
"loss": 0.0028,
"reward": -0.0084,
"step": 6729
},
{
"epoch": 0.95,
"learning_rate": 9.177620073685139e-06,
"loss": 0.0025,
"reward": 0.0006,
"step": 6739
},
{
"epoch": 0.96,
"learning_rate": 9.225828528709911e-06,
"loss": 0.0025,
"reward": -0.0565,
"step": 6749
},
{
"epoch": 0.96,
"learning_rate": 9.27269971477334e-06,
"loss": 0.0027,
"reward": -0.2262,
"step": 6759
},
{
"epoch": 0.96,
"learning_rate": 9.318218799427321e-06,
"loss": 0.0024,
"reward": -0.2918,
"step": 6769
},
{
"epoch": 0.96,
"learning_rate": 9.362371378098033e-06,
"loss": 0.0027,
"reward": -0.1157,
"step": 6779
},
{
"epoch": 0.96,
"learning_rate": 9.405143478644232e-06,
"loss": 0.0031,
"reward": -0.1187,
"step": 6789
},
{
"epoch": 0.96,
"learning_rate": 9.446521565778804e-06,
"loss": 0.0028,
"reward": -0.0971,
"step": 6799
},
{
"epoch": 0.96,
"learning_rate": 9.486492545351985e-06,
"loss": 0.0029,
"reward": -0.0164,
"step": 6809
},
{
"epoch": 0.97,
"learning_rate": 9.525043768495047e-06,
"loss": 0.0029,
"reward": -0.2,
"step": 6819
},
{
"epoch": 0.97,
"learning_rate": 9.562163035623038e-06,
"loss": 0.0022,
"reward": 0.0569,
"step": 6829
},
{
"epoch": 0.97,
"learning_rate": 9.597838600295355e-06,
"loss": 0.0023,
"reward": -0.163,
"step": 6839
},
{
"epoch": 0.97,
"learning_rate": 9.632059172932935e-06,
"loss": 0.0027,
"reward": -0.1407,
"step": 6849
},
{
"epoch": 0.97,
"learning_rate": 9.664813924390828e-06,
"loss": 0.0026,
"reward": -0.1893,
"step": 6859
},
{
"epoch": 0.97,
"learning_rate": 9.696092489385132e-06,
"loss": 0.0027,
"reward": -0.0518,
"step": 6869
},
{
"epoch": 0.97,
"learning_rate": 9.72588496977306e-06,
"loss": 0.0026,
"reward": -0.1236,
"step": 6879
},
{
"epoch": 0.98,
"learning_rate": 9.754181937685261e-06,
"loss": 0.0026,
"reward": -0.2622,
"step": 6889
},
{
"epoch": 0.98,
"learning_rate": 9.780974438509254e-06,
"loss": 0.0025,
"reward": -0.1576,
"step": 6899
},
{
"epoch": 0.98,
"learning_rate": 9.80625399372313e-06,
"loss": 0.0036,
"reward": -0.0548,
"step": 6909
},
{
"epoch": 0.98,
"learning_rate": 9.8300126035786e-06,
"loss": 0.0037,
"reward": -0.219,
"step": 6919
},
{
"epoch": 0.98,
"learning_rate": 9.852242749632524e-06,
"loss": 0.0028,
"reward": -0.0597,
"step": 6929
},
{
"epoch": 0.98,
"learning_rate": 9.872937397126121e-06,
"loss": 0.0024,
"reward": -0.133,
"step": 6939
},
{
"epoch": 0.98,
"learning_rate": 9.89208999721115e-06,
"loss": 0.0027,
"reward": -0.0708,
"step": 6949
},
{
"epoch": 0.99,
"learning_rate": 9.909694489022273e-06,
"loss": 0.0026,
"reward": -0.0812,
"step": 6959
},
{
"epoch": 0.99,
"learning_rate": 9.92574530159505e-06,
"loss": 0.0031,
"reward": 0.0002,
"step": 6969
},
{
"epoch": 0.99,
"learning_rate": 9.940237355628861e-06,
"loss": 0.003,
"reward": 0.0289,
"step": 6979
},
{
"epoch": 0.99,
"learning_rate": 9.95316606509427e-06,
"loss": 0.0029,
"reward": -0.0353,
"step": 6989
},
{
"epoch": 0.99,
"learning_rate": 9.964527338684262e-06,
"loss": 0.0023,
"reward": -0.0618,
"step": 6999
},
{
"epoch": 0.99,
"learning_rate": 9.974317581108963e-06,
"loss": 0.003,
"reward": -0.1512,
"step": 7009
},
{
"epoch": 0.99,
"learning_rate": 9.98253369423336e-06,
"loss": 0.0032,
"reward": 0.0394,
"step": 7019
},
{
"epoch": 1.0,
"learning_rate": 9.989173078057715e-06,
"loss": 0.0027,
"reward": -0.081,
"step": 7029
},
{
"epoch": 1.0,
"learning_rate": 9.99423363154034e-06,
"loss": 0.0028,
"reward": -0.0309,
"step": 7039
},
{
"epoch": 1.0,
"learning_rate": 9.99771375326247e-06,
"loss": 0.0025,
"reward": -0.1167,
"step": 7049
},
{
"epoch": 1.0,
"learning_rate": 9.999612341935039e-06,
"loss": 0.0024,
"reward": -0.0893,
"step": 7059
}
],
"max_steps": 7063,
"num_train_epochs": 1.0,
"total_flos": 0,
"trial_name": null,
"trial_params": null
}