sudoku-wll-neg-pos-2500 / trainer_state.json
RaresDolga's picture
Upload folder using huggingface_hub
9b76c63 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0025012506253126563,
"eval_steps": 1000,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 218.8541717529297,
"epoch": 1.0005002501250626e-06,
"grad_norm": 7.439283847808838,
"learning_rate": 3e-09,
"loss": 7.2972,
"reward": 0.0396825447678566,
"reward_std": 0.06933804973959923,
"rewards/sudoku_reward_func": 0.0396825410425663,
"step": 1,
"zero_std_ratio": 0.125
},
{
"epoch": 2.001000500250125e-06,
"grad_norm": 7.519765853881836,
"learning_rate": 6e-09,
"loss": 7.7068,
"step": 2
},
{
"epoch": 3.0015007503751877e-06,
"grad_norm": 12.11757755279541,
"learning_rate": 9.000000000000001e-09,
"loss": 8.009,
"step": 3
},
{
"epoch": 4.00200100050025e-06,
"grad_norm": 7.8453369140625,
"learning_rate": 1.2e-08,
"loss": 6.9207,
"step": 4
},
{
"epoch": 5.002501250625313e-06,
"grad_norm": 7.893465995788574,
"learning_rate": 1.5000000000000002e-08,
"loss": 7.3495,
"step": 5
},
{
"epoch": 6.003001500750375e-06,
"grad_norm": 7.371875286102295,
"learning_rate": 1.8000000000000002e-08,
"loss": 7.7256,
"step": 6
},
{
"epoch": 7.003501750875438e-06,
"grad_norm": 12.777957916259766,
"learning_rate": 2.1e-08,
"loss": 8.0202,
"step": 7
},
{
"epoch": 8.0040020010005e-06,
"grad_norm": 7.41762638092041,
"learning_rate": 2.4e-08,
"loss": 6.9169,
"step": 8
},
{
"completion_length": 230.6041717529297,
"epoch": 9.004502251125562e-06,
"grad_norm": 7.3082780838012695,
"learning_rate": 2.7e-08,
"loss": 2.7803,
"reward": 0.02740951138548553,
"reward_std": 0.0447020698338747,
"rewards/sudoku_reward_func": 0.02740951138548553,
"step": 9,
"zero_std_ratio": 0.375
},
{
"epoch": 1.0005002501250626e-05,
"grad_norm": 6.063836097717285,
"learning_rate": 3.0000000000000004e-08,
"loss": 2.5601,
"step": 10
},
{
"epoch": 1.1005502751375687e-05,
"grad_norm": 7.145730495452881,
"learning_rate": 3.3e-08,
"loss": 2.988,
"step": 11
},
{
"epoch": 1.200600300150075e-05,
"grad_norm": 6.994553565979004,
"learning_rate": 3.6000000000000005e-08,
"loss": 3.2755,
"step": 12
},
{
"epoch": 1.3006503251625812e-05,
"grad_norm": 8.28451919555664,
"learning_rate": 3.9e-08,
"loss": 2.7599,
"step": 13
},
{
"epoch": 1.4007003501750876e-05,
"grad_norm": 6.728439807891846,
"learning_rate": 4.2e-08,
"loss": 2.518,
"step": 14
},
{
"epoch": 1.5007503751875938e-05,
"grad_norm": 7.120697975158691,
"learning_rate": 4.5e-08,
"loss": 2.9896,
"step": 15
},
{
"epoch": 1.6008004002001e-05,
"grad_norm": 7.128693103790283,
"learning_rate": 4.8e-08,
"loss": 3.3136,
"step": 16
},
{
"completion_length": 235.87500762939453,
"epoch": 1.7008504252126064e-05,
"grad_norm": 11.133870124816895,
"learning_rate": 5.100000000000001e-08,
"loss": 1.6499,
"reward": 0.06601355969905853,
"reward_std": 0.08363537862896919,
"rewards/sudoku_reward_func": 0.06601355969905853,
"step": 17,
"zero_std_ratio": 0.25
},
{
"epoch": 1.8009004502251124e-05,
"grad_norm": 13.384941101074219,
"learning_rate": 5.4e-08,
"loss": 1.5589,
"step": 18
},
{
"epoch": 1.9009504752376188e-05,
"grad_norm": 9.533072471618652,
"learning_rate": 5.7e-08,
"loss": 1.4709,
"step": 19
},
{
"epoch": 2.001000500250125e-05,
"grad_norm": 39.541507720947266,
"learning_rate": 6.000000000000001e-08,
"loss": 3.2441,
"step": 20
},
{
"epoch": 2.1010505252626315e-05,
"grad_norm": 11.768808364868164,
"learning_rate": 6.300000000000001e-08,
"loss": 1.7169,
"step": 21
},
{
"epoch": 2.2011005502751375e-05,
"grad_norm": 14.177431106567383,
"learning_rate": 6.6e-08,
"loss": 1.5112,
"step": 22
},
{
"epoch": 2.3011505752876438e-05,
"grad_norm": 9.358668327331543,
"learning_rate": 6.9e-08,
"loss": 1.4272,
"step": 23
},
{
"epoch": 2.40120060030015e-05,
"grad_norm": 24.521499633789062,
"learning_rate": 7.200000000000001e-08,
"loss": 3.247,
"step": 24
},
{
"completion_length": 233.37500762939453,
"epoch": 2.5012506253126565e-05,
"grad_norm": 5.031166076660156,
"learning_rate": 7.500000000000001e-08,
"loss": 2.2568,
"reward": 0.022362764924764633,
"reward_std": 0.0353931887075305,
"rewards/sudoku_reward_func": 0.022362764924764633,
"step": 25,
"zero_std_ratio": 0.5
},
{
"epoch": 2.6013006503251625e-05,
"grad_norm": 5.3725972175598145,
"learning_rate": 7.8e-08,
"loss": 1.8075,
"step": 26
},
{
"epoch": 2.7013506753376688e-05,
"grad_norm": 5.61990213394165,
"learning_rate": 8.1e-08,
"loss": 1.7722,
"step": 27
},
{
"epoch": 2.8014007003501752e-05,
"grad_norm": 4.879170894622803,
"learning_rate": 8.4e-08,
"loss": 1.4841,
"step": 28
},
{
"epoch": 2.9014507253626812e-05,
"grad_norm": 5.542116165161133,
"learning_rate": 8.700000000000001e-08,
"loss": 2.2553,
"step": 29
},
{
"epoch": 3.0015007503751875e-05,
"grad_norm": 6.498071193695068,
"learning_rate": 9e-08,
"loss": 1.7906,
"step": 30
},
{
"epoch": 3.1015507753876935e-05,
"grad_norm": 5.03284215927124,
"learning_rate": 9.3e-08,
"loss": 1.7829,
"step": 31
},
{
"epoch": 3.2016008004002e-05,
"grad_norm": 5.980422019958496,
"learning_rate": 9.6e-08,
"loss": 1.4834,
"step": 32
},
{
"completion_length": 236.77084350585938,
"epoch": 3.301650825412706e-05,
"grad_norm": 10.07999324798584,
"learning_rate": 9.900000000000001e-08,
"loss": 4.6713,
"reward": 0.045345570892095566,
"reward_std": 0.060756947845220566,
"rewards/sudoku_reward_func": 0.04534556902945042,
"step": 33,
"zero_std_ratio": 0.375
},
{
"epoch": 3.401700850425213e-05,
"grad_norm": 10.255942344665527,
"learning_rate": 1.0200000000000001e-07,
"loss": 2.6158,
"step": 34
},
{
"epoch": 3.501750875437719e-05,
"grad_norm": 8.722171783447266,
"learning_rate": 1.0500000000000001e-07,
"loss": 3.5174,
"step": 35
},
{
"epoch": 3.601800900450225e-05,
"grad_norm": 15.56158447265625,
"learning_rate": 1.08e-07,
"loss": 4.9071,
"step": 36
},
{
"epoch": 3.7018509254627316e-05,
"grad_norm": 9.570954322814941,
"learning_rate": 1.11e-07,
"loss": 4.6103,
"step": 37
},
{
"epoch": 3.8019009504752376e-05,
"grad_norm": 9.923803329467773,
"learning_rate": 1.14e-07,
"loss": 2.5701,
"step": 38
},
{
"epoch": 3.9019509754877436e-05,
"grad_norm": 8.636165618896484,
"learning_rate": 1.17e-07,
"loss": 3.5118,
"step": 39
},
{
"epoch": 4.00200100050025e-05,
"grad_norm": 13.333678245544434,
"learning_rate": 1.2000000000000002e-07,
"loss": 4.9252,
"step": 40
},
{
"completion_length": 238.06250762939453,
"epoch": 4.102051025512756e-05,
"grad_norm": 7.653266429901123,
"learning_rate": 1.23e-07,
"loss": -1.3987,
"reward": 0.03835979010909796,
"reward_std": 0.06741865165531635,
"rewards/sudoku_reward_func": 0.03835978824645281,
"step": 41,
"zero_std_ratio": 0.125
},
{
"epoch": 4.202101050525263e-05,
"grad_norm": 9.044347763061523,
"learning_rate": 1.2600000000000002e-07,
"loss": -1.8583,
"step": 42
},
{
"epoch": 4.302151075537769e-05,
"grad_norm": 9.422472953796387,
"learning_rate": 1.29e-07,
"loss": -1.1338,
"step": 43
},
{
"epoch": 4.402201100550275e-05,
"grad_norm": 9.382232666015625,
"learning_rate": 1.32e-07,
"loss": -1.1015,
"step": 44
},
{
"epoch": 4.5022511255627816e-05,
"grad_norm": 7.433849334716797,
"learning_rate": 1.35e-07,
"loss": -1.3966,
"step": 45
},
{
"epoch": 4.6023011505752876e-05,
"grad_norm": 8.31912612915039,
"learning_rate": 1.38e-07,
"loss": -1.8804,
"step": 46
},
{
"epoch": 4.7023511755877936e-05,
"grad_norm": 9.417556762695312,
"learning_rate": 1.41e-07,
"loss": -1.1407,
"step": 47
},
{
"epoch": 4.8024012006003e-05,
"grad_norm": 9.155328750610352,
"learning_rate": 1.4400000000000002e-07,
"loss": -1.1026,
"step": 48
},
{
"completion_length": 238.95834350585938,
"epoch": 4.902451225612806e-05,
"grad_norm": 9.605969429016113,
"learning_rate": 1.47e-07,
"loss": 8.5284,
"reward": 0.038690481800585985,
"reward_std": 0.06603045156225562,
"rewards/sudoku_reward_func": 0.03869047784246504,
"step": 49,
"zero_std_ratio": 0.375
},
{
"epoch": 5.002501250625313e-05,
"grad_norm": 8.600493431091309,
"learning_rate": 1.5000000000000002e-07,
"loss": 8.2116,
"step": 50
},
{
"epoch": 5.102551275637819e-05,
"grad_norm": 9.14062213897705,
"learning_rate": 1.53e-07,
"loss": 8.6505,
"step": 51
},
{
"epoch": 5.202601300650325e-05,
"grad_norm": 10.15638256072998,
"learning_rate": 1.56e-07,
"loss": 8.738,
"step": 52
},
{
"epoch": 5.3026513256628317e-05,
"grad_norm": 9.743338584899902,
"learning_rate": 1.59e-07,
"loss": 8.5234,
"step": 53
},
{
"epoch": 5.4027013506753377e-05,
"grad_norm": 8.36423110961914,
"learning_rate": 1.62e-07,
"loss": 8.2441,
"step": 54
},
{
"epoch": 5.502751375687844e-05,
"grad_norm": 9.903505325317383,
"learning_rate": 1.65e-07,
"loss": 8.6868,
"step": 55
},
{
"epoch": 5.6028014007003503e-05,
"grad_norm": 10.892803192138672,
"learning_rate": 1.68e-07,
"loss": 8.8243,
"step": 56
},
{
"completion_length": 231.58333587646484,
"epoch": 5.7028514257128563e-05,
"grad_norm": 8.197606086730957,
"learning_rate": 1.71e-07,
"loss": 2.8601,
"reward": 0.03100198693573475,
"reward_std": 0.05434095114469528,
"rewards/sudoku_reward_func": 0.031001986004412174,
"step": 57,
"zero_std_ratio": 0.375
},
{
"epoch": 5.8029014507253623e-05,
"grad_norm": 11.048020362854004,
"learning_rate": 1.7400000000000002e-07,
"loss": 3.2602,
"step": 58
},
{
"epoch": 5.902951475737869e-05,
"grad_norm": 6.528013706207275,
"learning_rate": 1.7699999999999998e-07,
"loss": 2.1119,
"step": 59
},
{
"epoch": 6.003001500750375e-05,
"grad_norm": 11.171043395996094,
"learning_rate": 1.8e-07,
"loss": 3.1792,
"step": 60
},
{
"epoch": 6.103051525762882e-05,
"grad_norm": 8.444694519042969,
"learning_rate": 1.83e-07,
"loss": 2.8315,
"step": 61
},
{
"epoch": 6.203101550775387e-05,
"grad_norm": 9.485188484191895,
"learning_rate": 1.86e-07,
"loss": 3.2579,
"step": 62
},
{
"epoch": 6.303151575787894e-05,
"grad_norm": 6.87290096282959,
"learning_rate": 1.89e-07,
"loss": 2.1385,
"step": 63
},
{
"epoch": 6.4032016008004e-05,
"grad_norm": 12.49797248840332,
"learning_rate": 1.92e-07,
"loss": 3.1374,
"step": 64
},
{
"completion_length": 243.7916717529297,
"epoch": 6.503251625812907e-05,
"grad_norm": 16.387470245361328,
"learning_rate": 1.95e-07,
"loss": 5.5435,
"reward": 0.039468348026275635,
"reward_std": 0.07617796957492828,
"rewards/sudoku_reward_func": 0.039468344300985336,
"step": 65,
"zero_std_ratio": 0.25
},
{
"epoch": 6.603301650825412e-05,
"grad_norm": 16.877925872802734,
"learning_rate": 1.9800000000000003e-07,
"loss": 4.5106,
"step": 66
},
{
"epoch": 6.703351675837919e-05,
"grad_norm": 16.774993896484375,
"learning_rate": 2.01e-07,
"loss": 6.3509,
"step": 67
},
{
"epoch": 6.803401700850426e-05,
"grad_norm": 9.752405166625977,
"learning_rate": 2.0400000000000003e-07,
"loss": 4.2743,
"step": 68
},
{
"epoch": 6.903451725862931e-05,
"grad_norm": 13.019120216369629,
"learning_rate": 2.0700000000000001e-07,
"loss": 5.4941,
"step": 69
},
{
"epoch": 7.003501750875438e-05,
"grad_norm": 15.15886402130127,
"learning_rate": 2.1000000000000003e-07,
"loss": 4.561,
"step": 70
},
{
"epoch": 7.103551775887944e-05,
"grad_norm": 17.407318115234375,
"learning_rate": 2.13e-07,
"loss": 6.3596,
"step": 71
},
{
"epoch": 7.20360180090045e-05,
"grad_norm": 9.901360511779785,
"learning_rate": 2.16e-07,
"loss": 4.3023,
"step": 72
},
{
"completion_length": 242.58333587646484,
"epoch": 7.303651825912956e-05,
"grad_norm": 7.898802280426025,
"learning_rate": 2.19e-07,
"loss": 3.3781,
"reward": 0.03922784514725208,
"reward_std": 0.06382527574896812,
"rewards/sudoku_reward_func": 0.039227843284606934,
"step": 73,
"zero_std_ratio": 0.25
},
{
"epoch": 7.403701850925463e-05,
"grad_norm": 10.132791519165039,
"learning_rate": 2.22e-07,
"loss": 3.9789,
"step": 74
},
{
"epoch": 7.503751875937968e-05,
"grad_norm": 9.61319351196289,
"learning_rate": 2.25e-07,
"loss": 4.1563,
"step": 75
},
{
"epoch": 7.603801900950475e-05,
"grad_norm": 10.665925979614258,
"learning_rate": 2.28e-07,
"loss": 3.454,
"step": 76
},
{
"epoch": 7.703851925962982e-05,
"grad_norm": 8.118515014648438,
"learning_rate": 2.31e-07,
"loss": 3.4117,
"step": 77
},
{
"epoch": 7.803901950975487e-05,
"grad_norm": 7.520627975463867,
"learning_rate": 2.34e-07,
"loss": 3.9269,
"step": 78
},
{
"epoch": 7.903951975987994e-05,
"grad_norm": 10.15380573272705,
"learning_rate": 2.3700000000000002e-07,
"loss": 4.1432,
"step": 79
},
{
"epoch": 8.0040020010005e-05,
"grad_norm": 10.807955741882324,
"learning_rate": 2.4000000000000003e-07,
"loss": 3.4715,
"step": 80
},
{
"completion_length": 233.93750762939453,
"epoch": 8.104052026013007e-05,
"grad_norm": 8.597204208374023,
"learning_rate": 2.43e-07,
"loss": -0.6342,
"reward": 0.04493221268057823,
"reward_std": 0.08249906450510025,
"rewards/sudoku_reward_func": 0.04493220895528793,
"step": 81,
"zero_std_ratio": 0.125
},
{
"epoch": 8.204102051025512e-05,
"grad_norm": 9.011427879333496,
"learning_rate": 2.46e-07,
"loss": -1.2335,
"step": 82
},
{
"epoch": 8.304152076038019e-05,
"grad_norm": 13.026226997375488,
"learning_rate": 2.49e-07,
"loss": 0.0816,
"step": 83
},
{
"epoch": 8.404202101050526e-05,
"grad_norm": 9.908291816711426,
"learning_rate": 2.5200000000000003e-07,
"loss": -0.9982,
"step": 84
},
{
"epoch": 8.504252126063031e-05,
"grad_norm": 9.28254222869873,
"learning_rate": 2.5500000000000005e-07,
"loss": -0.5884,
"step": 85
},
{
"epoch": 8.604302151075538e-05,
"grad_norm": 8.442070960998535,
"learning_rate": 2.58e-07,
"loss": -1.2796,
"step": 86
},
{
"epoch": 8.704352176088045e-05,
"grad_norm": 12.563162803649902,
"learning_rate": 2.6099999999999997e-07,
"loss": 0.0909,
"step": 87
},
{
"epoch": 8.80440220110055e-05,
"grad_norm": 8.839503288269043,
"learning_rate": 2.64e-07,
"loss": -0.9965,
"step": 88
},
{
"completion_length": 244.83334350585938,
"epoch": 8.904452226113057e-05,
"grad_norm": 8.823372840881348,
"learning_rate": 2.67e-07,
"loss": 0.4445,
"reward": 0.03761574160307646,
"reward_std": 0.06432248279452324,
"rewards/sudoku_reward_func": 0.03761574160307646,
"step": 89,
"zero_std_ratio": 0.375
},
{
"epoch": 9.004502251125563e-05,
"grad_norm": 8.954886436462402,
"learning_rate": 2.7e-07,
"loss": 0.5046,
"step": 90
},
{
"epoch": 9.104552276138069e-05,
"grad_norm": 10.275053024291992,
"learning_rate": 2.73e-07,
"loss": 1.1653,
"step": 91
},
{
"epoch": 9.204602301150575e-05,
"grad_norm": 19.105579376220703,
"learning_rate": 2.76e-07,
"loss": 2.4353,
"step": 92
},
{
"epoch": 9.304652326163082e-05,
"grad_norm": 8.878145217895508,
"learning_rate": 2.79e-07,
"loss": 0.4359,
"step": 93
},
{
"epoch": 9.404702351175587e-05,
"grad_norm": 9.172167778015137,
"learning_rate": 2.82e-07,
"loss": 0.5311,
"step": 94
},
{
"epoch": 9.504752376188094e-05,
"grad_norm": 10.269847869873047,
"learning_rate": 2.85e-07,
"loss": 1.1261,
"step": 95
},
{
"epoch": 9.6048024012006e-05,
"grad_norm": 20.466459274291992,
"learning_rate": 2.8800000000000004e-07,
"loss": 2.4395,
"step": 96
},
{
"completion_length": 239.14583587646484,
"epoch": 9.704852426213106e-05,
"grad_norm": 15.091897964477539,
"learning_rate": 2.91e-07,
"loss": 7.7526,
"reward": 0.05989583395421505,
"reward_std": 0.10059662535786629,
"rewards/sudoku_reward_func": 0.05989583395421505,
"step": 97,
"zero_std_ratio": 0.0
},
{
"epoch": 9.804902451225613e-05,
"grad_norm": 14.478265762329102,
"learning_rate": 2.94e-07,
"loss": 8.9155,
"step": 98
},
{
"epoch": 9.904952476238119e-05,
"grad_norm": 11.164327621459961,
"learning_rate": 2.97e-07,
"loss": 6.5003,
"step": 99
},
{
"epoch": 0.00010005002501250626,
"grad_norm": 10.038124084472656,
"learning_rate": 3.0000000000000004e-07,
"loss": 7.4044,
"step": 100
},
{
"epoch": 0.00010105052526263131,
"grad_norm": 14.379571914672852,
"learning_rate": 3.0300000000000005e-07,
"loss": 7.8001,
"step": 101
},
{
"epoch": 0.00010205102551275638,
"grad_norm": 14.172938346862793,
"learning_rate": 3.06e-07,
"loss": 8.9515,
"step": 102
},
{
"epoch": 0.00010305152576288145,
"grad_norm": 11.172471046447754,
"learning_rate": 3.09e-07,
"loss": 6.5193,
"step": 103
},
{
"epoch": 0.0001040520260130065,
"grad_norm": 11.093944549560547,
"learning_rate": 3.12e-07,
"loss": 7.3289,
"step": 104
},
{
"completion_length": 229.89584350585938,
"epoch": 0.00010505252626313157,
"grad_norm": 9.635990142822266,
"learning_rate": 3.15e-07,
"loss": -1.1911,
"reward": 0.04885912872850895,
"reward_std": 0.0644612517207861,
"rewards/sudoku_reward_func": 0.048859127797186375,
"step": 105,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00010605302651325663,
"grad_norm": 9.855486869812012,
"learning_rate": 3.18e-07,
"loss": -0.616,
"step": 106
},
{
"epoch": 0.00010705352676338169,
"grad_norm": 10.122713088989258,
"learning_rate": 3.21e-07,
"loss": -0.7726,
"step": 107
},
{
"epoch": 0.00010805402701350675,
"grad_norm": 9.637285232543945,
"learning_rate": 3.24e-07,
"loss": -1.0351,
"step": 108
},
{
"epoch": 0.00010905452726363182,
"grad_norm": 10.141641616821289,
"learning_rate": 3.27e-07,
"loss": -1.1738,
"step": 109
},
{
"epoch": 0.00011005502751375687,
"grad_norm": 10.051895141601562,
"learning_rate": 3.3e-07,
"loss": -0.581,
"step": 110
},
{
"epoch": 0.00011105552776388194,
"grad_norm": 11.444948196411133,
"learning_rate": 3.3300000000000003e-07,
"loss": -0.7803,
"step": 111
},
{
"epoch": 0.00011205602801400701,
"grad_norm": 9.775928497314453,
"learning_rate": 3.36e-07,
"loss": -0.9849,
"step": 112
},
{
"completion_length": 217.5625,
"epoch": 0.00011305652826413206,
"grad_norm": 15.8468656539917,
"learning_rate": 3.39e-07,
"loss": 12.9496,
"reward": 0.042493388056755066,
"reward_std": 0.07390820980072021,
"rewards/sudoku_reward_func": 0.042493388056755066,
"step": 113,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00011405702851425713,
"grad_norm": 11.421664237976074,
"learning_rate": 3.42e-07,
"loss": 10.6176,
"step": 114
},
{
"epoch": 0.0001150575287643822,
"grad_norm": 11.024772644042969,
"learning_rate": 3.4500000000000003e-07,
"loss": 12.5574,
"step": 115
},
{
"epoch": 0.00011605802901450725,
"grad_norm": 10.388362884521484,
"learning_rate": 3.4800000000000005e-07,
"loss": 12.2676,
"step": 116
},
{
"epoch": 0.00011705852926463231,
"grad_norm": 17.08087921142578,
"learning_rate": 3.51e-07,
"loss": 13.0406,
"step": 117
},
{
"epoch": 0.00011805902951475738,
"grad_norm": 10.449882507324219,
"learning_rate": 3.5399999999999997e-07,
"loss": 10.5809,
"step": 118
},
{
"epoch": 0.00011905952976488245,
"grad_norm": 11.035545349121094,
"learning_rate": 3.57e-07,
"loss": 12.5377,
"step": 119
},
{
"epoch": 0.0001200600300150075,
"grad_norm": 10.570043563842773,
"learning_rate": 3.6e-07,
"loss": 12.2341,
"step": 120
},
{
"completion_length": 242.06250762939453,
"epoch": 0.00012106053026513257,
"grad_norm": 11.524609565734863,
"learning_rate": 3.63e-07,
"loss": -0.1825,
"reward": 0.05667162872850895,
"reward_std": 0.0804666131734848,
"rewards/sudoku_reward_func": 0.056671624071896076,
"step": 121,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00012206103051525763,
"grad_norm": 12.43232536315918,
"learning_rate": 3.66e-07,
"loss": -0.3778,
"step": 122
},
{
"epoch": 0.0001230615307653827,
"grad_norm": 23.712289810180664,
"learning_rate": 3.69e-07,
"loss": -2.2729,
"step": 123
},
{
"epoch": 0.00012406203101550774,
"grad_norm": 14.794097900390625,
"learning_rate": 3.72e-07,
"loss": 0.5418,
"step": 124
},
{
"epoch": 0.0001250625312656328,
"grad_norm": 10.758546829223633,
"learning_rate": 3.75e-07,
"loss": -0.2122,
"step": 125
},
{
"epoch": 0.00012606303151575787,
"grad_norm": 11.061405181884766,
"learning_rate": 3.78e-07,
"loss": -0.43,
"step": 126
},
{
"epoch": 0.00012706353176588294,
"grad_norm": 15.273740768432617,
"learning_rate": 3.8100000000000004e-07,
"loss": -2.2138,
"step": 127
},
{
"epoch": 0.000128064032016008,
"grad_norm": 14.798585891723633,
"learning_rate": 3.84e-07,
"loss": 0.4373,
"step": 128
},
{
"completion_length": 232.83333587646484,
"epoch": 0.00012906453226613307,
"grad_norm": 5.877110004425049,
"learning_rate": 3.87e-07,
"loss": 1.6496,
"reward": 0.02467758022248745,
"reward_std": 0.05041925609111786,
"rewards/sudoku_reward_func": 0.02467758022248745,
"step": 129,
"zero_std_ratio": 0.375
},
{
"epoch": 0.00013006503251625814,
"grad_norm": 6.033883094787598,
"learning_rate": 3.9e-07,
"loss": 0.7818,
"step": 130
},
{
"epoch": 0.00013106553276638318,
"grad_norm": 6.772444725036621,
"learning_rate": 3.9300000000000004e-07,
"loss": 0.9868,
"step": 131
},
{
"epoch": 0.00013206603301650825,
"grad_norm": 6.536540508270264,
"learning_rate": 3.9600000000000005e-07,
"loss": 0.3655,
"step": 132
},
{
"epoch": 0.00013306653326663331,
"grad_norm": 5.516957759857178,
"learning_rate": 3.99e-07,
"loss": 1.6722,
"step": 133
},
{
"epoch": 0.00013406703351675838,
"grad_norm": 6.0041046142578125,
"learning_rate": 4.02e-07,
"loss": 0.8035,
"step": 134
},
{
"epoch": 0.00013506753376688345,
"grad_norm": 7.185412883758545,
"learning_rate": 4.0500000000000004e-07,
"loss": 0.9725,
"step": 135
},
{
"epoch": 0.00013606803401700852,
"grad_norm": 6.690100193023682,
"learning_rate": 4.0800000000000005e-07,
"loss": 0.3469,
"step": 136
},
{
"completion_length": 238.33334350585938,
"epoch": 0.00013706853426713355,
"grad_norm": 6.418752193450928,
"learning_rate": 4.1100000000000007e-07,
"loss": 1.8285,
"reward": 0.023892195895314217,
"reward_std": 0.047397417947649956,
"rewards/sudoku_reward_func": 0.023892195895314217,
"step": 137,
"zero_std_ratio": 0.375
},
{
"epoch": 0.00013806903451725862,
"grad_norm": 6.306288719177246,
"learning_rate": 4.1400000000000003e-07,
"loss": 2.1302,
"step": 138
},
{
"epoch": 0.0001390695347673837,
"grad_norm": 8.030327796936035,
"learning_rate": 4.1700000000000004e-07,
"loss": 1.4886,
"step": 139
},
{
"epoch": 0.00014007003501750876,
"grad_norm": 7.097723484039307,
"learning_rate": 4.2000000000000006e-07,
"loss": 2.2219,
"step": 140
},
{
"epoch": 0.00014107053526763382,
"grad_norm": 5.7919511795043945,
"learning_rate": 4.2299999999999996e-07,
"loss": 1.8461,
"step": 141
},
{
"epoch": 0.0001420710355177589,
"grad_norm": 6.026665210723877,
"learning_rate": 4.26e-07,
"loss": 2.0779,
"step": 142
},
{
"epoch": 0.00014307153576788393,
"grad_norm": 8.162043571472168,
"learning_rate": 4.29e-07,
"loss": 1.5113,
"step": 143
},
{
"epoch": 0.000144072036018009,
"grad_norm": 6.930771350860596,
"learning_rate": 4.32e-07,
"loss": 2.2161,
"step": 144
},
{
"completion_length": 227.9166717529297,
"epoch": 0.00014507253626813406,
"grad_norm": 8.324076652526855,
"learning_rate": 4.3499999999999996e-07,
"loss": -2.2184,
"reward": 0.034808654338121414,
"reward_std": 0.06299007683992386,
"rewards/sudoku_reward_func": 0.034808652475476265,
"step": 145,
"zero_std_ratio": 0.375
},
{
"epoch": 0.00014607303651825913,
"grad_norm": 8.019487380981445,
"learning_rate": 4.38e-07,
"loss": -1.9534,
"step": 146
},
{
"epoch": 0.0001470735367683842,
"grad_norm": 8.434709548950195,
"learning_rate": 4.41e-07,
"loss": -1.8272,
"step": 147
},
{
"epoch": 0.00014807403701850926,
"grad_norm": 6.320549488067627,
"learning_rate": 4.44e-07,
"loss": -2.1752,
"step": 148
},
{
"epoch": 0.00014907453726863433,
"grad_norm": 8.163012504577637,
"learning_rate": 4.4699999999999997e-07,
"loss": -2.1973,
"step": 149
},
{
"epoch": 0.00015007503751875937,
"grad_norm": 8.064225196838379,
"learning_rate": 4.5e-07,
"loss": -1.9865,
"step": 150
},
{
"epoch": 0.00015107553776888444,
"grad_norm": 8.826040267944336,
"learning_rate": 4.53e-07,
"loss": -1.7932,
"step": 151
},
{
"epoch": 0.0001520760380190095,
"grad_norm": 7.065883636474609,
"learning_rate": 4.56e-07,
"loss": -2.189,
"step": 152
},
{
"completion_length": 219.7291717529297,
"epoch": 0.00015307653826913457,
"grad_norm": 16.795761108398438,
"learning_rate": 4.59e-07,
"loss": 7.2122,
"reward": 0.03798776492476463,
"reward_std": 0.0777844786643982,
"rewards/sudoku_reward_func": 0.03798776492476463,
"step": 153,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00015407703851925964,
"grad_norm": 8.526628494262695,
"learning_rate": 4.62e-07,
"loss": 6.6345,
"step": 154
},
{
"epoch": 0.0001550775387693847,
"grad_norm": 8.999878883361816,
"learning_rate": 4.65e-07,
"loss": 6.9569,
"step": 155
},
{
"epoch": 0.00015607803901950974,
"grad_norm": 9.370837211608887,
"learning_rate": 4.68e-07,
"loss": 7.0462,
"step": 156
},
{
"epoch": 0.0001570785392696348,
"grad_norm": 19.611669540405273,
"learning_rate": 4.71e-07,
"loss": 7.1868,
"step": 157
},
{
"epoch": 0.00015807903951975988,
"grad_norm": 8.714893341064453,
"learning_rate": 4.7400000000000004e-07,
"loss": 6.6027,
"step": 158
},
{
"epoch": 0.00015907953976988494,
"grad_norm": 8.8904447555542,
"learning_rate": 4.77e-07,
"loss": 6.9645,
"step": 159
},
{
"epoch": 0.00016008004002001,
"grad_norm": 9.179741859436035,
"learning_rate": 4.800000000000001e-07,
"loss": 7.0361,
"step": 160
},
{
"completion_length": 236.89584350585938,
"epoch": 0.00016108054027013508,
"grad_norm": 7.844521522521973,
"learning_rate": 4.830000000000001e-07,
"loss": 3.4178,
"reward": 0.03401951119303703,
"reward_std": 0.053184038028120995,
"rewards/sudoku_reward_func": 0.034019509330391884,
"step": 161,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00016208104052026014,
"grad_norm": 6.946485996246338,
"learning_rate": 4.86e-07,
"loss": 3.1326,
"step": 162
},
{
"epoch": 0.00016308154077038518,
"grad_norm": 8.468039512634277,
"learning_rate": 4.89e-07,
"loss": 3.4034,
"step": 163
},
{
"epoch": 0.00016408204102051025,
"grad_norm": 7.4369330406188965,
"learning_rate": 4.92e-07,
"loss": 3.0264,
"step": 164
},
{
"epoch": 0.00016508254127063532,
"grad_norm": 8.125073432922363,
"learning_rate": 4.95e-07,
"loss": 3.4745,
"step": 165
},
{
"epoch": 0.00016608304152076038,
"grad_norm": 6.560446739196777,
"learning_rate": 4.98e-07,
"loss": 3.0914,
"step": 166
},
{
"epoch": 0.00016708354177088545,
"grad_norm": 9.359310150146484,
"learning_rate": 5.01e-07,
"loss": 3.4344,
"step": 167
},
{
"epoch": 0.00016808404202101052,
"grad_norm": 7.4871368408203125,
"learning_rate": 5.040000000000001e-07,
"loss": 3.0149,
"step": 168
},
{
"completion_length": 229.06250762939453,
"epoch": 0.00016908454227113556,
"grad_norm": 10.238948822021484,
"learning_rate": 5.070000000000001e-07,
"loss": 1.0765,
"reward": 0.053778110072016716,
"reward_std": 0.09089003875851631,
"rewards/sudoku_reward_func": 0.05377810634672642,
"step": 169,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00017008504252126062,
"grad_norm": 11.512720108032227,
"learning_rate": 5.100000000000001e-07,
"loss": 2.2342,
"step": 170
},
{
"epoch": 0.0001710855427713857,
"grad_norm": 11.702943801879883,
"learning_rate": 5.13e-07,
"loss": 0.7152,
"step": 171
},
{
"epoch": 0.00017208604302151076,
"grad_norm": 13.017841339111328,
"learning_rate": 5.16e-07,
"loss": 1.9784,
"step": 172
},
{
"epoch": 0.00017308654327163582,
"grad_norm": 10.525778770446777,
"learning_rate": 5.189999999999999e-07,
"loss": 1.0766,
"step": 173
},
{
"epoch": 0.0001740870435217609,
"grad_norm": 11.924694061279297,
"learning_rate": 5.219999999999999e-07,
"loss": 2.2755,
"step": 174
},
{
"epoch": 0.00017508754377188593,
"grad_norm": 10.760505676269531,
"learning_rate": 5.25e-07,
"loss": 0.7167,
"step": 175
},
{
"epoch": 0.000176088044022011,
"grad_norm": 11.227055549621582,
"learning_rate": 5.28e-07,
"loss": 2.0339,
"step": 176
},
{
"completion_length": 230.8541717529297,
"epoch": 0.00017708854427213606,
"grad_norm": 16.376419067382812,
"learning_rate": 5.31e-07,
"loss": 9.7452,
"reward": 0.03583829663693905,
"reward_std": 0.06714264582842588,
"rewards/sudoku_reward_func": 0.03583829663693905,
"step": 177,
"zero_std_ratio": 0.375
},
{
"epoch": 0.00017808904452226113,
"grad_norm": 11.184220314025879,
"learning_rate": 5.34e-07,
"loss": 9.9333,
"step": 178
},
{
"epoch": 0.0001790895447723862,
"grad_norm": 11.791999816894531,
"learning_rate": 5.37e-07,
"loss": 9.8864,
"step": 179
},
{
"epoch": 0.00018009004502251126,
"grad_norm": 10.82250690460205,
"learning_rate": 5.4e-07,
"loss": 10.3131,
"step": 180
},
{
"epoch": 0.00018109054527263633,
"grad_norm": 16.504940032958984,
"learning_rate": 5.43e-07,
"loss": 9.7328,
"step": 181
},
{
"epoch": 0.00018209104552276137,
"grad_norm": 11.314173698425293,
"learning_rate": 5.46e-07,
"loss": 9.9869,
"step": 182
},
{
"epoch": 0.00018309154577288644,
"grad_norm": 11.449384689331055,
"learning_rate": 5.49e-07,
"loss": 9.9188,
"step": 183
},
{
"epoch": 0.0001840920460230115,
"grad_norm": 11.834486961364746,
"learning_rate": 5.52e-07,
"loss": 10.3294,
"step": 184
},
{
"completion_length": 223.68750762939453,
"epoch": 0.00018509254627313657,
"grad_norm": 7.6580023765563965,
"learning_rate": 5.55e-07,
"loss": 1.6827,
"reward": 0.023478839080780745,
"reward_std": 0.05215226113796234,
"rewards/sudoku_reward_func": 0.023478837218135595,
"step": 185,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00018609304652326164,
"grad_norm": 7.352197647094727,
"learning_rate": 5.58e-07,
"loss": 1.1462,
"step": 186
},
{
"epoch": 0.0001870935467733867,
"grad_norm": 8.335673332214355,
"learning_rate": 5.61e-07,
"loss": 1.429,
"step": 187
},
{
"epoch": 0.00018809404702351174,
"grad_norm": 9.88043212890625,
"learning_rate": 5.64e-07,
"loss": 1.4269,
"step": 188
},
{
"epoch": 0.0001890945472736368,
"grad_norm": 11.749013900756836,
"learning_rate": 5.67e-07,
"loss": 1.7256,
"step": 189
},
{
"epoch": 0.00019009504752376188,
"grad_norm": 7.314451694488525,
"learning_rate": 5.7e-07,
"loss": 1.139,
"step": 190
},
{
"epoch": 0.00019109554777388694,
"grad_norm": 8.525728225708008,
"learning_rate": 5.730000000000001e-07,
"loss": 1.3557,
"step": 191
},
{
"epoch": 0.000192096048024012,
"grad_norm": 10.000328063964844,
"learning_rate": 5.760000000000001e-07,
"loss": 1.3769,
"step": 192
},
{
"completion_length": 230.45833587646484,
"epoch": 0.00019309654827413708,
"grad_norm": 9.96373462677002,
"learning_rate": 5.79e-07,
"loss": 2.7711,
"reward": 0.04001322854310274,
"reward_std": 0.07591889426112175,
"rewards/sudoku_reward_func": 0.04001322854310274,
"step": 193,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00019409704852426212,
"grad_norm": 8.532519340515137,
"learning_rate": 5.82e-07,
"loss": 4.3165,
"step": 194
},
{
"epoch": 0.00019509754877438718,
"grad_norm": 8.89317798614502,
"learning_rate": 5.85e-07,
"loss": 3.8256,
"step": 195
},
{
"epoch": 0.00019609804902451225,
"grad_norm": 9.849699020385742,
"learning_rate": 5.88e-07,
"loss": 3.3788,
"step": 196
},
{
"epoch": 0.00019709854927463732,
"grad_norm": 9.646321296691895,
"learning_rate": 5.91e-07,
"loss": 2.692,
"step": 197
},
{
"epoch": 0.00019809904952476239,
"grad_norm": 11.351709365844727,
"learning_rate": 5.94e-07,
"loss": 4.3232,
"step": 198
},
{
"epoch": 0.00019909954977488745,
"grad_norm": 8.894522666931152,
"learning_rate": 5.970000000000001e-07,
"loss": 3.8403,
"step": 199
},
{
"epoch": 0.00020010005002501252,
"grad_norm": 9.612089157104492,
"learning_rate": 6.000000000000001e-07,
"loss": 3.3513,
"step": 200
},
{
"completion_length": 238.43750762939453,
"epoch": 0.00020110055027513756,
"grad_norm": 7.658564567565918,
"learning_rate": 6.030000000000001e-07,
"loss": 1.9131,
"reward": 0.02852182649075985,
"reward_std": 0.040328510105609894,
"rewards/sudoku_reward_func": 0.028521825559437275,
"step": 201,
"zero_std_ratio": 0.375
},
{
"epoch": 0.00020210105052526263,
"grad_norm": 5.2299089431762695,
"learning_rate": 6.060000000000001e-07,
"loss": 1.2648,
"step": 202
},
{
"epoch": 0.0002031015507753877,
"grad_norm": 10.499113082885742,
"learning_rate": 6.09e-07,
"loss": 2.2544,
"step": 203
},
{
"epoch": 0.00020410205102551276,
"grad_norm": 6.83461332321167,
"learning_rate": 6.12e-07,
"loss": 1.888,
"step": 204
},
{
"epoch": 0.00020510255127563783,
"grad_norm": 9.024131774902344,
"learning_rate": 6.149999999999999e-07,
"loss": 1.9116,
"step": 205
},
{
"epoch": 0.0002061030515257629,
"grad_norm": 5.002796649932861,
"learning_rate": 6.18e-07,
"loss": 1.2636,
"step": 206
},
{
"epoch": 0.00020710355177588793,
"grad_norm": 8.772700309753418,
"learning_rate": 6.21e-07,
"loss": 2.2221,
"step": 207
},
{
"epoch": 0.000208104052026013,
"grad_norm": 5.337297439575195,
"learning_rate": 6.24e-07,
"loss": 1.9176,
"step": 208
},
{
"completion_length": 234.2916717529297,
"epoch": 0.00020910455227613807,
"grad_norm": 9.960871696472168,
"learning_rate": 6.27e-07,
"loss": 4.9261,
"reward": 0.0486111119389534,
"reward_std": 0.07930124551057816,
"rewards/sudoku_reward_func": 0.0486111119389534,
"step": 209,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00021010505252626313,
"grad_norm": 9.485045433044434,
"learning_rate": 6.3e-07,
"loss": 4.2217,
"step": 210
},
{
"epoch": 0.0002111055527763882,
"grad_norm": 11.977108001708984,
"learning_rate": 6.33e-07,
"loss": 4.4983,
"step": 211
},
{
"epoch": 0.00021210605302651327,
"grad_norm": 12.709733009338379,
"learning_rate": 6.36e-07,
"loss": 4.5912,
"step": 212
},
{
"epoch": 0.0002131065532766383,
"grad_norm": 9.682394027709961,
"learning_rate": 6.39e-07,
"loss": 4.9071,
"step": 213
},
{
"epoch": 0.00021410705352676337,
"grad_norm": 12.194422721862793,
"learning_rate": 6.42e-07,
"loss": 4.1101,
"step": 214
},
{
"epoch": 0.00021510755377688844,
"grad_norm": 11.770171165466309,
"learning_rate": 6.45e-07,
"loss": 4.435,
"step": 215
},
{
"epoch": 0.0002161080540270135,
"grad_norm": 14.138443946838379,
"learning_rate": 6.48e-07,
"loss": 4.6484,
"step": 216
},
{
"completion_length": 243.8541717529297,
"epoch": 0.00021710855427713857,
"grad_norm": 10.657322883605957,
"learning_rate": 6.51e-07,
"loss": 0.3958,
"reward": 0.06342066638171673,
"reward_std": 0.09584061056375504,
"rewards/sudoku_reward_func": 0.06342066638171673,
"step": 217,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00021810905452726364,
"grad_norm": 11.380950927734375,
"learning_rate": 6.54e-07,
"loss": 2.1351,
"step": 218
},
{
"epoch": 0.0002191095547773887,
"grad_norm": 10.573854446411133,
"learning_rate": 6.57e-07,
"loss": 0.7276,
"step": 219
},
{
"epoch": 0.00022011005502751375,
"grad_norm": 13.61559772491455,
"learning_rate": 6.6e-07,
"loss": 1.6265,
"step": 220
},
{
"epoch": 0.0002211105552776388,
"grad_norm": 9.072285652160645,
"learning_rate": 6.63e-07,
"loss": 0.4138,
"step": 221
},
{
"epoch": 0.00022211105552776388,
"grad_norm": 10.759437561035156,
"learning_rate": 6.660000000000001e-07,
"loss": 2.135,
"step": 222
},
{
"epoch": 0.00022311155577788895,
"grad_norm": 10.8936128616333,
"learning_rate": 6.690000000000001e-07,
"loss": 0.6748,
"step": 223
},
{
"epoch": 0.00022411205602801401,
"grad_norm": 16.358993530273438,
"learning_rate": 6.72e-07,
"loss": 1.6985,
"step": 224
},
{
"completion_length": 220.50000762939453,
"epoch": 0.00022511255627813908,
"grad_norm": 7.5341715812683105,
"learning_rate": 6.75e-07,
"loss": -1.8022,
"reward": 0.03451554290950298,
"reward_std": 0.07399624213576317,
"rewards/sudoku_reward_func": 0.03451554290950298,
"step": 225,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00022611305652826412,
"grad_norm": 17.420621871948242,
"learning_rate": 6.78e-07,
"loss": -0.1952,
"step": 226
},
{
"epoch": 0.0002271135567783892,
"grad_norm": 8.836989402770996,
"learning_rate": 6.81e-07,
"loss": -0.4381,
"step": 227
},
{
"epoch": 0.00022811405702851425,
"grad_norm": 7.346076488494873,
"learning_rate": 6.84e-07,
"loss": -1.746,
"step": 228
},
{
"epoch": 0.00022911455727863932,
"grad_norm": 7.384093761444092,
"learning_rate": 6.87e-07,
"loss": -1.845,
"step": 229
},
{
"epoch": 0.0002301150575287644,
"grad_norm": 17.348112106323242,
"learning_rate": 6.900000000000001e-07,
"loss": -0.2476,
"step": 230
},
{
"epoch": 0.00023111555777888945,
"grad_norm": 8.641575813293457,
"learning_rate": 6.930000000000001e-07,
"loss": -0.4975,
"step": 231
},
{
"epoch": 0.0002321160580290145,
"grad_norm": 8.628252983093262,
"learning_rate": 6.960000000000001e-07,
"loss": -1.8189,
"step": 232
},
{
"completion_length": 235.43750762939453,
"epoch": 0.00023311655827913956,
"grad_norm": 4.170182228088379,
"learning_rate": 6.990000000000001e-07,
"loss": 1.5497,
"reward": 0.01355820195749402,
"reward_std": 0.02904263837262988,
"rewards/sudoku_reward_func": 0.013558201724663377,
"step": 233,
"zero_std_ratio": 0.5
},
{
"epoch": 0.00023411705852926463,
"grad_norm": 3.8872923851013184,
"learning_rate": 7.02e-07,
"loss": 1.4407,
"step": 234
},
{
"epoch": 0.0002351175587793897,
"grad_norm": 3.796323537826538,
"learning_rate": 7.05e-07,
"loss": 1.1411,
"step": 235
},
{
"epoch": 0.00023611805902951476,
"grad_norm": 5.579533100128174,
"learning_rate": 7.079999999999999e-07,
"loss": 1.2369,
"step": 236
},
{
"epoch": 0.00023711855927963983,
"grad_norm": 3.9385132789611816,
"learning_rate": 7.11e-07,
"loss": 1.5498,
"step": 237
},
{
"epoch": 0.0002381190595297649,
"grad_norm": 4.1062164306640625,
"learning_rate": 7.14e-07,
"loss": 1.4336,
"step": 238
},
{
"epoch": 0.00023911955977988993,
"grad_norm": 4.245860576629639,
"learning_rate": 7.17e-07,
"loss": 1.1133,
"step": 239
},
{
"epoch": 0.000240120060030015,
"grad_norm": 6.223697662353516,
"learning_rate": 7.2e-07,
"loss": 1.2535,
"step": 240
},
{
"completion_length": 232.6666717529297,
"epoch": 0.00024112056028014007,
"grad_norm": 28.815874099731445,
"learning_rate": 7.23e-07,
"loss": 14.1627,
"reward": 0.11326058581471443,
"reward_std": 0.11245110630989075,
"rewards/sudoku_reward_func": 0.11326058581471443,
"step": 241,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00024212106053026513,
"grad_norm": 14.755169868469238,
"learning_rate": 7.26e-07,
"loss": 12.0119,
"step": 242
},
{
"epoch": 0.0002431215607803902,
"grad_norm": 19.791038513183594,
"learning_rate": 7.29e-07,
"loss": 13.7708,
"step": 243
},
{
"epoch": 0.00024412206103051527,
"grad_norm": 13.325358390808105,
"learning_rate": 7.32e-07,
"loss": 12.5308,
"step": 244
},
{
"epoch": 0.0002451225612806403,
"grad_norm": 35.826072692871094,
"learning_rate": 7.350000000000001e-07,
"loss": 14.0292,
"step": 245
},
{
"epoch": 0.0002461230615307654,
"grad_norm": 12.46027946472168,
"learning_rate": 7.38e-07,
"loss": 11.9516,
"step": 246
},
{
"epoch": 0.00024712356178089044,
"grad_norm": 15.507015228271484,
"learning_rate": 7.41e-07,
"loss": 13.7996,
"step": 247
},
{
"epoch": 0.0002481240620310155,
"grad_norm": 13.646905899047852,
"learning_rate": 7.44e-07,
"loss": 12.5209,
"step": 248
},
{
"completion_length": 227.7916717529297,
"epoch": 0.0002491245622811406,
"grad_norm": 10.473774909973145,
"learning_rate": 7.47e-07,
"loss": 6.6821,
"reward": 0.06304113194346428,
"reward_std": 0.10152465477585793,
"rewards/sudoku_reward_func": 0.06304113194346428,
"step": 249,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0002501250625312656,
"grad_norm": 11.4276704788208,
"learning_rate": 7.5e-07,
"loss": 6.1763,
"step": 250
},
{
"epoch": 0.0002511255627813907,
"grad_norm": 11.7334566116333,
"learning_rate": 7.53e-07,
"loss": 5.9234,
"step": 251
},
{
"epoch": 0.00025212606303151575,
"grad_norm": 11.439474105834961,
"learning_rate": 7.56e-07,
"loss": 7.2595,
"step": 252
},
{
"epoch": 0.00025312656328164084,
"grad_norm": 11.427284240722656,
"learning_rate": 7.590000000000001e-07,
"loss": 6.6806,
"step": 253
},
{
"epoch": 0.0002541270635317659,
"grad_norm": 11.73374080657959,
"learning_rate": 7.620000000000001e-07,
"loss": 6.118,
"step": 254
},
{
"epoch": 0.0002551275637818909,
"grad_norm": 11.451107025146484,
"learning_rate": 7.65e-07,
"loss": 5.8955,
"step": 255
},
{
"epoch": 0.000256128064032016,
"grad_norm": 11.921669960021973,
"learning_rate": 7.68e-07,
"loss": 7.1842,
"step": 256
},
{
"completion_length": 237.06250762939453,
"epoch": 0.00025712856428214106,
"grad_norm": 6.466955184936523,
"learning_rate": 7.71e-07,
"loss": 2.0766,
"reward": 0.027695106342434883,
"reward_std": 0.06247997470200062,
"rewards/sudoku_reward_func": 0.02769510541111231,
"step": 257,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00025812906453226615,
"grad_norm": 7.86901330947876,
"learning_rate": 7.74e-07,
"loss": 2.768,
"step": 258
},
{
"epoch": 0.0002591295647823912,
"grad_norm": 7.55520486831665,
"learning_rate": 7.77e-07,
"loss": 2.1036,
"step": 259
},
{
"epoch": 0.0002601300650325163,
"grad_norm": 7.022609233856201,
"learning_rate": 7.8e-07,
"loss": 1.9049,
"step": 260
},
{
"epoch": 0.0002611305652826413,
"grad_norm": 7.077910423278809,
"learning_rate": 7.830000000000001e-07,
"loss": 2.0455,
"step": 261
},
{
"epoch": 0.00026213106553276636,
"grad_norm": 7.5524492263793945,
"learning_rate": 7.860000000000001e-07,
"loss": 2.7672,
"step": 262
},
{
"epoch": 0.00026313156578289146,
"grad_norm": 7.9139251708984375,
"learning_rate": 7.890000000000001e-07,
"loss": 2.1218,
"step": 263
},
{
"epoch": 0.0002641320660330165,
"grad_norm": 7.026261329650879,
"learning_rate": 7.920000000000001e-07,
"loss": 1.9044,
"step": 264
},
{
"completion_length": 235.9166717529297,
"epoch": 0.0002651325662831416,
"grad_norm": 10.94743824005127,
"learning_rate": 7.95e-07,
"loss": 3.926,
"reward": 0.04860359709709883,
"reward_std": 0.07168097421526909,
"rewards/sudoku_reward_func": 0.04860359709709883,
"step": 265,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00026613306653326663,
"grad_norm": 12.212309837341309,
"learning_rate": 7.98e-07,
"loss": 3.6554,
"step": 266
},
{
"epoch": 0.00026713356678339167,
"grad_norm": 11.789071083068848,
"learning_rate": 8.01e-07,
"loss": 5.5649,
"step": 267
},
{
"epoch": 0.00026813406703351676,
"grad_norm": 10.68334674835205,
"learning_rate": 8.04e-07,
"loss": 4.3805,
"step": 268
},
{
"epoch": 0.0002691345672836418,
"grad_norm": 11.618034362792969,
"learning_rate": 8.070000000000001e-07,
"loss": 3.9353,
"step": 269
},
{
"epoch": 0.0002701350675337669,
"grad_norm": 12.148497581481934,
"learning_rate": 8.100000000000001e-07,
"loss": 3.5802,
"step": 270
},
{
"epoch": 0.00027113556778389194,
"grad_norm": 11.314140319824219,
"learning_rate": 8.130000000000001e-07,
"loss": 5.4743,
"step": 271
},
{
"epoch": 0.00027213606803401703,
"grad_norm": 11.643878936767578,
"learning_rate": 8.160000000000001e-07,
"loss": 4.3817,
"step": 272
},
{
"completion_length": 235.70833587646484,
"epoch": 0.00027313656828414207,
"grad_norm": 8.467658042907715,
"learning_rate": 8.190000000000001e-07,
"loss": 10.9059,
"reward": 0.042369380593299866,
"reward_std": 0.08460133895277977,
"rewards/sudoku_reward_func": 0.042369380593299866,
"step": 273,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0002741370685342671,
"grad_norm": 11.485238075256348,
"learning_rate": 8.220000000000001e-07,
"loss": 11.3646,
"step": 274
},
{
"epoch": 0.0002751375687843922,
"grad_norm": 8.469305992126465,
"learning_rate": 8.25e-07,
"loss": 10.9724,
"step": 275
},
{
"epoch": 0.00027613806903451724,
"grad_norm": 9.359997749328613,
"learning_rate": 8.280000000000001e-07,
"loss": 10.7772,
"step": 276
},
{
"epoch": 0.00027713856928464234,
"grad_norm": 8.635589599609375,
"learning_rate": 8.310000000000001e-07,
"loss": 10.9291,
"step": 277
},
{
"epoch": 0.0002781390695347674,
"grad_norm": 11.921236991882324,
"learning_rate": 8.340000000000001e-07,
"loss": 11.3009,
"step": 278
},
{
"epoch": 0.00027913956978489247,
"grad_norm": 8.240606307983398,
"learning_rate": 8.370000000000001e-07,
"loss": 10.9624,
"step": 279
},
{
"epoch": 0.0002801400700350175,
"grad_norm": 9.36181354522705,
"learning_rate": 8.400000000000001e-07,
"loss": 10.7465,
"step": 280
},
{
"completion_length": 235.4791717529297,
"epoch": 0.00028114057028514255,
"grad_norm": 9.021184921264648,
"learning_rate": 8.430000000000001e-07,
"loss": 4.9804,
"reward": 0.03388047218322754,
"reward_std": 0.07418958842754364,
"rewards/sudoku_reward_func": 0.03388047032058239,
"step": 281,
"zero_std_ratio": 0.25
},
{
"epoch": 0.00028214107053526764,
"grad_norm": 9.126776695251465,
"learning_rate": 8.459999999999999e-07,
"loss": 4.9054,
"step": 282
},
{
"epoch": 0.0002831415707853927,
"grad_norm": 9.088274955749512,
"learning_rate": 8.489999999999999e-07,
"loss": 5.4592,
"step": 283
},
{
"epoch": 0.0002841420710355178,
"grad_norm": 12.914833068847656,
"learning_rate": 8.52e-07,
"loss": 4.9845,
"step": 284
},
{
"epoch": 0.0002851425712856428,
"grad_norm": 9.606204986572266,
"learning_rate": 8.55e-07,
"loss": 4.9761,
"step": 285
},
{
"epoch": 0.00028614307153576786,
"grad_norm": 9.16411018371582,
"learning_rate": 8.58e-07,
"loss": 4.846,
"step": 286
},
{
"epoch": 0.00028714357178589295,
"grad_norm": 9.097843170166016,
"learning_rate": 8.61e-07,
"loss": 5.3997,
"step": 287
},
{
"epoch": 0.000288144072036018,
"grad_norm": 9.768306732177734,
"learning_rate": 8.64e-07,
"loss": 4.9431,
"step": 288
},
{
"completion_length": 234.77084350585938,
"epoch": 0.0002891445722861431,
"grad_norm": 11.16508960723877,
"learning_rate": 8.669999999999999e-07,
"loss": -4.0727,
"reward": 0.049933863803744316,
"reward_std": 0.09329613298177719,
"rewards/sudoku_reward_func": 0.049933863803744316,
"step": 289,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0002901450725362681,
"grad_norm": 12.644773483276367,
"learning_rate": 8.699999999999999e-07,
"loss": -3.3604,
"step": 290
},
{
"epoch": 0.0002911455727863932,
"grad_norm": 13.333436965942383,
"learning_rate": 8.729999999999999e-07,
"loss": -3.7767,
"step": 291
},
{
"epoch": 0.00029214607303651826,
"grad_norm": 12.671016693115234,
"learning_rate": 8.76e-07,
"loss": -2.786,
"step": 292
},
{
"epoch": 0.0002931465732866433,
"grad_norm": 12.357577323913574,
"learning_rate": 8.79e-07,
"loss": -4.1053,
"step": 293
},
{
"epoch": 0.0002941470735367684,
"grad_norm": 14.383247375488281,
"learning_rate": 8.82e-07,
"loss": -3.3782,
"step": 294
},
{
"epoch": 0.00029514757378689343,
"grad_norm": 13.393773078918457,
"learning_rate": 8.85e-07,
"loss": -3.8462,
"step": 295
},
{
"epoch": 0.0002961480740370185,
"grad_norm": 12.599241256713867,
"learning_rate": 8.88e-07,
"loss": -2.8205,
"step": 296
},
{
"completion_length": 232.89584350585938,
"epoch": 0.00029714857428714356,
"grad_norm": 15.046712875366211,
"learning_rate": 8.91e-07,
"loss": 1.2729,
"reward": 0.047825731337070465,
"reward_std": 0.08659476786851883,
"rewards/sudoku_reward_func": 0.04782572761178017,
"step": 297,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00029814907453726866,
"grad_norm": 10.604710578918457,
"learning_rate": 8.939999999999999e-07,
"loss": -0.2316,
"step": 298
},
{
"epoch": 0.0002991495747873937,
"grad_norm": 12.712968826293945,
"learning_rate": 8.969999999999999e-07,
"loss": -0.0159,
"step": 299
},
{
"epoch": 0.00030015007503751874,
"grad_norm": 13.237211227416992,
"learning_rate": 9e-07,
"loss": 1.4319,
"step": 300
},
{
"epoch": 0.00030115057528764383,
"grad_norm": 15.451652526855469,
"learning_rate": 9.03e-07,
"loss": 1.1957,
"step": 301
},
{
"epoch": 0.00030215107553776887,
"grad_norm": 10.60921573638916,
"learning_rate": 9.06e-07,
"loss": -0.2241,
"step": 302
},
{
"epoch": 0.00030315157578789397,
"grad_norm": 12.147521018981934,
"learning_rate": 9.09e-07,
"loss": -0.0156,
"step": 303
},
{
"epoch": 0.000304152076038019,
"grad_norm": 13.540789604187012,
"learning_rate": 9.12e-07,
"loss": 1.3268,
"step": 304
},
{
"completion_length": 240.83333587646484,
"epoch": 0.0003051525762881441,
"grad_norm": 22.218473434448242,
"learning_rate": 9.15e-07,
"loss": 5.2482,
"reward": 0.0706845298409462,
"reward_std": 0.11625630408525467,
"rewards/sudoku_reward_func": 0.07068452797830105,
"step": 305,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00030615307653826914,
"grad_norm": 11.328912734985352,
"learning_rate": 9.18e-07,
"loss": 3.6993,
"step": 306
},
{
"epoch": 0.0003071535767883942,
"grad_norm": 12.521968841552734,
"learning_rate": 9.210000000000001e-07,
"loss": 2.4513,
"step": 307
},
{
"epoch": 0.00030815407703851927,
"grad_norm": 13.72804069519043,
"learning_rate": 9.24e-07,
"loss": 3.8968,
"step": 308
},
{
"epoch": 0.0003091545772886443,
"grad_norm": 18.00409698486328,
"learning_rate": 9.27e-07,
"loss": 5.2393,
"step": 309
},
{
"epoch": 0.0003101550775387694,
"grad_norm": 11.605079650878906,
"learning_rate": 9.3e-07,
"loss": 3.6261,
"step": 310
},
{
"epoch": 0.00031115557778889445,
"grad_norm": 11.738133430480957,
"learning_rate": 9.33e-07,
"loss": 2.4419,
"step": 311
},
{
"epoch": 0.0003121560780390195,
"grad_norm": 14.07581901550293,
"learning_rate": 9.36e-07,
"loss": 3.8252,
"step": 312
},
{
"completion_length": 243.81250762939453,
"epoch": 0.0003131565782891446,
"grad_norm": 8.576881408691406,
"learning_rate": 9.39e-07,
"loss": 1.2863,
"reward": 0.0376909002661705,
"reward_std": 0.06233246065676212,
"rewards/sudoku_reward_func": 0.03769089933484793,
"step": 313,
"zero_std_ratio": 0.25
},
{
"epoch": 0.0003141570785392696,
"grad_norm": 9.49803352355957,
"learning_rate": 9.42e-07,
"loss": 1.8182,
"step": 314
},
{
"epoch": 0.0003151575787893947,
"grad_norm": 8.959271430969238,
"learning_rate": 9.450000000000001e-07,
"loss": 1.2478,
"step": 315
},
{
"epoch": 0.00031615807903951975,
"grad_norm": 10.243912696838379,
"learning_rate": 9.480000000000001e-07,
"loss": 1.412,
"step": 316
},
{
"epoch": 0.00031715857928964485,
"grad_norm": 12.459125518798828,
"learning_rate": 9.510000000000001e-07,
"loss": 1.2536,
"step": 317
},
{
"epoch": 0.0003181590795397699,
"grad_norm": 9.807161331176758,
"learning_rate": 9.54e-07,
"loss": 1.7685,
"step": 318
},
{
"epoch": 0.0003191595797898949,
"grad_norm": 9.090300559997559,
"learning_rate": 9.570000000000001e-07,
"loss": 1.1907,
"step": 319
},
{
"epoch": 0.00032016008004002,
"grad_norm": 11.044015884399414,
"learning_rate": 9.600000000000001e-07,
"loss": 1.3517,
"step": 320
},
{
"completion_length": 236.4166717529297,
"epoch": 0.00032116058029014506,
"grad_norm": 14.772443771362305,
"learning_rate": 9.630000000000001e-07,
"loss": 4.4002,
"reward": 0.08391203731298447,
"reward_std": 0.12630317360162735,
"rewards/sudoku_reward_func": 0.08391203731298447,
"step": 321,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00032216108054027015,
"grad_norm": 12.570160865783691,
"learning_rate": 9.660000000000002e-07,
"loss": 3.2618,
"step": 322
},
{
"epoch": 0.0003231615807903952,
"grad_norm": 18.8726749420166,
"learning_rate": 9.69e-07,
"loss": 1.9231,
"step": 323
},
{
"epoch": 0.0003241620810405203,
"grad_norm": 15.474459648132324,
"learning_rate": 9.72e-07,
"loss": 3.6165,
"step": 324
},
{
"epoch": 0.0003251625812906453,
"grad_norm": 13.179699897766113,
"learning_rate": 9.75e-07,
"loss": 4.3286,
"step": 325
},
{
"epoch": 0.00032616308154077037,
"grad_norm": 12.312911033630371,
"learning_rate": 9.78e-07,
"loss": 3.1953,
"step": 326
},
{
"epoch": 0.00032716358179089546,
"grad_norm": 23.187116622924805,
"learning_rate": 9.81e-07,
"loss": 1.7782,
"step": 327
},
{
"epoch": 0.0003281640820410205,
"grad_norm": 14.457850456237793,
"learning_rate": 9.84e-07,
"loss": 3.5443,
"step": 328
},
{
"completion_length": 239.1041717529297,
"epoch": 0.0003291645822911456,
"grad_norm": 13.674739837646484,
"learning_rate": 9.87e-07,
"loss": 1.863,
"reward": 0.099082350730896,
"reward_std": 0.12339252233505249,
"rewards/sudoku_reward_func": 0.0990823395550251,
"step": 329,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00033016508254127063,
"grad_norm": 12.24724006652832,
"learning_rate": 9.9e-07,
"loss": 2.4264,
"step": 330
},
{
"epoch": 0.0003311655827913957,
"grad_norm": 17.608428955078125,
"learning_rate": 9.93e-07,
"loss": 3.1095,
"step": 331
},
{
"epoch": 0.00033216608304152077,
"grad_norm": 15.516730308532715,
"learning_rate": 9.96e-07,
"loss": 3.0734,
"step": 332
},
{
"epoch": 0.0003331665832916458,
"grad_norm": 12.981306076049805,
"learning_rate": 9.99e-07,
"loss": 1.7985,
"step": 333
},
{
"epoch": 0.0003341670835417709,
"grad_norm": 12.763707160949707,
"learning_rate": 1.002e-06,
"loss": 2.3478,
"step": 334
},
{
"epoch": 0.00033516758379189594,
"grad_norm": 16.93824577331543,
"learning_rate": 1.0050000000000001e-06,
"loss": 3.0187,
"step": 335
},
{
"epoch": 0.00033616808404202103,
"grad_norm": 16.1212215423584,
"learning_rate": 1.0080000000000001e-06,
"loss": 2.9999,
"step": 336
},
{
"completion_length": 239.37500762939453,
"epoch": 0.0003371685842921461,
"grad_norm": 8.778818130493164,
"learning_rate": 1.0110000000000001e-06,
"loss": 4.0337,
"reward": 0.05348875932395458,
"reward_std": 0.07841756939888,
"rewards/sudoku_reward_func": 0.05348875932395458,
"step": 337,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0003381690845422711,
"grad_norm": 8.754796028137207,
"learning_rate": 1.0140000000000002e-06,
"loss": 3.7786,
"step": 338
},
{
"epoch": 0.0003391695847923962,
"grad_norm": 9.616628646850586,
"learning_rate": 1.0170000000000002e-06,
"loss": 3.6712,
"step": 339
},
{
"epoch": 0.00034017008504252125,
"grad_norm": 8.855981826782227,
"learning_rate": 1.0200000000000002e-06,
"loss": 4.6132,
"step": 340
},
{
"epoch": 0.00034117058529264634,
"grad_norm": 8.571745872497559,
"learning_rate": 1.0230000000000002e-06,
"loss": 3.9668,
"step": 341
},
{
"epoch": 0.0003421710855427714,
"grad_norm": 9.43879222869873,
"learning_rate": 1.026e-06,
"loss": 3.7596,
"step": 342
},
{
"epoch": 0.0003431715857928965,
"grad_norm": 9.735145568847656,
"learning_rate": 1.029e-06,
"loss": 3.6587,
"step": 343
},
{
"epoch": 0.0003441720860430215,
"grad_norm": 9.055402755737305,
"learning_rate": 1.032e-06,
"loss": 4.514,
"step": 344
},
{
"completion_length": 240.4166717529297,
"epoch": 0.00034517258629314655,
"grad_norm": 15.025571823120117,
"learning_rate": 1.035e-06,
"loss": -2.9335,
"reward": 0.06999308802187443,
"reward_std": 0.10092847421765327,
"rewards/sudoku_reward_func": 0.06999308802187443,
"step": 345,
"zero_std_ratio": 0.125
},
{
"epoch": 0.00034617308654327165,
"grad_norm": 14.629435539245605,
"learning_rate": 1.0379999999999998e-06,
"loss": -2.6866,
"step": 346
},
{
"epoch": 0.0003471735867933967,
"grad_norm": 15.487167358398438,
"learning_rate": 1.0409999999999999e-06,
"loss": -3.771,
"step": 347
},
{
"epoch": 0.0003481740870435218,
"grad_norm": 15.948243141174316,
"learning_rate": 1.0439999999999999e-06,
"loss": -2.2402,
"step": 348
},
{
"epoch": 0.0003491745872936468,
"grad_norm": 15.299474716186523,
"learning_rate": 1.0469999999999999e-06,
"loss": -3.0113,
"step": 349
},
{
"epoch": 0.00035017508754377186,
"grad_norm": 14.349522590637207,
"learning_rate": 1.05e-06,
"loss": -2.88,
"step": 350
},
{
"epoch": 0.00035117558779389695,
"grad_norm": 15.201149940490723,
"learning_rate": 1.053e-06,
"loss": -3.9468,
"step": 351
},
{
"epoch": 0.000352176088044022,
"grad_norm": 16.916872024536133,
"learning_rate": 1.056e-06,
"loss": -2.4322,
"step": 352
},
{
"completion_length": 231.12500762939453,
"epoch": 0.0003531765882941471,
"grad_norm": 18.65467071533203,
"learning_rate": 1.059e-06,
"loss": 1.9933,
"reward": 0.0762648843228817,
"reward_std": 0.1246400736272335,
"rewards/sudoku_reward_func": 0.0762648805975914,
"step": 353,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00035417708854427213,
"grad_norm": 20.7829532623291,
"learning_rate": 1.062e-06,
"loss": 0.0083,
"step": 354
},
{
"epoch": 0.0003551775887943972,
"grad_norm": 17.963436126708984,
"learning_rate": 1.065e-06,
"loss": 1.9007,
"step": 355
},
{
"epoch": 0.00035617808904452226,
"grad_norm": 19.813440322875977,
"learning_rate": 1.068e-06,
"loss": 2.9544,
"step": 356
},
{
"epoch": 0.0003571785892946473,
"grad_norm": 18.973485946655273,
"learning_rate": 1.071e-06,
"loss": 1.6812,
"step": 357
},
{
"epoch": 0.0003581790895447724,
"grad_norm": 21.765804290771484,
"learning_rate": 1.074e-06,
"loss": -0.2427,
"step": 358
},
{
"epoch": 0.00035917958979489743,
"grad_norm": 18.01424789428711,
"learning_rate": 1.077e-06,
"loss": 1.7125,
"step": 359
},
{
"epoch": 0.00036018009004502253,
"grad_norm": 20.650293350219727,
"learning_rate": 1.08e-06,
"loss": 2.7495,
"step": 360
},
{
"completion_length": 232.1666717529297,
"epoch": 0.00036118059029514757,
"grad_norm": 17.942243576049805,
"learning_rate": 1.083e-06,
"loss": 5.3031,
"reward": 0.10309194773435593,
"reward_std": 0.14464747160673141,
"rewards/sudoku_reward_func": 0.10309194400906563,
"step": 361,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00036218109054527266,
"grad_norm": 18.253305435180664,
"learning_rate": 1.086e-06,
"loss": 5.155,
"step": 362
},
{
"epoch": 0.0003631815907953977,
"grad_norm": 20.967313766479492,
"learning_rate": 1.089e-06,
"loss": 5.5673,
"step": 363
},
{
"epoch": 0.00036418209104552274,
"grad_norm": 21.44413185119629,
"learning_rate": 1.092e-06,
"loss": 4.7692,
"step": 364
},
{
"epoch": 0.00036518259129564784,
"grad_norm": 19.41208267211914,
"learning_rate": 1.095e-06,
"loss": 5.0225,
"step": 365
},
{
"epoch": 0.0003661830915457729,
"grad_norm": 18.210580825805664,
"learning_rate": 1.098e-06,
"loss": 4.8703,
"step": 366
},
{
"epoch": 0.00036718359179589797,
"grad_norm": 20.35464859008789,
"learning_rate": 1.101e-06,
"loss": 5.2985,
"step": 367
},
{
"epoch": 0.000368184092046023,
"grad_norm": 24.97935676574707,
"learning_rate": 1.104e-06,
"loss": 4.5277,
"step": 368
},
{
"completion_length": 240.83333587646484,
"epoch": 0.00036918459229614805,
"grad_norm": 11.803683280944824,
"learning_rate": 1.107e-06,
"loss": -7.3946,
"reward": 0.0628720261156559,
"reward_std": 0.07477889209985733,
"rewards/sudoku_reward_func": 0.0628720223903656,
"step": 369,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00037018509254627314,
"grad_norm": 10.526047706604004,
"learning_rate": 1.11e-06,
"loss": -6.4134,
"step": 370
},
{
"epoch": 0.0003711855927963982,
"grad_norm": 10.823116302490234,
"learning_rate": 1.113e-06,
"loss": -6.2425,
"step": 371
},
{
"epoch": 0.0003721860930465233,
"grad_norm": 11.820639610290527,
"learning_rate": 1.116e-06,
"loss": -6.4356,
"step": 372
},
{
"epoch": 0.0003731865932966483,
"grad_norm": 12.025969505310059,
"learning_rate": 1.119e-06,
"loss": -7.4929,
"step": 373
},
{
"epoch": 0.0003741870935467734,
"grad_norm": 10.814313888549805,
"learning_rate": 1.122e-06,
"loss": -6.5665,
"step": 374
},
{
"epoch": 0.00037518759379689845,
"grad_norm": 10.791754722595215,
"learning_rate": 1.125e-06,
"loss": -6.3787,
"step": 375
},
{
"epoch": 0.0003761880940470235,
"grad_norm": 12.164316177368164,
"learning_rate": 1.128e-06,
"loss": -6.5785,
"step": 376
},
{
"completion_length": 235.81250762939453,
"epoch": 0.0003771885942971486,
"grad_norm": 20.148202896118164,
"learning_rate": 1.131e-06,
"loss": 7.9792,
"reward": 0.1282242238521576,
"reward_std": 0.1380881443619728,
"rewards/sudoku_reward_func": 0.1282242089509964,
"step": 377,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0003781890945472736,
"grad_norm": 20.059968948364258,
"learning_rate": 1.134e-06,
"loss": 7.7242,
"step": 378
},
{
"epoch": 0.0003791895947973987,
"grad_norm": 19.917285919189453,
"learning_rate": 1.137e-06,
"loss": 6.6093,
"step": 379
},
{
"epoch": 0.00038019009504752376,
"grad_norm": 18.866975784301758,
"learning_rate": 1.14e-06,
"loss": 5.9883,
"step": 380
},
{
"epoch": 0.00038119059529764885,
"grad_norm": 20.893918991088867,
"learning_rate": 1.1430000000000001e-06,
"loss": 7.736,
"step": 381
},
{
"epoch": 0.0003821910955477739,
"grad_norm": 20.95090103149414,
"learning_rate": 1.1460000000000001e-06,
"loss": 7.446,
"step": 382
},
{
"epoch": 0.00038319159579789893,
"grad_norm": 18.311017990112305,
"learning_rate": 1.1490000000000001e-06,
"loss": 6.3593,
"step": 383
},
{
"epoch": 0.000384192096048024,
"grad_norm": 18.9256591796875,
"learning_rate": 1.1520000000000002e-06,
"loss": 5.6758,
"step": 384
},
{
"completion_length": 242.7291717529297,
"epoch": 0.00038519259629814906,
"grad_norm": 15.097347259521484,
"learning_rate": 1.155e-06,
"loss": -5.6666,
"reward": 0.10884891077876091,
"reward_std": 0.12249365448951721,
"rewards/sudoku_reward_func": 0.10884890332818031,
"step": 385,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00038619309654827416,
"grad_norm": 14.708651542663574,
"learning_rate": 1.158e-06,
"loss": -4.3701,
"step": 386
},
{
"epoch": 0.0003871935967983992,
"grad_norm": 15.039358139038086,
"learning_rate": 1.161e-06,
"loss": -3.7308,
"step": 387
},
{
"epoch": 0.00038819409704852424,
"grad_norm": 15.061366081237793,
"learning_rate": 1.164e-06,
"loss": -4.6885,
"step": 388
},
{
"epoch": 0.00038919459729864933,
"grad_norm": 16.103355407714844,
"learning_rate": 1.167e-06,
"loss": -5.7928,
"step": 389
},
{
"epoch": 0.00039019509754877437,
"grad_norm": 14.791950225830078,
"learning_rate": 1.17e-06,
"loss": -4.5289,
"step": 390
},
{
"epoch": 0.00039119559779889946,
"grad_norm": 14.480550765991211,
"learning_rate": 1.173e-06,
"loss": -3.8723,
"step": 391
},
{
"epoch": 0.0003921960980490245,
"grad_norm": 15.546393394470215,
"learning_rate": 1.176e-06,
"loss": -4.8869,
"step": 392
},
{
"completion_length": 242.83333587646484,
"epoch": 0.0003931965982991496,
"grad_norm": 14.729490280151367,
"learning_rate": 1.179e-06,
"loss": -5.427,
"reward": 0.09122851490974426,
"reward_std": 0.12679633498191833,
"rewards/sudoku_reward_func": 0.09122850745916367,
"step": 393,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00039419709854927464,
"grad_norm": 13.33181095123291,
"learning_rate": 1.182e-06,
"loss": -5.2525,
"step": 394
},
{
"epoch": 0.0003951975987993997,
"grad_norm": 14.865705490112305,
"learning_rate": 1.185e-06,
"loss": -5.7037,
"step": 395
},
{
"epoch": 0.00039619809904952477,
"grad_norm": 12.856032371520996,
"learning_rate": 1.188e-06,
"loss": -6.5057,
"step": 396
},
{
"epoch": 0.0003971985992996498,
"grad_norm": 16.378400802612305,
"learning_rate": 1.1910000000000001e-06,
"loss": -5.5324,
"step": 397
},
{
"epoch": 0.0003981990995497749,
"grad_norm": 14.952813148498535,
"learning_rate": 1.1940000000000001e-06,
"loss": -5.4037,
"step": 398
},
{
"epoch": 0.00039919959979989994,
"grad_norm": 13.535036087036133,
"learning_rate": 1.1970000000000001e-06,
"loss": -5.8358,
"step": 399
},
{
"epoch": 0.00040020010005002504,
"grad_norm": 12.8466157913208,
"learning_rate": 1.2000000000000002e-06,
"loss": -6.6436,
"step": 400
},
{
"completion_length": 246.3541717529297,
"epoch": 0.0004012006003001501,
"grad_norm": 12.46061897277832,
"learning_rate": 1.2030000000000002e-06,
"loss": 0.0263,
"reward": 0.06971500627696514,
"reward_std": 0.10182120278477669,
"rewards/sudoku_reward_func": 0.06971500627696514,
"step": 401,
"zero_std_ratio": 0.125
},
{
"epoch": 0.0004022011005502751,
"grad_norm": 13.909893989562988,
"learning_rate": 1.2060000000000002e-06,
"loss": -0.1189,
"step": 402
},
{
"epoch": 0.0004032016008004002,
"grad_norm": 14.540249824523926,
"learning_rate": 1.2090000000000002e-06,
"loss": -0.3745,
"step": 403
},
{
"epoch": 0.00040420210105052525,
"grad_norm": 15.110182762145996,
"learning_rate": 1.2120000000000002e-06,
"loss": -0.7959,
"step": 404
},
{
"epoch": 0.00040520260130065034,
"grad_norm": 12.969428062438965,
"learning_rate": 1.215e-06,
"loss": -0.0583,
"step": 405
},
{
"epoch": 0.0004062031015507754,
"grad_norm": 15.375487327575684,
"learning_rate": 1.218e-06,
"loss": -0.2408,
"step": 406
},
{
"epoch": 0.0004072036018009004,
"grad_norm": 14.822982788085938,
"learning_rate": 1.221e-06,
"loss": -0.4963,
"step": 407
},
{
"epoch": 0.0004082041020510255,
"grad_norm": 16.54659080505371,
"learning_rate": 1.224e-06,
"loss": -0.8767,
"step": 408
},
{
"completion_length": 248.64583587646484,
"epoch": 0.00040920460230115056,
"grad_norm": 20.481712341308594,
"learning_rate": 1.2269999999999999e-06,
"loss": 4.346,
"reward": 0.11594367399811745,
"reward_std": 0.1314607784152031,
"rewards/sudoku_reward_func": 0.11594367027282715,
"step": 409,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00041020510255127565,
"grad_norm": 22.77071762084961,
"learning_rate": 1.2299999999999999e-06,
"loss": 4.7742,
"step": 410
},
{
"epoch": 0.0004112056028014007,
"grad_norm": 17.92951774597168,
"learning_rate": 1.2329999999999999e-06,
"loss": 1.959,
"step": 411
},
{
"epoch": 0.0004122061030515258,
"grad_norm": 18.547788619995117,
"learning_rate": 1.236e-06,
"loss": 1.3136,
"step": 412
},
{
"epoch": 0.0004132066033016508,
"grad_norm": 18.12384605407715,
"learning_rate": 1.239e-06,
"loss": 4.2181,
"step": 413
},
{
"epoch": 0.00041420710355177586,
"grad_norm": 21.05364418029785,
"learning_rate": 1.242e-06,
"loss": 4.8127,
"step": 414
},
{
"epoch": 0.00041520760380190096,
"grad_norm": 19.076353073120117,
"learning_rate": 1.245e-06,
"loss": 2.0259,
"step": 415
},
{
"epoch": 0.000416208104052026,
"grad_norm": 19.371305465698242,
"learning_rate": 1.248e-06,
"loss": 1.1727,
"step": 416
},
{
"completion_length": 238.6666717529297,
"epoch": 0.0004172086043021511,
"grad_norm": 24.886581420898438,
"learning_rate": 1.251e-06,
"loss": 2.8817,
"reward": 0.1366717889904976,
"reward_std": 0.17538512498140335,
"rewards/sudoku_reward_func": 0.136671781539917,
"step": 417,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00041820910455227613,
"grad_norm": 24.532011032104492,
"learning_rate": 1.254e-06,
"loss": 0.4373,
"step": 418
},
{
"epoch": 0.0004192096048024012,
"grad_norm": 21.299440383911133,
"learning_rate": 1.257e-06,
"loss": 1.7123,
"step": 419
},
{
"epoch": 0.00042021010505252627,
"grad_norm": 22.578105926513672,
"learning_rate": 1.26e-06,
"loss": -0.2846,
"step": 420
},
{
"epoch": 0.0004212106053026513,
"grad_norm": 25.076955795288086,
"learning_rate": 1.263e-06,
"loss": 2.7225,
"step": 421
},
{
"epoch": 0.0004222111055527764,
"grad_norm": 24.209144592285156,
"learning_rate": 1.266e-06,
"loss": 0.1744,
"step": 422
},
{
"epoch": 0.00042321160580290144,
"grad_norm": 20.4233455657959,
"learning_rate": 1.269e-06,
"loss": 1.5553,
"step": 423
},
{
"epoch": 0.00042421210605302653,
"grad_norm": 23.477943420410156,
"learning_rate": 1.272e-06,
"loss": -0.4778,
"step": 424
},
{
"completion_length": 239.25,
"epoch": 0.00042521260630315157,
"grad_norm": 22.97435188293457,
"learning_rate": 1.275e-06,
"loss": -10.5125,
"reward": 0.12210648879408836,
"reward_std": 0.14136488735675812,
"rewards/sudoku_reward_func": 0.12210648134350777,
"step": 425,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0004262131065532766,
"grad_norm": 23.24193572998047,
"learning_rate": 1.278e-06,
"loss": -10.8064,
"step": 426
},
{
"epoch": 0.0004272136068034017,
"grad_norm": 23.464582443237305,
"learning_rate": 1.281e-06,
"loss": -10.3222,
"step": 427
},
{
"epoch": 0.00042821410705352675,
"grad_norm": 26.24444007873535,
"learning_rate": 1.284e-06,
"loss": -10.2436,
"step": 428
},
{
"epoch": 0.00042921460730365184,
"grad_norm": 23.67002296447754,
"learning_rate": 1.287e-06,
"loss": -10.6952,
"step": 429
},
{
"epoch": 0.0004302151075537769,
"grad_norm": 23.789400100708008,
"learning_rate": 1.29e-06,
"loss": -11.0674,
"step": 430
},
{
"epoch": 0.000431215607803902,
"grad_norm": 23.746618270874023,
"learning_rate": 1.293e-06,
"loss": -10.7141,
"step": 431
},
{
"epoch": 0.000432216108054027,
"grad_norm": 26.598947525024414,
"learning_rate": 1.296e-06,
"loss": -10.6483,
"step": 432
},
{
"completion_length": 243.02083587646484,
"epoch": 0.00043321660830415205,
"grad_norm": 21.561887741088867,
"learning_rate": 1.299e-06,
"loss": -4.2898,
"reward": 0.16087963432073593,
"reward_std": 0.14164353907108307,
"rewards/sudoku_reward_func": 0.16087962687015533,
"step": 433,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00043421710855427715,
"grad_norm": 19.212657928466797,
"learning_rate": 1.302e-06,
"loss": -3.3441,
"step": 434
},
{
"epoch": 0.0004352176088044022,
"grad_norm": 22.98586654663086,
"learning_rate": 1.305e-06,
"loss": -3.923,
"step": 435
},
{
"epoch": 0.0004362181090545273,
"grad_norm": 23.236225128173828,
"learning_rate": 1.308e-06,
"loss": -5.2046,
"step": 436
},
{
"epoch": 0.0004372186093046523,
"grad_norm": 21.43988800048828,
"learning_rate": 1.311e-06,
"loss": -4.4946,
"step": 437
},
{
"epoch": 0.0004382191095547774,
"grad_norm": 19.148176193237305,
"learning_rate": 1.314e-06,
"loss": -3.5995,
"step": 438
},
{
"epoch": 0.00043921960980490245,
"grad_norm": 24.136995315551758,
"learning_rate": 1.317e-06,
"loss": -4.2286,
"step": 439
},
{
"epoch": 0.0004402201100550275,
"grad_norm": 23.57278823852539,
"learning_rate": 1.32e-06,
"loss": -5.5188,
"step": 440
},
{
"completion_length": 233.2916717529297,
"epoch": 0.0004412206103051526,
"grad_norm": 14.671692848205566,
"learning_rate": 1.323e-06,
"loss": -4.0371,
"reward": 0.09998046606779099,
"reward_std": 0.10778994113206863,
"rewards/sudoku_reward_func": 0.09998045861721039,
"step": 441,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0004422211105552776,
"grad_norm": 14.384123802185059,
"learning_rate": 1.326e-06,
"loss": -3.9219,
"step": 442
},
{
"epoch": 0.0004432216108054027,
"grad_norm": 17.219676971435547,
"learning_rate": 1.3290000000000001e-06,
"loss": -3.6826,
"step": 443
},
{
"epoch": 0.00044422211105552776,
"grad_norm": 11.303711891174316,
"learning_rate": 1.3320000000000001e-06,
"loss": -3.5771,
"step": 444
},
{
"epoch": 0.0004452226113056528,
"grad_norm": 11.632588386535645,
"learning_rate": 1.3350000000000001e-06,
"loss": -4.084,
"step": 445
},
{
"epoch": 0.0004462231115557779,
"grad_norm": 13.563810348510742,
"learning_rate": 1.3380000000000001e-06,
"loss": -4.0316,
"step": 446
},
{
"epoch": 0.00044722361180590293,
"grad_norm": 16.55260467529297,
"learning_rate": 1.3410000000000002e-06,
"loss": -3.7837,
"step": 447
},
{
"epoch": 0.00044822411205602803,
"grad_norm": 11.298389434814453,
"learning_rate": 1.344e-06,
"loss": -3.5847,
"step": 448
},
{
"completion_length": 247.14583587646484,
"epoch": 0.00044922461230615307,
"grad_norm": 25.21985626220703,
"learning_rate": 1.347e-06,
"loss": 6.0425,
"reward": 0.1645434945821762,
"reward_std": 0.15197737514972687,
"rewards/sudoku_reward_func": 0.1645434945821762,
"step": 449,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00045022511255627816,
"grad_norm": 23.20073699951172,
"learning_rate": 1.35e-06,
"loss": 5.8398,
"step": 450
},
{
"epoch": 0.0004512256128064032,
"grad_norm": 24.370487213134766,
"learning_rate": 1.353e-06,
"loss": 6.6032,
"step": 451
},
{
"epoch": 0.00045222611305652824,
"grad_norm": 23.520915985107422,
"learning_rate": 1.356e-06,
"loss": 5.9308,
"step": 452
},
{
"epoch": 0.00045322661330665333,
"grad_norm": 25.149738311767578,
"learning_rate": 1.359e-06,
"loss": 6.0518,
"step": 453
},
{
"epoch": 0.0004542271135567784,
"grad_norm": 22.509672164916992,
"learning_rate": 1.362e-06,
"loss": 5.8463,
"step": 454
},
{
"epoch": 0.00045522761380690347,
"grad_norm": 23.885211944580078,
"learning_rate": 1.365e-06,
"loss": 6.5497,
"step": 455
},
{
"epoch": 0.0004562281140570285,
"grad_norm": 23.235668182373047,
"learning_rate": 1.368e-06,
"loss": 5.8002,
"step": 456
},
{
"completion_length": 233.52083587646484,
"epoch": 0.0004572286143071536,
"grad_norm": 13.065637588500977,
"learning_rate": 1.371e-06,
"loss": -4.969,
"reward": 0.11541005969047546,
"reward_std": 0.12353448569774628,
"rewards/sudoku_reward_func": 0.11541005223989487,
"step": 457,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00045822911455727864,
"grad_norm": 15.100817680358887,
"learning_rate": 1.374e-06,
"loss": -4.0103,
"step": 458
},
{
"epoch": 0.0004592296148074037,
"grad_norm": 13.022795677185059,
"learning_rate": 1.3770000000000001e-06,
"loss": -4.4582,
"step": 459
},
{
"epoch": 0.0004602301150575288,
"grad_norm": 15.367870330810547,
"learning_rate": 1.3800000000000001e-06,
"loss": -4.6818,
"step": 460
},
{
"epoch": 0.0004612306153076538,
"grad_norm": 13.363811492919922,
"learning_rate": 1.3830000000000001e-06,
"loss": -5.0405,
"step": 461
},
{
"epoch": 0.0004622311155577789,
"grad_norm": 22.934417724609375,
"learning_rate": 1.3860000000000002e-06,
"loss": -4.0986,
"step": 462
},
{
"epoch": 0.00046323161580790395,
"grad_norm": 14.163771629333496,
"learning_rate": 1.3890000000000002e-06,
"loss": -4.5348,
"step": 463
},
{
"epoch": 0.000464232116058029,
"grad_norm": 14.093031883239746,
"learning_rate": 1.3920000000000002e-06,
"loss": -4.6754,
"step": 464
},
{
"completion_length": 238.33333587646484,
"epoch": 0.0004652326163081541,
"grad_norm": 18.271038055419922,
"learning_rate": 1.3950000000000002e-06,
"loss": 3.2038,
"reward": 0.1049107164144516,
"reward_std": 0.11827318742871284,
"rewards/sudoku_reward_func": 0.1049107164144516,
"step": 465,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0004662331165582791,
"grad_norm": 20.824806213378906,
"learning_rate": 1.3980000000000002e-06,
"loss": 2.997,
"step": 466
},
{
"epoch": 0.0004672336168084042,
"grad_norm": 16.93488883972168,
"learning_rate": 1.401e-06,
"loss": 2.7855,
"step": 467
},
{
"epoch": 0.00046823411705852925,
"grad_norm": 17.172334671020508,
"learning_rate": 1.404e-06,
"loss": 2.7988,
"step": 468
},
{
"epoch": 0.00046923461730865435,
"grad_norm": 18.538854598999023,
"learning_rate": 1.407e-06,
"loss": 3.0902,
"step": 469
},
{
"epoch": 0.0004702351175587794,
"grad_norm": 20.743358612060547,
"learning_rate": 1.41e-06,
"loss": 2.799,
"step": 470
},
{
"epoch": 0.00047123561780890443,
"grad_norm": 17.690128326416016,
"learning_rate": 1.4129999999999999e-06,
"loss": 2.6563,
"step": 471
},
{
"epoch": 0.0004722361180590295,
"grad_norm": 17.46112632751465,
"learning_rate": 1.4159999999999999e-06,
"loss": 2.5592,
"step": 472
},
{
"completion_length": 234.6875,
"epoch": 0.00047323661830915456,
"grad_norm": 14.19119930267334,
"learning_rate": 1.4189999999999999e-06,
"loss": 5.9519,
"reward": 0.1398809626698494,
"reward_std": 0.12144653871655464,
"rewards/sudoku_reward_func": 0.1398809552192688,
"step": 473,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00047423711855927966,
"grad_norm": 15.279950141906738,
"learning_rate": 1.422e-06,
"loss": 6.337,
"step": 474
},
{
"epoch": 0.0004752376188094047,
"grad_norm": 15.904949188232422,
"learning_rate": 1.425e-06,
"loss": 6.6797,
"step": 475
},
{
"epoch": 0.0004762381190595298,
"grad_norm": 15.839500427246094,
"learning_rate": 1.428e-06,
"loss": 4.1356,
"step": 476
},
{
"epoch": 0.00047723861930965483,
"grad_norm": 14.34893798828125,
"learning_rate": 1.431e-06,
"loss": 5.7966,
"step": 477
},
{
"epoch": 0.00047823911955977987,
"grad_norm": 15.247032165527344,
"learning_rate": 1.434e-06,
"loss": 6.1297,
"step": 478
},
{
"epoch": 0.00047923961980990496,
"grad_norm": 17.156147003173828,
"learning_rate": 1.437e-06,
"loss": 6.4489,
"step": 479
},
{
"epoch": 0.00048024012006003,
"grad_norm": 14.10362720489502,
"learning_rate": 1.44e-06,
"loss": 4.0195,
"step": 480
},
{
"completion_length": 244.4791717529297,
"epoch": 0.0004812406203101551,
"grad_norm": 12.527475357055664,
"learning_rate": 1.443e-06,
"loss": 3.0738,
"reward": 0.11999834701418877,
"reward_std": 0.13045284524559975,
"rewards/sudoku_reward_func": 0.11999834701418877,
"step": 481,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00048224112056028014,
"grad_norm": 13.442937850952148,
"learning_rate": 1.446e-06,
"loss": 2.4034,
"step": 482
},
{
"epoch": 0.00048324162081040523,
"grad_norm": 16.119911193847656,
"learning_rate": 1.449e-06,
"loss": 4.0128,
"step": 483
},
{
"epoch": 0.00048424212106053027,
"grad_norm": 17.460521697998047,
"learning_rate": 1.452e-06,
"loss": 3.8593,
"step": 484
},
{
"epoch": 0.0004852426213106553,
"grad_norm": 13.456750869750977,
"learning_rate": 1.455e-06,
"loss": 2.987,
"step": 485
},
{
"epoch": 0.0004862431215607804,
"grad_norm": 13.413694381713867,
"learning_rate": 1.458e-06,
"loss": 2.3376,
"step": 486
},
{
"epoch": 0.00048724362181090544,
"grad_norm": 17.877710342407227,
"learning_rate": 1.461e-06,
"loss": 3.8962,
"step": 487
},
{
"epoch": 0.00048824412206103054,
"grad_norm": 18.55176544189453,
"learning_rate": 1.464e-06,
"loss": 3.7211,
"step": 488
},
{
"completion_length": 226.89583587646484,
"epoch": 0.0004892446223111556,
"grad_norm": 15.29721736907959,
"learning_rate": 1.467e-06,
"loss": 1.0551,
"reward": 0.11888227611780167,
"reward_std": 0.14588766545057297,
"rewards/sudoku_reward_func": 0.11888227611780167,
"step": 489,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0004902451225612806,
"grad_norm": 14.319069862365723,
"learning_rate": 1.4700000000000001e-06,
"loss": 0.2161,
"step": 490
},
{
"epoch": 0.0004912456228114057,
"grad_norm": 16.261802673339844,
"learning_rate": 1.473e-06,
"loss": -1.4613,
"step": 491
},
{
"epoch": 0.0004922461230615308,
"grad_norm": 19.536277770996094,
"learning_rate": 1.476e-06,
"loss": -1.9982,
"step": 492
},
{
"epoch": 0.0004932466233116558,
"grad_norm": 14.717618942260742,
"learning_rate": 1.479e-06,
"loss": 0.9957,
"step": 493
},
{
"epoch": 0.0004942471235617809,
"grad_norm": 14.713449478149414,
"learning_rate": 1.482e-06,
"loss": 0.1798,
"step": 494
},
{
"epoch": 0.000495247623811906,
"grad_norm": 15.294724464416504,
"learning_rate": 1.485e-06,
"loss": -1.6165,
"step": 495
},
{
"epoch": 0.000496248124062031,
"grad_norm": 18.315418243408203,
"learning_rate": 1.488e-06,
"loss": -2.0018,
"step": 496
},
{
"completion_length": 246.4166717529297,
"epoch": 0.0004972486243121561,
"grad_norm": 19.199636459350586,
"learning_rate": 1.491e-06,
"loss": 0.9573,
"reward": 0.17453178763389587,
"reward_std": 0.13891105726361275,
"rewards/sudoku_reward_func": 0.17453177273273468,
"step": 497,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0004982491245622812,
"grad_norm": 33.779850006103516,
"learning_rate": 1.494e-06,
"loss": 2.7422,
"step": 498
},
{
"epoch": 0.0004992496248124062,
"grad_norm": 19.790515899658203,
"learning_rate": 1.497e-06,
"loss": 0.8765,
"step": 499
},
{
"epoch": 0.0005002501250625312,
"grad_norm": 36.270423889160156,
"learning_rate": 1.5e-06,
"loss": 1.9353,
"step": 500
},
{
"epoch": 0.0005012506253126563,
"grad_norm": 18.44716453552246,
"learning_rate": 1.503e-06,
"loss": 0.8864,
"step": 501
},
{
"epoch": 0.0005022511255627814,
"grad_norm": 32.266937255859375,
"learning_rate": 1.506e-06,
"loss": 2.6481,
"step": 502
},
{
"epoch": 0.0005032516258129064,
"grad_norm": 20.195646286010742,
"learning_rate": 1.509e-06,
"loss": 0.6608,
"step": 503
},
{
"epoch": 0.0005042521260630315,
"grad_norm": 24.758575439453125,
"learning_rate": 1.512e-06,
"loss": 1.8413,
"step": 504
},
{
"completion_length": 235.83333587646484,
"epoch": 0.0005052526263131566,
"grad_norm": 17.032028198242188,
"learning_rate": 1.5150000000000001e-06,
"loss": -4.68,
"reward": 0.15310095250606537,
"reward_std": 0.15669506788253784,
"rewards/sudoku_reward_func": 0.15310094505548477,
"step": 505,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005062531265632817,
"grad_norm": 17.817955017089844,
"learning_rate": 1.5180000000000001e-06,
"loss": -5.7518,
"step": 506
},
{
"epoch": 0.0005072536268134067,
"grad_norm": 20.450265884399414,
"learning_rate": 1.5210000000000001e-06,
"loss": -4.9771,
"step": 507
},
{
"epoch": 0.0005082541270635318,
"grad_norm": 14.851226806640625,
"learning_rate": 1.5240000000000001e-06,
"loss": -4.6316,
"step": 508
},
{
"epoch": 0.0005092546273136569,
"grad_norm": 17.262596130371094,
"learning_rate": 1.5270000000000002e-06,
"loss": -4.953,
"step": 509
},
{
"epoch": 0.0005102551275637818,
"grad_norm": 17.57811737060547,
"learning_rate": 1.53e-06,
"loss": -5.7844,
"step": 510
},
{
"epoch": 0.0005112556278139069,
"grad_norm": 19.80550765991211,
"learning_rate": 1.533e-06,
"loss": -5.0486,
"step": 511
},
{
"epoch": 0.000512256128064032,
"grad_norm": 16.75063133239746,
"learning_rate": 1.536e-06,
"loss": -4.7158,
"step": 512
},
{
"completion_length": 235.68750762939453,
"epoch": 0.0005132566283141571,
"grad_norm": 15.94558048248291,
"learning_rate": 1.539e-06,
"loss": 0.6142,
"reward": 0.11516203731298447,
"reward_std": 0.11813021078705788,
"rewards/sudoku_reward_func": 0.11516203358769417,
"step": 513,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005142571285642821,
"grad_norm": 16.774120330810547,
"learning_rate": 1.542e-06,
"loss": 0.9026,
"step": 514
},
{
"epoch": 0.0005152576288144072,
"grad_norm": 17.249300003051758,
"learning_rate": 1.545e-06,
"loss": 0.3232,
"step": 515
},
{
"epoch": 0.0005162581290645323,
"grad_norm": 15.842235565185547,
"learning_rate": 1.548e-06,
"loss": 0.4665,
"step": 516
},
{
"epoch": 0.0005172586293146573,
"grad_norm": 15.682981491088867,
"learning_rate": 1.551e-06,
"loss": 0.4711,
"step": 517
},
{
"epoch": 0.0005182591295647824,
"grad_norm": 17.212305068969727,
"learning_rate": 1.554e-06,
"loss": 0.7602,
"step": 518
},
{
"epoch": 0.0005192596298149075,
"grad_norm": 16.74599838256836,
"learning_rate": 1.557e-06,
"loss": 0.1957,
"step": 519
},
{
"epoch": 0.0005202601300650326,
"grad_norm": 15.095890998840332,
"learning_rate": 1.56e-06,
"loss": 0.2926,
"step": 520
},
{
"completion_length": 236.02083587646484,
"epoch": 0.0005212606303151576,
"grad_norm": 18.716089248657227,
"learning_rate": 1.5630000000000001e-06,
"loss": 2.8831,
"reward": 0.13464632630348206,
"reward_std": 0.13019248098134995,
"rewards/sudoku_reward_func": 0.13464632630348206,
"step": 521,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005222611305652826,
"grad_norm": 18.289785385131836,
"learning_rate": 1.5660000000000001e-06,
"loss": 3.3606,
"step": 522
},
{
"epoch": 0.0005232616308154077,
"grad_norm": 17.296327590942383,
"learning_rate": 1.5690000000000001e-06,
"loss": 2.1803,
"step": 523
},
{
"epoch": 0.0005242621310655327,
"grad_norm": 16.455181121826172,
"learning_rate": 1.5720000000000002e-06,
"loss": 3.8618,
"step": 524
},
{
"epoch": 0.0005252626313156578,
"grad_norm": 17.295499801635742,
"learning_rate": 1.5750000000000002e-06,
"loss": 2.8034,
"step": 525
},
{
"epoch": 0.0005262631315657829,
"grad_norm": 17.529541015625,
"learning_rate": 1.5780000000000002e-06,
"loss": 3.2605,
"step": 526
},
{
"epoch": 0.000527263631815908,
"grad_norm": 16.862716674804688,
"learning_rate": 1.5810000000000002e-06,
"loss": 2.0709,
"step": 527
},
{
"epoch": 0.000528264132066033,
"grad_norm": 16.12912368774414,
"learning_rate": 1.5840000000000002e-06,
"loss": 3.6643,
"step": 528
},
{
"completion_length": 236.33334350585938,
"epoch": 0.0005292646323161581,
"grad_norm": 16.581037521362305,
"learning_rate": 1.5870000000000002e-06,
"loss": 6.703,
"reward": 0.12632275372743607,
"reward_std": 0.14028233289718628,
"rewards/sudoku_reward_func": 0.12632274255156517,
"step": 529,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005302651325662832,
"grad_norm": 19.885190963745117,
"learning_rate": 1.59e-06,
"loss": 8.2868,
"step": 530
},
{
"epoch": 0.0005312656328164082,
"grad_norm": 17.113710403442383,
"learning_rate": 1.593e-06,
"loss": 6.2016,
"step": 531
},
{
"epoch": 0.0005322661330665333,
"grad_norm": 16.769590377807617,
"learning_rate": 1.596e-06,
"loss": 6.119,
"step": 532
},
{
"epoch": 0.0005332666333166584,
"grad_norm": 15.997132301330566,
"learning_rate": 1.599e-06,
"loss": 6.4742,
"step": 533
},
{
"epoch": 0.0005342671335667833,
"grad_norm": 21.274337768554688,
"learning_rate": 1.602e-06,
"loss": 8.0824,
"step": 534
},
{
"epoch": 0.0005352676338169084,
"grad_norm": 16.52000617980957,
"learning_rate": 1.605e-06,
"loss": 5.991,
"step": 535
},
{
"epoch": 0.0005362681340670335,
"grad_norm": 16.728483200073242,
"learning_rate": 1.608e-06,
"loss": 5.9626,
"step": 536
},
{
"completion_length": 246.02083587646484,
"epoch": 0.0005372686343171586,
"grad_norm": 19.841209411621094,
"learning_rate": 1.6110000000000001e-06,
"loss": 4.3933,
"reward": 0.17916516959667206,
"reward_std": 0.16655328124761581,
"rewards/sudoku_reward_func": 0.17916516214609146,
"step": 537,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005382691345672836,
"grad_norm": 21.26980972290039,
"learning_rate": 1.6140000000000001e-06,
"loss": 3.4108,
"step": 538
},
{
"epoch": 0.0005392696348174087,
"grad_norm": 18.89370346069336,
"learning_rate": 1.6170000000000001e-06,
"loss": 1.9086,
"step": 539
},
{
"epoch": 0.0005402701350675338,
"grad_norm": 19.744796752929688,
"learning_rate": 1.6200000000000002e-06,
"loss": 1.4578,
"step": 540
},
{
"epoch": 0.0005412706353176588,
"grad_norm": 21.075912475585938,
"learning_rate": 1.6230000000000002e-06,
"loss": 4.0819,
"step": 541
},
{
"epoch": 0.0005422711355677839,
"grad_norm": 19.98610496520996,
"learning_rate": 1.6260000000000002e-06,
"loss": 3.0692,
"step": 542
},
{
"epoch": 0.000543271635817909,
"grad_norm": 19.12093162536621,
"learning_rate": 1.6290000000000002e-06,
"loss": 1.5771,
"step": 543
},
{
"epoch": 0.0005442721360680341,
"grad_norm": 20.679964065551758,
"learning_rate": 1.6320000000000002e-06,
"loss": 1.1791,
"step": 544
},
{
"completion_length": 228.52083587646484,
"epoch": 0.000545272636318159,
"grad_norm": 15.315961837768555,
"learning_rate": 1.6350000000000002e-06,
"loss": -6.7685,
"reward": 0.16216104477643967,
"reward_std": 0.12737327441573143,
"rewards/sudoku_reward_func": 0.16216104477643967,
"step": 545,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005462731365682841,
"grad_norm": 16.78080940246582,
"learning_rate": 1.6380000000000002e-06,
"loss": -6.3753,
"step": 546
},
{
"epoch": 0.0005472736368184092,
"grad_norm": 14.62249755859375,
"learning_rate": 1.6410000000000003e-06,
"loss": -5.4868,
"step": 547
},
{
"epoch": 0.0005482741370685342,
"grad_norm": 15.880613327026367,
"learning_rate": 1.6440000000000003e-06,
"loss": -5.5422,
"step": 548
},
{
"epoch": 0.0005492746373186593,
"grad_norm": 15.091645240783691,
"learning_rate": 1.647e-06,
"loss": -6.8809,
"step": 549
},
{
"epoch": 0.0005502751375687844,
"grad_norm": 16.886802673339844,
"learning_rate": 1.65e-06,
"loss": -6.6144,
"step": 550
},
{
"epoch": 0.0005512756378189095,
"grad_norm": 15.56815242767334,
"learning_rate": 1.653e-06,
"loss": -5.6943,
"step": 551
},
{
"epoch": 0.0005522761380690345,
"grad_norm": 15.692801475524902,
"learning_rate": 1.6560000000000001e-06,
"loss": -5.7414,
"step": 552
},
{
"completion_length": 225.77083587646484,
"epoch": 0.0005532766383191596,
"grad_norm": 14.722847938537598,
"learning_rate": 1.6590000000000001e-06,
"loss": -2.6607,
"reward": 0.16656897217035294,
"reward_std": 0.11782306060194969,
"rewards/sudoku_reward_func": 0.16656896471977234,
"step": 553,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005542771385692847,
"grad_norm": 18.4454402923584,
"learning_rate": 1.6620000000000001e-06,
"loss": -1.9324,
"step": 554
},
{
"epoch": 0.0005552776388194097,
"grad_norm": 13.558207511901855,
"learning_rate": 1.6650000000000002e-06,
"loss": -2.5447,
"step": 555
},
{
"epoch": 0.0005562781390695348,
"grad_norm": 17.83416175842285,
"learning_rate": 1.6680000000000002e-06,
"loss": -0.7552,
"step": 556
},
{
"epoch": 0.0005572786393196598,
"grad_norm": 14.537623405456543,
"learning_rate": 1.6710000000000002e-06,
"loss": -2.8753,
"step": 557
},
{
"epoch": 0.0005582791395697849,
"grad_norm": 18.245431900024414,
"learning_rate": 1.6740000000000002e-06,
"loss": -2.1203,
"step": 558
},
{
"epoch": 0.0005592796398199099,
"grad_norm": 13.693886756896973,
"learning_rate": 1.6770000000000002e-06,
"loss": -2.7668,
"step": 559
},
{
"epoch": 0.000560280140070035,
"grad_norm": 16.825647354125977,
"learning_rate": 1.6800000000000002e-06,
"loss": -0.9397,
"step": 560
},
{
"completion_length": 230.62500762939453,
"epoch": 0.0005612806403201601,
"grad_norm": 24.731273651123047,
"learning_rate": 1.6830000000000002e-06,
"loss": 0.4022,
"reward": 0.1626984179019928,
"reward_std": 0.14903082698583603,
"rewards/sudoku_reward_func": 0.1626984104514122,
"step": 561,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005622811405702851,
"grad_norm": 23.209733963012695,
"learning_rate": 1.6860000000000002e-06,
"loss": 0.5578,
"step": 562
},
{
"epoch": 0.0005632816408204102,
"grad_norm": 21.375064849853516,
"learning_rate": 1.6889999999999998e-06,
"loss": 1.3835,
"step": 563
},
{
"epoch": 0.0005642821410705353,
"grad_norm": 22.1075496673584,
"learning_rate": 1.6919999999999999e-06,
"loss": 1.5969,
"step": 564
},
{
"epoch": 0.0005652826413206604,
"grad_norm": 25.393898010253906,
"learning_rate": 1.6949999999999999e-06,
"loss": 0.2459,
"step": 565
},
{
"epoch": 0.0005662831415707854,
"grad_norm": 21.525785446166992,
"learning_rate": 1.6979999999999999e-06,
"loss": 0.4449,
"step": 566
},
{
"epoch": 0.0005672836418209105,
"grad_norm": 20.55721664428711,
"learning_rate": 1.7009999999999999e-06,
"loss": 1.1952,
"step": 567
},
{
"epoch": 0.0005682841420710356,
"grad_norm": 22.30531120300293,
"learning_rate": 1.704e-06,
"loss": 1.2676,
"step": 568
},
{
"completion_length": 235.375,
"epoch": 0.0005692846423211605,
"grad_norm": 22.03434181213379,
"learning_rate": 1.707e-06,
"loss": 4.3422,
"reward": 0.20998678356409073,
"reward_std": 0.18079549074172974,
"rewards/sudoku_reward_func": 0.20998677611351013,
"step": 569,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005702851425712856,
"grad_norm": 32.50233459472656,
"learning_rate": 1.71e-06,
"loss": 2.9145,
"step": 570
},
{
"epoch": 0.0005712856428214107,
"grad_norm": 46.461246490478516,
"learning_rate": 1.713e-06,
"loss": 6.4925,
"step": 571
},
{
"epoch": 0.0005722861430715357,
"grad_norm": 24.894067764282227,
"learning_rate": 1.716e-06,
"loss": 3.9048,
"step": 572
},
{
"epoch": 0.0005732866433216608,
"grad_norm": 22.30925941467285,
"learning_rate": 1.719e-06,
"loss": 4.0346,
"step": 573
},
{
"epoch": 0.0005742871435717859,
"grad_norm": 26.490528106689453,
"learning_rate": 1.722e-06,
"loss": 2.5862,
"step": 574
},
{
"epoch": 0.000575287643821911,
"grad_norm": 50.72327423095703,
"learning_rate": 1.725e-06,
"loss": 6.3471,
"step": 575
},
{
"epoch": 0.000576288144072036,
"grad_norm": 23.435848236083984,
"learning_rate": 1.728e-06,
"loss": 3.6186,
"step": 576
},
{
"completion_length": 230.9791717529297,
"epoch": 0.0005772886443221611,
"grad_norm": 60.76210021972656,
"learning_rate": 1.7309999999999998e-06,
"loss": 9.3164,
"reward": 0.16644121706485748,
"reward_std": 0.1695634052157402,
"rewards/sudoku_reward_func": 0.16644120961427689,
"step": 577,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005782891445722862,
"grad_norm": 23.458660125732422,
"learning_rate": 1.7339999999999998e-06,
"loss": 10.5865,
"step": 578
},
{
"epoch": 0.0005792896448224112,
"grad_norm": 22.109237670898438,
"learning_rate": 1.7369999999999998e-06,
"loss": 9.5307,
"step": 579
},
{
"epoch": 0.0005802901450725362,
"grad_norm": 20.859468460083008,
"learning_rate": 1.7399999999999999e-06,
"loss": 11.4446,
"step": 580
},
{
"epoch": 0.0005812906453226613,
"grad_norm": 19.533044815063477,
"learning_rate": 1.7429999999999999e-06,
"loss": 9.1447,
"step": 581
},
{
"epoch": 0.0005822911455727864,
"grad_norm": 23.0717716217041,
"learning_rate": 1.7459999999999999e-06,
"loss": 10.3917,
"step": 582
},
{
"epoch": 0.0005832916458229114,
"grad_norm": 23.079936981201172,
"learning_rate": 1.749e-06,
"loss": 9.4,
"step": 583
},
{
"epoch": 0.0005842921460730365,
"grad_norm": 21.109201431274414,
"learning_rate": 1.752e-06,
"loss": 11.2249,
"step": 584
},
{
"completion_length": 236.4791717529297,
"epoch": 0.0005852926463231616,
"grad_norm": 15.436999320983887,
"learning_rate": 1.755e-06,
"loss": -1.8151,
"reward": 0.1277281753718853,
"reward_std": 0.11017344892024994,
"rewards/sudoku_reward_func": 0.1277281753718853,
"step": 585,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005862931465732866,
"grad_norm": 14.275968551635742,
"learning_rate": 1.758e-06,
"loss": -1.466,
"step": 586
},
{
"epoch": 0.0005872936468234117,
"grad_norm": 15.81253719329834,
"learning_rate": 1.761e-06,
"loss": -2.2609,
"step": 587
},
{
"epoch": 0.0005882941470735368,
"grad_norm": 14.601688385009766,
"learning_rate": 1.764e-06,
"loss": -1.7179,
"step": 588
},
{
"epoch": 0.0005892946473236619,
"grad_norm": 14.18327522277832,
"learning_rate": 1.767e-06,
"loss": -1.9194,
"step": 589
},
{
"epoch": 0.0005902951475737869,
"grad_norm": 14.13171672821045,
"learning_rate": 1.77e-06,
"loss": -1.6122,
"step": 590
},
{
"epoch": 0.000591295647823912,
"grad_norm": 17.244855880737305,
"learning_rate": 1.773e-06,
"loss": -2.3572,
"step": 591
},
{
"epoch": 0.000592296148074037,
"grad_norm": 15.741546630859375,
"learning_rate": 1.776e-06,
"loss": -1.8189,
"step": 592
},
{
"completion_length": 235.0416717529297,
"epoch": 0.000593296648324162,
"grad_norm": 25.220417022705078,
"learning_rate": 1.779e-06,
"loss": 5.7432,
"reward": 0.2039930671453476,
"reward_std": 0.1585550457239151,
"rewards/sudoku_reward_func": 0.203993059694767,
"step": 593,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0005942971485742871,
"grad_norm": 24.24148941040039,
"learning_rate": 1.782e-06,
"loss": 7.6423,
"step": 594
},
{
"epoch": 0.0005952976488244122,
"grad_norm": 33.06986999511719,
"learning_rate": 1.785e-06,
"loss": 9.136,
"step": 595
},
{
"epoch": 0.0005962981490745373,
"grad_norm": 26.336284637451172,
"learning_rate": 1.7879999999999999e-06,
"loss": 6.7126,
"step": 596
},
{
"epoch": 0.0005972986493246623,
"grad_norm": 25.30630111694336,
"learning_rate": 1.7909999999999999e-06,
"loss": 5.4191,
"step": 597
},
{
"epoch": 0.0005982991495747874,
"grad_norm": 24.444787979125977,
"learning_rate": 1.7939999999999999e-06,
"loss": 7.3168,
"step": 598
},
{
"epoch": 0.0005992996498249125,
"grad_norm": 31.10487174987793,
"learning_rate": 1.797e-06,
"loss": 8.7812,
"step": 599
},
{
"epoch": 0.0006003001500750375,
"grad_norm": 24.562986373901367,
"learning_rate": 1.8e-06,
"loss": 6.3572,
"step": 600
},
{
"completion_length": 239.6041717529297,
"epoch": 0.0006013006503251626,
"grad_norm": 17.44729232788086,
"learning_rate": 1.803e-06,
"loss": -1.0856,
"reward": 0.15695270895957947,
"reward_std": 0.1305704563856125,
"rewards/sudoku_reward_func": 0.15695270895957947,
"step": 601,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006023011505752877,
"grad_norm": 17.973352432250977,
"learning_rate": 1.806e-06,
"loss": -1.5651,
"step": 602
},
{
"epoch": 0.0006033016508254128,
"grad_norm": 17.87987518310547,
"learning_rate": 1.809e-06,
"loss": -1.5749,
"step": 603
},
{
"epoch": 0.0006043021510755377,
"grad_norm": 17.67951202392578,
"learning_rate": 1.812e-06,
"loss": -1.5796,
"step": 604
},
{
"epoch": 0.0006053026513256628,
"grad_norm": 18.45180892944336,
"learning_rate": 1.815e-06,
"loss": -1.2079,
"step": 605
},
{
"epoch": 0.0006063031515757879,
"grad_norm": 20.35084342956543,
"learning_rate": 1.818e-06,
"loss": -1.6413,
"step": 606
},
{
"epoch": 0.0006073036518259129,
"grad_norm": 17.7126407623291,
"learning_rate": 1.821e-06,
"loss": -1.7646,
"step": 607
},
{
"epoch": 0.000608304152076038,
"grad_norm": 17.698877334594727,
"learning_rate": 1.824e-06,
"loss": -1.7541,
"step": 608
},
{
"completion_length": 224.4791717529297,
"epoch": 0.0006093046523261631,
"grad_norm": 31.530330657958984,
"learning_rate": 1.827e-06,
"loss": 12.5315,
"reward": 0.21771661192178726,
"reward_std": 0.2013006955385208,
"rewards/sudoku_reward_func": 0.21771660447120667,
"step": 609,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006103051525762882,
"grad_norm": 29.887935638427734,
"learning_rate": 1.83e-06,
"loss": 14.9137,
"step": 610
},
{
"epoch": 0.0006113056528264132,
"grad_norm": 33.39522171020508,
"learning_rate": 1.833e-06,
"loss": 15.9311,
"step": 611
},
{
"epoch": 0.0006123061530765383,
"grad_norm": 32.336585998535156,
"learning_rate": 1.836e-06,
"loss": 12.8304,
"step": 612
},
{
"epoch": 0.0006133066533266634,
"grad_norm": 33.42409133911133,
"learning_rate": 1.839e-06,
"loss": 12.3461,
"step": 613
},
{
"epoch": 0.0006143071535767884,
"grad_norm": 31.57464027404785,
"learning_rate": 1.8420000000000001e-06,
"loss": 14.4472,
"step": 614
},
{
"epoch": 0.0006153076538269135,
"grad_norm": 30.877559661865234,
"learning_rate": 1.8450000000000001e-06,
"loss": 15.3655,
"step": 615
},
{
"epoch": 0.0006163081540770385,
"grad_norm": 34.25657272338867,
"learning_rate": 1.848e-06,
"loss": 12.2993,
"step": 616
},
{
"completion_length": 242.52084350585938,
"epoch": 0.0006173086543271635,
"grad_norm": 19.929956436157227,
"learning_rate": 1.851e-06,
"loss": 5.3496,
"reward": 0.17189379036426544,
"reward_std": 0.1432012841105461,
"rewards/sudoku_reward_func": 0.17189379036426544,
"step": 617,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006183091545772886,
"grad_norm": 21.747644424438477,
"learning_rate": 1.854e-06,
"loss": 3.4253,
"step": 618
},
{
"epoch": 0.0006193096548274137,
"grad_norm": 19.652868270874023,
"learning_rate": 1.857e-06,
"loss": 4.5252,
"step": 619
},
{
"epoch": 0.0006203101550775388,
"grad_norm": 18.432842254638672,
"learning_rate": 1.86e-06,
"loss": 4.9895,
"step": 620
},
{
"epoch": 0.0006213106553276638,
"grad_norm": 19.793832778930664,
"learning_rate": 1.863e-06,
"loss": 5.1259,
"step": 621
},
{
"epoch": 0.0006223111555777889,
"grad_norm": 24.654544830322266,
"learning_rate": 1.866e-06,
"loss": 3.2193,
"step": 622
},
{
"epoch": 0.000623311655827914,
"grad_norm": 21.008930206298828,
"learning_rate": 1.869e-06,
"loss": 4.2154,
"step": 623
},
{
"epoch": 0.000624312156078039,
"grad_norm": 19.32270622253418,
"learning_rate": 1.872e-06,
"loss": 4.7233,
"step": 624
},
{
"completion_length": 252.43750762939453,
"epoch": 0.0006253126563281641,
"grad_norm": 19.41009521484375,
"learning_rate": 1.875e-06,
"loss": -6.2214,
"reward": 0.14897486940026283,
"reward_std": 0.13010858744382858,
"rewards/sudoku_reward_func": 0.14897486940026283,
"step": 625,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006263131565782892,
"grad_norm": 18.7078914642334,
"learning_rate": 1.878e-06,
"loss": -5.9653,
"step": 626
},
{
"epoch": 0.0006273136568284143,
"grad_norm": 16.962142944335938,
"learning_rate": 1.881e-06,
"loss": -4.7354,
"step": 627
},
{
"epoch": 0.0006283141570785392,
"grad_norm": 18.850299835205078,
"learning_rate": 1.884e-06,
"loss": -6.3041,
"step": 628
},
{
"epoch": 0.0006293146573286643,
"grad_norm": 23.01152229309082,
"learning_rate": 1.887e-06,
"loss": -6.3494,
"step": 629
},
{
"epoch": 0.0006303151575787894,
"grad_norm": 18.785537719726562,
"learning_rate": 1.8900000000000001e-06,
"loss": -6.116,
"step": 630
},
{
"epoch": 0.0006313156578289144,
"grad_norm": 19.264942169189453,
"learning_rate": 1.8930000000000001e-06,
"loss": -4.9139,
"step": 631
},
{
"epoch": 0.0006323161580790395,
"grad_norm": 17.685102462768555,
"learning_rate": 1.8960000000000001e-06,
"loss": -6.5377,
"step": 632
},
{
"completion_length": 254.0625,
"epoch": 0.0006333166583291646,
"grad_norm": 16.194541931152344,
"learning_rate": 1.8990000000000002e-06,
"loss": -0.5502,
"reward": 0.1833701804280281,
"reward_std": 0.12051501497626305,
"rewards/sudoku_reward_func": 0.1833701729774475,
"step": 633,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006343171585792897,
"grad_norm": 17.165401458740234,
"learning_rate": 1.9020000000000002e-06,
"loss": 0.6505,
"step": 634
},
{
"epoch": 0.0006353176588294147,
"grad_norm": 18.674211502075195,
"learning_rate": 1.905e-06,
"loss": -0.3794,
"step": 635
},
{
"epoch": 0.0006363181590795398,
"grad_norm": 17.070940017700195,
"learning_rate": 1.908e-06,
"loss": 0.2266,
"step": 636
},
{
"epoch": 0.0006373186593296649,
"grad_norm": 15.575688362121582,
"learning_rate": 1.911e-06,
"loss": -0.7375,
"step": 637
},
{
"epoch": 0.0006383191595797899,
"grad_norm": 23.437768936157227,
"learning_rate": 1.9140000000000002e-06,
"loss": 0.5262,
"step": 638
},
{
"epoch": 0.0006393196598299149,
"grad_norm": 18.74703598022461,
"learning_rate": 1.917e-06,
"loss": -0.6114,
"step": 639
},
{
"epoch": 0.00064032016008004,
"grad_norm": 16.295331954956055,
"learning_rate": 1.9200000000000003e-06,
"loss": 0.0056,
"step": 640
},
{
"completion_length": 251.45833587646484,
"epoch": 0.0006413206603301651,
"grad_norm": 21.831514358520508,
"learning_rate": 1.923e-06,
"loss": 0.2665,
"reward": 0.17208169400691986,
"reward_std": 0.14966098219156265,
"rewards/sudoku_reward_func": 0.17208168655633926,
"step": 641,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006423211605802901,
"grad_norm": 24.15346336364746,
"learning_rate": 1.9260000000000003e-06,
"loss": -0.7073,
"step": 642
},
{
"epoch": 0.0006433216608304152,
"grad_norm": 21.65900993347168,
"learning_rate": 1.929e-06,
"loss": -1.115,
"step": 643
},
{
"epoch": 0.0006443221610805403,
"grad_norm": 23.846370697021484,
"learning_rate": 1.9320000000000003e-06,
"loss": 0.2187,
"step": 644
},
{
"epoch": 0.0006453226613306653,
"grad_norm": 21.314346313476562,
"learning_rate": 1.935e-06,
"loss": -0.0856,
"step": 645
},
{
"epoch": 0.0006463231615807904,
"grad_norm": 25.002071380615234,
"learning_rate": 1.938e-06,
"loss": -1.0247,
"step": 646
},
{
"epoch": 0.0006473236618309155,
"grad_norm": 21.471878051757812,
"learning_rate": 1.941e-06,
"loss": -1.4162,
"step": 647
},
{
"epoch": 0.0006483241620810406,
"grad_norm": 23.55864715576172,
"learning_rate": 1.944e-06,
"loss": -0.163,
"step": 648
},
{
"completion_length": 250.5,
"epoch": 0.0006493246623311656,
"grad_norm": 19.517107009887695,
"learning_rate": 1.947e-06,
"loss": -1.1045,
"reward": 0.18660564720630646,
"reward_std": 0.1325959712266922,
"rewards/sudoku_reward_func": 0.18660564720630646,
"step": 649,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006503251625812907,
"grad_norm": 18.44375991821289,
"learning_rate": 1.95e-06,
"loss": -0.3916,
"step": 650
},
{
"epoch": 0.0006513256628314157,
"grad_norm": 22.07648468017578,
"learning_rate": 1.953e-06,
"loss": -0.2807,
"step": 651
},
{
"epoch": 0.0006523261630815407,
"grad_norm": 23.970638275146484,
"learning_rate": 1.956e-06,
"loss": -2.3437,
"step": 652
},
{
"epoch": 0.0006533266633316658,
"grad_norm": 21.739980697631836,
"learning_rate": 1.9590000000000002e-06,
"loss": -1.5197,
"step": 653
},
{
"epoch": 0.0006543271635817909,
"grad_norm": 21.033462524414062,
"learning_rate": 1.962e-06,
"loss": -0.7898,
"step": 654
},
{
"epoch": 0.0006553276638319159,
"grad_norm": 23.045095443725586,
"learning_rate": 1.9650000000000002e-06,
"loss": -0.7434,
"step": 655
},
{
"epoch": 0.000656328164082041,
"grad_norm": 24.260587692260742,
"learning_rate": 1.968e-06,
"loss": -2.9049,
"step": 656
},
{
"completion_length": 248.58334350585938,
"epoch": 0.0006573286643321661,
"grad_norm": 25.21026039123535,
"learning_rate": 1.9710000000000003e-06,
"loss": -5.561,
"reward": 0.20556382834911346,
"reward_std": 0.17001141607761383,
"rewards/sudoku_reward_func": 0.20556382089853287,
"step": 657,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006583291645822912,
"grad_norm": 25.44139862060547,
"learning_rate": 1.974e-06,
"loss": -4.1767,
"step": 658
},
{
"epoch": 0.0006593296648324162,
"grad_norm": 27.466651916503906,
"learning_rate": 1.9770000000000003e-06,
"loss": -6.3143,
"step": 659
},
{
"epoch": 0.0006603301650825413,
"grad_norm": 27.116817474365234,
"learning_rate": 1.98e-06,
"loss": -5.6863,
"step": 660
},
{
"epoch": 0.0006613306653326664,
"grad_norm": 26.143383026123047,
"learning_rate": 1.9830000000000003e-06,
"loss": -5.8748,
"step": 661
},
{
"epoch": 0.0006623311655827913,
"grad_norm": 30.05205726623535,
"learning_rate": 1.986e-06,
"loss": -4.5714,
"step": 662
},
{
"epoch": 0.0006633316658329164,
"grad_norm": 28.53112030029297,
"learning_rate": 1.9890000000000004e-06,
"loss": -6.6444,
"step": 663
},
{
"epoch": 0.0006643321660830415,
"grad_norm": 27.286012649536133,
"learning_rate": 1.992e-06,
"loss": -6.099,
"step": 664
},
{
"completion_length": 242.6041717529297,
"epoch": 0.0006653326663331666,
"grad_norm": 25.060420989990234,
"learning_rate": 1.995e-06,
"loss": -2.4716,
"reward": 0.2606647089123726,
"reward_std": 0.1507035493850708,
"rewards/sudoku_reward_func": 0.2606647089123726,
"step": 665,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006663331665832916,
"grad_norm": 24.94710922241211,
"learning_rate": 1.998e-06,
"loss": -2.414,
"step": 666
},
{
"epoch": 0.0006673336668334167,
"grad_norm": 24.827306747436523,
"learning_rate": 2.001e-06,
"loss": -3.3365,
"step": 667
},
{
"epoch": 0.0006683341670835418,
"grad_norm": 24.628686904907227,
"learning_rate": 2.004e-06,
"loss": -3.3505,
"step": 668
},
{
"epoch": 0.0006693346673336668,
"grad_norm": 24.80254364013672,
"learning_rate": 2.007e-06,
"loss": -2.7403,
"step": 669
},
{
"epoch": 0.0006703351675837919,
"grad_norm": 24.99594497680664,
"learning_rate": 2.0100000000000002e-06,
"loss": -2.6696,
"step": 670
},
{
"epoch": 0.000671335667833917,
"grad_norm": 27.10658073425293,
"learning_rate": 2.013e-06,
"loss": -3.8388,
"step": 671
},
{
"epoch": 0.0006723361680840421,
"grad_norm": 29.956634521484375,
"learning_rate": 2.0160000000000003e-06,
"loss": -3.6414,
"step": 672
},
{
"completion_length": 247.64584350585938,
"epoch": 0.000673336668334167,
"grad_norm": 26.138870239257812,
"learning_rate": 2.019e-06,
"loss": 5.4727,
"reward": 0.18890543282032013,
"reward_std": 0.1443747878074646,
"rewards/sudoku_reward_func": 0.18890542536973953,
"step": 673,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006743371685842921,
"grad_norm": 27.671852111816406,
"learning_rate": 2.0220000000000003e-06,
"loss": 6.6836,
"step": 674
},
{
"epoch": 0.0006753376688344172,
"grad_norm": 30.31881332397461,
"learning_rate": 2.025e-06,
"loss": 5.5997,
"step": 675
},
{
"epoch": 0.0006763381690845422,
"grad_norm": 23.821044921875,
"learning_rate": 2.0280000000000003e-06,
"loss": 4.0779,
"step": 676
},
{
"epoch": 0.0006773386693346673,
"grad_norm": 24.792219161987305,
"learning_rate": 2.031e-06,
"loss": 5.4354,
"step": 677
},
{
"epoch": 0.0006783391695847924,
"grad_norm": 28.62071990966797,
"learning_rate": 2.0340000000000003e-06,
"loss": 6.375,
"step": 678
},
{
"epoch": 0.0006793396698349175,
"grad_norm": 24.46523666381836,
"learning_rate": 2.037e-06,
"loss": 5.4811,
"step": 679
},
{
"epoch": 0.0006803401700850425,
"grad_norm": 23.84539031982422,
"learning_rate": 2.0400000000000004e-06,
"loss": 3.906,
"step": 680
},
{
"completion_length": 255.14584350585938,
"epoch": 0.0006813406703351676,
"grad_norm": 29.295547485351562,
"learning_rate": 2.043e-06,
"loss": -4.581,
"reward": 0.22625061869621277,
"reward_std": 0.17611468583345413,
"rewards/sudoku_reward_func": 0.22625060379505157,
"step": 681,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006823411705852927,
"grad_norm": 30.2603759765625,
"learning_rate": 2.0460000000000004e-06,
"loss": -4.5551,
"step": 682
},
{
"epoch": 0.0006833416708354177,
"grad_norm": 35.28757095336914,
"learning_rate": 2.049e-06,
"loss": -5.3189,
"step": 683
},
{
"epoch": 0.0006843421710855428,
"grad_norm": 34.68866729736328,
"learning_rate": 2.052e-06,
"loss": -5.9553,
"step": 684
},
{
"epoch": 0.0006853426713356679,
"grad_norm": 29.826597213745117,
"learning_rate": 2.0550000000000002e-06,
"loss": -4.8571,
"step": 685
},
{
"epoch": 0.000686343171585793,
"grad_norm": 35.04398727416992,
"learning_rate": 2.058e-06,
"loss": -4.8504,
"step": 686
},
{
"epoch": 0.0006873436718359179,
"grad_norm": 31.422697067260742,
"learning_rate": 2.0610000000000003e-06,
"loss": -5.5763,
"step": 687
},
{
"epoch": 0.000688344172086043,
"grad_norm": 32.0524787902832,
"learning_rate": 2.064e-06,
"loss": -6.1022,
"step": 688
},
{
"completion_length": 244.89583587646484,
"epoch": 0.0006893446723361681,
"grad_norm": 29.540935516357422,
"learning_rate": 2.067e-06,
"loss": -1.289,
"reward": 0.1987847313284874,
"reward_std": 0.13025881350040436,
"rewards/sudoku_reward_func": 0.1987847313284874,
"step": 689,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006903451725862931,
"grad_norm": 28.919418334960938,
"learning_rate": 2.07e-06,
"loss": 0.1461,
"step": 690
},
{
"epoch": 0.0006913456728364182,
"grad_norm": 23.043699264526367,
"learning_rate": 2.073e-06,
"loss": -0.6692,
"step": 691
},
{
"epoch": 0.0006923461730865433,
"grad_norm": 23.756433486938477,
"learning_rate": 2.0759999999999997e-06,
"loss": 0.7958,
"step": 692
},
{
"epoch": 0.0006933466733366683,
"grad_norm": 29.989591598510742,
"learning_rate": 2.079e-06,
"loss": -1.4441,
"step": 693
},
{
"epoch": 0.0006943471735867934,
"grad_norm": 31.252445220947266,
"learning_rate": 2.0819999999999997e-06,
"loss": -0.1056,
"step": 694
},
{
"epoch": 0.0006953476738369185,
"grad_norm": 23.959022521972656,
"learning_rate": 2.085e-06,
"loss": -0.9871,
"step": 695
},
{
"epoch": 0.0006963481740870436,
"grad_norm": 22.977869033813477,
"learning_rate": 2.0879999999999997e-06,
"loss": 0.6185,
"step": 696
},
{
"completion_length": 252.7916717529297,
"epoch": 0.0006973486743371685,
"grad_norm": 24.07022476196289,
"learning_rate": 2.091e-06,
"loss": -3.2906,
"reward": 0.2117680013179779,
"reward_std": 0.11290831118822098,
"rewards/sudoku_reward_func": 0.2117679864168167,
"step": 697,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0006983491745872936,
"grad_norm": 17.388952255249023,
"learning_rate": 2.0939999999999998e-06,
"loss": -3.8824,
"step": 698
},
{
"epoch": 0.0006993496748374187,
"grad_norm": 25.840599060058594,
"learning_rate": 2.097e-06,
"loss": -0.2673,
"step": 699
},
{
"epoch": 0.0007003501750875437,
"grad_norm": 25.69615364074707,
"learning_rate": 2.1e-06,
"loss": -1.1923,
"step": 700
},
{
"epoch": 0.0007013506753376688,
"grad_norm": 23.536897659301758,
"learning_rate": 2.103e-06,
"loss": -3.5789,
"step": 701
},
{
"epoch": 0.0007023511755877939,
"grad_norm": 17.143381118774414,
"learning_rate": 2.106e-06,
"loss": -4.0471,
"step": 702
},
{
"epoch": 0.000703351675837919,
"grad_norm": 27.47609519958496,
"learning_rate": 2.109e-06,
"loss": -0.4917,
"step": 703
},
{
"epoch": 0.000704352176088044,
"grad_norm": 37.165225982666016,
"learning_rate": 2.112e-06,
"loss": -1.5096,
"step": 704
},
{
"completion_length": 245.81250762939453,
"epoch": 0.0007053526763381691,
"grad_norm": 24.30489158630371,
"learning_rate": 2.115e-06,
"loss": -4.6945,
"reward": 0.167658731341362,
"reward_std": 0.11763971298933029,
"rewards/sudoku_reward_func": 0.167658731341362,
"step": 705,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007063531765882942,
"grad_norm": 23.941953659057617,
"learning_rate": 2.118e-06,
"loss": -4.4556,
"step": 706
},
{
"epoch": 0.0007073536768384192,
"grad_norm": 24.57003402709961,
"learning_rate": 2.121e-06,
"loss": -4.9353,
"step": 707
},
{
"epoch": 0.0007083541770885443,
"grad_norm": 26.972028732299805,
"learning_rate": 2.124e-06,
"loss": -6.7715,
"step": 708
},
{
"epoch": 0.0007093546773386693,
"grad_norm": 27.066307067871094,
"learning_rate": 2.127e-06,
"loss": -4.7776,
"step": 709
},
{
"epoch": 0.0007103551775887944,
"grad_norm": 21.551305770874023,
"learning_rate": 2.13e-06,
"loss": -4.4619,
"step": 710
},
{
"epoch": 0.0007113556778389194,
"grad_norm": 25.452531814575195,
"learning_rate": 2.133e-06,
"loss": -5.1383,
"step": 711
},
{
"epoch": 0.0007123561780890445,
"grad_norm": 27.460329055786133,
"learning_rate": 2.136e-06,
"loss": -7.0182,
"step": 712
},
{
"completion_length": 248.52083587646484,
"epoch": 0.0007133566783391696,
"grad_norm": 28.8990478515625,
"learning_rate": 2.1389999999999998e-06,
"loss": -3.1244,
"reward": 0.21841933578252792,
"reward_std": 0.14541570469737053,
"rewards/sudoku_reward_func": 0.21841932088136673,
"step": 713,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007143571785892946,
"grad_norm": 26.447952270507812,
"learning_rate": 2.142e-06,
"loss": -1.9208,
"step": 714
},
{
"epoch": 0.0007153576788394197,
"grad_norm": 31.950580596923828,
"learning_rate": 2.145e-06,
"loss": -2.0995,
"step": 715
},
{
"epoch": 0.0007163581790895448,
"grad_norm": 24.73855209350586,
"learning_rate": 2.148e-06,
"loss": -4.2209,
"step": 716
},
{
"epoch": 0.0007173586793396699,
"grad_norm": 27.81092071533203,
"learning_rate": 2.151e-06,
"loss": -3.2356,
"step": 717
},
{
"epoch": 0.0007183591795897949,
"grad_norm": 28.132692337036133,
"learning_rate": 2.154e-06,
"loss": -2.1771,
"step": 718
},
{
"epoch": 0.00071935967983992,
"grad_norm": 29.067340850830078,
"learning_rate": 2.157e-06,
"loss": -2.3805,
"step": 719
},
{
"epoch": 0.0007203601800900451,
"grad_norm": 25.294504165649414,
"learning_rate": 2.16e-06,
"loss": -4.5135,
"step": 720
},
{
"completion_length": 248.2916717529297,
"epoch": 0.00072136068034017,
"grad_norm": 26.602191925048828,
"learning_rate": 2.163e-06,
"loss": -16.483,
"reward": 0.2065032720565796,
"reward_std": 0.13350703567266464,
"rewards/sudoku_reward_func": 0.2065032720565796,
"step": 721,
"zero_std_ratio": 0.125
},
{
"epoch": 0.0007223611805902951,
"grad_norm": 29.65574836730957,
"learning_rate": 2.166e-06,
"loss": -14.9039,
"step": 722
},
{
"epoch": 0.0007233616808404202,
"grad_norm": 31.21291732788086,
"learning_rate": 2.169e-06,
"loss": -15.1445,
"step": 723
},
{
"epoch": 0.0007243621810905453,
"grad_norm": 31.365436553955078,
"learning_rate": 2.172e-06,
"loss": -14.1896,
"step": 724
},
{
"epoch": 0.0007253626813406703,
"grad_norm": 26.096881866455078,
"learning_rate": 2.175e-06,
"loss": -16.666,
"step": 725
},
{
"epoch": 0.0007263631815907954,
"grad_norm": 31.20346450805664,
"learning_rate": 2.178e-06,
"loss": -15.4398,
"step": 726
},
{
"epoch": 0.0007273636818409205,
"grad_norm": 30.130516052246094,
"learning_rate": 2.181e-06,
"loss": -15.3874,
"step": 727
},
{
"epoch": 0.0007283641820910455,
"grad_norm": 31.089357376098633,
"learning_rate": 2.184e-06,
"loss": -14.4237,
"step": 728
},
{
"completion_length": 243.2916717529297,
"epoch": 0.0007293646823411706,
"grad_norm": 28.103757858276367,
"learning_rate": 2.187e-06,
"loss": 3.8517,
"reward": 0.23788856714963913,
"reward_std": 0.15394366532564163,
"rewards/sudoku_reward_func": 0.23788856714963913,
"step": 729,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007303651825912957,
"grad_norm": 32.89771270751953,
"learning_rate": 2.19e-06,
"loss": 5.6428,
"step": 730
},
{
"epoch": 0.0007313656828414207,
"grad_norm": 26.92860221862793,
"learning_rate": 2.193e-06,
"loss": 3.6873,
"step": 731
},
{
"epoch": 0.0007323661830915458,
"grad_norm": 35.90951919555664,
"learning_rate": 2.196e-06,
"loss": 6.3758,
"step": 732
},
{
"epoch": 0.0007333666833416708,
"grad_norm": 30.617719650268555,
"learning_rate": 2.199e-06,
"loss": 3.6392,
"step": 733
},
{
"epoch": 0.0007343671835917959,
"grad_norm": 32.82959747314453,
"learning_rate": 2.202e-06,
"loss": 5.2549,
"step": 734
},
{
"epoch": 0.0007353676838419209,
"grad_norm": 33.0678596496582,
"learning_rate": 2.205e-06,
"loss": 3.1626,
"step": 735
},
{
"epoch": 0.000736368184092046,
"grad_norm": 34.600852966308594,
"learning_rate": 2.208e-06,
"loss": 5.8968,
"step": 736
},
{
"completion_length": 250.52084350585938,
"epoch": 0.0007373686843421711,
"grad_norm": 42.83121109008789,
"learning_rate": 2.211e-06,
"loss": 1.0832,
"reward": 0.16931217163801193,
"reward_std": 0.15145771950483322,
"rewards/sudoku_reward_func": 0.16931216418743134,
"step": 737,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007383691845922961,
"grad_norm": 32.792449951171875,
"learning_rate": 2.214e-06,
"loss": 1.4861,
"step": 738
},
{
"epoch": 0.0007393696848424212,
"grad_norm": 31.13545799255371,
"learning_rate": 2.217e-06,
"loss": 2.7522,
"step": 739
},
{
"epoch": 0.0007403701850925463,
"grad_norm": 25.863052368164062,
"learning_rate": 2.22e-06,
"loss": 1.8101,
"step": 740
},
{
"epoch": 0.0007413706853426714,
"grad_norm": 35.75248718261719,
"learning_rate": 2.223e-06,
"loss": 1.0109,
"step": 741
},
{
"epoch": 0.0007423711855927964,
"grad_norm": 32.12532424926758,
"learning_rate": 2.226e-06,
"loss": 1.1726,
"step": 742
},
{
"epoch": 0.0007433716858429215,
"grad_norm": 29.697542190551758,
"learning_rate": 2.229e-06,
"loss": 2.4716,
"step": 743
},
{
"epoch": 0.0007443721860930466,
"grad_norm": 27.35248374938965,
"learning_rate": 2.232e-06,
"loss": 1.5878,
"step": 744
},
{
"completion_length": 249.3541717529297,
"epoch": 0.0007453726863431715,
"grad_norm": 23.706253051757812,
"learning_rate": 2.235e-06,
"loss": 6.3962,
"reward": 0.1799355298280716,
"reward_std": 0.12801172584295273,
"rewards/sudoku_reward_func": 0.179935522377491,
"step": 745,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007463731865932966,
"grad_norm": 23.791215896606445,
"learning_rate": 2.238e-06,
"loss": 5.8298,
"step": 746
},
{
"epoch": 0.0007473736868434217,
"grad_norm": 23.94332504272461,
"learning_rate": 2.2410000000000002e-06,
"loss": 7.1122,
"step": 747
},
{
"epoch": 0.0007483741870935468,
"grad_norm": 24.198583602905273,
"learning_rate": 2.244e-06,
"loss": 5.6156,
"step": 748
},
{
"epoch": 0.0007493746873436718,
"grad_norm": 25.003089904785156,
"learning_rate": 2.2470000000000003e-06,
"loss": 6.3101,
"step": 749
},
{
"epoch": 0.0007503751875937969,
"grad_norm": 23.13814926147461,
"learning_rate": 2.25e-06,
"loss": 5.5756,
"step": 750
},
{
"epoch": 0.000751375687843922,
"grad_norm": 23.709775924682617,
"learning_rate": 2.253e-06,
"loss": 6.8625,
"step": 751
},
{
"epoch": 0.000752376188094047,
"grad_norm": 33.373619079589844,
"learning_rate": 2.256e-06,
"loss": 5.3697,
"step": 752
},
{
"completion_length": 249.33334350585938,
"epoch": 0.0007533766883441721,
"grad_norm": 36.459205627441406,
"learning_rate": 2.259e-06,
"loss": 1.3885,
"reward": 0.181175597012043,
"reward_std": 0.15113066881895065,
"rewards/sudoku_reward_func": 0.1811755895614624,
"step": 753,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007543771885942972,
"grad_norm": 33.78456115722656,
"learning_rate": 2.262e-06,
"loss": 3.3189,
"step": 754
},
{
"epoch": 0.0007553776888444223,
"grad_norm": 33.32498550415039,
"learning_rate": 2.265e-06,
"loss": 0.7809,
"step": 755
},
{
"epoch": 0.0007563781890945472,
"grad_norm": 33.475223541259766,
"learning_rate": 2.268e-06,
"loss": 0.8343,
"step": 756
},
{
"epoch": 0.0007573786893446723,
"grad_norm": 32.76923370361328,
"learning_rate": 2.271e-06,
"loss": 1.1428,
"step": 757
},
{
"epoch": 0.0007583791895947974,
"grad_norm": 28.283016204833984,
"learning_rate": 2.274e-06,
"loss": 3.1434,
"step": 758
},
{
"epoch": 0.0007593796898449224,
"grad_norm": 35.577857971191406,
"learning_rate": 2.277e-06,
"loss": 0.4024,
"step": 759
},
{
"epoch": 0.0007603801900950475,
"grad_norm": 32.67893600463867,
"learning_rate": 2.28e-06,
"loss": 0.3714,
"step": 760
},
{
"completion_length": 247.0625,
"epoch": 0.0007613806903451726,
"grad_norm": 26.973373413085938,
"learning_rate": 2.283e-06,
"loss": -4.7676,
"reward": 0.20238097012043,
"reward_std": 0.12780283018946648,
"rewards/sudoku_reward_func": 0.2023809626698494,
"step": 761,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007623811905952977,
"grad_norm": 26.879213333129883,
"learning_rate": 2.2860000000000002e-06,
"loss": -4.5721,
"step": 762
},
{
"epoch": 0.0007633816908454227,
"grad_norm": 28.65926170349121,
"learning_rate": 2.289e-06,
"loss": -4.6671,
"step": 763
},
{
"epoch": 0.0007643821910955478,
"grad_norm": 24.28839874267578,
"learning_rate": 2.2920000000000002e-06,
"loss": -5.1815,
"step": 764
},
{
"epoch": 0.0007653826913456729,
"grad_norm": 26.704910278320312,
"learning_rate": 2.295e-06,
"loss": -5.2439,
"step": 765
},
{
"epoch": 0.0007663831915957979,
"grad_norm": 25.38524055480957,
"learning_rate": 2.2980000000000003e-06,
"loss": -4.8948,
"step": 766
},
{
"epoch": 0.000767383691845923,
"grad_norm": 29.275630950927734,
"learning_rate": 2.301e-06,
"loss": -5.139,
"step": 767
},
{
"epoch": 0.000768384192096048,
"grad_norm": 22.713115692138672,
"learning_rate": 2.3040000000000003e-06,
"loss": -5.7018,
"step": 768
},
{
"completion_length": 249.4375,
"epoch": 0.0007693846923461731,
"grad_norm": 43.691341400146484,
"learning_rate": 2.307e-06,
"loss": 3.4159,
"reward": 0.20924274623394012,
"reward_std": 0.1474134773015976,
"rewards/sudoku_reward_func": 0.20924272388219833,
"step": 769,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007703851925962981,
"grad_norm": 27.533424377441406,
"learning_rate": 2.31e-06,
"loss": 2.5735,
"step": 770
},
{
"epoch": 0.0007713856928464232,
"grad_norm": 25.509496688842773,
"learning_rate": 2.313e-06,
"loss": 3.6815,
"step": 771
},
{
"epoch": 0.0007723861930965483,
"grad_norm": 24.885425567626953,
"learning_rate": 2.316e-06,
"loss": 3.8332,
"step": 772
},
{
"epoch": 0.0007733866933466733,
"grad_norm": 35.75920867919922,
"learning_rate": 2.319e-06,
"loss": 3.1062,
"step": 773
},
{
"epoch": 0.0007743871935967984,
"grad_norm": 29.339269638061523,
"learning_rate": 2.322e-06,
"loss": 2.5655,
"step": 774
},
{
"epoch": 0.0007753876938469235,
"grad_norm": 23.58751678466797,
"learning_rate": 2.325e-06,
"loss": 3.6055,
"step": 775
},
{
"epoch": 0.0007763881940970485,
"grad_norm": 23.617435455322266,
"learning_rate": 2.328e-06,
"loss": 3.6078,
"step": 776
},
{
"completion_length": 254.875,
"epoch": 0.0007773886943471736,
"grad_norm": 35.409183502197266,
"learning_rate": 2.3310000000000002e-06,
"loss": -14.0541,
"reward": 0.18220899999141693,
"reward_std": 0.1431182250380516,
"rewards/sudoku_reward_func": 0.18220899999141693,
"step": 777,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007783891945972987,
"grad_norm": 41.375572204589844,
"learning_rate": 2.334e-06,
"loss": -13.7587,
"step": 778
},
{
"epoch": 0.0007793896948474238,
"grad_norm": 32.68427276611328,
"learning_rate": 2.3370000000000002e-06,
"loss": -14.2352,
"step": 779
},
{
"epoch": 0.0007803901950975487,
"grad_norm": 39.738059997558594,
"learning_rate": 2.34e-06,
"loss": -14.0971,
"step": 780
},
{
"epoch": 0.0007813906953476738,
"grad_norm": 38.458805084228516,
"learning_rate": 2.3430000000000003e-06,
"loss": -14.3691,
"step": 781
},
{
"epoch": 0.0007823911955977989,
"grad_norm": 34.345001220703125,
"learning_rate": 2.346e-06,
"loss": -14.0191,
"step": 782
},
{
"epoch": 0.0007833916958479239,
"grad_norm": 31.24772071838379,
"learning_rate": 2.3490000000000003e-06,
"loss": -14.7715,
"step": 783
},
{
"epoch": 0.000784392196098049,
"grad_norm": 37.2220344543457,
"learning_rate": 2.352e-06,
"loss": -14.7264,
"step": 784
},
{
"completion_length": 252.14583587646484,
"epoch": 0.0007853926963481741,
"grad_norm": 34.27210998535156,
"learning_rate": 2.3550000000000003e-06,
"loss": -3.8389,
"reward": 0.23557373881340027,
"reward_std": 0.1432676911354065,
"rewards/sudoku_reward_func": 0.23557373881340027,
"step": 785,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007863931965982992,
"grad_norm": 44.486698150634766,
"learning_rate": 2.358e-06,
"loss": -3.4254,
"step": 786
},
{
"epoch": 0.0007873936968484242,
"grad_norm": 32.860557556152344,
"learning_rate": 2.3610000000000003e-06,
"loss": -3.6627,
"step": 787
},
{
"epoch": 0.0007883941970985493,
"grad_norm": 29.94903564453125,
"learning_rate": 2.364e-06,
"loss": -4.305,
"step": 788
},
{
"epoch": 0.0007893946973486744,
"grad_norm": 29.018800735473633,
"learning_rate": 2.367e-06,
"loss": -4.0097,
"step": 789
},
{
"epoch": 0.0007903951975987994,
"grad_norm": 42.00046920776367,
"learning_rate": 2.37e-06,
"loss": -3.9625,
"step": 790
},
{
"epoch": 0.0007913956978489244,
"grad_norm": 32.21451950073242,
"learning_rate": 2.373e-06,
"loss": -4.1312,
"step": 791
},
{
"epoch": 0.0007923961980990495,
"grad_norm": 31.023622512817383,
"learning_rate": 2.376e-06,
"loss": -4.6582,
"step": 792
},
{
"completion_length": 246.5416717529297,
"epoch": 0.0007933966983491746,
"grad_norm": 57.225624084472656,
"learning_rate": 2.379e-06,
"loss": -2.409,
"reward": 0.171502985060215,
"reward_std": 0.15528041124343872,
"rewards/sudoku_reward_func": 0.1715029776096344,
"step": 793,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0007943971985992996,
"grad_norm": 54.69853973388672,
"learning_rate": 2.3820000000000002e-06,
"loss": 0.6507,
"step": 794
},
{
"epoch": 0.0007953976988494247,
"grad_norm": 31.700592041015625,
"learning_rate": 2.385e-06,
"loss": -0.2206,
"step": 795
},
{
"epoch": 0.0007963981990995498,
"grad_norm": 34.947086334228516,
"learning_rate": 2.3880000000000003e-06,
"loss": -0.3802,
"step": 796
},
{
"epoch": 0.0007973986993496748,
"grad_norm": 35.65658950805664,
"learning_rate": 2.391e-06,
"loss": -2.6027,
"step": 797
},
{
"epoch": 0.0007983991995997999,
"grad_norm": 41.322940826416016,
"learning_rate": 2.3940000000000003e-06,
"loss": 0.2528,
"step": 798
},
{
"epoch": 0.000799399699849925,
"grad_norm": 31.205036163330078,
"learning_rate": 2.397e-06,
"loss": -0.4945,
"step": 799
},
{
"epoch": 0.0008004002001000501,
"grad_norm": 30.227001190185547,
"learning_rate": 2.4000000000000003e-06,
"loss": -0.8871,
"step": 800
},
{
"completion_length": 245.5,
"epoch": 0.0008014007003501751,
"grad_norm": 33.39512252807617,
"learning_rate": 2.403e-06,
"loss": -8.9157,
"reward": 0.1831597313284874,
"reward_std": 0.16883236914873123,
"rewards/sudoku_reward_func": 0.1831597313284874,
"step": 801,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008024012006003002,
"grad_norm": 34.010154724121094,
"learning_rate": 2.4060000000000003e-06,
"loss": -10.499,
"step": 802
},
{
"epoch": 0.0008034017008504252,
"grad_norm": 32.17269515991211,
"learning_rate": 2.409e-06,
"loss": -8.7243,
"step": 803
},
{
"epoch": 0.0008044022011005502,
"grad_norm": 32.730064392089844,
"learning_rate": 2.4120000000000004e-06,
"loss": -11.5972,
"step": 804
},
{
"epoch": 0.0008054027013506753,
"grad_norm": 32.50396728515625,
"learning_rate": 2.415e-06,
"loss": -9.1913,
"step": 805
},
{
"epoch": 0.0008064032016008004,
"grad_norm": 35.29380798339844,
"learning_rate": 2.4180000000000004e-06,
"loss": -11.002,
"step": 806
},
{
"epoch": 0.0008074037018509255,
"grad_norm": 32.23307418823242,
"learning_rate": 2.421e-06,
"loss": -9.3948,
"step": 807
},
{
"epoch": 0.0008084042021010505,
"grad_norm": 36.28053665161133,
"learning_rate": 2.4240000000000004e-06,
"loss": -12.0591,
"step": 808
},
{
"completion_length": 249.56250762939453,
"epoch": 0.0008094047023511756,
"grad_norm": 42.41727066040039,
"learning_rate": 2.4270000000000002e-06,
"loss": -10.5996,
"reward": 0.2089533880352974,
"reward_std": 0.11930705606937408,
"rewards/sudoku_reward_func": 0.2089533805847168,
"step": 809,
"zero_std_ratio": 0.125
},
{
"epoch": 0.0008104052026013007,
"grad_norm": 34.009498596191406,
"learning_rate": 2.43e-06,
"loss": -9.9286,
"step": 810
},
{
"epoch": 0.0008114057028514257,
"grad_norm": 30.14313316345215,
"learning_rate": 2.4330000000000003e-06,
"loss": -12.0992,
"step": 811
},
{
"epoch": 0.0008124062031015508,
"grad_norm": 37.952056884765625,
"learning_rate": 2.436e-06,
"loss": -10.552,
"step": 812
},
{
"epoch": 0.0008134067033516759,
"grad_norm": 44.20967483520508,
"learning_rate": 2.439e-06,
"loss": -11.1506,
"step": 813
},
{
"epoch": 0.0008144072036018008,
"grad_norm": 36.3494873046875,
"learning_rate": 2.442e-06,
"loss": -10.4949,
"step": 814
},
{
"epoch": 0.0008154077038519259,
"grad_norm": 33.20271682739258,
"learning_rate": 2.445e-06,
"loss": -12.5342,
"step": 815
},
{
"epoch": 0.000816408204102051,
"grad_norm": 39.655853271484375,
"learning_rate": 2.448e-06,
"loss": -11.3508,
"step": 816
},
{
"completion_length": 248.2916717529297,
"epoch": 0.0008174087043521761,
"grad_norm": 40.428096771240234,
"learning_rate": 2.451e-06,
"loss": -8.2092,
"reward": 0.2371031790971756,
"reward_std": 0.15679361671209335,
"rewards/sudoku_reward_func": 0.237103171646595,
"step": 817,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008184092046023011,
"grad_norm": 54.35465621948242,
"learning_rate": 2.4539999999999997e-06,
"loss": -7.2284,
"step": 818
},
{
"epoch": 0.0008194097048524262,
"grad_norm": 37.36104965209961,
"learning_rate": 2.457e-06,
"loss": -7.3311,
"step": 819
},
{
"epoch": 0.0008204102051025513,
"grad_norm": 42.316307067871094,
"learning_rate": 2.4599999999999997e-06,
"loss": -9.6601,
"step": 820
},
{
"epoch": 0.0008214107053526763,
"grad_norm": 41.3543701171875,
"learning_rate": 2.463e-06,
"loss": -8.9068,
"step": 821
},
{
"epoch": 0.0008224112056028014,
"grad_norm": 57.49537658691406,
"learning_rate": 2.4659999999999998e-06,
"loss": -7.369,
"step": 822
},
{
"epoch": 0.0008234117058529265,
"grad_norm": 38.88594436645508,
"learning_rate": 2.469e-06,
"loss": -8.1573,
"step": 823
},
{
"epoch": 0.0008244122061030516,
"grad_norm": 48.43411636352539,
"learning_rate": 2.472e-06,
"loss": -9.5986,
"step": 824
},
{
"completion_length": 233.1875,
"epoch": 0.0008254127063531766,
"grad_norm": 58.4598274230957,
"learning_rate": 2.475e-06,
"loss": -4.3023,
"reward": 0.2301587462425232,
"reward_std": 0.14847226440906525,
"rewards/sudoku_reward_func": 0.230158731341362,
"step": 825,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008264132066033016,
"grad_norm": 48.626136779785156,
"learning_rate": 2.478e-06,
"loss": -2.8084,
"step": 826
},
{
"epoch": 0.0008274137068534267,
"grad_norm": 56.721961975097656,
"learning_rate": 2.481e-06,
"loss": -5.9076,
"step": 827
},
{
"epoch": 0.0008284142071035517,
"grad_norm": 61.319942474365234,
"learning_rate": 2.484e-06,
"loss": -4.8713,
"step": 828
},
{
"epoch": 0.0008294147073536768,
"grad_norm": 70.11338806152344,
"learning_rate": 2.487e-06,
"loss": -5.3066,
"step": 829
},
{
"epoch": 0.0008304152076038019,
"grad_norm": 48.47807693481445,
"learning_rate": 2.49e-06,
"loss": -3.7843,
"step": 830
},
{
"epoch": 0.000831415707853927,
"grad_norm": 49.2767333984375,
"learning_rate": 2.493e-06,
"loss": -7.192,
"step": 831
},
{
"epoch": 0.000832416208104052,
"grad_norm": 65.00503540039062,
"learning_rate": 2.496e-06,
"loss": -6.7884,
"step": 832
},
{
"completion_length": 227.06250762939453,
"epoch": 0.0008334167083541771,
"grad_norm": 60.06196212768555,
"learning_rate": 2.499e-06,
"loss": 13.0532,
"reward": 0.19473380595445633,
"reward_std": 0.15059228241443634,
"rewards/sudoku_reward_func": 0.19473379850387573,
"step": 833,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008344172086043022,
"grad_norm": 88.75370025634766,
"learning_rate": 2.502e-06,
"loss": 14.0123,
"step": 834
},
{
"epoch": 0.0008354177088544272,
"grad_norm": 64.58737182617188,
"learning_rate": 2.505e-06,
"loss": 13.2359,
"step": 835
},
{
"epoch": 0.0008364182091045523,
"grad_norm": 74.0726089477539,
"learning_rate": 2.508e-06,
"loss": 13.5593,
"step": 836
},
{
"epoch": 0.0008374187093546774,
"grad_norm": 65.56517791748047,
"learning_rate": 2.5109999999999998e-06,
"loss": 13.6362,
"step": 837
},
{
"epoch": 0.0008384192096048025,
"grad_norm": 86.18720245361328,
"learning_rate": 2.514e-06,
"loss": 14.6564,
"step": 838
},
{
"epoch": 0.0008394197098549274,
"grad_norm": 64.94049835205078,
"learning_rate": 2.5169999999999998e-06,
"loss": 13.2589,
"step": 839
},
{
"epoch": 0.0008404202101050525,
"grad_norm": 70.9004898071289,
"learning_rate": 2.52e-06,
"loss": 13.0416,
"step": 840
},
{
"completion_length": 239.0,
"epoch": 0.0008414207103551776,
"grad_norm": 54.30680465698242,
"learning_rate": 2.523e-06,
"loss": 6.8374,
"reward": 0.2005208358168602,
"reward_std": 0.1658303141593933,
"rewards/sudoku_reward_func": 0.2005208358168602,
"step": 841,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008424212106053026,
"grad_norm": 52.91537857055664,
"learning_rate": 2.526e-06,
"loss": 6.5411,
"step": 842
},
{
"epoch": 0.0008434217108554277,
"grad_norm": 68.12520599365234,
"learning_rate": 2.529e-06,
"loss": 3.6742,
"step": 843
},
{
"epoch": 0.0008444222111055528,
"grad_norm": 64.33242797851562,
"learning_rate": 2.532e-06,
"loss": 6.8825,
"step": 844
},
{
"epoch": 0.0008454227113556779,
"grad_norm": 54.45749282836914,
"learning_rate": 2.535e-06,
"loss": 6.8283,
"step": 845
},
{
"epoch": 0.0008464232116058029,
"grad_norm": 48.27708435058594,
"learning_rate": 2.538e-06,
"loss": 5.9642,
"step": 846
},
{
"epoch": 0.000847423711855928,
"grad_norm": 61.76557540893555,
"learning_rate": 2.541e-06,
"loss": 3.4451,
"step": 847
},
{
"epoch": 0.0008484242121060531,
"grad_norm": 68.36261749267578,
"learning_rate": 2.544e-06,
"loss": 6.5138,
"step": 848
},
{
"completion_length": 243.37500762939453,
"epoch": 0.000849424712356178,
"grad_norm": 54.307151794433594,
"learning_rate": 2.547e-06,
"loss": -14.0519,
"reward": 0.21928737312555313,
"reward_std": 0.1534140184521675,
"rewards/sudoku_reward_func": 0.21928736567497253,
"step": 849,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008504252126063031,
"grad_norm": 53.22282028198242,
"learning_rate": 2.55e-06,
"loss": -14.7871,
"step": 850
},
{
"epoch": 0.0008514257128564282,
"grad_norm": 54.85883712768555,
"learning_rate": 2.553e-06,
"loss": -14.8958,
"step": 851
},
{
"epoch": 0.0008524262131065532,
"grad_norm": 44.00740432739258,
"learning_rate": 2.556e-06,
"loss": -15.3165,
"step": 852
},
{
"epoch": 0.0008534267133566783,
"grad_norm": 47.5384521484375,
"learning_rate": 2.559e-06,
"loss": -14.4872,
"step": 853
},
{
"epoch": 0.0008544272136068034,
"grad_norm": 56.67496109008789,
"learning_rate": 2.562e-06,
"loss": -15.1607,
"step": 854
},
{
"epoch": 0.0008554277138569285,
"grad_norm": 54.683143615722656,
"learning_rate": 2.565e-06,
"loss": -15.3318,
"step": 855
},
{
"epoch": 0.0008564282141070535,
"grad_norm": 48.874916076660156,
"learning_rate": 2.568e-06,
"loss": -15.6385,
"step": 856
},
{
"completion_length": 237.75000762939453,
"epoch": 0.0008574287143571786,
"grad_norm": 55.119384765625,
"learning_rate": 2.571e-06,
"loss": 7.5436,
"reward": 0.2546972781419754,
"reward_std": 0.16300001740455627,
"rewards/sudoku_reward_func": 0.2546972706913948,
"step": 857,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008584292146073037,
"grad_norm": 51.099952697753906,
"learning_rate": 2.574e-06,
"loss": 7.3429,
"step": 858
},
{
"epoch": 0.0008594297148574287,
"grad_norm": 59.46302032470703,
"learning_rate": 2.577e-06,
"loss": 8.0542,
"step": 859
},
{
"epoch": 0.0008604302151075538,
"grad_norm": 47.472633361816406,
"learning_rate": 2.58e-06,
"loss": 6.304,
"step": 860
},
{
"epoch": 0.0008614307153576789,
"grad_norm": 49.4232063293457,
"learning_rate": 2.583e-06,
"loss": 7.2967,
"step": 861
},
{
"epoch": 0.000862431215607804,
"grad_norm": 49.1822624206543,
"learning_rate": 2.586e-06,
"loss": 7.064,
"step": 862
},
{
"epoch": 0.0008634317158579289,
"grad_norm": 56.62918472290039,
"learning_rate": 2.589e-06,
"loss": 7.7846,
"step": 863
},
{
"epoch": 0.000864432216108054,
"grad_norm": 56.20520782470703,
"learning_rate": 2.592e-06,
"loss": 5.455,
"step": 864
},
{
"completion_length": 246.1041717529297,
"epoch": 0.0008654327163581791,
"grad_norm": 48.346553802490234,
"learning_rate": 2.595e-06,
"loss": -11.4518,
"reward": 0.20085152238607407,
"reward_std": 0.13320738822221756,
"rewards/sudoku_reward_func": 0.20085152238607407,
"step": 865,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008664332166083041,
"grad_norm": 52.97309112548828,
"learning_rate": 2.598e-06,
"loss": -10.5137,
"step": 866
},
{
"epoch": 0.0008674337168584292,
"grad_norm": 41.53067398071289,
"learning_rate": 2.601e-06,
"loss": -13.1581,
"step": 867
},
{
"epoch": 0.0008684342171085543,
"grad_norm": 50.84276580810547,
"learning_rate": 2.604e-06,
"loss": -10.5355,
"step": 868
},
{
"epoch": 0.0008694347173586794,
"grad_norm": 49.1697998046875,
"learning_rate": 2.607e-06,
"loss": -12.0282,
"step": 869
},
{
"epoch": 0.0008704352176088044,
"grad_norm": 45.84278106689453,
"learning_rate": 2.61e-06,
"loss": -11.2891,
"step": 870
},
{
"epoch": 0.0008714357178589295,
"grad_norm": 36.355979919433594,
"learning_rate": 2.6130000000000002e-06,
"loss": -13.4586,
"step": 871
},
{
"epoch": 0.0008724362181090546,
"grad_norm": 46.167972564697266,
"learning_rate": 2.616e-06,
"loss": -11.315,
"step": 872
},
{
"completion_length": 235.6041717529297,
"epoch": 0.0008734367183591795,
"grad_norm": 43.850440979003906,
"learning_rate": 2.6190000000000003e-06,
"loss": -18.2923,
"reward": 0.229662723839283,
"reward_std": 0.16215338557958603,
"rewards/sudoku_reward_func": 0.2296627163887024,
"step": 873,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008744372186093046,
"grad_norm": 51.63322830200195,
"learning_rate": 2.622e-06,
"loss": -21.9418,
"step": 874
},
{
"epoch": 0.0008754377188594297,
"grad_norm": 44.274967193603516,
"learning_rate": 2.6250000000000003e-06,
"loss": -19.8107,
"step": 875
},
{
"epoch": 0.0008764382191095548,
"grad_norm": 35.89291763305664,
"learning_rate": 2.628e-06,
"loss": -19.523,
"step": 876
},
{
"epoch": 0.0008774387193596798,
"grad_norm": 42.89470291137695,
"learning_rate": 2.631e-06,
"loss": -18.7292,
"step": 877
},
{
"epoch": 0.0008784392196098049,
"grad_norm": 50.21378707885742,
"learning_rate": 2.634e-06,
"loss": -22.1391,
"step": 878
},
{
"epoch": 0.00087943971985993,
"grad_norm": 41.582706451416016,
"learning_rate": 2.637e-06,
"loss": -20.1133,
"step": 879
},
{
"epoch": 0.000880440220110055,
"grad_norm": 35.33237838745117,
"learning_rate": 2.64e-06,
"loss": -19.9634,
"step": 880
},
{
"completion_length": 244.6666717529297,
"epoch": 0.0008814407203601801,
"grad_norm": 66.10181427001953,
"learning_rate": 2.643e-06,
"loss": -11.6534,
"reward": 0.2147516831755638,
"reward_std": 0.1552412137389183,
"rewards/sudoku_reward_func": 0.2147516831755638,
"step": 881,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008824412206103052,
"grad_norm": 55.456111907958984,
"learning_rate": 2.646e-06,
"loss": -13.2263,
"step": 882
},
{
"epoch": 0.0008834417208604303,
"grad_norm": 52.62863540649414,
"learning_rate": 2.649e-06,
"loss": -12.9538,
"step": 883
},
{
"epoch": 0.0008844422211105553,
"grad_norm": 35.29569625854492,
"learning_rate": 2.652e-06,
"loss": -10.0486,
"step": 884
},
{
"epoch": 0.0008854427213606803,
"grad_norm": 57.16860580444336,
"learning_rate": 2.655e-06,
"loss": -11.931,
"step": 885
},
{
"epoch": 0.0008864432216108054,
"grad_norm": 42.70516586303711,
"learning_rate": 2.6580000000000002e-06,
"loss": -13.5012,
"step": 886
},
{
"epoch": 0.0008874437218609304,
"grad_norm": 48.49374008178711,
"learning_rate": 2.661e-06,
"loss": -13.4838,
"step": 887
},
{
"epoch": 0.0008884442221110555,
"grad_norm": 37.89622116088867,
"learning_rate": 2.6640000000000002e-06,
"loss": -10.5131,
"step": 888
},
{
"completion_length": 252.64583587646484,
"epoch": 0.0008894447223611806,
"grad_norm": 61.70961380004883,
"learning_rate": 2.667e-06,
"loss": -9.5585,
"reward": 0.2173859253525734,
"reward_std": 0.15939748287200928,
"rewards/sudoku_reward_func": 0.2173859179019928,
"step": 889,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008904452226113056,
"grad_norm": 54.10370635986328,
"learning_rate": 2.6700000000000003e-06,
"loss": -14.3615,
"step": 890
},
{
"epoch": 0.0008914457228614307,
"grad_norm": 45.82687759399414,
"learning_rate": 2.673e-06,
"loss": -10.4068,
"step": 891
},
{
"epoch": 0.0008924462231115558,
"grad_norm": 48.732147216796875,
"learning_rate": 2.6760000000000003e-06,
"loss": -11.3894,
"step": 892
},
{
"epoch": 0.0008934467233616809,
"grad_norm": 89.220458984375,
"learning_rate": 2.679e-06,
"loss": -10.2901,
"step": 893
},
{
"epoch": 0.0008944472236118059,
"grad_norm": 65.57836151123047,
"learning_rate": 2.6820000000000003e-06,
"loss": -15.0504,
"step": 894
},
{
"epoch": 0.000895447723861931,
"grad_norm": 52.363243103027344,
"learning_rate": 2.685e-06,
"loss": -10.9473,
"step": 895
},
{
"epoch": 0.0008964482241120561,
"grad_norm": 58.80600357055664,
"learning_rate": 2.688e-06,
"loss": -11.7646,
"step": 896
},
{
"completion_length": 243.89584350585938,
"epoch": 0.000897448724362181,
"grad_norm": 59.68390655517578,
"learning_rate": 2.691e-06,
"loss": -18.4827,
"reward": 0.243675597012043,
"reward_std": 0.1689695119857788,
"rewards/sudoku_reward_func": 0.2436755895614624,
"step": 897,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0008984492246123061,
"grad_norm": 57.939754486083984,
"learning_rate": 2.694e-06,
"loss": -18.4617,
"step": 898
},
{
"epoch": 0.0008994497248624312,
"grad_norm": 59.48532485961914,
"learning_rate": 2.697e-06,
"loss": -18.3377,
"step": 899
},
{
"epoch": 0.0009004502251125563,
"grad_norm": 68.90072631835938,
"learning_rate": 2.7e-06,
"loss": -16.435,
"step": 900
},
{
"epoch": 0.0009014507253626813,
"grad_norm": 62.35285186767578,
"learning_rate": 2.703e-06,
"loss": -19.3488,
"step": 901
},
{
"epoch": 0.0009024512256128064,
"grad_norm": 65.77997589111328,
"learning_rate": 2.706e-06,
"loss": -19.3106,
"step": 902
},
{
"epoch": 0.0009034517258629315,
"grad_norm": 60.621700286865234,
"learning_rate": 2.7090000000000002e-06,
"loss": -19.6105,
"step": 903
},
{
"epoch": 0.0009044522261130565,
"grad_norm": 61.568870544433594,
"learning_rate": 2.712e-06,
"loss": -17.7438,
"step": 904
},
{
"completion_length": 247.9791717529297,
"epoch": 0.0009054527263631816,
"grad_norm": 37.43316650390625,
"learning_rate": 2.7150000000000003e-06,
"loss": -5.8514,
"reward": 0.2286706417798996,
"reward_std": 0.1333366557955742,
"rewards/sudoku_reward_func": 0.2286706417798996,
"step": 905,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009064532266133067,
"grad_norm": 41.17620849609375,
"learning_rate": 2.718e-06,
"loss": -5.1501,
"step": 906
},
{
"epoch": 0.0009074537268634318,
"grad_norm": 46.76512908935547,
"learning_rate": 2.7210000000000003e-06,
"loss": -7.0313,
"step": 907
},
{
"epoch": 0.0009084542271135567,
"grad_norm": 44.03168869018555,
"learning_rate": 2.724e-06,
"loss": -5.5208,
"step": 908
},
{
"epoch": 0.0009094547273636818,
"grad_norm": 40.06308364868164,
"learning_rate": 2.7270000000000003e-06,
"loss": -6.4912,
"step": 909
},
{
"epoch": 0.0009104552276138069,
"grad_norm": 43.71113204956055,
"learning_rate": 2.73e-06,
"loss": -5.5134,
"step": 910
},
{
"epoch": 0.0009114557278639319,
"grad_norm": 43.34052276611328,
"learning_rate": 2.7330000000000003e-06,
"loss": -7.919,
"step": 911
},
{
"epoch": 0.000912456228114057,
"grad_norm": 41.50727844238281,
"learning_rate": 2.736e-06,
"loss": -6.0544,
"step": 912
},
{
"completion_length": 243.4791717529297,
"epoch": 0.0009134567283641821,
"grad_norm": 92.41754913330078,
"learning_rate": 2.7390000000000004e-06,
"loss": -20.6525,
"reward": 0.2719945013523102,
"reward_std": 0.1805286407470703,
"rewards/sudoku_reward_func": 0.2719945013523102,
"step": 913,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009144572286143072,
"grad_norm": 83.57636260986328,
"learning_rate": 2.742e-06,
"loss": -21.188,
"step": 914
},
{
"epoch": 0.0009154577288644322,
"grad_norm": 75.21762084960938,
"learning_rate": 2.745e-06,
"loss": -22.7858,
"step": 915
},
{
"epoch": 0.0009164582291145573,
"grad_norm": 78.82347106933594,
"learning_rate": 2.748e-06,
"loss": -24.0821,
"step": 916
},
{
"epoch": 0.0009174587293646824,
"grad_norm": 87.21110534667969,
"learning_rate": 2.751e-06,
"loss": -22.2703,
"step": 917
},
{
"epoch": 0.0009184592296148074,
"grad_norm": 84.7657470703125,
"learning_rate": 2.7540000000000002e-06,
"loss": -22.8605,
"step": 918
},
{
"epoch": 0.0009194597298649325,
"grad_norm": 74.08875274658203,
"learning_rate": 2.757e-06,
"loss": -24.4158,
"step": 919
},
{
"epoch": 0.0009204602301150575,
"grad_norm": 91.21599578857422,
"learning_rate": 2.7600000000000003e-06,
"loss": -26.2348,
"step": 920
},
{
"completion_length": 248.9791717529297,
"epoch": 0.0009214607303651826,
"grad_norm": 78.67549133300781,
"learning_rate": 2.763e-06,
"loss": 0.443,
"reward": 0.23371364176273346,
"reward_std": 0.174501433968544,
"rewards/sudoku_reward_func": 0.23371363431215286,
"step": 921,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009224612306153076,
"grad_norm": 94.62200927734375,
"learning_rate": 2.7660000000000003e-06,
"loss": 2.2315,
"step": 922
},
{
"epoch": 0.0009234617308654327,
"grad_norm": 98.74855041503906,
"learning_rate": 2.769e-06,
"loss": 3.319,
"step": 923
},
{
"epoch": 0.0009244622311155578,
"grad_norm": 105.0526351928711,
"learning_rate": 2.7720000000000003e-06,
"loss": 0.0095,
"step": 924
},
{
"epoch": 0.0009254627313656828,
"grad_norm": 104.87490844726562,
"learning_rate": 2.775e-06,
"loss": 0.2426,
"step": 925
},
{
"epoch": 0.0009264632316158079,
"grad_norm": 115.48833465576172,
"learning_rate": 2.7780000000000003e-06,
"loss": 2.4548,
"step": 926
},
{
"epoch": 0.000927463731865933,
"grad_norm": 100.13760375976562,
"learning_rate": 2.781e-06,
"loss": 3.4853,
"step": 927
},
{
"epoch": 0.000928464232116058,
"grad_norm": 111.59932708740234,
"learning_rate": 2.7840000000000004e-06,
"loss": -0.2007,
"step": 928
},
{
"completion_length": 240.68750762939453,
"epoch": 0.0009294647323661831,
"grad_norm": 90.83345031738281,
"learning_rate": 2.787e-06,
"loss": -0.0686,
"reward": 0.27306924760341644,
"reward_std": 0.14821404218673706,
"rewards/sudoku_reward_func": 0.27306922525167465,
"step": 929,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009304652326163082,
"grad_norm": 97.31766510009766,
"learning_rate": 2.7900000000000004e-06,
"loss": 2.2642,
"step": 930
},
{
"epoch": 0.0009314657328664333,
"grad_norm": 74.02940368652344,
"learning_rate": 2.793e-06,
"loss": -2.8585,
"step": 931
},
{
"epoch": 0.0009324662331165582,
"grad_norm": 110.25257873535156,
"learning_rate": 2.7960000000000004e-06,
"loss": -4.8888,
"step": 932
},
{
"epoch": 0.0009334667333666833,
"grad_norm": 94.64034271240234,
"learning_rate": 2.7990000000000002e-06,
"loss": -0.4688,
"step": 933
},
{
"epoch": 0.0009344672336168084,
"grad_norm": 80.84830474853516,
"learning_rate": 2.802e-06,
"loss": 1.6228,
"step": 934
},
{
"epoch": 0.0009354677338669334,
"grad_norm": 65.42864227294922,
"learning_rate": 2.8050000000000002e-06,
"loss": -2.8248,
"step": 935
},
{
"epoch": 0.0009364682341170585,
"grad_norm": 126.89608764648438,
"learning_rate": 2.808e-06,
"loss": -6.1213,
"step": 936
},
{
"completion_length": 246.75000762939453,
"epoch": 0.0009374687343671836,
"grad_norm": 88.34729766845703,
"learning_rate": 2.8110000000000003e-06,
"loss": -12.6919,
"reward": 0.21135085821151733,
"reward_std": 0.13452807068824768,
"rewards/sudoku_reward_func": 0.21135085821151733,
"step": 937,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009384692346173087,
"grad_norm": 72.44952392578125,
"learning_rate": 2.814e-06,
"loss": -10.9051,
"step": 938
},
{
"epoch": 0.0009394697348674337,
"grad_norm": 125.90797424316406,
"learning_rate": 2.817e-06,
"loss": -14.7098,
"step": 939
},
{
"epoch": 0.0009404702351175588,
"grad_norm": 94.34359741210938,
"learning_rate": 2.82e-06,
"loss": -12.7713,
"step": 940
},
{
"epoch": 0.0009414707353676839,
"grad_norm": 87.32075500488281,
"learning_rate": 2.823e-06,
"loss": -13.6036,
"step": 941
},
{
"epoch": 0.0009424712356178089,
"grad_norm": 73.44007873535156,
"learning_rate": 2.8259999999999997e-06,
"loss": -11.6647,
"step": 942
},
{
"epoch": 0.000943471735867934,
"grad_norm": 153.84591674804688,
"learning_rate": 2.829e-06,
"loss": -17.473,
"step": 943
},
{
"epoch": 0.000944472236118059,
"grad_norm": 112.31997680664062,
"learning_rate": 2.8319999999999997e-06,
"loss": -14.8225,
"step": 944
},
{
"completion_length": 239.00000762939453,
"epoch": 0.0009454727363681841,
"grad_norm": 105.95699310302734,
"learning_rate": 2.835e-06,
"loss": -1.1112,
"reward": 0.2568768113851547,
"reward_std": 0.16827785968780518,
"rewards/sudoku_reward_func": 0.25687679648399353,
"step": 945,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009464732366183091,
"grad_norm": 122.6570816040039,
"learning_rate": 2.8379999999999998e-06,
"loss": 0.4598,
"step": 946
},
{
"epoch": 0.0009474737368684342,
"grad_norm": 125.27985382080078,
"learning_rate": 2.841e-06,
"loss": -1.2065,
"step": 947
},
{
"epoch": 0.0009484742371185593,
"grad_norm": 125.6364974975586,
"learning_rate": 2.844e-06,
"loss": 7.3372,
"step": 948
},
{
"epoch": 0.0009494747373686843,
"grad_norm": 107.11581420898438,
"learning_rate": 2.847e-06,
"loss": -1.2886,
"step": 949
},
{
"epoch": 0.0009504752376188094,
"grad_norm": 129.3809051513672,
"learning_rate": 2.85e-06,
"loss": 0.2694,
"step": 950
},
{
"epoch": 0.0009514757378689345,
"grad_norm": 117.40351867675781,
"learning_rate": 2.853e-06,
"loss": -1.7224,
"step": 951
},
{
"epoch": 0.0009524762381190596,
"grad_norm": 135.25901794433594,
"learning_rate": 2.856e-06,
"loss": 6.8023,
"step": 952
},
{
"completion_length": 247.83334350585938,
"epoch": 0.0009534767383691846,
"grad_norm": 102.14061737060547,
"learning_rate": 2.859e-06,
"loss": 8.0822,
"reward": 0.227430559694767,
"reward_std": 0.14220082387328148,
"rewards/sudoku_reward_func": 0.2274305522441864,
"step": 953,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009544772386193097,
"grad_norm": 129.8813934326172,
"learning_rate": 2.862e-06,
"loss": 15.1066,
"step": 954
},
{
"epoch": 0.0009554777388694348,
"grad_norm": 106.110107421875,
"learning_rate": 2.865e-06,
"loss": 7.7215,
"step": 955
},
{
"epoch": 0.0009564782391195597,
"grad_norm": 69.91547393798828,
"learning_rate": 2.868e-06,
"loss": 7.6638,
"step": 956
},
{
"epoch": 0.0009574787393696848,
"grad_norm": 109.26384735107422,
"learning_rate": 2.871e-06,
"loss": 7.1161,
"step": 957
},
{
"epoch": 0.0009584792396198099,
"grad_norm": 130.6197052001953,
"learning_rate": 2.874e-06,
"loss": 12.9955,
"step": 958
},
{
"epoch": 0.000959479739869935,
"grad_norm": 69.60470581054688,
"learning_rate": 2.877e-06,
"loss": 6.9533,
"step": 959
},
{
"epoch": 0.00096048024012006,
"grad_norm": 67.70174407958984,
"learning_rate": 2.88e-06,
"loss": 6.9371,
"step": 960
},
{
"completion_length": 243.9791717529297,
"epoch": 0.0009614807403701851,
"grad_norm": 102.05725860595703,
"learning_rate": 2.883e-06,
"loss": 9.4034,
"reward": 0.23429233580827713,
"reward_std": 0.1784105822443962,
"rewards/sudoku_reward_func": 0.23429233580827713,
"step": 961,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009624812406203102,
"grad_norm": 93.39031982421875,
"learning_rate": 2.886e-06,
"loss": 11.6216,
"step": 962
},
{
"epoch": 0.0009634817408704352,
"grad_norm": 86.00289916992188,
"learning_rate": 2.8889999999999998e-06,
"loss": 8.0131,
"step": 963
},
{
"epoch": 0.0009644822411205603,
"grad_norm": 88.48316192626953,
"learning_rate": 2.892e-06,
"loss": 11.9298,
"step": 964
},
{
"epoch": 0.0009654827413706854,
"grad_norm": 91.39781188964844,
"learning_rate": 2.895e-06,
"loss": 8.7401,
"step": 965
},
{
"epoch": 0.0009664832416208105,
"grad_norm": 85.42029571533203,
"learning_rate": 2.898e-06,
"loss": 9.4002,
"step": 966
},
{
"epoch": 0.0009674837418709354,
"grad_norm": 67.53469848632812,
"learning_rate": 2.901e-06,
"loss": 7.4259,
"step": 967
},
{
"epoch": 0.0009684842421210605,
"grad_norm": 83.41094207763672,
"learning_rate": 2.904e-06,
"loss": 11.5478,
"step": 968
},
{
"completion_length": 245.27083587646484,
"epoch": 0.0009694847423711856,
"grad_norm": 246.9258270263672,
"learning_rate": 2.907e-06,
"loss": 5.9896,
"reward": 0.21226026117801666,
"reward_std": 0.1643363982439041,
"rewards/sudoku_reward_func": 0.21226025372743607,
"step": 969,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009704852426213106,
"grad_norm": 98.34902954101562,
"learning_rate": 2.91e-06,
"loss": 6.1014,
"step": 970
},
{
"epoch": 0.0009714857428714357,
"grad_norm": 109.06088256835938,
"learning_rate": 2.913e-06,
"loss": 7.1094,
"step": 971
},
{
"epoch": 0.0009724862431215608,
"grad_norm": 79.995849609375,
"learning_rate": 2.916e-06,
"loss": 3.4938,
"step": 972
},
{
"epoch": 0.0009734867433716858,
"grad_norm": 111.15068054199219,
"learning_rate": 2.919e-06,
"loss": 3.5674,
"step": 973
},
{
"epoch": 0.0009744872436218109,
"grad_norm": 68.43648529052734,
"learning_rate": 2.922e-06,
"loss": 4.8717,
"step": 974
},
{
"epoch": 0.000975487743871936,
"grad_norm": 91.81321716308594,
"learning_rate": 2.925e-06,
"loss": 5.182,
"step": 975
},
{
"epoch": 0.0009764882441220611,
"grad_norm": 70.03031921386719,
"learning_rate": 2.928e-06,
"loss": 2.105,
"step": 976
},
{
"completion_length": 248.9166717529297,
"epoch": 0.0009774887443721862,
"grad_norm": 49.68278503417969,
"learning_rate": 2.931e-06,
"loss": 2.851,
"reward": 0.20291833579540253,
"reward_std": 0.1318095251917839,
"rewards/sudoku_reward_func": 0.20291832834482193,
"step": 977,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009784892446223113,
"grad_norm": 86.14371490478516,
"learning_rate": 2.934e-06,
"loss": 3.8071,
"step": 978
},
{
"epoch": 0.0009794897448724361,
"grad_norm": 68.95272064208984,
"learning_rate": 2.937e-06,
"loss": 4.1672,
"step": 979
},
{
"epoch": 0.0009804902451225612,
"grad_norm": 45.35859298706055,
"learning_rate": 2.9400000000000002e-06,
"loss": 1.9626,
"step": 980
},
{
"epoch": 0.0009814907453726863,
"grad_norm": 44.2899169921875,
"learning_rate": 2.943e-06,
"loss": 2.4469,
"step": 981
},
{
"epoch": 0.0009824912456228114,
"grad_norm": 62.7624397277832,
"learning_rate": 2.946e-06,
"loss": 2.4931,
"step": 982
},
{
"epoch": 0.0009834917458729365,
"grad_norm": 63.1762809753418,
"learning_rate": 2.949e-06,
"loss": 3.0954,
"step": 983
},
{
"epoch": 0.0009844922461230616,
"grad_norm": 41.33958053588867,
"learning_rate": 2.952e-06,
"loss": 1.4097,
"step": 984
},
{
"completion_length": 241.83333587646484,
"epoch": 0.0009854927463731867,
"grad_norm": 83.71294403076172,
"learning_rate": 2.955e-06,
"loss": 15.8541,
"reward": 0.219866082072258,
"reward_std": 0.13635045289993286,
"rewards/sudoku_reward_func": 0.219866082072258,
"step": 985,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009864932466233116,
"grad_norm": 88.49727630615234,
"learning_rate": 2.958e-06,
"loss": 13.5028,
"step": 986
},
{
"epoch": 0.0009874937468734367,
"grad_norm": 62.60023880004883,
"learning_rate": 2.961e-06,
"loss": 13.6597,
"step": 987
},
{
"epoch": 0.0009884942471235618,
"grad_norm": 67.28657531738281,
"learning_rate": 2.964e-06,
"loss": 15.6197,
"step": 988
},
{
"epoch": 0.0009894947473736869,
"grad_norm": 91.06583404541016,
"learning_rate": 2.967e-06,
"loss": 14.1141,
"step": 989
},
{
"epoch": 0.000990495247623812,
"grad_norm": 70.22718811035156,
"learning_rate": 2.97e-06,
"loss": 12.7172,
"step": 990
},
{
"epoch": 0.000991495747873937,
"grad_norm": 56.140846252441406,
"learning_rate": 2.973e-06,
"loss": 12.1588,
"step": 991
},
{
"epoch": 0.000992496248124062,
"grad_norm": 61.37893295288086,
"learning_rate": 2.976e-06,
"loss": 14.1818,
"step": 992
},
{
"completion_length": 255.77084350585938,
"epoch": 0.000993496748374187,
"grad_norm": 62.03451919555664,
"learning_rate": 2.979e-06,
"loss": 3.4047,
"reward": 0.1975446492433548,
"reward_std": 0.1418343260884285,
"rewards/sudoku_reward_func": 0.1975446492433548,
"step": 993,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0009944972486243121,
"grad_norm": 52.507904052734375,
"learning_rate": 2.982e-06,
"loss": 2.0955,
"step": 994
},
{
"epoch": 0.0009954977488744372,
"grad_norm": 50.54158401489258,
"learning_rate": 2.9850000000000002e-06,
"loss": 4.8062,
"step": 995
},
{
"epoch": 0.0009964982491245623,
"grad_norm": 43.728912353515625,
"learning_rate": 2.988e-06,
"loss": 3.4528,
"step": 996
},
{
"epoch": 0.0009974987493746874,
"grad_norm": 52.202728271484375,
"learning_rate": 2.9910000000000002e-06,
"loss": 2.6404,
"step": 997
},
{
"epoch": 0.0009984992496248125,
"grad_norm": 52.02908706665039,
"learning_rate": 2.994e-06,
"loss": 0.8025,
"step": 998
},
{
"epoch": 0.0009994997498749374,
"grad_norm": 54.37822341918945,
"learning_rate": 2.9970000000000003e-06,
"loss": 3.864,
"step": 999
},
{
"epoch": 0.0010005002501250625,
"grad_norm": 44.46600341796875,
"learning_rate": 3e-06,
"loss": 2.609,
"step": 1000
},
{
"completion_length": 254.37500762939453,
"epoch": 0.0010015007503751876,
"grad_norm": 39.40637969970703,
"learning_rate": 3e-06,
"loss": -9.2458,
"reward": 0.22263558954000473,
"reward_std": 0.14237912744283676,
"rewards/sudoku_reward_func": 0.22263558954000473,
"step": 1001,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010025012506253126,
"grad_norm": 48.54051208496094,
"learning_rate": 3e-06,
"loss": -8.9324,
"step": 1002
},
{
"epoch": 0.0010035017508754377,
"grad_norm": 43.79606628417969,
"learning_rate": 3e-06,
"loss": -6.4303,
"step": 1003
},
{
"epoch": 0.0010045022511255628,
"grad_norm": 36.05558776855469,
"learning_rate": 3e-06,
"loss": -9.1181,
"step": 1004
},
{
"epoch": 0.001005502751375688,
"grad_norm": 39.247318267822266,
"learning_rate": 3e-06,
"loss": -9.8013,
"step": 1005
},
{
"epoch": 0.0010065032516258128,
"grad_norm": 45.25431442260742,
"learning_rate": 3e-06,
"loss": -9.7636,
"step": 1006
},
{
"epoch": 0.001007503751875938,
"grad_norm": 37.57381820678711,
"learning_rate": 3e-06,
"loss": -7.187,
"step": 1007
},
{
"epoch": 0.001008504252126063,
"grad_norm": 33.81916046142578,
"learning_rate": 3e-06,
"loss": -9.3165,
"step": 1008
},
{
"completion_length": 251.7916717529297,
"epoch": 0.001009504752376188,
"grad_norm": 47.115623474121094,
"learning_rate": 3e-06,
"loss": -3.2863,
"reward": 0.15844081342220306,
"reward_std": 0.1177215576171875,
"rewards/sudoku_reward_func": 0.15844080597162247,
"step": 1009,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010105052526263132,
"grad_norm": 41.37596893310547,
"learning_rate": 3e-06,
"loss": -3.0274,
"step": 1010
},
{
"epoch": 0.0010115057528764383,
"grad_norm": 42.2933464050293,
"learning_rate": 3e-06,
"loss": -3.8172,
"step": 1011
},
{
"epoch": 0.0010125062531265634,
"grad_norm": 35.31851577758789,
"learning_rate": 3e-06,
"loss": -3.7782,
"step": 1012
},
{
"epoch": 0.0010135067533766882,
"grad_norm": 43.999046325683594,
"learning_rate": 3e-06,
"loss": -3.1858,
"step": 1013
},
{
"epoch": 0.0010145072536268133,
"grad_norm": 42.06627655029297,
"learning_rate": 3e-06,
"loss": -3.3128,
"step": 1014
},
{
"epoch": 0.0010155077538769384,
"grad_norm": 43.68073272705078,
"learning_rate": 3e-06,
"loss": -4.1191,
"step": 1015
},
{
"epoch": 0.0010165082541270635,
"grad_norm": 40.35986328125,
"learning_rate": 3e-06,
"loss": -4.3652,
"step": 1016
},
{
"completion_length": 251.5,
"epoch": 0.0010175087543771886,
"grad_norm": 43.43013000488281,
"learning_rate": 3e-06,
"loss": -5.5952,
"reward": 0.21097884327173233,
"reward_std": 0.15194199979305267,
"rewards/sudoku_reward_func": 0.21097883582115173,
"step": 1017,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010185092546273137,
"grad_norm": 43.78632736206055,
"learning_rate": 3e-06,
"loss": -5.3231,
"step": 1018
},
{
"epoch": 0.0010195097548774388,
"grad_norm": 38.955196380615234,
"learning_rate": 3e-06,
"loss": -6.054,
"step": 1019
},
{
"epoch": 0.0010205102551275637,
"grad_norm": 54.36684799194336,
"learning_rate": 3e-06,
"loss": -3.6614,
"step": 1020
},
{
"epoch": 0.0010215107553776888,
"grad_norm": 52.5922966003418,
"learning_rate": 3e-06,
"loss": -5.9705,
"step": 1021
},
{
"epoch": 0.0010225112556278139,
"grad_norm": 43.55205535888672,
"learning_rate": 3e-06,
"loss": -5.892,
"step": 1022
},
{
"epoch": 0.001023511755877939,
"grad_norm": 48.446205139160156,
"learning_rate": 3e-06,
"loss": -6.5397,
"step": 1023
},
{
"epoch": 0.001024512256128064,
"grad_norm": 56.804771423339844,
"learning_rate": 3e-06,
"loss": -4.1653,
"step": 1024
},
{
"completion_length": 256.0,
"epoch": 0.0010255127563781892,
"grad_norm": 48.26102828979492,
"learning_rate": 3e-06,
"loss": -10.1381,
"reward": 0.2154017984867096,
"reward_std": 0.13763760775327682,
"rewards/sudoku_reward_func": 0.2154017835855484,
"step": 1025,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010265132566283143,
"grad_norm": 54.459964752197266,
"learning_rate": 3e-06,
"loss": -10.2523,
"step": 1026
},
{
"epoch": 0.0010275137568784391,
"grad_norm": 66.99800109863281,
"learning_rate": 3e-06,
"loss": -8.145,
"step": 1027
},
{
"epoch": 0.0010285142571285642,
"grad_norm": 40.35697937011719,
"learning_rate": 3e-06,
"loss": -9.0834,
"step": 1028
},
{
"epoch": 0.0010295147573786893,
"grad_norm": 48.033973693847656,
"learning_rate": 3e-06,
"loss": -10.6708,
"step": 1029
},
{
"epoch": 0.0010305152576288144,
"grad_norm": 44.81524658203125,
"learning_rate": 3e-06,
"loss": -10.8063,
"step": 1030
},
{
"epoch": 0.0010315157578789395,
"grad_norm": 58.0858154296875,
"learning_rate": 3e-06,
"loss": -9.0872,
"step": 1031
},
{
"epoch": 0.0010325162581290646,
"grad_norm": 41.79676818847656,
"learning_rate": 3e-06,
"loss": -9.7248,
"step": 1032
},
{
"completion_length": 254.9791717529297,
"epoch": 0.0010335167583791897,
"grad_norm": 61.14440155029297,
"learning_rate": 3e-06,
"loss": 2.4693,
"reward": 0.17128128558397293,
"reward_std": 0.11857020482420921,
"rewards/sudoku_reward_func": 0.17128127813339233,
"step": 1033,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010345172586293146,
"grad_norm": 34.947303771972656,
"learning_rate": 3e-06,
"loss": 1.9397,
"step": 1034
},
{
"epoch": 0.0010355177588794397,
"grad_norm": 42.94151306152344,
"learning_rate": 3e-06,
"loss": 1.5863,
"step": 1035
},
{
"epoch": 0.0010365182591295648,
"grad_norm": 34.938148498535156,
"learning_rate": 3e-06,
"loss": 2.1598,
"step": 1036
},
{
"epoch": 0.0010375187593796898,
"grad_norm": 43.56894302368164,
"learning_rate": 3e-06,
"loss": 1.5508,
"step": 1037
},
{
"epoch": 0.001038519259629815,
"grad_norm": 27.966718673706055,
"learning_rate": 3e-06,
"loss": 1.1268,
"step": 1038
},
{
"epoch": 0.00103951975987994,
"grad_norm": 34.90840530395508,
"learning_rate": 3e-06,
"loss": 0.9044,
"step": 1039
},
{
"epoch": 0.0010405202601300651,
"grad_norm": 32.87578201293945,
"learning_rate": 3e-06,
"loss": 1.2856,
"step": 1040
},
{
"completion_length": 251.62500762939453,
"epoch": 0.00104152076038019,
"grad_norm": 56.22600555419922,
"learning_rate": 3e-06,
"loss": 0.9322,
"reward": 0.18572255223989487,
"reward_std": 0.12399409711360931,
"rewards/sudoku_reward_func": 0.18572255223989487,
"step": 1041,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001042521260630315,
"grad_norm": 33.610408782958984,
"learning_rate": 3e-06,
"loss": 2.2944,
"step": 1042
},
{
"epoch": 0.0010435217608804402,
"grad_norm": 33.504329681396484,
"learning_rate": 3e-06,
"loss": 1.6829,
"step": 1043
},
{
"epoch": 0.0010445222611305653,
"grad_norm": 36.757469177246094,
"learning_rate": 3e-06,
"loss": 2.9836,
"step": 1044
},
{
"epoch": 0.0010455227613806904,
"grad_norm": 30.555280685424805,
"learning_rate": 3e-06,
"loss": 0.8575,
"step": 1045
},
{
"epoch": 0.0010465232616308155,
"grad_norm": 31.86289405822754,
"learning_rate": 3e-06,
"loss": 1.8835,
"step": 1046
},
{
"epoch": 0.0010475237618809406,
"grad_norm": 34.755680084228516,
"learning_rate": 3e-06,
"loss": 1.1176,
"step": 1047
},
{
"epoch": 0.0010485242621310654,
"grad_norm": 40.08235549926758,
"learning_rate": 3e-06,
"loss": 2.0918,
"step": 1048
},
{
"completion_length": 253.08333587646484,
"epoch": 0.0010495247623811905,
"grad_norm": 21.937795639038086,
"learning_rate": 3e-06,
"loss": -2.3447,
"reward": 0.165798619389534,
"reward_std": 0.08688194304704666,
"rewards/sudoku_reward_func": 0.1657986119389534,
"step": 1049,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010505252626313156,
"grad_norm": 25.62210464477539,
"learning_rate": 3e-06,
"loss": -3.3928,
"step": 1050
},
{
"epoch": 0.0010515257628814407,
"grad_norm": 25.852378845214844,
"learning_rate": 3e-06,
"loss": -2.4151,
"step": 1051
},
{
"epoch": 0.0010525262631315658,
"grad_norm": 19.493253707885742,
"learning_rate": 3e-06,
"loss": -3.1267,
"step": 1052
},
{
"epoch": 0.001053526763381691,
"grad_norm": 24.54071807861328,
"learning_rate": 3e-06,
"loss": -2.3195,
"step": 1053
},
{
"epoch": 0.001054527263631816,
"grad_norm": 26.212997436523438,
"learning_rate": 3e-06,
"loss": -3.525,
"step": 1054
},
{
"epoch": 0.0010555277638819409,
"grad_norm": 34.78599166870117,
"learning_rate": 3e-06,
"loss": -2.55,
"step": 1055
},
{
"epoch": 0.001056528264132066,
"grad_norm": 20.467262268066406,
"learning_rate": 3e-06,
"loss": -3.3289,
"step": 1056
},
{
"completion_length": 254.58333587646484,
"epoch": 0.001057528764382191,
"grad_norm": 49.550392150878906,
"learning_rate": 3e-06,
"loss": -12.7646,
"reward": 0.1769593432545662,
"reward_std": 0.15440654009580612,
"rewards/sudoku_reward_func": 0.1769593358039856,
"step": 1057,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010585292646323162,
"grad_norm": 61.013832092285156,
"learning_rate": 3e-06,
"loss": -11.774,
"step": 1058
},
{
"epoch": 0.0010595297648824413,
"grad_norm": 39.312164306640625,
"learning_rate": 3e-06,
"loss": -12.574,
"step": 1059
},
{
"epoch": 0.0010605302651325664,
"grad_norm": 49.83845520019531,
"learning_rate": 3e-06,
"loss": -14.3973,
"step": 1060
},
{
"epoch": 0.0010615307653826915,
"grad_norm": 41.74903106689453,
"learning_rate": 3e-06,
"loss": -13.5319,
"step": 1061
},
{
"epoch": 0.0010625312656328163,
"grad_norm": 63.667415618896484,
"learning_rate": 3e-06,
"loss": -12.8829,
"step": 1062
},
{
"epoch": 0.0010635317658829414,
"grad_norm": 39.259647369384766,
"learning_rate": 3e-06,
"loss": -13.3493,
"step": 1063
},
{
"epoch": 0.0010645322661330665,
"grad_norm": 48.72328186035156,
"learning_rate": 3e-06,
"loss": -15.4117,
"step": 1064
},
{
"completion_length": 254.68750762939453,
"epoch": 0.0010655327663831916,
"grad_norm": 37.72417449951172,
"learning_rate": 3e-06,
"loss": -1.2786,
"reward": 0.2013888955116272,
"reward_std": 0.15765716135501862,
"rewards/sudoku_reward_func": 0.2013888955116272,
"step": 1065,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010665332666333167,
"grad_norm": 50.46000671386719,
"learning_rate": 3e-06,
"loss": -2.2704,
"step": 1066
},
{
"epoch": 0.0010675337668834418,
"grad_norm": 58.26871109008789,
"learning_rate": 3e-06,
"loss": -2.746,
"step": 1067
},
{
"epoch": 0.0010685342671335667,
"grad_norm": 72.73383331298828,
"learning_rate": 3e-06,
"loss": -1.7437,
"step": 1068
},
{
"epoch": 0.0010695347673836918,
"grad_norm": 37.29910659790039,
"learning_rate": 3e-06,
"loss": -1.6776,
"step": 1069
},
{
"epoch": 0.0010705352676338169,
"grad_norm": 44.55699157714844,
"learning_rate": 3e-06,
"loss": -2.8655,
"step": 1070
},
{
"epoch": 0.001071535767883942,
"grad_norm": 46.04912567138672,
"learning_rate": 3e-06,
"loss": -3.3467,
"step": 1071
},
{
"epoch": 0.001072536268134067,
"grad_norm": 47.104732513427734,
"learning_rate": 3e-06,
"loss": -2.3808,
"step": 1072
},
{
"completion_length": 255.7916717529297,
"epoch": 0.0010735367683841921,
"grad_norm": 47.69097137451172,
"learning_rate": 3e-06,
"loss": -12.6992,
"reward": 0.26855987310409546,
"reward_std": 0.16888362169265747,
"rewards/sudoku_reward_func": 0.26855985820293427,
"step": 1073,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010745372686343172,
"grad_norm": 57.80671691894531,
"learning_rate": 3e-06,
"loss": -14.2458,
"step": 1074
},
{
"epoch": 0.0010755377688844421,
"grad_norm": 47.09156799316406,
"learning_rate": 3e-06,
"loss": -15.3588,
"step": 1075
},
{
"epoch": 0.0010765382691345672,
"grad_norm": 59.22261428833008,
"learning_rate": 3e-06,
"loss": -13.2847,
"step": 1076
},
{
"epoch": 0.0010775387693846923,
"grad_norm": 52.122535705566406,
"learning_rate": 3e-06,
"loss": -12.8921,
"step": 1077
},
{
"epoch": 0.0010785392696348174,
"grad_norm": 69.0072250366211,
"learning_rate": 3e-06,
"loss": -14.7293,
"step": 1078
},
{
"epoch": 0.0010795397698849425,
"grad_norm": 44.61542510986328,
"learning_rate": 3e-06,
"loss": -15.8092,
"step": 1079
},
{
"epoch": 0.0010805402701350676,
"grad_norm": 52.3683967590332,
"learning_rate": 3e-06,
"loss": -14.0392,
"step": 1080
},
{
"completion_length": 251.6041717529297,
"epoch": 0.0010815407703851927,
"grad_norm": 47.1711311340332,
"learning_rate": 3e-06,
"loss": -11.0198,
"reward": 0.19797680526971817,
"reward_std": 0.16620083153247833,
"rewards/sudoku_reward_func": 0.19797679781913757,
"step": 1081,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010825412706353176,
"grad_norm": 48.20561981201172,
"learning_rate": 3e-06,
"loss": -12.7956,
"step": 1082
},
{
"epoch": 0.0010835417708854427,
"grad_norm": 48.29547882080078,
"learning_rate": 3e-06,
"loss": -12.0332,
"step": 1083
},
{
"epoch": 0.0010845422711355677,
"grad_norm": 50.09273147583008,
"learning_rate": 3e-06,
"loss": -11.6346,
"step": 1084
},
{
"epoch": 0.0010855427713856928,
"grad_norm": 48.44402313232422,
"learning_rate": 3e-06,
"loss": -11.6618,
"step": 1085
},
{
"epoch": 0.001086543271635818,
"grad_norm": 49.130584716796875,
"learning_rate": 3e-06,
"loss": -13.341,
"step": 1086
},
{
"epoch": 0.001087543771885943,
"grad_norm": 51.19216537475586,
"learning_rate": 3e-06,
"loss": -12.571,
"step": 1087
},
{
"epoch": 0.0010885442721360681,
"grad_norm": 47.3951301574707,
"learning_rate": 3e-06,
"loss": -12.3729,
"step": 1088
},
{
"completion_length": 256.0,
"epoch": 0.001089544772386193,
"grad_norm": 45.89775466918945,
"learning_rate": 3e-06,
"loss": -9.4464,
"reward": 0.2154017984867096,
"reward_std": 0.141516774892807,
"rewards/sudoku_reward_func": 0.2154017835855484,
"step": 1089,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001090545272636318,
"grad_norm": 47.289772033691406,
"learning_rate": 3e-06,
"loss": -10.7988,
"step": 1090
},
{
"epoch": 0.0010915457728864432,
"grad_norm": 59.00400161743164,
"learning_rate": 3e-06,
"loss": -12.4682,
"step": 1091
},
{
"epoch": 0.0010925462731365683,
"grad_norm": 58.620784759521484,
"learning_rate": 3e-06,
"loss": -12.318,
"step": 1092
},
{
"epoch": 0.0010935467733866934,
"grad_norm": 55.14871597290039,
"learning_rate": 3e-06,
"loss": -9.9248,
"step": 1093
},
{
"epoch": 0.0010945472736368185,
"grad_norm": 41.1626091003418,
"learning_rate": 3e-06,
"loss": -11.404,
"step": 1094
},
{
"epoch": 0.0010955477738869436,
"grad_norm": 50.45378875732422,
"learning_rate": 3e-06,
"loss": -13.1491,
"step": 1095
},
{
"epoch": 0.0010965482741370684,
"grad_norm": 37.13240432739258,
"learning_rate": 3e-06,
"loss": -12.8651,
"step": 1096
},
{
"completion_length": 256.0,
"epoch": 0.0010975487743871935,
"grad_norm": 58.38191223144531,
"learning_rate": 3e-06,
"loss": -3.4455,
"reward": 0.22999339550733566,
"reward_std": 0.18920356035232544,
"rewards/sudoku_reward_func": 0.22999338060617447,
"step": 1097,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0010985492746373186,
"grad_norm": 58.06026077270508,
"learning_rate": 3e-06,
"loss": 0.3434,
"step": 1098
},
{
"epoch": 0.0010995497748874437,
"grad_norm": 52.834251403808594,
"learning_rate": 3e-06,
"loss": -4.7924,
"step": 1099
},
{
"epoch": 0.0011005502751375688,
"grad_norm": 62.384254455566406,
"learning_rate": 3e-06,
"loss": -2.7561,
"step": 1100
},
{
"epoch": 0.001101550775387694,
"grad_norm": 54.30180358886719,
"learning_rate": 3e-06,
"loss": -3.8362,
"step": 1101
},
{
"epoch": 0.001102551275637819,
"grad_norm": 64.72964477539062,
"learning_rate": 3e-06,
"loss": -0.2608,
"step": 1102
},
{
"epoch": 0.0011035517758879439,
"grad_norm": 59.001922607421875,
"learning_rate": 3e-06,
"loss": -5.3006,
"step": 1103
},
{
"epoch": 0.001104552276138069,
"grad_norm": 60.54927444458008,
"learning_rate": 3e-06,
"loss": -3.0542,
"step": 1104
},
{
"completion_length": 253.7916717529297,
"epoch": 0.001105552776388194,
"grad_norm": 34.46814727783203,
"learning_rate": 3e-06,
"loss": 1.3126,
"reward": 0.20547740161418915,
"reward_std": 0.14204465597867966,
"rewards/sudoku_reward_func": 0.20547740161418915,
"step": 1105,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011065532766383192,
"grad_norm": 38.980445861816406,
"learning_rate": 3e-06,
"loss": 0.8968,
"step": 1106
},
{
"epoch": 0.0011075537768884443,
"grad_norm": 41.922882080078125,
"learning_rate": 3e-06,
"loss": 0.7973,
"step": 1107
},
{
"epoch": 0.0011085542771385693,
"grad_norm": 35.87057113647461,
"learning_rate": 3e-06,
"loss": 1.8764,
"step": 1108
},
{
"epoch": 0.0011095547773886944,
"grad_norm": 37.61541748046875,
"learning_rate": 3e-06,
"loss": 1.0825,
"step": 1109
},
{
"epoch": 0.0011105552776388193,
"grad_norm": 38.23784255981445,
"learning_rate": 3e-06,
"loss": 0.5459,
"step": 1110
},
{
"epoch": 0.0011115557778889444,
"grad_norm": 35.42008590698242,
"learning_rate": 3e-06,
"loss": 0.5068,
"step": 1111
},
{
"epoch": 0.0011125562781390695,
"grad_norm": 34.2984504699707,
"learning_rate": 3e-06,
"loss": 1.6137,
"step": 1112
},
{
"completion_length": 253.7291717529297,
"epoch": 0.0011135567783891946,
"grad_norm": 64.51300048828125,
"learning_rate": 3e-06,
"loss": -27.9773,
"reward": 0.21698381751775742,
"reward_std": 0.19872380048036575,
"rewards/sudoku_reward_func": 0.21698381751775742,
"step": 1113,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011145572786393197,
"grad_norm": 51.81525802612305,
"learning_rate": 3e-06,
"loss": -26.6845,
"step": 1114
},
{
"epoch": 0.0011155577788894448,
"grad_norm": 74.68866729736328,
"learning_rate": 3e-06,
"loss": -26.7574,
"step": 1115
},
{
"epoch": 0.0011165582791395699,
"grad_norm": 91.27005004882812,
"learning_rate": 3e-06,
"loss": -29.4966,
"step": 1116
},
{
"epoch": 0.0011175587793896948,
"grad_norm": 70.424072265625,
"learning_rate": 3e-06,
"loss": -28.8359,
"step": 1117
},
{
"epoch": 0.0011185592796398199,
"grad_norm": 61.16431427001953,
"learning_rate": 3e-06,
"loss": -27.8278,
"step": 1118
},
{
"epoch": 0.001119559779889945,
"grad_norm": 71.94561004638672,
"learning_rate": 3e-06,
"loss": -28.0802,
"step": 1119
},
{
"epoch": 0.00112056028014007,
"grad_norm": 82.03819274902344,
"learning_rate": 3e-06,
"loss": -31.3154,
"step": 1120
},
{
"completion_length": 255.52083587646484,
"epoch": 0.0011215607803901951,
"grad_norm": 62.45363235473633,
"learning_rate": 3e-06,
"loss": -9.9586,
"reward": 0.2145337387919426,
"reward_std": 0.16341029852628708,
"rewards/sudoku_reward_func": 0.2145337387919426,
"step": 1121,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011225612806403202,
"grad_norm": 53.67804718017578,
"learning_rate": 3e-06,
"loss": -9.8162,
"step": 1122
},
{
"epoch": 0.0011235617808904453,
"grad_norm": 57.61661148071289,
"learning_rate": 3e-06,
"loss": -10.5356,
"step": 1123
},
{
"epoch": 0.0011245622811405702,
"grad_norm": 73.10306549072266,
"learning_rate": 3e-06,
"loss": -11.704,
"step": 1124
},
{
"epoch": 0.0011255627813906953,
"grad_norm": 71.7237548828125,
"learning_rate": 3e-06,
"loss": -10.8365,
"step": 1125
},
{
"epoch": 0.0011265632816408204,
"grad_norm": 65.70649719238281,
"learning_rate": 3e-06,
"loss": -10.4143,
"step": 1126
},
{
"epoch": 0.0011275637818909455,
"grad_norm": 61.187156677246094,
"learning_rate": 3e-06,
"loss": -11.2278,
"step": 1127
},
{
"epoch": 0.0011285642821410706,
"grad_norm": 73.90250396728516,
"learning_rate": 3e-06,
"loss": -12.7642,
"step": 1128
},
{
"completion_length": 256.0,
"epoch": 0.0011295647823911957,
"grad_norm": 57.4097785949707,
"learning_rate": 3e-06,
"loss": -4.7173,
"reward": 0.25728265941143036,
"reward_std": 0.1203451007604599,
"rewards/sudoku_reward_func": 0.25728263705968857,
"step": 1129,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011305652826413208,
"grad_norm": 66.7146987915039,
"learning_rate": 3e-06,
"loss": -4.9575,
"step": 1130
},
{
"epoch": 0.0011315657828914456,
"grad_norm": 64.86263275146484,
"learning_rate": 3e-06,
"loss": -4.5152,
"step": 1131
},
{
"epoch": 0.0011325662831415707,
"grad_norm": 62.34523391723633,
"learning_rate": 3e-06,
"loss": -6.3271,
"step": 1132
},
{
"epoch": 0.0011335667833916958,
"grad_norm": 51.701087951660156,
"learning_rate": 3e-06,
"loss": -5.0724,
"step": 1133
},
{
"epoch": 0.001134567283641821,
"grad_norm": 57.253665924072266,
"learning_rate": 3e-06,
"loss": -5.2917,
"step": 1134
},
{
"epoch": 0.001135567783891946,
"grad_norm": 86.78887176513672,
"learning_rate": 3e-06,
"loss": -5.2676,
"step": 1135
},
{
"epoch": 0.0011365682841420711,
"grad_norm": 57.795284271240234,
"learning_rate": 3e-06,
"loss": -6.9075,
"step": 1136
},
{
"completion_length": 255.5625,
"epoch": 0.0011375687843921962,
"grad_norm": 64.22807312011719,
"learning_rate": 3e-06,
"loss": -7.4062,
"reward": 0.1966765895485878,
"reward_std": 0.14253421127796173,
"rewards/sudoku_reward_func": 0.1966765895485878,
"step": 1137,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001138569284642321,
"grad_norm": 76.80013275146484,
"learning_rate": 3e-06,
"loss": -6.0915,
"step": 1138
},
{
"epoch": 0.0011395697848924462,
"grad_norm": 69.39733123779297,
"learning_rate": 3e-06,
"loss": -5.2055,
"step": 1139
},
{
"epoch": 0.0011405702851425713,
"grad_norm": 58.3795166015625,
"learning_rate": 3e-06,
"loss": -7.0196,
"step": 1140
},
{
"epoch": 0.0011415707853926964,
"grad_norm": 48.28700637817383,
"learning_rate": 3e-06,
"loss": -7.8775,
"step": 1141
},
{
"epoch": 0.0011425712856428215,
"grad_norm": 66.8180160522461,
"learning_rate": 3e-06,
"loss": -7.0476,
"step": 1142
},
{
"epoch": 0.0011435717858929466,
"grad_norm": 60.36480712890625,
"learning_rate": 3e-06,
"loss": -6.1894,
"step": 1143
},
{
"epoch": 0.0011445722861430714,
"grad_norm": 46.661373138427734,
"learning_rate": 3e-06,
"loss": -7.787,
"step": 1144
},
{
"completion_length": 256.0,
"epoch": 0.0011455727863931965,
"grad_norm": 63.26940155029297,
"learning_rate": 3e-06,
"loss": -12.6312,
"reward": 0.21070076525211334,
"reward_std": 0.16143980622291565,
"rewards/sudoku_reward_func": 0.21070076525211334,
"step": 1145,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011465732866433216,
"grad_norm": 59.096675872802734,
"learning_rate": 3e-06,
"loss": -10.4162,
"step": 1146
},
{
"epoch": 0.0011475737868934467,
"grad_norm": 68.54641723632812,
"learning_rate": 3e-06,
"loss": -8.9985,
"step": 1147
},
{
"epoch": 0.0011485742871435718,
"grad_norm": 77.7593002319336,
"learning_rate": 3e-06,
"loss": -10.295,
"step": 1148
},
{
"epoch": 0.001149574787393697,
"grad_norm": 52.21497344970703,
"learning_rate": 3e-06,
"loss": -13.3169,
"step": 1149
},
{
"epoch": 0.001150575287643822,
"grad_norm": 44.6879997253418,
"learning_rate": 3e-06,
"loss": -11.347,
"step": 1150
},
{
"epoch": 0.0011515757878939469,
"grad_norm": 49.52695083618164,
"learning_rate": 3e-06,
"loss": -9.9074,
"step": 1151
},
{
"epoch": 0.001152576288144072,
"grad_norm": 52.84867477416992,
"learning_rate": 3e-06,
"loss": -10.8295,
"step": 1152
},
{
"completion_length": 256.0,
"epoch": 0.001153576788394197,
"grad_norm": 47.332157135009766,
"learning_rate": 3e-06,
"loss": -10.6754,
"reward": 0.23607730120420456,
"reward_std": 0.15124260634183884,
"rewards/sudoku_reward_func": 0.23607730120420456,
"step": 1153,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011545772886443221,
"grad_norm": 59.60795211791992,
"learning_rate": 3e-06,
"loss": -9.6053,
"step": 1154
},
{
"epoch": 0.0011555777888944472,
"grad_norm": 58.676998138427734,
"learning_rate": 3e-06,
"loss": -6.6886,
"step": 1155
},
{
"epoch": 0.0011565782891445723,
"grad_norm": 44.790306091308594,
"learning_rate": 3e-06,
"loss": -7.5866,
"step": 1156
},
{
"epoch": 0.0011575787893946974,
"grad_norm": 47.08787536621094,
"learning_rate": 3e-06,
"loss": -11.3928,
"step": 1157
},
{
"epoch": 0.0011585792896448223,
"grad_norm": 62.33521270751953,
"learning_rate": 3e-06,
"loss": -10.1897,
"step": 1158
},
{
"epoch": 0.0011595797898949474,
"grad_norm": 62.34925842285156,
"learning_rate": 3e-06,
"loss": -6.9214,
"step": 1159
},
{
"epoch": 0.0011605802901450725,
"grad_norm": 51.26759719848633,
"learning_rate": 3e-06,
"loss": -8.3383,
"step": 1160
},
{
"completion_length": 256.0,
"epoch": 0.0011615807903951976,
"grad_norm": 71.22803497314453,
"learning_rate": 3e-06,
"loss": -3.411,
"reward": 0.1722470298409462,
"reward_std": 0.15417955815792084,
"rewards/sudoku_reward_func": 0.1722470298409462,
"step": 1161,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011625812906453227,
"grad_norm": 71.8301010131836,
"learning_rate": 3e-06,
"loss": -6.7294,
"step": 1162
},
{
"epoch": 0.0011635817908954478,
"grad_norm": 98.72702026367188,
"learning_rate": 3e-06,
"loss": -8.4408,
"step": 1163
},
{
"epoch": 0.0011645822911455729,
"grad_norm": 48.08713912963867,
"learning_rate": 3e-06,
"loss": -3.6254,
"step": 1164
},
{
"epoch": 0.0011655827913956977,
"grad_norm": 68.0713882446289,
"learning_rate": 3e-06,
"loss": -4.2022,
"step": 1165
},
{
"epoch": 0.0011665832916458228,
"grad_norm": 58.155616760253906,
"learning_rate": 3e-06,
"loss": -7.46,
"step": 1166
},
{
"epoch": 0.001167583791895948,
"grad_norm": 85.09232330322266,
"learning_rate": 3e-06,
"loss": -9.5867,
"step": 1167
},
{
"epoch": 0.001168584292146073,
"grad_norm": 43.816776275634766,
"learning_rate": 3e-06,
"loss": -3.9229,
"step": 1168
},
{
"completion_length": 254.2916717529297,
"epoch": 0.0011695847923961981,
"grad_norm": 51.38243103027344,
"learning_rate": 3e-06,
"loss": -11.8953,
"reward": 0.20840098708868027,
"reward_std": 0.1413535289466381,
"rewards/sudoku_reward_func": 0.20840098708868027,
"step": 1169,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011705852926463232,
"grad_norm": 45.753936767578125,
"learning_rate": 3e-06,
"loss": -11.8943,
"step": 1170
},
{
"epoch": 0.0011715857928964483,
"grad_norm": 49.84098434448242,
"learning_rate": 3e-06,
"loss": -13.6678,
"step": 1171
},
{
"epoch": 0.0011725862931465732,
"grad_norm": 54.85823440551758,
"learning_rate": 3e-06,
"loss": -13.16,
"step": 1172
},
{
"epoch": 0.0011735867933966983,
"grad_norm": 48.858642578125,
"learning_rate": 3e-06,
"loss": -12.1275,
"step": 1173
},
{
"epoch": 0.0011745872936468234,
"grad_norm": 73.31692504882812,
"learning_rate": 3e-06,
"loss": -12.7694,
"step": 1174
},
{
"epoch": 0.0011755877938969485,
"grad_norm": 62.74945831298828,
"learning_rate": 3e-06,
"loss": -14.5101,
"step": 1175
},
{
"epoch": 0.0011765882941470736,
"grad_norm": 46.59790802001953,
"learning_rate": 3e-06,
"loss": -14.0565,
"step": 1176
},
{
"completion_length": 255.6875,
"epoch": 0.0011775887943971987,
"grad_norm": 75.01581573486328,
"learning_rate": 3e-06,
"loss": -11.7232,
"reward": 0.20382773131132126,
"reward_std": 0.12743021547794342,
"rewards/sudoku_reward_func": 0.20382773131132126,
"step": 1177,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011785892946473238,
"grad_norm": 50.770267486572266,
"learning_rate": 3e-06,
"loss": -11.7161,
"step": 1178
},
{
"epoch": 0.0011795897948974486,
"grad_norm": 46.51319122314453,
"learning_rate": 3e-06,
"loss": -12.1454,
"step": 1179
},
{
"epoch": 0.0011805902951475737,
"grad_norm": 66.0778579711914,
"learning_rate": 3e-06,
"loss": -12.9209,
"step": 1180
},
{
"epoch": 0.0011815907953976988,
"grad_norm": 79.279296875,
"learning_rate": 3e-06,
"loss": -11.9834,
"step": 1181
},
{
"epoch": 0.001182591295647824,
"grad_norm": 90.76998138427734,
"learning_rate": 3e-06,
"loss": -11.7098,
"step": 1182
},
{
"epoch": 0.001183591795897949,
"grad_norm": 45.429054260253906,
"learning_rate": 3e-06,
"loss": -12.816,
"step": 1183
},
{
"epoch": 0.001184592296148074,
"grad_norm": 72.65291595458984,
"learning_rate": 3e-06,
"loss": -13.4472,
"step": 1184
},
{
"completion_length": 255.0416717529297,
"epoch": 0.0011855927963981992,
"grad_norm": 51.79472732543945,
"learning_rate": 3e-06,
"loss": -8.3447,
"reward": 0.21494334936141968,
"reward_std": 0.18318727612495422,
"rewards/sudoku_reward_func": 0.21494334936141968,
"step": 1185,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001186593296648324,
"grad_norm": 81.84302520751953,
"learning_rate": 3e-06,
"loss": -6.9186,
"step": 1186
},
{
"epoch": 0.0011875937968984492,
"grad_norm": 65.76778411865234,
"learning_rate": 3e-06,
"loss": -9.9974,
"step": 1187
},
{
"epoch": 0.0011885942971485743,
"grad_norm": 49.771148681640625,
"learning_rate": 3e-06,
"loss": -6.0897,
"step": 1188
},
{
"epoch": 0.0011895947973986994,
"grad_norm": 57.41340255737305,
"learning_rate": 3e-06,
"loss": -9.1539,
"step": 1189
},
{
"epoch": 0.0011905952976488244,
"grad_norm": 84.84779357910156,
"learning_rate": 3e-06,
"loss": -7.8363,
"step": 1190
},
{
"epoch": 0.0011915957978989495,
"grad_norm": 62.59910202026367,
"learning_rate": 3e-06,
"loss": -10.681,
"step": 1191
},
{
"epoch": 0.0011925962981490746,
"grad_norm": 55.14677047729492,
"learning_rate": 3e-06,
"loss": -6.7142,
"step": 1192
},
{
"completion_length": 256.0,
"epoch": 0.0011935967983991995,
"grad_norm": 55.2512321472168,
"learning_rate": 3e-06,
"loss": -6.7691,
"reward": 0.19457221776247025,
"reward_std": 0.12072273343801498,
"rewards/sudoku_reward_func": 0.19457221776247025,
"step": 1193,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0011945972986493246,
"grad_norm": 58.17645263671875,
"learning_rate": 3e-06,
"loss": -7.746,
"step": 1194
},
{
"epoch": 0.0011955977988994497,
"grad_norm": 54.57912826538086,
"learning_rate": 3e-06,
"loss": -6.2499,
"step": 1195
},
{
"epoch": 0.0011965982991495748,
"grad_norm": 50.100582122802734,
"learning_rate": 3e-06,
"loss": -8.7012,
"step": 1196
},
{
"epoch": 0.0011975987993996999,
"grad_norm": 58.364891052246094,
"learning_rate": 3e-06,
"loss": -6.9944,
"step": 1197
},
{
"epoch": 0.001198599299649825,
"grad_norm": 63.98237991333008,
"learning_rate": 3e-06,
"loss": -8.0062,
"step": 1198
},
{
"epoch": 0.00119959979989995,
"grad_norm": 55.945518493652344,
"learning_rate": 3e-06,
"loss": -7.0796,
"step": 1199
},
{
"epoch": 0.001200600300150075,
"grad_norm": 43.91253662109375,
"learning_rate": 3e-06,
"loss": -9.2514,
"step": 1200
},
{
"completion_length": 255.8541717529297,
"epoch": 0.0012016008004002,
"grad_norm": 54.789920806884766,
"learning_rate": 3e-06,
"loss": -17.2824,
"reward": 0.22809194028377533,
"reward_std": 0.15452970564365387,
"rewards/sudoku_reward_func": 0.22809194028377533,
"step": 1201,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012026013006503251,
"grad_norm": 75.46862030029297,
"learning_rate": 3e-06,
"loss": -17.5235,
"step": 1202
},
{
"epoch": 0.0012036018009004502,
"grad_norm": 51.897701263427734,
"learning_rate": 3e-06,
"loss": -17.4954,
"step": 1203
},
{
"epoch": 0.0012046023011505753,
"grad_norm": 56.08158874511719,
"learning_rate": 3e-06,
"loss": -18.0722,
"step": 1204
},
{
"epoch": 0.0012056028014007004,
"grad_norm": 44.373287200927734,
"learning_rate": 3e-06,
"loss": -17.5788,
"step": 1205
},
{
"epoch": 0.0012066033016508255,
"grad_norm": 57.38613510131836,
"learning_rate": 3e-06,
"loss": -18.2698,
"step": 1206
},
{
"epoch": 0.0012076038019009504,
"grad_norm": 63.77848434448242,
"learning_rate": 3e-06,
"loss": -17.855,
"step": 1207
},
{
"epoch": 0.0012086043021510755,
"grad_norm": 54.5859375,
"learning_rate": 3e-06,
"loss": -18.4923,
"step": 1208
},
{
"completion_length": 252.4791717529297,
"epoch": 0.0012096048024012006,
"grad_norm": 54.04340744018555,
"learning_rate": 3e-06,
"loss": -9.2032,
"reward": 0.23073744028806686,
"reward_std": 0.151360884308815,
"rewards/sudoku_reward_func": 0.23073744028806686,
"step": 1209,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012106053026513257,
"grad_norm": 68.70407104492188,
"learning_rate": 3e-06,
"loss": -10.1651,
"step": 1210
},
{
"epoch": 0.0012116058029014508,
"grad_norm": 60.362545013427734,
"learning_rate": 3e-06,
"loss": -9.5503,
"step": 1211
},
{
"epoch": 0.0012126063031515759,
"grad_norm": 53.39244842529297,
"learning_rate": 3e-06,
"loss": -10.7794,
"step": 1212
},
{
"epoch": 0.001213606803401701,
"grad_norm": 55.661537170410156,
"learning_rate": 3e-06,
"loss": -9.8296,
"step": 1213
},
{
"epoch": 0.0012146073036518258,
"grad_norm": 68.5494613647461,
"learning_rate": 3e-06,
"loss": -11.2466,
"step": 1214
},
{
"epoch": 0.001215607803901951,
"grad_norm": 80.74198913574219,
"learning_rate": 3e-06,
"loss": -10.4096,
"step": 1215
},
{
"epoch": 0.001216608304152076,
"grad_norm": 63.03583908081055,
"learning_rate": 3e-06,
"loss": -11.251,
"step": 1216
},
{
"completion_length": 256.0,
"epoch": 0.0012176088044022011,
"grad_norm": 54.193382263183594,
"learning_rate": 3e-06,
"loss": -6.744,
"reward": 0.22689320147037506,
"reward_std": 0.15158719569444656,
"rewards/sudoku_reward_func": 0.22689320147037506,
"step": 1217,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012186093046523262,
"grad_norm": 61.95039749145508,
"learning_rate": 3e-06,
"loss": -8.6779,
"step": 1218
},
{
"epoch": 0.0012196098049024513,
"grad_norm": 60.906612396240234,
"learning_rate": 3e-06,
"loss": -11.6589,
"step": 1219
},
{
"epoch": 0.0012206103051525764,
"grad_norm": 64.95401000976562,
"learning_rate": 3e-06,
"loss": -6.9082,
"step": 1220
},
{
"epoch": 0.0012216108054027013,
"grad_norm": 51.398624420166016,
"learning_rate": 3e-06,
"loss": -6.9897,
"step": 1221
},
{
"epoch": 0.0012226113056528264,
"grad_norm": 63.6590690612793,
"learning_rate": 3e-06,
"loss": -9.0396,
"step": 1222
},
{
"epoch": 0.0012236118059029515,
"grad_norm": 56.660369873046875,
"learning_rate": 3e-06,
"loss": -12.1785,
"step": 1223
},
{
"epoch": 0.0012246123061530766,
"grad_norm": 71.9052734375,
"learning_rate": 3e-06,
"loss": -7.0529,
"step": 1224
},
{
"completion_length": 256.0,
"epoch": 0.0012256128064032016,
"grad_norm": 68.56539916992188,
"learning_rate": 3e-06,
"loss": -8.8322,
"reward": 0.1755952499806881,
"reward_std": 0.13179953396320343,
"rewards/sudoku_reward_func": 0.1755952425301075,
"step": 1225,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012266133066533267,
"grad_norm": 76.60578155517578,
"learning_rate": 3e-06,
"loss": -10.5883,
"step": 1226
},
{
"epoch": 0.0012276138069034516,
"grad_norm": 71.67562103271484,
"learning_rate": 3e-06,
"loss": -7.2645,
"step": 1227
},
{
"epoch": 0.0012286143071535767,
"grad_norm": 55.47669982910156,
"learning_rate": 3e-06,
"loss": -8.8148,
"step": 1228
},
{
"epoch": 0.0012296148074037018,
"grad_norm": 89.62641906738281,
"learning_rate": 3e-06,
"loss": -8.9664,
"step": 1229
},
{
"epoch": 0.001230615307653827,
"grad_norm": 67.79340362548828,
"learning_rate": 3e-06,
"loss": -11.0511,
"step": 1230
},
{
"epoch": 0.001231615807903952,
"grad_norm": 68.45556640625,
"learning_rate": 3e-06,
"loss": -8.3416,
"step": 1231
},
{
"epoch": 0.001232616308154077,
"grad_norm": 59.216705322265625,
"learning_rate": 3e-06,
"loss": -8.9795,
"step": 1232
},
{
"completion_length": 252.3541717529297,
"epoch": 0.0012336168084042022,
"grad_norm": 55.95851516723633,
"learning_rate": 3e-06,
"loss": -7.4608,
"reward": 0.18629751354455948,
"reward_std": 0.1258675828576088,
"rewards/sudoku_reward_func": 0.18629750609397888,
"step": 1233,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001234617308654327,
"grad_norm": 47.71657943725586,
"learning_rate": 3e-06,
"loss": -4.2586,
"step": 1234
},
{
"epoch": 0.0012356178089044522,
"grad_norm": 63.48736572265625,
"learning_rate": 3e-06,
"loss": -7.4636,
"step": 1235
},
{
"epoch": 0.0012366183091545772,
"grad_norm": 46.31603240966797,
"learning_rate": 3e-06,
"loss": -8.7069,
"step": 1236
},
{
"epoch": 0.0012376188094047023,
"grad_norm": 57.96319580078125,
"learning_rate": 3e-06,
"loss": -7.4238,
"step": 1237
},
{
"epoch": 0.0012386193096548274,
"grad_norm": 45.64490509033203,
"learning_rate": 3e-06,
"loss": -4.4522,
"step": 1238
},
{
"epoch": 0.0012396198099049525,
"grad_norm": 68.51618957519531,
"learning_rate": 3e-06,
"loss": -7.908,
"step": 1239
},
{
"epoch": 0.0012406203101550776,
"grad_norm": 47.038169860839844,
"learning_rate": 3e-06,
"loss": -9.3185,
"step": 1240
},
{
"completion_length": 254.02083587646484,
"epoch": 0.0012416208104052025,
"grad_norm": 40.134315490722656,
"learning_rate": 3e-06,
"loss": 11.8948,
"reward": 0.19241898506879807,
"reward_std": 0.11422308534383774,
"rewards/sudoku_reward_func": 0.19241898506879807,
"step": 1241,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012426213106553276,
"grad_norm": 44.94253921508789,
"learning_rate": 3e-06,
"loss": 11.3922,
"step": 1242
},
{
"epoch": 0.0012436218109054527,
"grad_norm": 42.705142974853516,
"learning_rate": 3e-06,
"loss": 10.8539,
"step": 1243
},
{
"epoch": 0.0012446223111555778,
"grad_norm": 54.742881774902344,
"learning_rate": 3e-06,
"loss": 12.6673,
"step": 1244
},
{
"epoch": 0.0012456228114057029,
"grad_norm": 38.904541015625,
"learning_rate": 3e-06,
"loss": 11.573,
"step": 1245
},
{
"epoch": 0.001246623311655828,
"grad_norm": 44.749977111816406,
"learning_rate": 3e-06,
"loss": 11.1428,
"step": 1246
},
{
"epoch": 0.001247623811905953,
"grad_norm": 42.752193450927734,
"learning_rate": 3e-06,
"loss": 10.1868,
"step": 1247
},
{
"epoch": 0.001248624312156078,
"grad_norm": 52.36620330810547,
"learning_rate": 3e-06,
"loss": 12.2819,
"step": 1248
},
{
"completion_length": 253.8541717529297,
"epoch": 0.001249624812406203,
"grad_norm": 51.714195251464844,
"learning_rate": 3e-06,
"loss": -10.4317,
"reward": 0.22540509700775146,
"reward_std": 0.1407158188521862,
"rewards/sudoku_reward_func": 0.22540509700775146,
"step": 1249,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012506253126563281,
"grad_norm": 58.5916633605957,
"learning_rate": 3e-06,
"loss": -10.271,
"step": 1250
},
{
"epoch": 0.0012516258129064532,
"grad_norm": 50.90818786621094,
"learning_rate": 3e-06,
"loss": -15.043,
"step": 1251
},
{
"epoch": 0.0012526263131565783,
"grad_norm": 61.283321380615234,
"learning_rate": 3e-06,
"loss": -15.4831,
"step": 1252
},
{
"epoch": 0.0012536268134067034,
"grad_norm": 49.761512756347656,
"learning_rate": 3e-06,
"loss": -10.8181,
"step": 1253
},
{
"epoch": 0.0012546273136568285,
"grad_norm": 65.47783660888672,
"learning_rate": 3e-06,
"loss": -10.4775,
"step": 1254
},
{
"epoch": 0.0012556278139069534,
"grad_norm": 49.240692138671875,
"learning_rate": 3e-06,
"loss": -15.3536,
"step": 1255
},
{
"epoch": 0.0012566283141570785,
"grad_norm": 73.38108825683594,
"learning_rate": 3e-06,
"loss": -16.2277,
"step": 1256
},
{
"completion_length": 251.3541717529297,
"epoch": 0.0012576288144072036,
"grad_norm": 103.87187194824219,
"learning_rate": 3e-06,
"loss": -9.3538,
"reward": 0.21862224489450455,
"reward_std": 0.14190439134836197,
"rewards/sudoku_reward_func": 0.21862224489450455,
"step": 1257,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012586293146573287,
"grad_norm": 79.26856994628906,
"learning_rate": 3e-06,
"loss": -10.7503,
"step": 1258
},
{
"epoch": 0.0012596298149074538,
"grad_norm": 94.93610382080078,
"learning_rate": 3e-06,
"loss": -9.2469,
"step": 1259
},
{
"epoch": 0.0012606303151575789,
"grad_norm": 76.20452117919922,
"learning_rate": 3e-06,
"loss": -9.0145,
"step": 1260
},
{
"epoch": 0.001261630815407704,
"grad_norm": 115.39997100830078,
"learning_rate": 3e-06,
"loss": -10.6488,
"step": 1261
},
{
"epoch": 0.0012626313156578288,
"grad_norm": 77.61318969726562,
"learning_rate": 3e-06,
"loss": -11.5868,
"step": 1262
},
{
"epoch": 0.001263631815907954,
"grad_norm": 101.30606079101562,
"learning_rate": 3e-06,
"loss": -9.5905,
"step": 1263
},
{
"epoch": 0.001264632316158079,
"grad_norm": 79.39830780029297,
"learning_rate": 3e-06,
"loss": -10.3842,
"step": 1264
},
{
"completion_length": 253.4791717529297,
"epoch": 0.001265632816408204,
"grad_norm": 70.97663879394531,
"learning_rate": 3e-06,
"loss": -3.1305,
"reward": 0.23012492060661316,
"reward_std": 0.1151810809969902,
"rewards/sudoku_reward_func": 0.23012491315603256,
"step": 1265,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012666333166583292,
"grad_norm": 62.975276947021484,
"learning_rate": 3e-06,
"loss": -5.8182,
"step": 1266
},
{
"epoch": 0.0012676338169084543,
"grad_norm": 58.47067642211914,
"learning_rate": 3e-06,
"loss": -3.038,
"step": 1267
},
{
"epoch": 0.0012686343171585794,
"grad_norm": 59.895057678222656,
"learning_rate": 3e-06,
"loss": -2.2264,
"step": 1268
},
{
"epoch": 0.0012696348174087043,
"grad_norm": 69.18795013427734,
"learning_rate": 3e-06,
"loss": -3.2379,
"step": 1269
},
{
"epoch": 0.0012706353176588294,
"grad_norm": 60.994815826416016,
"learning_rate": 3e-06,
"loss": -6.9695,
"step": 1270
},
{
"epoch": 0.0012716358179089544,
"grad_norm": 52.13833999633789,
"learning_rate": 3e-06,
"loss": -3.9799,
"step": 1271
},
{
"epoch": 0.0012726363181590795,
"grad_norm": 63.659725189208984,
"learning_rate": 3e-06,
"loss": -2.0911,
"step": 1272
},
{
"completion_length": 254.375,
"epoch": 0.0012736368184092046,
"grad_norm": 115.85366821289062,
"learning_rate": 3e-06,
"loss": -0.5977,
"reward": 0.21176423132419586,
"reward_std": 0.14812293648719788,
"rewards/sudoku_reward_func": 0.21176422387361526,
"step": 1273,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012746373186593297,
"grad_norm": 82.96026611328125,
"learning_rate": 3e-06,
"loss": -5.8223,
"step": 1274
},
{
"epoch": 0.0012756378189094548,
"grad_norm": 101.27959442138672,
"learning_rate": 3e-06,
"loss": -2.9291,
"step": 1275
},
{
"epoch": 0.0012766383191595797,
"grad_norm": 84.06151580810547,
"learning_rate": 3e-06,
"loss": -3.468,
"step": 1276
},
{
"epoch": 0.0012776388194097048,
"grad_norm": 119.58338165283203,
"learning_rate": 3e-06,
"loss": -0.5509,
"step": 1277
},
{
"epoch": 0.0012786393196598299,
"grad_norm": 89.5655746459961,
"learning_rate": 3e-06,
"loss": -6.6584,
"step": 1278
},
{
"epoch": 0.001279639819909955,
"grad_norm": 100.6349105834961,
"learning_rate": 3e-06,
"loss": -3.7463,
"step": 1279
},
{
"epoch": 0.00128064032016008,
"grad_norm": 84.62515258789062,
"learning_rate": 3e-06,
"loss": -4.8434,
"step": 1280
},
{
"completion_length": 252.39583587646484,
"epoch": 0.0012816408204102052,
"grad_norm": 94.54007720947266,
"learning_rate": 3e-06,
"loss": 5.2631,
"reward": 0.26583169400691986,
"reward_std": 0.12800082564353943,
"rewards/sudoku_reward_func": 0.26583168655633926,
"step": 1281,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012826413206603303,
"grad_norm": 98.34558868408203,
"learning_rate": 3e-06,
"loss": 5.4646,
"step": 1282
},
{
"epoch": 0.0012836418209104551,
"grad_norm": 93.9869384765625,
"learning_rate": 3e-06,
"loss": 5.4419,
"step": 1283
},
{
"epoch": 0.0012846423211605802,
"grad_norm": 87.3626708984375,
"learning_rate": 3e-06,
"loss": 3.0002,
"step": 1284
},
{
"epoch": 0.0012856428214107053,
"grad_norm": 90.73570251464844,
"learning_rate": 3e-06,
"loss": 4.9438,
"step": 1285
},
{
"epoch": 0.0012866433216608304,
"grad_norm": 109.58126831054688,
"learning_rate": 3e-06,
"loss": 5.7411,
"step": 1286
},
{
"epoch": 0.0012876438219109555,
"grad_norm": 100.88875579833984,
"learning_rate": 3e-06,
"loss": 5.1317,
"step": 1287
},
{
"epoch": 0.0012886443221610806,
"grad_norm": 87.1065673828125,
"learning_rate": 3e-06,
"loss": 2.495,
"step": 1288
},
{
"completion_length": 252.4791717529297,
"epoch": 0.0012896448224112057,
"grad_norm": 98.72467803955078,
"learning_rate": 3e-06,
"loss": -12.6574,
"reward": 0.22350365668535233,
"reward_std": 0.17447489500045776,
"rewards/sudoku_reward_func": 0.22350364923477173,
"step": 1289,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012906453226613306,
"grad_norm": 128.9749755859375,
"learning_rate": 3e-06,
"loss": -5.734,
"step": 1290
},
{
"epoch": 0.0012916458229114557,
"grad_norm": 102.89720916748047,
"learning_rate": 3e-06,
"loss": -4.3407,
"step": 1291
},
{
"epoch": 0.0012926463231615808,
"grad_norm": 91.6898422241211,
"learning_rate": 3e-06,
"loss": -9.1816,
"step": 1292
},
{
"epoch": 0.0012936468234117059,
"grad_norm": 107.5361557006836,
"learning_rate": 3e-06,
"loss": -13.5125,
"step": 1293
},
{
"epoch": 0.001294647323661831,
"grad_norm": 112.34242248535156,
"learning_rate": 3e-06,
"loss": -7.4742,
"step": 1294
},
{
"epoch": 0.001295647823911956,
"grad_norm": 90.103515625,
"learning_rate": 3e-06,
"loss": -5.8833,
"step": 1295
},
{
"epoch": 0.0012966483241620811,
"grad_norm": 87.0075454711914,
"learning_rate": 3e-06,
"loss": -10.2467,
"step": 1296
},
{
"completion_length": 255.27083587646484,
"epoch": 0.001297648824412206,
"grad_norm": 102.01131439208984,
"learning_rate": 3e-06,
"loss": -11.5775,
"reward": 0.20604482293128967,
"reward_std": 0.15800370275974274,
"rewards/sudoku_reward_func": 0.20604482293128967,
"step": 1297,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0012986493246623311,
"grad_norm": 112.05081939697266,
"learning_rate": 3e-06,
"loss": -8.5013,
"step": 1298
},
{
"epoch": 0.0012996498249124562,
"grad_norm": 76.74340057373047,
"learning_rate": 3e-06,
"loss": -10.8034,
"step": 1299
},
{
"epoch": 0.0013006503251625813,
"grad_norm": 92.91266632080078,
"learning_rate": 3e-06,
"loss": -10.9193,
"step": 1300
},
{
"epoch": 0.0013016508254127064,
"grad_norm": 78.14289093017578,
"learning_rate": 3e-06,
"loss": -12.0713,
"step": 1301
},
{
"epoch": 0.0013026513256628315,
"grad_norm": 85.2936019897461,
"learning_rate": 3e-06,
"loss": -9.2757,
"step": 1302
},
{
"epoch": 0.0013036518259129564,
"grad_norm": 71.67143249511719,
"learning_rate": 3e-06,
"loss": -11.6347,
"step": 1303
},
{
"epoch": 0.0013046523261630815,
"grad_norm": 89.5588607788086,
"learning_rate": 3e-06,
"loss": -10.9821,
"step": 1304
},
{
"completion_length": 248.5416717529297,
"epoch": 0.0013056528264132066,
"grad_norm": 61.13591384887695,
"learning_rate": 3e-06,
"loss": -1.4544,
"reward": 0.24454365670681,
"reward_std": 0.13666882365942,
"rewards/sudoku_reward_func": 0.24454365670681,
"step": 1305,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013066533266633317,
"grad_norm": 65.24658966064453,
"learning_rate": 3e-06,
"loss": -1.3029,
"step": 1306
},
{
"epoch": 0.0013076538269134567,
"grad_norm": 67.64944458007812,
"learning_rate": 3e-06,
"loss": -4.5651,
"step": 1307
},
{
"epoch": 0.0013086543271635818,
"grad_norm": 92.36503601074219,
"learning_rate": 3e-06,
"loss": -2.1237,
"step": 1308
},
{
"epoch": 0.001309654827413707,
"grad_norm": 63.10952377319336,
"learning_rate": 3e-06,
"loss": -1.846,
"step": 1309
},
{
"epoch": 0.0013106553276638318,
"grad_norm": 63.06090545654297,
"learning_rate": 3e-06,
"loss": -1.8813,
"step": 1310
},
{
"epoch": 0.001311655827913957,
"grad_norm": 68.20687866210938,
"learning_rate": 3e-06,
"loss": -5.8478,
"step": 1311
},
{
"epoch": 0.001312656328164082,
"grad_norm": 95.44178009033203,
"learning_rate": 3e-06,
"loss": -3.2949,
"step": 1312
},
{
"completion_length": 246.5625,
"epoch": 0.001313656828414207,
"grad_norm": 82.63322448730469,
"learning_rate": 3e-06,
"loss": -8.5331,
"reward": 0.21932870894670486,
"reward_std": 0.13176480680704117,
"rewards/sudoku_reward_func": 0.21932870149612427,
"step": 1313,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013146573286643322,
"grad_norm": 76.15507507324219,
"learning_rate": 3e-06,
"loss": -10.7565,
"step": 1314
},
{
"epoch": 0.0013156578289144573,
"grad_norm": 66.5208969116211,
"learning_rate": 3e-06,
"loss": -10.1243,
"step": 1315
},
{
"epoch": 0.0013166583291645824,
"grad_norm": 63.920650482177734,
"learning_rate": 3e-06,
"loss": -9.6918,
"step": 1316
},
{
"epoch": 0.0013176588294147073,
"grad_norm": 112.46867370605469,
"learning_rate": 3e-06,
"loss": -8.9053,
"step": 1317
},
{
"epoch": 0.0013186593296648323,
"grad_norm": 96.82019805908203,
"learning_rate": 3e-06,
"loss": -10.5068,
"step": 1318
},
{
"epoch": 0.0013196598299149574,
"grad_norm": 78.8730697631836,
"learning_rate": 3e-06,
"loss": -10.6982,
"step": 1319
},
{
"epoch": 0.0013206603301650825,
"grad_norm": 70.09661865234375,
"learning_rate": 3e-06,
"loss": -10.2522,
"step": 1320
},
{
"completion_length": 250.83333587646484,
"epoch": 0.0013216608304152076,
"grad_norm": 86.71615600585938,
"learning_rate": 3e-06,
"loss": 2.4734,
"reward": 0.19102858752012253,
"reward_std": 0.14162860810756683,
"rewards/sudoku_reward_func": 0.19102858752012253,
"step": 1321,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013226613306653327,
"grad_norm": 106.76415252685547,
"learning_rate": 3e-06,
"loss": -0.4374,
"step": 1322
},
{
"epoch": 0.0013236618309154578,
"grad_norm": 185.66705322265625,
"learning_rate": 3e-06,
"loss": 3.0146,
"step": 1323
},
{
"epoch": 0.0013246623311655827,
"grad_norm": 108.97735595703125,
"learning_rate": 3e-06,
"loss": 2.749,
"step": 1324
},
{
"epoch": 0.0013256628314157078,
"grad_norm": 90.60322570800781,
"learning_rate": 3e-06,
"loss": 2.3065,
"step": 1325
},
{
"epoch": 0.0013266633316658329,
"grad_norm": 106.5020751953125,
"learning_rate": 3e-06,
"loss": -0.4537,
"step": 1326
},
{
"epoch": 0.001327663831915958,
"grad_norm": 173.3131866455078,
"learning_rate": 3e-06,
"loss": 2.367,
"step": 1327
},
{
"epoch": 0.001328664332166083,
"grad_norm": 98.478515625,
"learning_rate": 3e-06,
"loss": 2.0123,
"step": 1328
},
{
"completion_length": 249.8541717529297,
"epoch": 0.0013296648324162082,
"grad_norm": 127.2477798461914,
"learning_rate": 3e-06,
"loss": -3.8098,
"reward": 0.2223086580634117,
"reward_std": 0.14327675849199295,
"rewards/sudoku_reward_func": 0.2223086580634117,
"step": 1329,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013306653326663333,
"grad_norm": 75.55018615722656,
"learning_rate": 3e-06,
"loss": 0.4287,
"step": 1330
},
{
"epoch": 0.0013316658329164581,
"grad_norm": 87.05211639404297,
"learning_rate": 3e-06,
"loss": -0.1133,
"step": 1331
},
{
"epoch": 0.0013326663331665832,
"grad_norm": 74.08929443359375,
"learning_rate": 3e-06,
"loss": -3.0703,
"step": 1332
},
{
"epoch": 0.0013336668334167083,
"grad_norm": 145.09320068359375,
"learning_rate": 3e-06,
"loss": -4.448,
"step": 1333
},
{
"epoch": 0.0013346673336668334,
"grad_norm": 79.23677062988281,
"learning_rate": 3e-06,
"loss": -0.3325,
"step": 1334
},
{
"epoch": 0.0013356678339169585,
"grad_norm": 86.81128692626953,
"learning_rate": 3e-06,
"loss": -0.9774,
"step": 1335
},
{
"epoch": 0.0013366683341670836,
"grad_norm": 82.84042358398438,
"learning_rate": 3e-06,
"loss": -3.4287,
"step": 1336
},
{
"completion_length": 233.20833587646484,
"epoch": 0.0013376688344172087,
"grad_norm": 95.14927673339844,
"learning_rate": 3e-06,
"loss": -7.883,
"reward": 0.2618408799171448,
"reward_std": 0.12445945292711258,
"rewards/sudoku_reward_func": 0.2618408799171448,
"step": 1337,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013386693346673336,
"grad_norm": 68.84015655517578,
"learning_rate": 3e-06,
"loss": -8.7071,
"step": 1338
},
{
"epoch": 0.0013396698349174587,
"grad_norm": 127.49705505371094,
"learning_rate": 3e-06,
"loss": -4.8652,
"step": 1339
},
{
"epoch": 0.0013406703351675838,
"grad_norm": 87.26109313964844,
"learning_rate": 3e-06,
"loss": -4.9352,
"step": 1340
},
{
"epoch": 0.0013416708354177089,
"grad_norm": 91.16841888427734,
"learning_rate": 3e-06,
"loss": -8.759,
"step": 1341
},
{
"epoch": 0.001342671335667834,
"grad_norm": 76.6376953125,
"learning_rate": 3e-06,
"loss": -9.6058,
"step": 1342
},
{
"epoch": 0.001343671835917959,
"grad_norm": 90.81112670898438,
"learning_rate": 3e-06,
"loss": -5.3411,
"step": 1343
},
{
"epoch": 0.0013446723361680841,
"grad_norm": 112.43119049072266,
"learning_rate": 3e-06,
"loss": -4.9454,
"step": 1344
},
{
"completion_length": 249.6041717529297,
"epoch": 0.001345672836418209,
"grad_norm": 122.31782531738281,
"learning_rate": 3e-06,
"loss": 1.6905,
"reward": 0.28765709698200226,
"reward_std": 0.15683971345424652,
"rewards/sudoku_reward_func": 0.28765709698200226,
"step": 1345,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001346673336668334,
"grad_norm": 156.43411254882812,
"learning_rate": 3e-06,
"loss": -5.0148,
"step": 1346
},
{
"epoch": 0.0013476738369184592,
"grad_norm": 168.4134521484375,
"learning_rate": 3e-06,
"loss": 2.5596,
"step": 1347
},
{
"epoch": 0.0013486743371685843,
"grad_norm": 177.4627227783203,
"learning_rate": 3e-06,
"loss": 9.4151,
"step": 1348
},
{
"epoch": 0.0013496748374187094,
"grad_norm": 166.94285583496094,
"learning_rate": 3e-06,
"loss": 0.9821,
"step": 1349
},
{
"epoch": 0.0013506753376688345,
"grad_norm": 170.2594757080078,
"learning_rate": 3e-06,
"loss": -6.3015,
"step": 1350
},
{
"epoch": 0.0013516758379189596,
"grad_norm": 163.38580322265625,
"learning_rate": 3e-06,
"loss": 1.8278,
"step": 1351
},
{
"epoch": 0.0013526763381690845,
"grad_norm": 175.76690673828125,
"learning_rate": 3e-06,
"loss": 9.595,
"step": 1352
},
{
"completion_length": 244.9791717529297,
"epoch": 0.0013536768384192095,
"grad_norm": 97.57686614990234,
"learning_rate": 3e-06,
"loss": 3.503,
"reward": 0.27925462275743484,
"reward_std": 0.13963400572538376,
"rewards/sudoku_reward_func": 0.27925462275743484,
"step": 1353,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013546773386693346,
"grad_norm": 103.25208282470703,
"learning_rate": 3e-06,
"loss": 1.375,
"step": 1354
},
{
"epoch": 0.0013556778389194597,
"grad_norm": 120.28804779052734,
"learning_rate": 3e-06,
"loss": -0.7448,
"step": 1355
},
{
"epoch": 0.0013566783391695848,
"grad_norm": 77.3034439086914,
"learning_rate": 3e-06,
"loss": 5.2681,
"step": 1356
},
{
"epoch": 0.00135767883941971,
"grad_norm": 114.31840515136719,
"learning_rate": 3e-06,
"loss": 2.6711,
"step": 1357
},
{
"epoch": 0.001358679339669835,
"grad_norm": 121.68708038330078,
"learning_rate": 3e-06,
"loss": 1.068,
"step": 1358
},
{
"epoch": 0.00135967983991996,
"grad_norm": 183.0277862548828,
"learning_rate": 3e-06,
"loss": -1.5462,
"step": 1359
},
{
"epoch": 0.001360680340170085,
"grad_norm": 79.93998718261719,
"learning_rate": 3e-06,
"loss": 4.9505,
"step": 1360
},
{
"completion_length": 248.06250762939453,
"epoch": 0.00136168084042021,
"grad_norm": 133.2718963623047,
"learning_rate": 3e-06,
"loss": -6.8724,
"reward": 0.2313988208770752,
"reward_std": 0.15942735970020294,
"rewards/sudoku_reward_func": 0.2313988208770752,
"step": 1361,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013626813406703352,
"grad_norm": 143.41510009765625,
"learning_rate": 3e-06,
"loss": -11.1885,
"step": 1362
},
{
"epoch": 0.0013636818409204603,
"grad_norm": 156.24859619140625,
"learning_rate": 3e-06,
"loss": -4.2768,
"step": 1363
},
{
"epoch": 0.0013646823411705854,
"grad_norm": 148.48779296875,
"learning_rate": 3e-06,
"loss": -6.0221,
"step": 1364
},
{
"epoch": 0.0013656828414207105,
"grad_norm": 126.07758331298828,
"learning_rate": 3e-06,
"loss": -7.2337,
"step": 1365
},
{
"epoch": 0.0013666833416708353,
"grad_norm": 146.1149444580078,
"learning_rate": 3e-06,
"loss": -12.7754,
"step": 1366
},
{
"epoch": 0.0013676838419209604,
"grad_norm": 166.75257873535156,
"learning_rate": 3e-06,
"loss": -5.6501,
"step": 1367
},
{
"epoch": 0.0013686843421710855,
"grad_norm": 117.26903533935547,
"learning_rate": 3e-06,
"loss": -7.462,
"step": 1368
},
{
"completion_length": 244.37500762939453,
"epoch": 0.0013696848424212106,
"grad_norm": 155.39373779296875,
"learning_rate": 3e-06,
"loss": -3.6877,
"reward": 0.25125136226415634,
"reward_std": 0.15128708630800247,
"rewards/sudoku_reward_func": 0.25125134736299515,
"step": 1369,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013706853426713357,
"grad_norm": 113.63846588134766,
"learning_rate": 3e-06,
"loss": -3.9167,
"step": 1370
},
{
"epoch": 0.0013716858429214608,
"grad_norm": 72.45719909667969,
"learning_rate": 3e-06,
"loss": -1.3301,
"step": 1371
},
{
"epoch": 0.001372686343171586,
"grad_norm": 78.77584838867188,
"learning_rate": 3e-06,
"loss": -6.9234,
"step": 1372
},
{
"epoch": 0.0013736868434217108,
"grad_norm": 151.5078125,
"learning_rate": 3e-06,
"loss": -3.098,
"step": 1373
},
{
"epoch": 0.0013746873436718359,
"grad_norm": 65.08445739746094,
"learning_rate": 3e-06,
"loss": -4.1381,
"step": 1374
},
{
"epoch": 0.001375687843921961,
"grad_norm": 74.84840393066406,
"learning_rate": 3e-06,
"loss": -1.6518,
"step": 1375
},
{
"epoch": 0.001376688344172086,
"grad_norm": 75.1297836303711,
"learning_rate": 3e-06,
"loss": -7.4169,
"step": 1376
},
{
"completion_length": 248.4791717529297,
"epoch": 0.0013776888444222112,
"grad_norm": 91.91381072998047,
"learning_rate": 3e-06,
"loss": -6.5628,
"reward": 0.2777778059244156,
"reward_std": 0.15616532415151596,
"rewards/sudoku_reward_func": 0.2777777910232544,
"step": 1377,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013786893446723362,
"grad_norm": 86.98223114013672,
"learning_rate": 3e-06,
"loss": -6.736,
"step": 1378
},
{
"epoch": 0.0013796898449224613,
"grad_norm": 119.746337890625,
"learning_rate": 3e-06,
"loss": -6.1626,
"step": 1379
},
{
"epoch": 0.0013806903451725862,
"grad_norm": 102.7596435546875,
"learning_rate": 3e-06,
"loss": -6.9399,
"step": 1380
},
{
"epoch": 0.0013816908454227113,
"grad_norm": 88.32569885253906,
"learning_rate": 3e-06,
"loss": -6.8012,
"step": 1381
},
{
"epoch": 0.0013826913456728364,
"grad_norm": 63.3649787902832,
"learning_rate": 3e-06,
"loss": -7.2393,
"step": 1382
},
{
"epoch": 0.0013836918459229615,
"grad_norm": 82.19108581542969,
"learning_rate": 3e-06,
"loss": -6.9574,
"step": 1383
},
{
"epoch": 0.0013846923461730866,
"grad_norm": 74.44951629638672,
"learning_rate": 3e-06,
"loss": -7.6355,
"step": 1384
},
{
"completion_length": 251.6666717529297,
"epoch": 0.0013856928464232117,
"grad_norm": 79.18040466308594,
"learning_rate": 3e-06,
"loss": -4.2122,
"reward": 0.24041006714105606,
"reward_std": 0.13425163179636002,
"rewards/sudoku_reward_func": 0.24041006714105606,
"step": 1385,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0013866933466733366,
"grad_norm": 71.49141693115234,
"learning_rate": 3e-06,
"loss": -5.4048,
"step": 1386
},
{
"epoch": 0.0013876938469234617,
"grad_norm": 63.46376419067383,
"learning_rate": 3e-06,
"loss": -6.3293,
"step": 1387
},
{
"epoch": 0.0013886943471735867,
"grad_norm": 71.38902282714844,
"learning_rate": 3e-06,
"loss": -7.0593,
"step": 1388
},
{
"epoch": 0.0013896948474237118,
"grad_norm": 66.57677459716797,
"learning_rate": 3e-06,
"loss": -5.1928,
"step": 1389
},
{
"epoch": 0.001390695347673837,
"grad_norm": 71.48369598388672,
"learning_rate": 3e-06,
"loss": -6.1357,
"step": 1390
},
{
"epoch": 0.001391695847923962,
"grad_norm": 61.57743453979492,
"learning_rate": 3e-06,
"loss": -6.5625,
"step": 1391
},
{
"epoch": 0.0013926963481740871,
"grad_norm": 58.65646743774414,
"learning_rate": 3e-06,
"loss": -7.4159,
"step": 1392
},
{
"completion_length": 247.06250762939453,
"epoch": 0.001393696848424212,
"grad_norm": 47.037384033203125,
"learning_rate": 3e-06,
"loss": -4.6575,
"reward": 0.24611443281173706,
"reward_std": 0.1536630243062973,
"rewards/sudoku_reward_func": 0.24611442536115646,
"step": 1393,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001394697348674337,
"grad_norm": 85.59077453613281,
"learning_rate": 3e-06,
"loss": -3.5807,
"step": 1394
},
{
"epoch": 0.0013956978489244622,
"grad_norm": 56.778385162353516,
"learning_rate": 3e-06,
"loss": -3.5657,
"step": 1395
},
{
"epoch": 0.0013966983491745873,
"grad_norm": 56.52222442626953,
"learning_rate": 3e-06,
"loss": -3.2542,
"step": 1396
},
{
"epoch": 0.0013976988494247124,
"grad_norm": 55.23912811279297,
"learning_rate": 3e-06,
"loss": -5.5549,
"step": 1397
},
{
"epoch": 0.0013986993496748375,
"grad_norm": 76.44470977783203,
"learning_rate": 3e-06,
"loss": -4.2466,
"step": 1398
},
{
"epoch": 0.0013996998499249626,
"grad_norm": 60.509979248046875,
"learning_rate": 3e-06,
"loss": -4.577,
"step": 1399
},
{
"epoch": 0.0014007003501750874,
"grad_norm": 48.10431671142578,
"learning_rate": 3e-06,
"loss": -3.913,
"step": 1400
},
{
"completion_length": 255.45833587646484,
"epoch": 0.0014017008504252125,
"grad_norm": 72.41065216064453,
"learning_rate": 3e-06,
"loss": -11.1462,
"reward": 0.234995037317276,
"reward_std": 0.17775796353816986,
"rewards/sudoku_reward_func": 0.234995037317276,
"step": 1401,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014027013506753376,
"grad_norm": 84.73635864257812,
"learning_rate": 3e-06,
"loss": -5.2094,
"step": 1402
},
{
"epoch": 0.0014037018509254627,
"grad_norm": 76.14923095703125,
"learning_rate": 3e-06,
"loss": -5.9781,
"step": 1403
},
{
"epoch": 0.0014047023511755878,
"grad_norm": 75.8636245727539,
"learning_rate": 3e-06,
"loss": -11.0312,
"step": 1404
},
{
"epoch": 0.001405702851425713,
"grad_norm": 82.16057586669922,
"learning_rate": 3e-06,
"loss": -11.7936,
"step": 1405
},
{
"epoch": 0.001406703351675838,
"grad_norm": 72.84368896484375,
"learning_rate": 3e-06,
"loss": -5.8866,
"step": 1406
},
{
"epoch": 0.0014077038519259629,
"grad_norm": 76.86295318603516,
"learning_rate": 3e-06,
"loss": -6.5346,
"step": 1407
},
{
"epoch": 0.001408704352176088,
"grad_norm": 75.8145523071289,
"learning_rate": 3e-06,
"loss": -11.6837,
"step": 1408
},
{
"completion_length": 255.0416717529297,
"epoch": 0.001409704852426213,
"grad_norm": 76.15098571777344,
"learning_rate": 3e-06,
"loss": -3.7352,
"reward": 0.2776537910103798,
"reward_std": 0.16747048497200012,
"rewards/sudoku_reward_func": 0.2776537910103798,
"step": 1409,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014107053526763382,
"grad_norm": 106.66970825195312,
"learning_rate": 3e-06,
"loss": -4.3643,
"step": 1410
},
{
"epoch": 0.0014117058529264633,
"grad_norm": 64.76913452148438,
"learning_rate": 3e-06,
"loss": -6.6609,
"step": 1411
},
{
"epoch": 0.0014127063531765884,
"grad_norm": 113.87123107910156,
"learning_rate": 3e-06,
"loss": -4.4335,
"step": 1412
},
{
"epoch": 0.0014137068534267134,
"grad_norm": 130.8094482421875,
"learning_rate": 3e-06,
"loss": -4.8342,
"step": 1413
},
{
"epoch": 0.0014147073536768383,
"grad_norm": 85.35004425048828,
"learning_rate": 3e-06,
"loss": -5.7942,
"step": 1414
},
{
"epoch": 0.0014157078539269634,
"grad_norm": 69.62791442871094,
"learning_rate": 3e-06,
"loss": -8.0415,
"step": 1415
},
{
"epoch": 0.0014167083541770885,
"grad_norm": 116.4716567993164,
"learning_rate": 3e-06,
"loss": -6.9187,
"step": 1416
},
{
"completion_length": 246.52083587646484,
"epoch": 0.0014177088544272136,
"grad_norm": 68.39940643310547,
"learning_rate": 3e-06,
"loss": 12.7806,
"reward": 0.2607887014746666,
"reward_std": 0.15520118921995163,
"rewards/sudoku_reward_func": 0.2607886865735054,
"step": 1417,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014187093546773387,
"grad_norm": 65.6619644165039,
"learning_rate": 3e-06,
"loss": 15.2863,
"step": 1418
},
{
"epoch": 0.0014197098549274638,
"grad_norm": 67.66735076904297,
"learning_rate": 3e-06,
"loss": 12.5973,
"step": 1419
},
{
"epoch": 0.0014207103551775889,
"grad_norm": 61.57198715209961,
"learning_rate": 3e-06,
"loss": 12.9858,
"step": 1420
},
{
"epoch": 0.0014217108554277138,
"grad_norm": 95.22171020507812,
"learning_rate": 3e-06,
"loss": 11.3072,
"step": 1421
},
{
"epoch": 0.0014227113556778389,
"grad_norm": 136.36041259765625,
"learning_rate": 3e-06,
"loss": 15.4914,
"step": 1422
},
{
"epoch": 0.001423711855927964,
"grad_norm": 87.2356185913086,
"learning_rate": 3e-06,
"loss": 12.1819,
"step": 1423
},
{
"epoch": 0.001424712356178089,
"grad_norm": 85.48587036132812,
"learning_rate": 3e-06,
"loss": 12.1587,
"step": 1424
},
{
"completion_length": 252.625,
"epoch": 0.0014257128564282141,
"grad_norm": 124.8813705444336,
"learning_rate": 3e-06,
"loss": 0.3882,
"reward": 0.22999338805675507,
"reward_std": 0.13858719915151596,
"rewards/sudoku_reward_func": 0.22999338805675507,
"step": 1425,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014267133566783392,
"grad_norm": 110.09771728515625,
"learning_rate": 3e-06,
"loss": 1.7846,
"step": 1426
},
{
"epoch": 0.0014277138569284643,
"grad_norm": 120.795654296875,
"learning_rate": 3e-06,
"loss": 0.6717,
"step": 1427
},
{
"epoch": 0.0014287143571785892,
"grad_norm": 98.05519104003906,
"learning_rate": 3e-06,
"loss": 0.1646,
"step": 1428
},
{
"epoch": 0.0014297148574287143,
"grad_norm": 145.90524291992188,
"learning_rate": 3e-06,
"loss": -0.9419,
"step": 1429
},
{
"epoch": 0.0014307153576788394,
"grad_norm": 84.8663558959961,
"learning_rate": 3e-06,
"loss": 0.2669,
"step": 1430
},
{
"epoch": 0.0014317158579289645,
"grad_norm": 90.54247283935547,
"learning_rate": 3e-06,
"loss": -1.3302,
"step": 1431
},
{
"epoch": 0.0014327163581790896,
"grad_norm": 93.9137954711914,
"learning_rate": 3e-06,
"loss": -1.5561,
"step": 1432
},
{
"completion_length": 252.20833587646484,
"epoch": 0.0014337168584292147,
"grad_norm": 91.17662811279297,
"learning_rate": 3e-06,
"loss": 1.4802,
"reward": 0.22301137447357178,
"reward_std": 0.13641764968633652,
"rewards/sudoku_reward_func": 0.22301135957241058,
"step": 1433,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014347173586793398,
"grad_norm": 83.0351791381836,
"learning_rate": 3e-06,
"loss": 3.8915,
"step": 1434
},
{
"epoch": 0.0014357178589294646,
"grad_norm": 101.50720977783203,
"learning_rate": 3e-06,
"loss": 7.0151,
"step": 1435
},
{
"epoch": 0.0014367183591795897,
"grad_norm": 85.99288177490234,
"learning_rate": 3e-06,
"loss": 0.5616,
"step": 1436
},
{
"epoch": 0.0014377188594297148,
"grad_norm": 76.99950408935547,
"learning_rate": 3e-06,
"loss": 1.1366,
"step": 1437
},
{
"epoch": 0.00143871935967984,
"grad_norm": 80.017333984375,
"learning_rate": 3e-06,
"loss": 3.0612,
"step": 1438
},
{
"epoch": 0.001439719859929965,
"grad_norm": 110.35974884033203,
"learning_rate": 3e-06,
"loss": 5.9277,
"step": 1439
},
{
"epoch": 0.0014407203601800901,
"grad_norm": 99.30435943603516,
"learning_rate": 3e-06,
"loss": -1.2894,
"step": 1440
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0014417208604302152,
"grad_norm": 106.5459976196289,
"learning_rate": 3e-06,
"loss": -14.271,
"reward": 0.247519850730896,
"reward_std": 0.1321713551878929,
"rewards/sudoku_reward_func": 0.247519850730896,
"step": 1441,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00144272136068034,
"grad_norm": 91.2779769897461,
"learning_rate": 3e-06,
"loss": -8.3958,
"step": 1442
},
{
"epoch": 0.0014437218609304652,
"grad_norm": 116.3575210571289,
"learning_rate": 3e-06,
"loss": -13.9669,
"step": 1443
},
{
"epoch": 0.0014447223611805903,
"grad_norm": 132.5576171875,
"learning_rate": 3e-06,
"loss": -8.9636,
"step": 1444
},
{
"epoch": 0.0014457228614307154,
"grad_norm": 143.8059844970703,
"learning_rate": 3e-06,
"loss": -13.8777,
"step": 1445
},
{
"epoch": 0.0014467233616808405,
"grad_norm": 85.30062866210938,
"learning_rate": 3e-06,
"loss": -8.6056,
"step": 1446
},
{
"epoch": 0.0014477238619309656,
"grad_norm": 97.61617279052734,
"learning_rate": 3e-06,
"loss": -14.6353,
"step": 1447
},
{
"epoch": 0.0014487243621810906,
"grad_norm": 147.2740020751953,
"learning_rate": 3e-06,
"loss": -9.864,
"step": 1448
},
{
"completion_length": 250.4791717529297,
"epoch": 0.0014497248624312155,
"grad_norm": 137.6214599609375,
"learning_rate": 3e-06,
"loss": -6.1127,
"reward": 0.25371648371219635,
"reward_std": 0.15944789350032806,
"rewards/sudoku_reward_func": 0.25371648371219635,
"step": 1449,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014507253626813406,
"grad_norm": 106.05636596679688,
"learning_rate": 3e-06,
"loss": -5.666,
"step": 1450
},
{
"epoch": 0.0014517258629314657,
"grad_norm": 110.15316772460938,
"learning_rate": 3e-06,
"loss": -7.7442,
"step": 1451
},
{
"epoch": 0.0014527263631815908,
"grad_norm": 91.43193054199219,
"learning_rate": 3e-06,
"loss": -4.0562,
"step": 1452
},
{
"epoch": 0.001453726863431716,
"grad_norm": 129.03883361816406,
"learning_rate": 3e-06,
"loss": -6.7951,
"step": 1453
},
{
"epoch": 0.001454727363681841,
"grad_norm": 131.1297607421875,
"learning_rate": 3e-06,
"loss": -6.6216,
"step": 1454
},
{
"epoch": 0.001455727863931966,
"grad_norm": 117.30534362792969,
"learning_rate": 3e-06,
"loss": -8.6388,
"step": 1455
},
{
"epoch": 0.001456728364182091,
"grad_norm": 103.80106353759766,
"learning_rate": 3e-06,
"loss": -4.4661,
"step": 1456
},
{
"completion_length": 246.12500762939453,
"epoch": 0.001457728864432216,
"grad_norm": 87.62139129638672,
"learning_rate": 3e-06,
"loss": -6.7951,
"reward": 0.233506940305233,
"reward_std": 0.13778656721115112,
"rewards/sudoku_reward_func": 0.233506940305233,
"step": 1457,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014587293646823412,
"grad_norm": 84.07535552978516,
"learning_rate": 3e-06,
"loss": -7.339,
"step": 1458
},
{
"epoch": 0.0014597298649324662,
"grad_norm": 57.86692810058594,
"learning_rate": 3e-06,
"loss": -6.35,
"step": 1459
},
{
"epoch": 0.0014607303651825913,
"grad_norm": 99.71435546875,
"learning_rate": 3e-06,
"loss": -6.6156,
"step": 1460
},
{
"epoch": 0.0014617308654327164,
"grad_norm": 104.31732940673828,
"learning_rate": 3e-06,
"loss": -8.1659,
"step": 1461
},
{
"epoch": 0.0014627313656828413,
"grad_norm": 91.96192932128906,
"learning_rate": 3e-06,
"loss": -7.4639,
"step": 1462
},
{
"epoch": 0.0014637318659329664,
"grad_norm": 60.986629486083984,
"learning_rate": 3e-06,
"loss": -7.0631,
"step": 1463
},
{
"epoch": 0.0014647323661830915,
"grad_norm": 97.76730346679688,
"learning_rate": 3e-06,
"loss": -7.033,
"step": 1464
},
{
"completion_length": 245.12500762939453,
"epoch": 0.0014657328664332166,
"grad_norm": 114.5215835571289,
"learning_rate": 3e-06,
"loss": 3.3239,
"reward": 0.22682179510593414,
"reward_std": 0.1532593071460724,
"rewards/sudoku_reward_func": 0.22682178765535355,
"step": 1465,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014667333666833417,
"grad_norm": 107.46202087402344,
"learning_rate": 3e-06,
"loss": 6.8997,
"step": 1466
},
{
"epoch": 0.0014677338669334668,
"grad_norm": 110.28289031982422,
"learning_rate": 3e-06,
"loss": 6.234,
"step": 1467
},
{
"epoch": 0.0014687343671835919,
"grad_norm": 98.04695129394531,
"learning_rate": 3e-06,
"loss": 1.3534,
"step": 1468
},
{
"epoch": 0.0014697348674337168,
"grad_norm": 151.7884979248047,
"learning_rate": 3e-06,
"loss": 2.3593,
"step": 1469
},
{
"epoch": 0.0014707353676838418,
"grad_norm": 92.70248413085938,
"learning_rate": 3e-06,
"loss": 6.602,
"step": 1470
},
{
"epoch": 0.001471735867933967,
"grad_norm": 102.759765625,
"learning_rate": 3e-06,
"loss": 5.3602,
"step": 1471
},
{
"epoch": 0.001472736368184092,
"grad_norm": 94.43704986572266,
"learning_rate": 3e-06,
"loss": 0.9451,
"step": 1472
},
{
"completion_length": 252.6666717529297,
"epoch": 0.0014737368684342171,
"grad_norm": 188.50820922851562,
"learning_rate": 3e-06,
"loss": 0.5014,
"reward": 0.26963459700345993,
"reward_std": 0.14512023329734802,
"rewards/sudoku_reward_func": 0.26963459700345993,
"step": 1473,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014747373686843422,
"grad_norm": 128.79505920410156,
"learning_rate": 3e-06,
"loss": -3.2434,
"step": 1474
},
{
"epoch": 0.0014757378689344673,
"grad_norm": 123.74315643310547,
"learning_rate": 3e-06,
"loss": -6.2439,
"step": 1475
},
{
"epoch": 0.0014767383691845922,
"grad_norm": 90.2215805053711,
"learning_rate": 3e-06,
"loss": -2.9395,
"step": 1476
},
{
"epoch": 0.0014777388694347173,
"grad_norm": 215.22544860839844,
"learning_rate": 3e-06,
"loss": -1.6467,
"step": 1477
},
{
"epoch": 0.0014787393696848424,
"grad_norm": 88.53108215332031,
"learning_rate": 3e-06,
"loss": -4.445,
"step": 1478
},
{
"epoch": 0.0014797398699349675,
"grad_norm": 111.55303955078125,
"learning_rate": 3e-06,
"loss": -6.7549,
"step": 1479
},
{
"epoch": 0.0014807403701850926,
"grad_norm": 111.30841827392578,
"learning_rate": 3e-06,
"loss": -2.8563,
"step": 1480
},
{
"completion_length": 253.06250762939453,
"epoch": 0.0014817408704352177,
"grad_norm": 83.2706069946289,
"learning_rate": 3e-06,
"loss": 5.2902,
"reward": 0.24458499252796173,
"reward_std": 0.1385301575064659,
"rewards/sudoku_reward_func": 0.24458499252796173,
"step": 1481,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014827413706853428,
"grad_norm": 74.04076385498047,
"learning_rate": 3e-06,
"loss": 7.6081,
"step": 1482
},
{
"epoch": 0.0014837418709354676,
"grad_norm": 88.90835571289062,
"learning_rate": 3e-06,
"loss": 7.3049,
"step": 1483
},
{
"epoch": 0.0014847423711855927,
"grad_norm": 92.47587585449219,
"learning_rate": 3e-06,
"loss": 9.4078,
"step": 1484
},
{
"epoch": 0.0014857428714357178,
"grad_norm": 76.9234848022461,
"learning_rate": 3e-06,
"loss": 4.3459,
"step": 1485
},
{
"epoch": 0.001486743371685843,
"grad_norm": 75.98670959472656,
"learning_rate": 3e-06,
"loss": 6.8549,
"step": 1486
},
{
"epoch": 0.001487743871935968,
"grad_norm": 129.2482452392578,
"learning_rate": 3e-06,
"loss": 5.5137,
"step": 1487
},
{
"epoch": 0.001488744372186093,
"grad_norm": 79.61399841308594,
"learning_rate": 3e-06,
"loss": 8.9599,
"step": 1488
},
{
"completion_length": 251.125,
"epoch": 0.0014897448724362182,
"grad_norm": 98.78363037109375,
"learning_rate": 3e-06,
"loss": 2.7972,
"reward": 0.25284092128276825,
"reward_std": 0.1391737014055252,
"rewards/sudoku_reward_func": 0.25284091383218765,
"step": 1489,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001490745372686343,
"grad_norm": 85.65895080566406,
"learning_rate": 3e-06,
"loss": 1.2383,
"step": 1490
},
{
"epoch": 0.0014917458729364682,
"grad_norm": 112.22046661376953,
"learning_rate": 3e-06,
"loss": 5.8015,
"step": 1491
},
{
"epoch": 0.0014927463731865933,
"grad_norm": 67.51846313476562,
"learning_rate": 3e-06,
"loss": 0.434,
"step": 1492
},
{
"epoch": 0.0014937468734367184,
"grad_norm": 83.65746307373047,
"learning_rate": 3e-06,
"loss": 2.4033,
"step": 1493
},
{
"epoch": 0.0014947473736868435,
"grad_norm": 94.6855697631836,
"learning_rate": 3e-06,
"loss": -0.0726,
"step": 1494
},
{
"epoch": 0.0014957478739369685,
"grad_norm": 76.3798599243164,
"learning_rate": 3e-06,
"loss": 5.4418,
"step": 1495
},
{
"epoch": 0.0014967483741870936,
"grad_norm": 65.88536071777344,
"learning_rate": 3e-06,
"loss": -0.3561,
"step": 1496
},
{
"completion_length": 247.64583587646484,
"epoch": 0.0014977488744372185,
"grad_norm": 81.58489227294922,
"learning_rate": 3e-06,
"loss": -0.9713,
"reward": 0.19385448843240738,
"reward_std": 0.13647598028182983,
"rewards/sudoku_reward_func": 0.19385448098182678,
"step": 1497,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0014987493746873436,
"grad_norm": 89.6312026977539,
"learning_rate": 3e-06,
"loss": -6.1808,
"step": 1498
},
{
"epoch": 0.0014997498749374687,
"grad_norm": 100.32272338867188,
"learning_rate": 3e-06,
"loss": -2.441,
"step": 1499
},
{
"epoch": 0.0015007503751875938,
"grad_norm": 68.68706512451172,
"learning_rate": 3e-06,
"loss": -3.726,
"step": 1500
},
{
"epoch": 0.0015017508754377189,
"grad_norm": 76.70758819580078,
"learning_rate": 3e-06,
"loss": -1.8507,
"step": 1501
},
{
"epoch": 0.001502751375687844,
"grad_norm": 64.95220184326172,
"learning_rate": 3e-06,
"loss": -7.1908,
"step": 1502
},
{
"epoch": 0.001503751875937969,
"grad_norm": 113.56958770751953,
"learning_rate": 3e-06,
"loss": -4.5099,
"step": 1503
},
{
"epoch": 0.001504752376188094,
"grad_norm": 61.816226959228516,
"learning_rate": 3e-06,
"loss": -4.6618,
"step": 1504
},
{
"completion_length": 249.0625,
"epoch": 0.001505752876438219,
"grad_norm": 84.65864562988281,
"learning_rate": 3e-06,
"loss": -3.4845,
"reward": 0.21722058206796646,
"reward_std": 0.14737464487552643,
"rewards/sudoku_reward_func": 0.21722057461738586,
"step": 1505,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015067533766883441,
"grad_norm": 114.67820739746094,
"learning_rate": 3e-06,
"loss": 1.9507,
"step": 1506
},
{
"epoch": 0.0015077538769384692,
"grad_norm": 97.06100463867188,
"learning_rate": 3e-06,
"loss": 0.9134,
"step": 1507
},
{
"epoch": 0.0015087543771885943,
"grad_norm": 85.14713287353516,
"learning_rate": 3e-06,
"loss": -0.671,
"step": 1508
},
{
"epoch": 0.0015097548774387194,
"grad_norm": 75.44242095947266,
"learning_rate": 3e-06,
"loss": -4.8392,
"step": 1509
},
{
"epoch": 0.0015107553776888445,
"grad_norm": 98.38030242919922,
"learning_rate": 3e-06,
"loss": 1.2949,
"step": 1510
},
{
"epoch": 0.0015117558779389694,
"grad_norm": 67.19142150878906,
"learning_rate": 3e-06,
"loss": 0.4119,
"step": 1511
},
{
"epoch": 0.0015127563781890945,
"grad_norm": 112.94633483886719,
"learning_rate": 3e-06,
"loss": -1.4086,
"step": 1512
},
{
"completion_length": 250.89583587646484,
"epoch": 0.0015137568784392196,
"grad_norm": 57.52448272705078,
"learning_rate": 3e-06,
"loss": -5.1907,
"reward": 0.23578043282032013,
"reward_std": 0.14374738186597824,
"rewards/sudoku_reward_func": 0.23578043282032013,
"step": 1513,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015147573786893447,
"grad_norm": 65.3283920288086,
"learning_rate": 3e-06,
"loss": -5.2681,
"step": 1514
},
{
"epoch": 0.0015157578789394698,
"grad_norm": 125.39384460449219,
"learning_rate": 3e-06,
"loss": -6.6646,
"step": 1515
},
{
"epoch": 0.0015167583791895949,
"grad_norm": 120.92166900634766,
"learning_rate": 3e-06,
"loss": -6.0842,
"step": 1516
},
{
"epoch": 0.00151775887943972,
"grad_norm": 64.44194793701172,
"learning_rate": 3e-06,
"loss": -5.4585,
"step": 1517
},
{
"epoch": 0.0015187593796898448,
"grad_norm": 68.04533386230469,
"learning_rate": 3e-06,
"loss": -5.6854,
"step": 1518
},
{
"epoch": 0.00151975987993997,
"grad_norm": 104.33889770507812,
"learning_rate": 3e-06,
"loss": -6.8329,
"step": 1519
},
{
"epoch": 0.001520760380190095,
"grad_norm": 100.65160369873047,
"learning_rate": 3e-06,
"loss": -6.3702,
"step": 1520
},
{
"completion_length": 249.50000762939453,
"epoch": 0.0015217608804402201,
"grad_norm": 225.0299530029297,
"learning_rate": 3e-06,
"loss": 2.0878,
"reward": 0.293744757771492,
"reward_std": 0.19449011981487274,
"rewards/sudoku_reward_func": 0.2937447428703308,
"step": 1521,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015227613806903452,
"grad_norm": 176.6975555419922,
"learning_rate": 3e-06,
"loss": -2.4676,
"step": 1522
},
{
"epoch": 0.0015237618809404703,
"grad_norm": 84.19757080078125,
"learning_rate": 3e-06,
"loss": -1.3043,
"step": 1523
},
{
"epoch": 0.0015247623811905954,
"grad_norm": 86.49832153320312,
"learning_rate": 3e-06,
"loss": -3.4045,
"step": 1524
},
{
"epoch": 0.0015257628814407203,
"grad_norm": 169.01022338867188,
"learning_rate": 3e-06,
"loss": -0.4526,
"step": 1525
},
{
"epoch": 0.0015267633816908454,
"grad_norm": 152.21177673339844,
"learning_rate": 3e-06,
"loss": -4.8907,
"step": 1526
},
{
"epoch": 0.0015277638819409705,
"grad_norm": 86.3626708984375,
"learning_rate": 3e-06,
"loss": -2.5317,
"step": 1527
},
{
"epoch": 0.0015287643821910956,
"grad_norm": 68.27998352050781,
"learning_rate": 3e-06,
"loss": -4.5565,
"step": 1528
},
{
"completion_length": 249.5416717529297,
"epoch": 0.0015297648824412207,
"grad_norm": 123.3398208618164,
"learning_rate": 3e-06,
"loss": -6.8746,
"reward": 0.22935831546783447,
"reward_std": 0.1520671397447586,
"rewards/sudoku_reward_func": 0.22935831546783447,
"step": 1529,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015307653826913457,
"grad_norm": 69.14616394042969,
"learning_rate": 3e-06,
"loss": -4.4294,
"step": 1530
},
{
"epoch": 0.0015317658829414708,
"grad_norm": 90.8684310913086,
"learning_rate": 3e-06,
"loss": -2.9881,
"step": 1531
},
{
"epoch": 0.0015327663831915957,
"grad_norm": 72.3403091430664,
"learning_rate": 3e-06,
"loss": -4.3715,
"step": 1532
},
{
"epoch": 0.0015337668834417208,
"grad_norm": 135.02798461914062,
"learning_rate": 3e-06,
"loss": -5.9151,
"step": 1533
},
{
"epoch": 0.001534767383691846,
"grad_norm": 85.45219421386719,
"learning_rate": 3e-06,
"loss": -4.3543,
"step": 1534
},
{
"epoch": 0.001535767883941971,
"grad_norm": 58.03328323364258,
"learning_rate": 3e-06,
"loss": -3.612,
"step": 1535
},
{
"epoch": 0.001536768384192096,
"grad_norm": 56.990333557128906,
"learning_rate": 3e-06,
"loss": -5.0622,
"step": 1536
},
{
"completion_length": 254.0,
"epoch": 0.0015377688844422212,
"grad_norm": 81.3235092163086,
"learning_rate": 3e-06,
"loss": -10.1589,
"reward": 0.29837438464164734,
"reward_std": 0.15097320824861526,
"rewards/sudoku_reward_func": 0.29837436974048615,
"step": 1537,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015387693846923463,
"grad_norm": 106.20044708251953,
"learning_rate": 3e-06,
"loss": -13.3899,
"step": 1538
},
{
"epoch": 0.0015397698849424712,
"grad_norm": 74.74054718017578,
"learning_rate": 3e-06,
"loss": -9.4645,
"step": 1539
},
{
"epoch": 0.0015407703851925963,
"grad_norm": 141.25428771972656,
"learning_rate": 3e-06,
"loss": -12.3875,
"step": 1540
},
{
"epoch": 0.0015417708854427213,
"grad_norm": 89.22872924804688,
"learning_rate": 3e-06,
"loss": -10.428,
"step": 1541
},
{
"epoch": 0.0015427713856928464,
"grad_norm": 90.24919128417969,
"learning_rate": 3e-06,
"loss": -14.6637,
"step": 1542
},
{
"epoch": 0.0015437718859429715,
"grad_norm": 89.6397933959961,
"learning_rate": 3e-06,
"loss": -10.8819,
"step": 1543
},
{
"epoch": 0.0015447723861930966,
"grad_norm": 140.13125610351562,
"learning_rate": 3e-06,
"loss": -14.8474,
"step": 1544
},
{
"completion_length": 254.2291717529297,
"epoch": 0.0015457728864432215,
"grad_norm": 83.32096862792969,
"learning_rate": 3e-06,
"loss": 2.6586,
"reward": 0.19969412684440613,
"reward_std": 0.1379682719707489,
"rewards/sudoku_reward_func": 0.19969411194324493,
"step": 1545,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015467733866933466,
"grad_norm": 83.81671905517578,
"learning_rate": 3e-06,
"loss": 3.664,
"step": 1546
},
{
"epoch": 0.0015477738869434717,
"grad_norm": 58.57638931274414,
"learning_rate": 3e-06,
"loss": 4.2643,
"step": 1547
},
{
"epoch": 0.0015487743871935968,
"grad_norm": 92.9087905883789,
"learning_rate": 3e-06,
"loss": 1.3365,
"step": 1548
},
{
"epoch": 0.0015497748874437219,
"grad_norm": 88.11719512939453,
"learning_rate": 3e-06,
"loss": -0.389,
"step": 1549
},
{
"epoch": 0.001550775387693847,
"grad_norm": 113.19729614257812,
"learning_rate": 3e-06,
"loss": 1.6035,
"step": 1550
},
{
"epoch": 0.001551775887943972,
"grad_norm": 81.20172882080078,
"learning_rate": 3e-06,
"loss": 4.2581,
"step": 1551
},
{
"epoch": 0.001552776388194097,
"grad_norm": 96.5271987915039,
"learning_rate": 3e-06,
"loss": -0.6346,
"step": 1552
},
{
"completion_length": 256.0,
"epoch": 0.001553776888444222,
"grad_norm": 142.41549682617188,
"learning_rate": 3e-06,
"loss": -6.5003,
"reward": 0.23867394775152206,
"reward_std": 0.1645166575908661,
"rewards/sudoku_reward_func": 0.23867394775152206,
"step": 1553,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015547773886943471,
"grad_norm": 192.58761596679688,
"learning_rate": 3e-06,
"loss": -4.5212,
"step": 1554
},
{
"epoch": 0.0015557778889444722,
"grad_norm": 108.9868392944336,
"learning_rate": 3e-06,
"loss": -9.6799,
"step": 1555
},
{
"epoch": 0.0015567783891945973,
"grad_norm": 92.08625030517578,
"learning_rate": 3e-06,
"loss": -3.8365,
"step": 1556
},
{
"epoch": 0.0015577788894447224,
"grad_norm": 143.28524780273438,
"learning_rate": 3e-06,
"loss": -6.7169,
"step": 1557
},
{
"epoch": 0.0015587793896948475,
"grad_norm": 96.56403350830078,
"learning_rate": 3e-06,
"loss": -5.0673,
"step": 1558
},
{
"epoch": 0.0015597798899449724,
"grad_norm": 109.43830871582031,
"learning_rate": 3e-06,
"loss": -10.2858,
"step": 1559
},
{
"epoch": 0.0015607803901950975,
"grad_norm": 88.09711456298828,
"learning_rate": 3e-06,
"loss": -4.9645,
"step": 1560
},
{
"completion_length": 253.18750762939453,
"epoch": 0.0015617808904452226,
"grad_norm": 82.52039337158203,
"learning_rate": 3e-06,
"loss": -4.6242,
"reward": 0.2604166865348816,
"reward_std": 0.13141649216413498,
"rewards/sudoku_reward_func": 0.2604166716337204,
"step": 1561,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015627813906953477,
"grad_norm": 62.24523162841797,
"learning_rate": 3e-06,
"loss": -6.4732,
"step": 1562
},
{
"epoch": 0.0015637818909454728,
"grad_norm": 97.1363525390625,
"learning_rate": 3e-06,
"loss": -4.7735,
"step": 1563
},
{
"epoch": 0.0015647823911955979,
"grad_norm": 92.79659271240234,
"learning_rate": 3e-06,
"loss": -5.7465,
"step": 1564
},
{
"epoch": 0.001565782891445723,
"grad_norm": 75.58289337158203,
"learning_rate": 3e-06,
"loss": -5.6129,
"step": 1565
},
{
"epoch": 0.0015667833916958478,
"grad_norm": 79.69831085205078,
"learning_rate": 3e-06,
"loss": -6.9765,
"step": 1566
},
{
"epoch": 0.001567783891945973,
"grad_norm": 99.76036834716797,
"learning_rate": 3e-06,
"loss": -5.4218,
"step": 1567
},
{
"epoch": 0.001568784392196098,
"grad_norm": 73.5775146484375,
"learning_rate": 3e-06,
"loss": -6.1781,
"step": 1568
},
{
"completion_length": 255.9791717529297,
"epoch": 0.001569784892446223,
"grad_norm": 69.83203125,
"learning_rate": 3e-06,
"loss": -3.86,
"reward": 0.2596275433897972,
"reward_std": 0.1274852231144905,
"rewards/sudoku_reward_func": 0.259627528488636,
"step": 1569,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015707853926963482,
"grad_norm": 84.85794067382812,
"learning_rate": 3e-06,
"loss": -1.9833,
"step": 1570
},
{
"epoch": 0.0015717858929464733,
"grad_norm": 74.86754608154297,
"learning_rate": 3e-06,
"loss": -7.1057,
"step": 1571
},
{
"epoch": 0.0015727863931965984,
"grad_norm": 64.1947250366211,
"learning_rate": 3e-06,
"loss": -2.2024,
"step": 1572
},
{
"epoch": 0.0015737868934467233,
"grad_norm": 63.00054168701172,
"learning_rate": 3e-06,
"loss": -4.747,
"step": 1573
},
{
"epoch": 0.0015747873936968484,
"grad_norm": 89.75048065185547,
"learning_rate": 3e-06,
"loss": -2.3625,
"step": 1574
},
{
"epoch": 0.0015757878939469735,
"grad_norm": 82.22975158691406,
"learning_rate": 3e-06,
"loss": -8.4312,
"step": 1575
},
{
"epoch": 0.0015767883941970985,
"grad_norm": 65.96247863769531,
"learning_rate": 3e-06,
"loss": -2.6335,
"step": 1576
},
{
"completion_length": 252.08333587646484,
"epoch": 0.0015777888944472236,
"grad_norm": 116.85649108886719,
"learning_rate": 3e-06,
"loss": -3.3996,
"reward": 0.19888994842767715,
"reward_std": 0.13466180860996246,
"rewards/sudoku_reward_func": 0.19888994097709656,
"step": 1577,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015787893946973487,
"grad_norm": 73.25578308105469,
"learning_rate": 3e-06,
"loss": -3.9517,
"step": 1578
},
{
"epoch": 0.0015797898949474738,
"grad_norm": 82.38320922851562,
"learning_rate": 3e-06,
"loss": 0.7611,
"step": 1579
},
{
"epoch": 0.0015807903951975987,
"grad_norm": 77.25627899169922,
"learning_rate": 3e-06,
"loss": -2.21,
"step": 1580
},
{
"epoch": 0.0015817908954477238,
"grad_norm": 82.29784393310547,
"learning_rate": 3e-06,
"loss": -4.6331,
"step": 1581
},
{
"epoch": 0.001582791395697849,
"grad_norm": 75.16769409179688,
"learning_rate": 3e-06,
"loss": -4.2407,
"step": 1582
},
{
"epoch": 0.001583791895947974,
"grad_norm": 71.37103271484375,
"learning_rate": 3e-06,
"loss": 0.0222,
"step": 1583
},
{
"epoch": 0.001584792396198099,
"grad_norm": 87.73910522460938,
"learning_rate": 3e-06,
"loss": -2.4734,
"step": 1584
},
{
"completion_length": 251.27084350585938,
"epoch": 0.0015857928964482242,
"grad_norm": 102.77171325683594,
"learning_rate": 3e-06,
"loss": -0.3869,
"reward": 0.24416036158800125,
"reward_std": 0.12802283093333244,
"rewards/sudoku_reward_func": 0.24416035413742065,
"step": 1585,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015867933966983493,
"grad_norm": 111.1654052734375,
"learning_rate": 3e-06,
"loss": -1.6993,
"step": 1586
},
{
"epoch": 0.0015877938969484741,
"grad_norm": 98.75373077392578,
"learning_rate": 3e-06,
"loss": -1.3497,
"step": 1587
},
{
"epoch": 0.0015887943971985992,
"grad_norm": 123.21533203125,
"learning_rate": 3e-06,
"loss": -0.2909,
"step": 1588
},
{
"epoch": 0.0015897948974487243,
"grad_norm": 115.61258697509766,
"learning_rate": 3e-06,
"loss": -1.0612,
"step": 1589
},
{
"epoch": 0.0015907953976988494,
"grad_norm": 103.52163696289062,
"learning_rate": 3e-06,
"loss": -2.3525,
"step": 1590
},
{
"epoch": 0.0015917958979489745,
"grad_norm": 119.00145721435547,
"learning_rate": 3e-06,
"loss": -1.7235,
"step": 1591
},
{
"epoch": 0.0015927963981990996,
"grad_norm": 105.48393249511719,
"learning_rate": 3e-06,
"loss": -1.6225,
"step": 1592
},
{
"completion_length": 253.7916717529297,
"epoch": 0.0015937968984492247,
"grad_norm": 125.09081268310547,
"learning_rate": 3e-06,
"loss": 5.2883,
"reward": 0.250248022377491,
"reward_std": 0.12681767344474792,
"rewards/sudoku_reward_func": 0.250248022377491,
"step": 1593,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0015947973986993496,
"grad_norm": 125.6259536743164,
"learning_rate": 3e-06,
"loss": -1.9275,
"step": 1594
},
{
"epoch": 0.0015957978989494747,
"grad_norm": 103.04039764404297,
"learning_rate": 3e-06,
"loss": -5.8767,
"step": 1595
},
{
"epoch": 0.0015967983991995998,
"grad_norm": 123.74522399902344,
"learning_rate": 3e-06,
"loss": -1.9645,
"step": 1596
},
{
"epoch": 0.0015977988994497249,
"grad_norm": 121.4541015625,
"learning_rate": 3e-06,
"loss": 5.1423,
"step": 1597
},
{
"epoch": 0.00159879939969985,
"grad_norm": 133.47821044921875,
"learning_rate": 3e-06,
"loss": -1.9671,
"step": 1598
},
{
"epoch": 0.001599799899949975,
"grad_norm": 105.77659606933594,
"learning_rate": 3e-06,
"loss": -6.2592,
"step": 1599
},
{
"epoch": 0.0016008004002001002,
"grad_norm": 93.46845245361328,
"learning_rate": 3e-06,
"loss": -2.6713,
"step": 1600
},
{
"completion_length": 254.5416717529297,
"epoch": 0.001601800900450225,
"grad_norm": 136.07968139648438,
"learning_rate": 3e-06,
"loss": -7.7159,
"reward": 0.2539908140897751,
"reward_std": 0.1405995786190033,
"rewards/sudoku_reward_func": 0.2539908140897751,
"step": 1601,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016028014007003501,
"grad_norm": 160.49053955078125,
"learning_rate": 3e-06,
"loss": -6.9086,
"step": 1602
},
{
"epoch": 0.0016038019009504752,
"grad_norm": 117.0043716430664,
"learning_rate": 3e-06,
"loss": -2.7393,
"step": 1603
},
{
"epoch": 0.0016048024012006003,
"grad_norm": 105.5345687866211,
"learning_rate": 3e-06,
"loss": -6.4129,
"step": 1604
},
{
"epoch": 0.0016058029014507254,
"grad_norm": 108.58619689941406,
"learning_rate": 3e-06,
"loss": -8.0772,
"step": 1605
},
{
"epoch": 0.0016068034017008505,
"grad_norm": 168.36338806152344,
"learning_rate": 3e-06,
"loss": -9.2435,
"step": 1606
},
{
"epoch": 0.0016078039019509756,
"grad_norm": 124.85694122314453,
"learning_rate": 3e-06,
"loss": -5.0364,
"step": 1607
},
{
"epoch": 0.0016088044022011005,
"grad_norm": 91.1465835571289,
"learning_rate": 3e-06,
"loss": -7.5496,
"step": 1608
},
{
"completion_length": 250.6041717529297,
"epoch": 0.0016098049024512256,
"grad_norm": 158.19471740722656,
"learning_rate": 3e-06,
"loss": -10.0427,
"reward": 0.22151951491832733,
"reward_std": 0.14843863993883133,
"rewards/sudoku_reward_func": 0.22151951491832733,
"step": 1609,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016108054027013507,
"grad_norm": 140.9024658203125,
"learning_rate": 3e-06,
"loss": -9.1608,
"step": 1610
},
{
"epoch": 0.0016118059029514758,
"grad_norm": 106.24528503417969,
"learning_rate": 3e-06,
"loss": -15.0477,
"step": 1611
},
{
"epoch": 0.0016128064032016008,
"grad_norm": 131.50599670410156,
"learning_rate": 3e-06,
"loss": -13.0933,
"step": 1612
},
{
"epoch": 0.001613806903451726,
"grad_norm": 134.1167755126953,
"learning_rate": 3e-06,
"loss": -10.9769,
"step": 1613
},
{
"epoch": 0.001614807403701851,
"grad_norm": 134.45484924316406,
"learning_rate": 3e-06,
"loss": -10.1419,
"step": 1614
},
{
"epoch": 0.001615807903951976,
"grad_norm": 128.87808227539062,
"learning_rate": 3e-06,
"loss": -14.6942,
"step": 1615
},
{
"epoch": 0.001616808404202101,
"grad_norm": 103.39990997314453,
"learning_rate": 3e-06,
"loss": -14.5943,
"step": 1616
},
{
"completion_length": 253.6875,
"epoch": 0.001617808904452226,
"grad_norm": 117.34486389160156,
"learning_rate": 3e-06,
"loss": 2.1868,
"reward": 0.23395414650440216,
"reward_std": 0.1375764161348343,
"rewards/sudoku_reward_func": 0.23395412415266037,
"step": 1617,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016188094047023512,
"grad_norm": 114.16409301757812,
"learning_rate": 3e-06,
"loss": -3.7694,
"step": 1618
},
{
"epoch": 0.0016198099049524763,
"grad_norm": 145.38967895507812,
"learning_rate": 3e-06,
"loss": -2.0291,
"step": 1619
},
{
"epoch": 0.0016208104052026014,
"grad_norm": 120.34522247314453,
"learning_rate": 3e-06,
"loss": -4.8787,
"step": 1620
},
{
"epoch": 0.0016218109054527263,
"grad_norm": 120.42835235595703,
"learning_rate": 3e-06,
"loss": 2.1333,
"step": 1621
},
{
"epoch": 0.0016228114057028514,
"grad_norm": 104.11591339111328,
"learning_rate": 3e-06,
"loss": -4.6544,
"step": 1622
},
{
"epoch": 0.0016238119059529764,
"grad_norm": 143.9630889892578,
"learning_rate": 3e-06,
"loss": -2.2529,
"step": 1623
},
{
"epoch": 0.0016248124062031015,
"grad_norm": 139.08253479003906,
"learning_rate": 3e-06,
"loss": -5.4911,
"step": 1624
},
{
"completion_length": 248.95833587646484,
"epoch": 0.0016258129064532266,
"grad_norm": 106.52497863769531,
"learning_rate": 3e-06,
"loss": -4.0487,
"reward": 0.2699652835726738,
"reward_std": 0.13865990936756134,
"rewards/sudoku_reward_func": 0.2699652835726738,
"step": 1625,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016268134067033517,
"grad_norm": 205.29147338867188,
"learning_rate": 3e-06,
"loss": -6.9653,
"step": 1626
},
{
"epoch": 0.0016278139069534768,
"grad_norm": 154.57858276367188,
"learning_rate": 3e-06,
"loss": 1.8228,
"step": 1627
},
{
"epoch": 0.0016288144072036017,
"grad_norm": 107.60708618164062,
"learning_rate": 3e-06,
"loss": 0.0389,
"step": 1628
},
{
"epoch": 0.0016298149074537268,
"grad_norm": 109.25032043457031,
"learning_rate": 3e-06,
"loss": -4.4817,
"step": 1629
},
{
"epoch": 0.0016308154077038519,
"grad_norm": 142.53073120117188,
"learning_rate": 3e-06,
"loss": -7.5816,
"step": 1630
},
{
"epoch": 0.001631815907953977,
"grad_norm": 119.20352935791016,
"learning_rate": 3e-06,
"loss": 1.6034,
"step": 1631
},
{
"epoch": 0.001632816408204102,
"grad_norm": 153.0146484375,
"learning_rate": 3e-06,
"loss": -1.5781,
"step": 1632
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0016338169084542272,
"grad_norm": 196.0118408203125,
"learning_rate": 3e-06,
"loss": 5.2555,
"reward": 0.21412037312984467,
"reward_std": 0.16516636312007904,
"rewards/sudoku_reward_func": 0.21412037312984467,
"step": 1633,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016348174087043523,
"grad_norm": 120.1426773071289,
"learning_rate": 3e-06,
"loss": 5.5743,
"step": 1634
},
{
"epoch": 0.0016358179089544771,
"grad_norm": 113.17626190185547,
"learning_rate": 3e-06,
"loss": 2.7044,
"step": 1635
},
{
"epoch": 0.0016368184092046022,
"grad_norm": 157.77957153320312,
"learning_rate": 3e-06,
"loss": 3.0029,
"step": 1636
},
{
"epoch": 0.0016378189094547273,
"grad_norm": 142.81764221191406,
"learning_rate": 3e-06,
"loss": 3.6418,
"step": 1637
},
{
"epoch": 0.0016388194097048524,
"grad_norm": 151.52130126953125,
"learning_rate": 3e-06,
"loss": 3.9227,
"step": 1638
},
{
"epoch": 0.0016398199099549775,
"grad_norm": 134.97328186035156,
"learning_rate": 3e-06,
"loss": 1.3015,
"step": 1639
},
{
"epoch": 0.0016408204102051026,
"grad_norm": 128.61497497558594,
"learning_rate": 3e-06,
"loss": 2.455,
"step": 1640
},
{
"completion_length": 251.1041717529297,
"epoch": 0.0016418209104552277,
"grad_norm": 131.1600799560547,
"learning_rate": 3e-06,
"loss": 1.0932,
"reward": 0.2405754178762436,
"reward_std": 0.13715411722660065,
"rewards/sudoku_reward_func": 0.2405753955245018,
"step": 1641,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016428214107053526,
"grad_norm": 143.12283325195312,
"learning_rate": 3e-06,
"loss": 4.3751,
"step": 1642
},
{
"epoch": 0.0016438219109554777,
"grad_norm": 156.13925170898438,
"learning_rate": 3e-06,
"loss": 1.8976,
"step": 1643
},
{
"epoch": 0.0016448224112056028,
"grad_norm": 104.6756591796875,
"learning_rate": 3e-06,
"loss": 6.0342,
"step": 1644
},
{
"epoch": 0.0016458229114557279,
"grad_norm": 125.25997924804688,
"learning_rate": 3e-06,
"loss": -0.2949,
"step": 1645
},
{
"epoch": 0.001646823411705853,
"grad_norm": 139.3275146484375,
"learning_rate": 3e-06,
"loss": 2.4151,
"step": 1646
},
{
"epoch": 0.001647823911955978,
"grad_norm": 87.91356658935547,
"learning_rate": 3e-06,
"loss": 0.6755,
"step": 1647
},
{
"epoch": 0.0016488244122061031,
"grad_norm": 156.1615447998047,
"learning_rate": 3e-06,
"loss": 3.1565,
"step": 1648
},
{
"completion_length": 253.87500762939453,
"epoch": 0.001649824912456228,
"grad_norm": 133.72607421875,
"learning_rate": 3e-06,
"loss": -0.155,
"reward": 0.1815100461244583,
"reward_std": 0.16103161871433258,
"rewards/sudoku_reward_func": 0.1815100461244583,
"step": 1649,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016508254127063531,
"grad_norm": 283.2836608886719,
"learning_rate": 3e-06,
"loss": -1.3618,
"step": 1650
},
{
"epoch": 0.0016518259129564782,
"grad_norm": 167.18289184570312,
"learning_rate": 3e-06,
"loss": -7.732,
"step": 1651
},
{
"epoch": 0.0016528264132066033,
"grad_norm": 124.50619506835938,
"learning_rate": 3e-06,
"loss": -7.0091,
"step": 1652
},
{
"epoch": 0.0016538269134567284,
"grad_norm": 177.51052856445312,
"learning_rate": 3e-06,
"loss": -0.1884,
"step": 1653
},
{
"epoch": 0.0016548274137068535,
"grad_norm": 92.6873779296875,
"learning_rate": 3e-06,
"loss": -3.7635,
"step": 1654
},
{
"epoch": 0.0016558279139569786,
"grad_norm": 99.19071197509766,
"learning_rate": 3e-06,
"loss": -7.8231,
"step": 1655
},
{
"epoch": 0.0016568284142071035,
"grad_norm": 145.24159240722656,
"learning_rate": 3e-06,
"loss": -6.9523,
"step": 1656
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0016578289144572286,
"grad_norm": 149.080322265625,
"learning_rate": 3e-06,
"loss": -6.2626,
"reward": 0.24648644030094147,
"reward_std": 0.16061823815107346,
"rewards/sudoku_reward_func": 0.24648643285036087,
"step": 1657,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016588294147073536,
"grad_norm": 109.03119659423828,
"learning_rate": 3e-06,
"loss": -1.8432,
"step": 1658
},
{
"epoch": 0.0016598299149574787,
"grad_norm": 133.4971923828125,
"learning_rate": 3e-06,
"loss": -3.2182,
"step": 1659
},
{
"epoch": 0.0016608304152076038,
"grad_norm": 101.81431579589844,
"learning_rate": 3e-06,
"loss": -0.4809,
"step": 1660
},
{
"epoch": 0.001661830915457729,
"grad_norm": 148.29449462890625,
"learning_rate": 3e-06,
"loss": -7.8211,
"step": 1661
},
{
"epoch": 0.001662831415707854,
"grad_norm": 125.23590850830078,
"learning_rate": 3e-06,
"loss": -3.2888,
"step": 1662
},
{
"epoch": 0.001663831915957979,
"grad_norm": 142.0076904296875,
"learning_rate": 3e-06,
"loss": -3.8196,
"step": 1663
},
{
"epoch": 0.001664832416208104,
"grad_norm": 107.60643005371094,
"learning_rate": 3e-06,
"loss": -1.6867,
"step": 1664
},
{
"completion_length": 254.7291717529297,
"epoch": 0.001665832916458229,
"grad_norm": 227.44874572753906,
"learning_rate": 3e-06,
"loss": -9.3651,
"reward": 0.2640542536973953,
"reward_std": 0.17234958708286285,
"rewards/sudoku_reward_func": 0.26405423879623413,
"step": 1665,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016668334167083542,
"grad_norm": 145.90663146972656,
"learning_rate": 3e-06,
"loss": -6.723,
"step": 1666
},
{
"epoch": 0.0016678339169584793,
"grad_norm": 205.7703399658203,
"learning_rate": 3e-06,
"loss": -11.4297,
"step": 1667
},
{
"epoch": 0.0016688344172086044,
"grad_norm": 128.5513153076172,
"learning_rate": 3e-06,
"loss": -12.0292,
"step": 1668
},
{
"epoch": 0.0016698349174587295,
"grad_norm": 217.45556640625,
"learning_rate": 3e-06,
"loss": -8.7485,
"step": 1669
},
{
"epoch": 0.0016708354177088543,
"grad_norm": 123.4818115234375,
"learning_rate": 3e-06,
"loss": -8.3128,
"step": 1670
},
{
"epoch": 0.0016718359179589794,
"grad_norm": 174.58563232421875,
"learning_rate": 3e-06,
"loss": -11.4492,
"step": 1671
},
{
"epoch": 0.0016728364182091045,
"grad_norm": 139.7147674560547,
"learning_rate": 3e-06,
"loss": -12.6301,
"step": 1672
},
{
"completion_length": 251.83334350585938,
"epoch": 0.0016738369184592296,
"grad_norm": 119.95178985595703,
"learning_rate": 3e-06,
"loss": 1.2102,
"reward": 0.2717013955116272,
"reward_std": 0.16230863332748413,
"rewards/sudoku_reward_func": 0.2717013955116272,
"step": 1673,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016748374187093547,
"grad_norm": 133.8927459716797,
"learning_rate": 3e-06,
"loss": -2.6016,
"step": 1674
},
{
"epoch": 0.0016758379189594798,
"grad_norm": 154.2207794189453,
"learning_rate": 3e-06,
"loss": 1.9318,
"step": 1675
},
{
"epoch": 0.001676838419209605,
"grad_norm": 114.0203857421875,
"learning_rate": 3e-06,
"loss": -1.2058,
"step": 1676
},
{
"epoch": 0.0016778389194597298,
"grad_norm": 122.6033706665039,
"learning_rate": 3e-06,
"loss": 0.6539,
"step": 1677
},
{
"epoch": 0.0016788394197098549,
"grad_norm": 127.75208282470703,
"learning_rate": 3e-06,
"loss": -3.674,
"step": 1678
},
{
"epoch": 0.00167983991995998,
"grad_norm": 174.46522521972656,
"learning_rate": 3e-06,
"loss": 0.9551,
"step": 1679
},
{
"epoch": 0.001680840420210105,
"grad_norm": 138.7799835205078,
"learning_rate": 3e-06,
"loss": -1.7151,
"step": 1680
},
{
"completion_length": 254.6666717529297,
"epoch": 0.0016818409204602302,
"grad_norm": 151.94314575195312,
"learning_rate": 3e-06,
"loss": 6.1425,
"reward": 0.22201555222272873,
"reward_std": 0.13886961340904236,
"rewards/sudoku_reward_func": 0.22201555222272873,
"step": 1681,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016828414207103552,
"grad_norm": 143.5550994873047,
"learning_rate": 3e-06,
"loss": 4.1679,
"step": 1682
},
{
"epoch": 0.0016838419209604803,
"grad_norm": 140.15504455566406,
"learning_rate": 3e-06,
"loss": 4.78,
"step": 1683
},
{
"epoch": 0.0016848424212106052,
"grad_norm": 191.37742614746094,
"learning_rate": 3e-06,
"loss": 9.2405,
"step": 1684
},
{
"epoch": 0.0016858429214607303,
"grad_norm": 178.3914337158203,
"learning_rate": 3e-06,
"loss": 2.8219,
"step": 1685
},
{
"epoch": 0.0016868434217108554,
"grad_norm": 74.8600082397461,
"learning_rate": 3e-06,
"loss": 2.2428,
"step": 1686
},
{
"epoch": 0.0016878439219609805,
"grad_norm": 122.69676971435547,
"learning_rate": 3e-06,
"loss": 2.1578,
"step": 1687
},
{
"epoch": 0.0016888444222111056,
"grad_norm": 189.11329650878906,
"learning_rate": 3e-06,
"loss": 5.1728,
"step": 1688
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0016898449224612307,
"grad_norm": 102.0486068725586,
"learning_rate": 3e-06,
"loss": 1.2396,
"reward": 0.24384094774723053,
"reward_std": 0.13583382219076157,
"rewards/sudoku_reward_func": 0.24384094774723053,
"step": 1689,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0016908454227113558,
"grad_norm": 65.69036102294922,
"learning_rate": 3e-06,
"loss": 1.7486,
"step": 1690
},
{
"epoch": 0.0016918459229614807,
"grad_norm": 91.4066162109375,
"learning_rate": 3e-06,
"loss": 1.4402,
"step": 1691
},
{
"epoch": 0.0016928464232116058,
"grad_norm": 89.79181671142578,
"learning_rate": 3e-06,
"loss": 0.494,
"step": 1692
},
{
"epoch": 0.0016938469234617308,
"grad_norm": 75.11154174804688,
"learning_rate": 3e-06,
"loss": 0.0127,
"step": 1693
},
{
"epoch": 0.001694847423711856,
"grad_norm": 60.495384216308594,
"learning_rate": 3e-06,
"loss": 1.491,
"step": 1694
},
{
"epoch": 0.001695847923961981,
"grad_norm": 79.2197036743164,
"learning_rate": 3e-06,
"loss": 1.074,
"step": 1695
},
{
"epoch": 0.0016968484242121061,
"grad_norm": 105.94502258300781,
"learning_rate": 3e-06,
"loss": -0.5889,
"step": 1696
},
{
"completion_length": 255.8125,
"epoch": 0.001697848924462231,
"grad_norm": 118.99041748046875,
"learning_rate": 3e-06,
"loss": -8.0986,
"reward": 0.25305887311697006,
"reward_std": 0.1613667756319046,
"rewards/sudoku_reward_func": 0.25305885821580887,
"step": 1697,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001698849424712356,
"grad_norm": 81.08747863769531,
"learning_rate": 3e-06,
"loss": -9.7944,
"step": 1698
},
{
"epoch": 0.0016998499249624812,
"grad_norm": 76.02010345458984,
"learning_rate": 3e-06,
"loss": -8.2251,
"step": 1699
},
{
"epoch": 0.0017008504252126063,
"grad_norm": 89.18621063232422,
"learning_rate": 3e-06,
"loss": -9.8214,
"step": 1700
},
{
"epoch": 0.0017018509254627314,
"grad_norm": 112.59184265136719,
"learning_rate": 3e-06,
"loss": -9.0111,
"step": 1701
},
{
"epoch": 0.0017028514257128565,
"grad_norm": 78.08659362792969,
"learning_rate": 3e-06,
"loss": -10.6981,
"step": 1702
},
{
"epoch": 0.0017038519259629816,
"grad_norm": 88.7759780883789,
"learning_rate": 3e-06,
"loss": -9.1064,
"step": 1703
},
{
"epoch": 0.0017048524262131064,
"grad_norm": 80.99433898925781,
"learning_rate": 3e-06,
"loss": -11.0677,
"step": 1704
},
{
"completion_length": 253.25,
"epoch": 0.0017058529264632315,
"grad_norm": 118.51707458496094,
"learning_rate": 3e-06,
"loss": -0.7235,
"reward": 0.2441716343164444,
"reward_std": 0.15575607120990753,
"rewards/sudoku_reward_func": 0.2441716343164444,
"step": 1705,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017068534267133566,
"grad_norm": 87.36502838134766,
"learning_rate": 3e-06,
"loss": 3.1127,
"step": 1706
},
{
"epoch": 0.0017078539269634817,
"grad_norm": 82.34584045410156,
"learning_rate": 3e-06,
"loss": -0.7819,
"step": 1707
},
{
"epoch": 0.0017088544272136068,
"grad_norm": 74.79312896728516,
"learning_rate": 3e-06,
"loss": 2.0932,
"step": 1708
},
{
"epoch": 0.001709854927463732,
"grad_norm": 107.77957153320312,
"learning_rate": 3e-06,
"loss": -1.5862,
"step": 1709
},
{
"epoch": 0.001710855427713857,
"grad_norm": 78.9430160522461,
"learning_rate": 3e-06,
"loss": 2.499,
"step": 1710
},
{
"epoch": 0.0017118559279639819,
"grad_norm": 88.64921569824219,
"learning_rate": 3e-06,
"loss": -1.5229,
"step": 1711
},
{
"epoch": 0.001712856428214107,
"grad_norm": 80.30286407470703,
"learning_rate": 3e-06,
"loss": 1.0471,
"step": 1712
},
{
"completion_length": 255.9791717529297,
"epoch": 0.001713856928464232,
"grad_norm": 102.88541412353516,
"learning_rate": 3e-06,
"loss": 5.9438,
"reward": 0.23086145520210266,
"reward_std": 0.1465577483177185,
"rewards/sudoku_reward_func": 0.23086144030094147,
"step": 1713,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017148574287143572,
"grad_norm": 96.9382095336914,
"learning_rate": 3e-06,
"loss": -1.8535,
"step": 1714
},
{
"epoch": 0.0017158579289644823,
"grad_norm": 93.00968170166016,
"learning_rate": 3e-06,
"loss": 2.9559,
"step": 1715
},
{
"epoch": 0.0017168584292146074,
"grad_norm": 115.1343002319336,
"learning_rate": 3e-06,
"loss": 5.5838,
"step": 1716
},
{
"epoch": 0.0017178589294647325,
"grad_norm": 117.99451446533203,
"learning_rate": 3e-06,
"loss": 5.6028,
"step": 1717
},
{
"epoch": 0.0017188594297148573,
"grad_norm": 105.47667694091797,
"learning_rate": 3e-06,
"loss": -2.0703,
"step": 1718
},
{
"epoch": 0.0017198599299649824,
"grad_norm": 87.09989166259766,
"learning_rate": 3e-06,
"loss": 1.6817,
"step": 1719
},
{
"epoch": 0.0017208604302151075,
"grad_norm": 111.20448303222656,
"learning_rate": 3e-06,
"loss": 5.465,
"step": 1720
},
{
"completion_length": 252.1666717529297,
"epoch": 0.0017218609304652326,
"grad_norm": 138.27731323242188,
"learning_rate": 3e-06,
"loss": -15.3219,
"reward": 0.23909859359264374,
"reward_std": 0.14476260542869568,
"rewards/sudoku_reward_func": 0.23909857124090195,
"step": 1721,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017228614307153577,
"grad_norm": 109.71159362792969,
"learning_rate": 3e-06,
"loss": -13.3281,
"step": 1722
},
{
"epoch": 0.0017238619309654828,
"grad_norm": 103.57756042480469,
"learning_rate": 3e-06,
"loss": -12.6453,
"step": 1723
},
{
"epoch": 0.001724862431215608,
"grad_norm": 126.73169708251953,
"learning_rate": 3e-06,
"loss": -9.6316,
"step": 1724
},
{
"epoch": 0.0017258629314657328,
"grad_norm": 124.72553253173828,
"learning_rate": 3e-06,
"loss": -15.9503,
"step": 1725
},
{
"epoch": 0.0017268634317158579,
"grad_norm": 105.17279052734375,
"learning_rate": 3e-06,
"loss": -14.3943,
"step": 1726
},
{
"epoch": 0.001727863931965983,
"grad_norm": 102.4739990234375,
"learning_rate": 3e-06,
"loss": -13.3378,
"step": 1727
},
{
"epoch": 0.001728864432216108,
"grad_norm": 138.15892028808594,
"learning_rate": 3e-06,
"loss": -9.9653,
"step": 1728
},
{
"completion_length": 254.39584350585938,
"epoch": 0.0017298649324662331,
"grad_norm": 96.33226776123047,
"learning_rate": 3e-06,
"loss": -11.8497,
"reward": 0.2301737666130066,
"reward_std": 0.14909013360738754,
"rewards/sudoku_reward_func": 0.2301737666130066,
"step": 1729,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017308654327163582,
"grad_norm": 86.83834838867188,
"learning_rate": 3e-06,
"loss": -16.6763,
"step": 1730
},
{
"epoch": 0.0017318659329664833,
"grad_norm": 101.6478271484375,
"learning_rate": 3e-06,
"loss": -10.558,
"step": 1731
},
{
"epoch": 0.0017328664332166082,
"grad_norm": 93.8929214477539,
"learning_rate": 3e-06,
"loss": -11.1721,
"step": 1732
},
{
"epoch": 0.0017338669334667333,
"grad_norm": 89.75656127929688,
"learning_rate": 3e-06,
"loss": -12.3965,
"step": 1733
},
{
"epoch": 0.0017348674337168584,
"grad_norm": 97.66020202636719,
"learning_rate": 3e-06,
"loss": -17.2704,
"step": 1734
},
{
"epoch": 0.0017358679339669835,
"grad_norm": 88.83231353759766,
"learning_rate": 3e-06,
"loss": -11.6396,
"step": 1735
},
{
"epoch": 0.0017368684342171086,
"grad_norm": 119.40660858154297,
"learning_rate": 3e-06,
"loss": -11.6067,
"step": 1736
},
{
"completion_length": 252.1666717529297,
"epoch": 0.0017378689344672337,
"grad_norm": 119.9146957397461,
"learning_rate": 3e-06,
"loss": -9.2346,
"reward": 0.28505294024944305,
"reward_std": 0.1518668606877327,
"rewards/sudoku_reward_func": 0.28505291789770126,
"step": 1737,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017388694347173588,
"grad_norm": 151.11672973632812,
"learning_rate": 3e-06,
"loss": -5.4549,
"step": 1738
},
{
"epoch": 0.0017398699349674837,
"grad_norm": 101.99298095703125,
"learning_rate": 3e-06,
"loss": -6.1653,
"step": 1739
},
{
"epoch": 0.0017408704352176087,
"grad_norm": 143.7563018798828,
"learning_rate": 3e-06,
"loss": -5.4064,
"step": 1740
},
{
"epoch": 0.0017418709354677338,
"grad_norm": 130.0487823486328,
"learning_rate": 3e-06,
"loss": -10.9143,
"step": 1741
},
{
"epoch": 0.001742871435717859,
"grad_norm": 164.75473022460938,
"learning_rate": 3e-06,
"loss": -7.4136,
"step": 1742
},
{
"epoch": 0.001743871935967984,
"grad_norm": 102.13026428222656,
"learning_rate": 3e-06,
"loss": -7.0587,
"step": 1743
},
{
"epoch": 0.0017448724362181091,
"grad_norm": 105.52335357666016,
"learning_rate": 3e-06,
"loss": -6.7822,
"step": 1744
},
{
"completion_length": 253.58334350585938,
"epoch": 0.0017458729364682342,
"grad_norm": 89.50044250488281,
"learning_rate": 3e-06,
"loss": -1.5483,
"reward": 0.22809945046901703,
"reward_std": 0.12719708681106567,
"rewards/sudoku_reward_func": 0.22809944301843643,
"step": 1745,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001746873436718359,
"grad_norm": 101.14523315429688,
"learning_rate": 3e-06,
"loss": -2.5046,
"step": 1746
},
{
"epoch": 0.0017478739369684842,
"grad_norm": 62.526153564453125,
"learning_rate": 3e-06,
"loss": -0.9477,
"step": 1747
},
{
"epoch": 0.0017488744372186093,
"grad_norm": 67.7402572631836,
"learning_rate": 3e-06,
"loss": -7.4768,
"step": 1748
},
{
"epoch": 0.0017498749374687344,
"grad_norm": 115.06365203857422,
"learning_rate": 3e-06,
"loss": -2.7306,
"step": 1749
},
{
"epoch": 0.0017508754377188595,
"grad_norm": 87.61795043945312,
"learning_rate": 3e-06,
"loss": -2.4428,
"step": 1750
},
{
"epoch": 0.0017518759379689846,
"grad_norm": 64.8736801147461,
"learning_rate": 3e-06,
"loss": -1.5924,
"step": 1751
},
{
"epoch": 0.0017528764382191097,
"grad_norm": 86.32395935058594,
"learning_rate": 3e-06,
"loss": -7.8121,
"step": 1752
},
{
"completion_length": 254.3125,
"epoch": 0.0017538769384692345,
"grad_norm": 124.07573699951172,
"learning_rate": 3e-06,
"loss": 0.7756,
"reward": 0.24261214584112167,
"reward_std": 0.15232361108064651,
"rewards/sudoku_reward_func": 0.24261213093996048,
"step": 1753,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017548774387193596,
"grad_norm": 96.29853820800781,
"learning_rate": 3e-06,
"loss": -2.1126,
"step": 1754
},
{
"epoch": 0.0017558779389694847,
"grad_norm": 90.17716979980469,
"learning_rate": 3e-06,
"loss": 3.0019,
"step": 1755
},
{
"epoch": 0.0017568784392196098,
"grad_norm": 100.68418884277344,
"learning_rate": 3e-06,
"loss": -1.2563,
"step": 1756
},
{
"epoch": 0.001757878939469735,
"grad_norm": 115.46549987792969,
"learning_rate": 3e-06,
"loss": -0.2155,
"step": 1757
},
{
"epoch": 0.00175887943971986,
"grad_norm": 105.72683715820312,
"learning_rate": 3e-06,
"loss": -2.9011,
"step": 1758
},
{
"epoch": 0.001759879939969985,
"grad_norm": 84.93206787109375,
"learning_rate": 3e-06,
"loss": 2.8186,
"step": 1759
},
{
"epoch": 0.00176088044022011,
"grad_norm": 107.25251007080078,
"learning_rate": 3e-06,
"loss": -1.8318,
"step": 1760
},
{
"completion_length": 252.8125,
"epoch": 0.001761880940470235,
"grad_norm": 103.16302490234375,
"learning_rate": 3e-06,
"loss": -3.5031,
"reward": 0.20936673879623413,
"reward_std": 0.13528703153133392,
"rewards/sudoku_reward_func": 0.20936672389507294,
"step": 1761,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017628814407203602,
"grad_norm": 148.88758850097656,
"learning_rate": 3e-06,
"loss": -4.4631,
"step": 1762
},
{
"epoch": 0.0017638819409704853,
"grad_norm": 76.8835678100586,
"learning_rate": 3e-06,
"loss": -4.734,
"step": 1763
},
{
"epoch": 0.0017648824412206103,
"grad_norm": 178.9423828125,
"learning_rate": 3e-06,
"loss": -7.8421,
"step": 1764
},
{
"epoch": 0.0017658829414707354,
"grad_norm": 113.23098754882812,
"learning_rate": 3e-06,
"loss": -4.039,
"step": 1765
},
{
"epoch": 0.0017668834417208605,
"grad_norm": 125.62858581542969,
"learning_rate": 3e-06,
"loss": -5.1876,
"step": 1766
},
{
"epoch": 0.0017678839419709854,
"grad_norm": 81.04549407958984,
"learning_rate": 3e-06,
"loss": -5.3517,
"step": 1767
},
{
"epoch": 0.0017688844422211105,
"grad_norm": 155.84767150878906,
"learning_rate": 3e-06,
"loss": -7.7305,
"step": 1768
},
{
"completion_length": 253.52084350585938,
"epoch": 0.0017698849424712356,
"grad_norm": 183.12640380859375,
"learning_rate": 3e-06,
"loss": 5.874,
"reward": 0.23503637313842773,
"reward_std": 0.15780103206634521,
"rewards/sudoku_reward_func": 0.23503635823726654,
"step": 1769,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017708854427213607,
"grad_norm": 134.1517333984375,
"learning_rate": 3e-06,
"loss": 5.0481,
"step": 1770
},
{
"epoch": 0.0017718859429714858,
"grad_norm": 200.28854370117188,
"learning_rate": 3e-06,
"loss": 9.3885,
"step": 1771
},
{
"epoch": 0.0017728864432216109,
"grad_norm": 118.0494155883789,
"learning_rate": 3e-06,
"loss": 11.8479,
"step": 1772
},
{
"epoch": 0.001773886943471736,
"grad_norm": 153.0768585205078,
"learning_rate": 3e-06,
"loss": 4.5118,
"step": 1773
},
{
"epoch": 0.0017748874437218609,
"grad_norm": 134.91383361816406,
"learning_rate": 3e-06,
"loss": 4.0104,
"step": 1774
},
{
"epoch": 0.001775887943971986,
"grad_norm": 102.95285034179688,
"learning_rate": 3e-06,
"loss": 7.8211,
"step": 1775
},
{
"epoch": 0.001776888444222111,
"grad_norm": 161.75311279296875,
"learning_rate": 3e-06,
"loss": 9.3988,
"step": 1776
},
{
"completion_length": 256.0,
"epoch": 0.0017778889444722361,
"grad_norm": 76.7910385131836,
"learning_rate": 3e-06,
"loss": 2.0049,
"reward": 0.24829020351171494,
"reward_std": 0.1371561922132969,
"rewards/sudoku_reward_func": 0.24829020351171494,
"step": 1777,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017788894447223612,
"grad_norm": 115.39936828613281,
"learning_rate": 3e-06,
"loss": 0.9716,
"step": 1778
},
{
"epoch": 0.0017798899449724863,
"grad_norm": 96.10639953613281,
"learning_rate": 3e-06,
"loss": -0.8323,
"step": 1779
},
{
"epoch": 0.0017808904452226112,
"grad_norm": 98.98098754882812,
"learning_rate": 3e-06,
"loss": -3.2226,
"step": 1780
},
{
"epoch": 0.0017818909454727363,
"grad_norm": 63.91598129272461,
"learning_rate": 3e-06,
"loss": 1.5459,
"step": 1781
},
{
"epoch": 0.0017828914457228614,
"grad_norm": 85.2409896850586,
"learning_rate": 3e-06,
"loss": 0.9568,
"step": 1782
},
{
"epoch": 0.0017838919459729865,
"grad_norm": 92.9672622680664,
"learning_rate": 3e-06,
"loss": -1.4394,
"step": 1783
},
{
"epoch": 0.0017848924462231116,
"grad_norm": 105.71380615234375,
"learning_rate": 3e-06,
"loss": -3.4215,
"step": 1784
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0017858929464732367,
"grad_norm": 136.5750274658203,
"learning_rate": 3e-06,
"loss": -7.522,
"reward": 0.23626518994569778,
"reward_std": 0.1622900366783142,
"rewards/sudoku_reward_func": 0.23626518994569778,
"step": 1785,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0017868934467233618,
"grad_norm": 206.81781005859375,
"learning_rate": 3e-06,
"loss": -8.6697,
"step": 1786
},
{
"epoch": 0.0017878939469734866,
"grad_norm": 103.22502136230469,
"learning_rate": 3e-06,
"loss": -13.4631,
"step": 1787
},
{
"epoch": 0.0017888944472236117,
"grad_norm": 160.32032775878906,
"learning_rate": 3e-06,
"loss": -6.0248,
"step": 1788
},
{
"epoch": 0.0017898949474737368,
"grad_norm": 138.78746032714844,
"learning_rate": 3e-06,
"loss": -7.9239,
"step": 1789
},
{
"epoch": 0.001790895447723862,
"grad_norm": 199.31930541992188,
"learning_rate": 3e-06,
"loss": -10.0847,
"step": 1790
},
{
"epoch": 0.001791895947973987,
"grad_norm": 106.71943664550781,
"learning_rate": 3e-06,
"loss": -13.6951,
"step": 1791
},
{
"epoch": 0.0017928964482241121,
"grad_norm": 159.4205322265625,
"learning_rate": 3e-06,
"loss": -6.4479,
"step": 1792
},
{
"completion_length": 254.1875,
"epoch": 0.0017938969484742372,
"grad_norm": 96.4322509765625,
"learning_rate": 3e-06,
"loss": -9.344,
"reward": 0.2364831268787384,
"reward_std": 0.15473373234272003,
"rewards/sudoku_reward_func": 0.2364831268787384,
"step": 1793,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001794897448724362,
"grad_norm": 153.9132537841797,
"learning_rate": 3e-06,
"loss": -14.0934,
"step": 1794
},
{
"epoch": 0.0017958979489744872,
"grad_norm": 76.87872314453125,
"learning_rate": 3e-06,
"loss": -13.9281,
"step": 1795
},
{
"epoch": 0.0017968984492246123,
"grad_norm": 55.318077087402344,
"learning_rate": 3e-06,
"loss": -11.7386,
"step": 1796
},
{
"epoch": 0.0017978989494747374,
"grad_norm": 121.17082214355469,
"learning_rate": 3e-06,
"loss": -9.2901,
"step": 1797
},
{
"epoch": 0.0017988994497248625,
"grad_norm": 131.7681427001953,
"learning_rate": 3e-06,
"loss": -14.4251,
"step": 1798
},
{
"epoch": 0.0017998999499749875,
"grad_norm": 76.07759857177734,
"learning_rate": 3e-06,
"loss": -14.1956,
"step": 1799
},
{
"epoch": 0.0018009004502251126,
"grad_norm": 62.16264724731445,
"learning_rate": 3e-06,
"loss": -12.19,
"step": 1800
},
{
"completion_length": 252.43750762939453,
"epoch": 0.0018019009504752375,
"grad_norm": 108.04124450683594,
"learning_rate": 3e-06,
"loss": 5.9732,
"reward": 0.25009771436452866,
"reward_std": 0.1837964430451393,
"rewards/sudoku_reward_func": 0.25009771436452866,
"step": 1801,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018029014507253626,
"grad_norm": 148.05174255371094,
"learning_rate": 3e-06,
"loss": 9.4788,
"step": 1802
},
{
"epoch": 0.0018039019509754877,
"grad_norm": 158.5712890625,
"learning_rate": 3e-06,
"loss": 7.4384,
"step": 1803
},
{
"epoch": 0.0018049024512256128,
"grad_norm": 84.07239532470703,
"learning_rate": 3e-06,
"loss": 9.1441,
"step": 1804
},
{
"epoch": 0.001805902951475738,
"grad_norm": 108.87689971923828,
"learning_rate": 3e-06,
"loss": 4.8344,
"step": 1805
},
{
"epoch": 0.001806903451725863,
"grad_norm": 143.6099853515625,
"learning_rate": 3e-06,
"loss": 8.816,
"step": 1806
},
{
"epoch": 0.001807903951975988,
"grad_norm": 150.4852752685547,
"learning_rate": 3e-06,
"loss": 7.8367,
"step": 1807
},
{
"epoch": 0.001808904452226113,
"grad_norm": 149.5937042236328,
"learning_rate": 3e-06,
"loss": 7.4894,
"step": 1808
},
{
"completion_length": 254.0416717529297,
"epoch": 0.001809904952476238,
"grad_norm": 63.49519348144531,
"learning_rate": 3e-06,
"loss": -1.8099,
"reward": 0.23842592537403107,
"reward_std": 0.12684232741594315,
"rewards/sudoku_reward_func": 0.23842591792345047,
"step": 1809,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018109054527263631,
"grad_norm": 75.89506530761719,
"learning_rate": 3e-06,
"loss": -5.6479,
"step": 1810
},
{
"epoch": 0.0018119059529764882,
"grad_norm": 63.72089385986328,
"learning_rate": 3e-06,
"loss": -3.8267,
"step": 1811
},
{
"epoch": 0.0018129064532266133,
"grad_norm": 119.11578369140625,
"learning_rate": 3e-06,
"loss": -3.6712,
"step": 1812
},
{
"epoch": 0.0018139069534767384,
"grad_norm": 56.39674377441406,
"learning_rate": 3e-06,
"loss": -2.3428,
"step": 1813
},
{
"epoch": 0.0018149074537268635,
"grad_norm": 58.924068450927734,
"learning_rate": 3e-06,
"loss": -5.7407,
"step": 1814
},
{
"epoch": 0.0018159079539769884,
"grad_norm": 52.398563385009766,
"learning_rate": 3e-06,
"loss": -4.5608,
"step": 1815
},
{
"epoch": 0.0018169084542271135,
"grad_norm": 114.20697021484375,
"learning_rate": 3e-06,
"loss": -4.3273,
"step": 1816
},
{
"completion_length": 254.875,
"epoch": 0.0018179089544772386,
"grad_norm": 63.99118423461914,
"learning_rate": 3e-06,
"loss": -10.1683,
"reward": 0.24574241042137146,
"reward_std": 0.15981419384479523,
"rewards/sudoku_reward_func": 0.24574239552021027,
"step": 1817,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018189094547273637,
"grad_norm": 54.27163314819336,
"learning_rate": 3e-06,
"loss": -7.9241,
"step": 1818
},
{
"epoch": 0.0018199099549774888,
"grad_norm": 98.15486907958984,
"learning_rate": 3e-06,
"loss": -5.2589,
"step": 1819
},
{
"epoch": 0.0018209104552276139,
"grad_norm": 80.08932495117188,
"learning_rate": 3e-06,
"loss": -10.737,
"step": 1820
},
{
"epoch": 0.001821910955477739,
"grad_norm": 68.28329467773438,
"learning_rate": 3e-06,
"loss": -10.8951,
"step": 1821
},
{
"epoch": 0.0018229114557278638,
"grad_norm": 53.481849670410156,
"learning_rate": 3e-06,
"loss": -8.5383,
"step": 1822
},
{
"epoch": 0.001823911955977989,
"grad_norm": 97.5908203125,
"learning_rate": 3e-06,
"loss": -5.6582,
"step": 1823
},
{
"epoch": 0.001824912456228114,
"grad_norm": 92.74129486083984,
"learning_rate": 3e-06,
"loss": -11.6666,
"step": 1824
},
{
"completion_length": 255.45833587646484,
"epoch": 0.0018259129564782391,
"grad_norm": 68.78041076660156,
"learning_rate": 3e-06,
"loss": -2.671,
"reward": 0.21817130595445633,
"reward_std": 0.15033115446567535,
"rewards/sudoku_reward_func": 0.21817129850387573,
"step": 1825,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018269134567283642,
"grad_norm": 55.90199661254883,
"learning_rate": 3e-06,
"loss": -0.1168,
"step": 1826
},
{
"epoch": 0.0018279139569784893,
"grad_norm": 116.6128921508789,
"learning_rate": 3e-06,
"loss": -2.0107,
"step": 1827
},
{
"epoch": 0.0018289144572286144,
"grad_norm": 110.54891967773438,
"learning_rate": 3e-06,
"loss": -0.969,
"step": 1828
},
{
"epoch": 0.0018299149574787393,
"grad_norm": 94.7934341430664,
"learning_rate": 3e-06,
"loss": -3.8128,
"step": 1829
},
{
"epoch": 0.0018309154577288644,
"grad_norm": 64.49011993408203,
"learning_rate": 3e-06,
"loss": -0.8491,
"step": 1830
},
{
"epoch": 0.0018319159579789895,
"grad_norm": 158.86155700683594,
"learning_rate": 3e-06,
"loss": -4.1421,
"step": 1831
},
{
"epoch": 0.0018329164582291146,
"grad_norm": 105.3134536743164,
"learning_rate": 3e-06,
"loss": -3.6942,
"step": 1832
},
{
"completion_length": 256.0,
"epoch": 0.0018339169584792397,
"grad_norm": 83.44578552246094,
"learning_rate": 3e-06,
"loss": -27.8578,
"reward": 0.2110615223646164,
"reward_std": 0.14463221281766891,
"rewards/sudoku_reward_func": 0.2110615149140358,
"step": 1833,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018349174587293648,
"grad_norm": 111.02509307861328,
"learning_rate": 3e-06,
"loss": -22.4974,
"step": 1834
},
{
"epoch": 0.0018359179589794898,
"grad_norm": 88.58699798583984,
"learning_rate": 3e-06,
"loss": -21.2492,
"step": 1835
},
{
"epoch": 0.0018369184592296147,
"grad_norm": 79.12022399902344,
"learning_rate": 3e-06,
"loss": -25.1015,
"step": 1836
},
{
"epoch": 0.0018379189594797398,
"grad_norm": 101.9961166381836,
"learning_rate": 3e-06,
"loss": -28.2455,
"step": 1837
},
{
"epoch": 0.001838919459729865,
"grad_norm": 148.4155731201172,
"learning_rate": 3e-06,
"loss": -22.2837,
"step": 1838
},
{
"epoch": 0.00183991995997999,
"grad_norm": 168.54803466796875,
"learning_rate": 3e-06,
"loss": -21.5606,
"step": 1839
},
{
"epoch": 0.001840920460230115,
"grad_norm": 78.82660675048828,
"learning_rate": 3e-06,
"loss": -26.2552,
"step": 1840
},
{
"completion_length": 256.0,
"epoch": 0.0018419209604802402,
"grad_norm": 288.81829833984375,
"learning_rate": 3e-06,
"loss": -8.0671,
"reward": 0.25539247691631317,
"reward_std": 0.1576203741133213,
"rewards/sudoku_reward_func": 0.2553924694657326,
"step": 1841,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018429214607303653,
"grad_norm": 133.83079528808594,
"learning_rate": 3e-06,
"loss": -5.4173,
"step": 1842
},
{
"epoch": 0.0018439219609804902,
"grad_norm": 207.3365478515625,
"learning_rate": 3e-06,
"loss": -12.2163,
"step": 1843
},
{
"epoch": 0.0018449224612306153,
"grad_norm": 163.4165496826172,
"learning_rate": 3e-06,
"loss": -4.0535,
"step": 1844
},
{
"epoch": 0.0018459229614807404,
"grad_norm": 273.75286865234375,
"learning_rate": 3e-06,
"loss": -11.5609,
"step": 1845
},
{
"epoch": 0.0018469234617308654,
"grad_norm": 142.3900146484375,
"learning_rate": 3e-06,
"loss": -7.2858,
"step": 1846
},
{
"epoch": 0.0018479239619809905,
"grad_norm": 126.44757843017578,
"learning_rate": 3e-06,
"loss": -15.2951,
"step": 1847
},
{
"epoch": 0.0018489244622311156,
"grad_norm": 206.6569061279297,
"learning_rate": 3e-06,
"loss": -9.2882,
"step": 1848
},
{
"completion_length": 254.875,
"epoch": 0.0018499249624812407,
"grad_norm": 56.17991256713867,
"learning_rate": 3e-06,
"loss": -5.9357,
"reward": 0.2160218358039856,
"reward_std": 0.15568187460303307,
"rewards/sudoku_reward_func": 0.2160218134522438,
"step": 1849,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018509254627313656,
"grad_norm": 121.19395446777344,
"learning_rate": 3e-06,
"loss": -6.2401,
"step": 1850
},
{
"epoch": 0.0018519259629814907,
"grad_norm": 84.88428497314453,
"learning_rate": 3e-06,
"loss": -3.9807,
"step": 1851
},
{
"epoch": 0.0018529264632316158,
"grad_norm": 151.30287170410156,
"learning_rate": 3e-06,
"loss": -1.2683,
"step": 1852
},
{
"epoch": 0.0018539269634817409,
"grad_norm": 103.15193939208984,
"learning_rate": 3e-06,
"loss": -6.3061,
"step": 1853
},
{
"epoch": 0.001854927463731866,
"grad_norm": 95.67821502685547,
"learning_rate": 3e-06,
"loss": -8.745,
"step": 1854
},
{
"epoch": 0.001855927963981991,
"grad_norm": 158.7672576904297,
"learning_rate": 3e-06,
"loss": -3.6772,
"step": 1855
},
{
"epoch": 0.001856928464232116,
"grad_norm": 132.406982421875,
"learning_rate": 3e-06,
"loss": -1.2915,
"step": 1856
},
{
"completion_length": 255.9791717529297,
"epoch": 0.001857928964482241,
"grad_norm": 215.91055297851562,
"learning_rate": 3e-06,
"loss": -1.8523,
"reward": 0.23044809699058533,
"reward_std": 0.14949989318847656,
"rewards/sudoku_reward_func": 0.23044808208942413,
"step": 1857,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018589294647323661,
"grad_norm": 133.122314453125,
"learning_rate": 3e-06,
"loss": -3.0281,
"step": 1858
},
{
"epoch": 0.0018599299649824912,
"grad_norm": 162.92601013183594,
"learning_rate": 3e-06,
"loss": -1.8028,
"step": 1859
},
{
"epoch": 0.0018609304652326163,
"grad_norm": 132.5195770263672,
"learning_rate": 3e-06,
"loss": 1.1273,
"step": 1860
},
{
"epoch": 0.0018619309654827414,
"grad_norm": 143.70330810546875,
"learning_rate": 3e-06,
"loss": -4.3285,
"step": 1861
},
{
"epoch": 0.0018629314657328665,
"grad_norm": 126.70255279541016,
"learning_rate": 3e-06,
"loss": -4.0841,
"step": 1862
},
{
"epoch": 0.0018639319659829914,
"grad_norm": 194.76414489746094,
"learning_rate": 3e-06,
"loss": -2.7091,
"step": 1863
},
{
"epoch": 0.0018649324662331165,
"grad_norm": 186.4334716796875,
"learning_rate": 3e-06,
"loss": -0.7513,
"step": 1864
},
{
"completion_length": 249.2916717529297,
"epoch": 0.0018659329664832416,
"grad_norm": 172.89756774902344,
"learning_rate": 3e-06,
"loss": -1.016,
"reward": 0.24346517026424408,
"reward_std": 0.1537521705031395,
"rewards/sudoku_reward_func": 0.24346517026424408,
"step": 1865,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018669334667333667,
"grad_norm": 140.2328338623047,
"learning_rate": 3e-06,
"loss": -0.0186,
"step": 1866
},
{
"epoch": 0.0018679339669834918,
"grad_norm": 167.56224060058594,
"learning_rate": 3e-06,
"loss": -1.1759,
"step": 1867
},
{
"epoch": 0.0018689344672336169,
"grad_norm": 129.8845977783203,
"learning_rate": 3e-06,
"loss": 4.8728,
"step": 1868
},
{
"epoch": 0.001869934967483742,
"grad_norm": 170.2915802001953,
"learning_rate": 3e-06,
"loss": -0.5258,
"step": 1869
},
{
"epoch": 0.0018709354677338668,
"grad_norm": 114.28946685791016,
"learning_rate": 3e-06,
"loss": -0.7656,
"step": 1870
},
{
"epoch": 0.001871935967983992,
"grad_norm": 186.74124145507812,
"learning_rate": 3e-06,
"loss": -2.6374,
"step": 1871
},
{
"epoch": 0.001872936468234117,
"grad_norm": 153.4672393798828,
"learning_rate": 3e-06,
"loss": 4.74,
"step": 1872
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0018739369684842421,
"grad_norm": 164.3131103515625,
"learning_rate": 3e-06,
"loss": -5.3978,
"reward": 0.2078373059630394,
"reward_std": 0.15452614426612854,
"rewards/sudoku_reward_func": 0.2078373059630394,
"step": 1873,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018749374687343672,
"grad_norm": 190.09767150878906,
"learning_rate": 3e-06,
"loss": -3.6008,
"step": 1874
},
{
"epoch": 0.0018759379689844923,
"grad_norm": 309.4437255859375,
"learning_rate": 3e-06,
"loss": -1.9107,
"step": 1875
},
{
"epoch": 0.0018769384692346174,
"grad_norm": 193.36318969726562,
"learning_rate": 3e-06,
"loss": 0.3439,
"step": 1876
},
{
"epoch": 0.0018779389694847423,
"grad_norm": 130.9011993408203,
"learning_rate": 3e-06,
"loss": -6.1956,
"step": 1877
},
{
"epoch": 0.0018789394697348674,
"grad_norm": 190.30758666992188,
"learning_rate": 3e-06,
"loss": -4.9197,
"step": 1878
},
{
"epoch": 0.0018799399699849925,
"grad_norm": 171.53958129882812,
"learning_rate": 3e-06,
"loss": -3.2976,
"step": 1879
},
{
"epoch": 0.0018809404702351176,
"grad_norm": 245.52931213378906,
"learning_rate": 3e-06,
"loss": -3.0038,
"step": 1880
},
{
"completion_length": 256.0,
"epoch": 0.0018819409704852426,
"grad_norm": 128.33712768554688,
"learning_rate": 3e-06,
"loss": -12.8456,
"reward": 0.24286392331123352,
"reward_std": 0.15595544129610062,
"rewards/sudoku_reward_func": 0.24286391586065292,
"step": 1881,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018829414707353677,
"grad_norm": 120.86560821533203,
"learning_rate": 3e-06,
"loss": -10.8183,
"step": 1882
},
{
"epoch": 0.0018839419709854928,
"grad_norm": 223.6404266357422,
"learning_rate": 3e-06,
"loss": -17.097,
"step": 1883
},
{
"epoch": 0.0018849424712356177,
"grad_norm": 105.8646469116211,
"learning_rate": 3e-06,
"loss": -11.4175,
"step": 1884
},
{
"epoch": 0.0018859429714857428,
"grad_norm": 121.3120346069336,
"learning_rate": 3e-06,
"loss": -12.857,
"step": 1885
},
{
"epoch": 0.001886943471735868,
"grad_norm": 138.72531127929688,
"learning_rate": 3e-06,
"loss": -11.7301,
"step": 1886
},
{
"epoch": 0.001887943971985993,
"grad_norm": 215.96949768066406,
"learning_rate": 3e-06,
"loss": -17.1948,
"step": 1887
},
{
"epoch": 0.001888944472236118,
"grad_norm": 111.08206939697266,
"learning_rate": 3e-06,
"loss": -12.625,
"step": 1888
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0018899449724862432,
"grad_norm": 156.65145874023438,
"learning_rate": 3e-06,
"loss": 0.5665,
"reward": 0.25066138803958893,
"reward_std": 0.12561482936143875,
"rewards/sudoku_reward_func": 0.25066138803958893,
"step": 1889,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018909454727363683,
"grad_norm": 107.05677032470703,
"learning_rate": 3e-06,
"loss": -1.7127,
"step": 1890
},
{
"epoch": 0.0018919459729864932,
"grad_norm": 131.33168029785156,
"learning_rate": 3e-06,
"loss": 2.0091,
"step": 1891
},
{
"epoch": 0.0018929464732366182,
"grad_norm": 90.71002197265625,
"learning_rate": 3e-06,
"loss": 4.5182,
"step": 1892
},
{
"epoch": 0.0018939469734867433,
"grad_norm": 138.46646118164062,
"learning_rate": 3e-06,
"loss": 0.3023,
"step": 1893
},
{
"epoch": 0.0018949474737368684,
"grad_norm": 117.40104675292969,
"learning_rate": 3e-06,
"loss": -1.9082,
"step": 1894
},
{
"epoch": 0.0018959479739869935,
"grad_norm": 141.14013671875,
"learning_rate": 3e-06,
"loss": 0.8231,
"step": 1895
},
{
"epoch": 0.0018969484742371186,
"grad_norm": 164.23402404785156,
"learning_rate": 3e-06,
"loss": 4.1673,
"step": 1896
},
{
"completion_length": 254.20833587646484,
"epoch": 0.0018979489744872437,
"grad_norm": 177.48680114746094,
"learning_rate": 3e-06,
"loss": -10.6794,
"reward": 0.24317581951618195,
"reward_std": 0.14005287736654282,
"rewards/sudoku_reward_func": 0.24317579716444016,
"step": 1897,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0018989494747373686,
"grad_norm": 230.07167053222656,
"learning_rate": 3e-06,
"loss": -11.7156,
"step": 1898
},
{
"epoch": 0.0018999499749874937,
"grad_norm": 184.52452087402344,
"learning_rate": 3e-06,
"loss": -2.0613,
"step": 1899
},
{
"epoch": 0.0019009504752376188,
"grad_norm": 199.63613891601562,
"learning_rate": 3e-06,
"loss": -9.7111,
"step": 1900
},
{
"epoch": 0.0019019509754877439,
"grad_norm": 261.729736328125,
"learning_rate": 3e-06,
"loss": -12.5346,
"step": 1901
},
{
"epoch": 0.001902951475737869,
"grad_norm": 174.4585418701172,
"learning_rate": 3e-06,
"loss": -14.4528,
"step": 1902
},
{
"epoch": 0.001903951975987994,
"grad_norm": 189.9020233154297,
"learning_rate": 3e-06,
"loss": -3.4384,
"step": 1903
},
{
"epoch": 0.0019049524762381192,
"grad_norm": 160.3231964111328,
"learning_rate": 3e-06,
"loss": -11.0285,
"step": 1904
},
{
"completion_length": 256.0,
"epoch": 0.001905952976488244,
"grad_norm": 193.59310913085938,
"learning_rate": 3e-06,
"loss": -6.2462,
"reward": 0.25355491042137146,
"reward_std": 0.16307833790779114,
"rewards/sudoku_reward_func": 0.25355489552021027,
"step": 1905,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019069534767383691,
"grad_norm": 207.5780792236328,
"learning_rate": 3e-06,
"loss": -6.7655,
"step": 1906
},
{
"epoch": 0.0019079539769884942,
"grad_norm": 170.86962890625,
"learning_rate": 3e-06,
"loss": -10.8921,
"step": 1907
},
{
"epoch": 0.0019089544772386193,
"grad_norm": 140.1287841796875,
"learning_rate": 3e-06,
"loss": -18.6721,
"step": 1908
},
{
"epoch": 0.0019099549774887444,
"grad_norm": 270.27630615234375,
"learning_rate": 3e-06,
"loss": -6.2828,
"step": 1909
},
{
"epoch": 0.0019109554777388695,
"grad_norm": 183.57421875,
"learning_rate": 3e-06,
"loss": -8.3827,
"step": 1910
},
{
"epoch": 0.0019119559779889946,
"grad_norm": 210.7086944580078,
"learning_rate": 3e-06,
"loss": -12.2565,
"step": 1911
},
{
"epoch": 0.0019129564782391195,
"grad_norm": 148.4086456298828,
"learning_rate": 3e-06,
"loss": -19.4607,
"step": 1912
},
{
"completion_length": 253.125,
"epoch": 0.0019139569784892446,
"grad_norm": 140.03799438476562,
"learning_rate": 3e-06,
"loss": 6.7997,
"reward": 0.2821180745959282,
"reward_std": 0.16406304389238358,
"rewards/sudoku_reward_func": 0.282118059694767,
"step": 1913,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019149574787393697,
"grad_norm": 161.60719299316406,
"learning_rate": 3e-06,
"loss": -3.7872,
"step": 1914
},
{
"epoch": 0.0019159579789894948,
"grad_norm": 165.4402313232422,
"learning_rate": 3e-06,
"loss": -0.9041,
"step": 1915
},
{
"epoch": 0.0019169584792396198,
"grad_norm": 174.52320861816406,
"learning_rate": 3e-06,
"loss": 5.3668,
"step": 1916
},
{
"epoch": 0.001917958979489745,
"grad_norm": 172.8831329345703,
"learning_rate": 3e-06,
"loss": 5.6679,
"step": 1917
},
{
"epoch": 0.00191895947973987,
"grad_norm": 207.86276245117188,
"learning_rate": 3e-06,
"loss": -4.6441,
"step": 1918
},
{
"epoch": 0.001919959979989995,
"grad_norm": 141.2104034423828,
"learning_rate": 3e-06,
"loss": -2.1364,
"step": 1919
},
{
"epoch": 0.00192096048024012,
"grad_norm": 168.36842346191406,
"learning_rate": 3e-06,
"loss": 4.9658,
"step": 1920
},
{
"completion_length": 253.95834350585938,
"epoch": 0.001921960980490245,
"grad_norm": 89.40079498291016,
"learning_rate": 3e-06,
"loss": -10.3347,
"reward": 0.28163330256938934,
"reward_std": 0.11973906680941582,
"rewards/sudoku_reward_func": 0.28163329511880875,
"step": 1921,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019229614807403702,
"grad_norm": 155.6529083251953,
"learning_rate": 3e-06,
"loss": -11.0518,
"step": 1922
},
{
"epoch": 0.0019239619809904953,
"grad_norm": 137.2110595703125,
"learning_rate": 3e-06,
"loss": -12.0976,
"step": 1923
},
{
"epoch": 0.0019249624812406204,
"grad_norm": 133.95614624023438,
"learning_rate": 3e-06,
"loss": -9.0308,
"step": 1924
},
{
"epoch": 0.0019259629814907455,
"grad_norm": 108.86006927490234,
"learning_rate": 3e-06,
"loss": -10.1471,
"step": 1925
},
{
"epoch": 0.0019269634817408704,
"grad_norm": 147.3195037841797,
"learning_rate": 3e-06,
"loss": -12.1494,
"step": 1926
},
{
"epoch": 0.0019279639819909954,
"grad_norm": 127.62635803222656,
"learning_rate": 3e-06,
"loss": -13.4963,
"step": 1927
},
{
"epoch": 0.0019289644822411205,
"grad_norm": 123.51240539550781,
"learning_rate": 3e-06,
"loss": -10.2573,
"step": 1928
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0019299649824912456,
"grad_norm": 151.781982421875,
"learning_rate": 3e-06,
"loss": -22.1567,
"reward": 0.26459161937236786,
"reward_std": 0.17789562046527863,
"rewards/sudoku_reward_func": 0.26459160447120667,
"step": 1929,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019309654827413707,
"grad_norm": 157.08926391601562,
"learning_rate": 3e-06,
"loss": -19.2895,
"step": 1930
},
{
"epoch": 0.0019319659829914958,
"grad_norm": 170.2781219482422,
"learning_rate": 3e-06,
"loss": -12.0409,
"step": 1931
},
{
"epoch": 0.001932966483241621,
"grad_norm": 162.08419799804688,
"learning_rate": 3e-06,
"loss": -15.2176,
"step": 1932
},
{
"epoch": 0.0019339669834917458,
"grad_norm": 162.1764373779297,
"learning_rate": 3e-06,
"loss": -22.3389,
"step": 1933
},
{
"epoch": 0.0019349674837418709,
"grad_norm": 153.0626983642578,
"learning_rate": 3e-06,
"loss": -20.8159,
"step": 1934
},
{
"epoch": 0.001935967983991996,
"grad_norm": 126.0255126953125,
"learning_rate": 3e-06,
"loss": -13.5504,
"step": 1935
},
{
"epoch": 0.001936968484242121,
"grad_norm": 145.0688934326172,
"learning_rate": 3e-06,
"loss": -16.9399,
"step": 1936
},
{
"completion_length": 254.14583587646484,
"epoch": 0.0019379689844922462,
"grad_norm": 110.93611907958984,
"learning_rate": 3e-06,
"loss": -9.5221,
"reward": 0.24819251149892807,
"reward_std": 0.13325944542884827,
"rewards/sudoku_reward_func": 0.24819249659776688,
"step": 1937,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019389694847423713,
"grad_norm": 197.55638122558594,
"learning_rate": 3e-06,
"loss": -11.8475,
"step": 1938
},
{
"epoch": 0.0019399699849924961,
"grad_norm": 132.9469757080078,
"learning_rate": 3e-06,
"loss": -13.7213,
"step": 1939
},
{
"epoch": 0.0019409704852426212,
"grad_norm": 145.57211303710938,
"learning_rate": 3e-06,
"loss": -10.4007,
"step": 1940
},
{
"epoch": 0.0019419709854927463,
"grad_norm": 139.77806091308594,
"learning_rate": 3e-06,
"loss": -11.1832,
"step": 1941
},
{
"epoch": 0.0019429714857428714,
"grad_norm": 175.5628662109375,
"learning_rate": 3e-06,
"loss": -12.1468,
"step": 1942
},
{
"epoch": 0.0019439719859929965,
"grad_norm": 179.16539001464844,
"learning_rate": 3e-06,
"loss": -15.5776,
"step": 1943
},
{
"epoch": 0.0019449724862431216,
"grad_norm": 140.24085998535156,
"learning_rate": 3e-06,
"loss": -11.1777,
"step": 1944
},
{
"completion_length": 255.95833587646484,
"epoch": 0.0019459729864932467,
"grad_norm": 171.1797332763672,
"learning_rate": 3e-06,
"loss": -2.8557,
"reward": 0.24582508206367493,
"reward_std": 0.15176919847726822,
"rewards/sudoku_reward_func": 0.24582506716251373,
"step": 1945,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019469734867433716,
"grad_norm": 192.55026245117188,
"learning_rate": 3e-06,
"loss": -0.0457,
"step": 1946
},
{
"epoch": 0.0019479739869934967,
"grad_norm": 227.40118408203125,
"learning_rate": 3e-06,
"loss": -1.2861,
"step": 1947
},
{
"epoch": 0.0019489744872436218,
"grad_norm": 251.78111267089844,
"learning_rate": 3e-06,
"loss": 0.6302,
"step": 1948
},
{
"epoch": 0.0019499749874937469,
"grad_norm": 199.07493591308594,
"learning_rate": 3e-06,
"loss": -3.8908,
"step": 1949
},
{
"epoch": 0.001950975487743872,
"grad_norm": 179.95046997070312,
"learning_rate": 3e-06,
"loss": -1.2324,
"step": 1950
},
{
"epoch": 0.001951975987993997,
"grad_norm": 189.5157470703125,
"learning_rate": 3e-06,
"loss": -3.2241,
"step": 1951
},
{
"epoch": 0.0019529764882441221,
"grad_norm": 246.80743408203125,
"learning_rate": 3e-06,
"loss": -0.6323,
"step": 1952
},
{
"completion_length": 255.9166717529297,
"epoch": 0.001953976988494247,
"grad_norm": 133.2589569091797,
"learning_rate": 3e-06,
"loss": 2.4421,
"reward": 0.2585640847682953,
"reward_std": 0.1397685706615448,
"rewards/sudoku_reward_func": 0.2585640847682953,
"step": 1953,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019549774887443723,
"grad_norm": 144.25059509277344,
"learning_rate": 3e-06,
"loss": -2.5594,
"step": 1954
},
{
"epoch": 0.001955977988994497,
"grad_norm": 193.25198364257812,
"learning_rate": 3e-06,
"loss": 1.5943,
"step": 1955
},
{
"epoch": 0.0019569784892446225,
"grad_norm": 146.21253967285156,
"learning_rate": 3e-06,
"loss": -0.3917,
"step": 1956
},
{
"epoch": 0.0019579789894947474,
"grad_norm": 157.32179260253906,
"learning_rate": 3e-06,
"loss": 1.1938,
"step": 1957
},
{
"epoch": 0.0019589794897448723,
"grad_norm": 131.586669921875,
"learning_rate": 3e-06,
"loss": -3.8024,
"step": 1958
},
{
"epoch": 0.0019599799899949976,
"grad_norm": 246.2342071533203,
"learning_rate": 3e-06,
"loss": 0.7853,
"step": 1959
},
{
"epoch": 0.0019609804902451225,
"grad_norm": 115.52106475830078,
"learning_rate": 3e-06,
"loss": -2.1448,
"step": 1960
},
{
"completion_length": 255.83333587646484,
"epoch": 0.0019619809904952478,
"grad_norm": 189.30178833007812,
"learning_rate": 3e-06,
"loss": 1.9125,
"reward": 0.2099228948354721,
"reward_std": 0.13680537045001984,
"rewards/sudoku_reward_func": 0.2099228873848915,
"step": 1961,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019629814907453727,
"grad_norm": 292.6360168457031,
"learning_rate": 3e-06,
"loss": 8.2895,
"step": 1962
},
{
"epoch": 0.001963981990995498,
"grad_norm": 222.63095092773438,
"learning_rate": 3e-06,
"loss": 2.543,
"step": 1963
},
{
"epoch": 0.001964982491245623,
"grad_norm": 154.17628479003906,
"learning_rate": 3e-06,
"loss": 3.8953,
"step": 1964
},
{
"epoch": 0.0019659829914957477,
"grad_norm": 155.28807067871094,
"learning_rate": 3e-06,
"loss": 1.0186,
"step": 1965
},
{
"epoch": 0.001966983491745873,
"grad_norm": 280.9315185546875,
"learning_rate": 3e-06,
"loss": 6.3139,
"step": 1966
},
{
"epoch": 0.001967983991995998,
"grad_norm": 150.9969482421875,
"learning_rate": 3e-06,
"loss": 1.5908,
"step": 1967
},
{
"epoch": 0.001968984492246123,
"grad_norm": 167.98971557617188,
"learning_rate": 3e-06,
"loss": 2.7659,
"step": 1968
},
{
"completion_length": 253.33334350585938,
"epoch": 0.001969984992496248,
"grad_norm": 195.38546752929688,
"learning_rate": 3e-06,
"loss": -7.7222,
"reward": 0.21969321370124817,
"reward_std": 0.14372562617063522,
"rewards/sudoku_reward_func": 0.21969321370124817,
"step": 1969,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019709854927463734,
"grad_norm": 204.2120361328125,
"learning_rate": 3e-06,
"loss": -23.4097,
"step": 1970
},
{
"epoch": 0.0019719859929964983,
"grad_norm": 177.75064086914062,
"learning_rate": 3e-06,
"loss": -13.5707,
"step": 1971
},
{
"epoch": 0.001972986493246623,
"grad_norm": 199.55172729492188,
"learning_rate": 3e-06,
"loss": -13.0697,
"step": 1972
},
{
"epoch": 0.0019739869934967485,
"grad_norm": 187.18035888671875,
"learning_rate": 3e-06,
"loss": -7.5141,
"step": 1973
},
{
"epoch": 0.0019749874937468733,
"grad_norm": 167.5812225341797,
"learning_rate": 3e-06,
"loss": -23.3993,
"step": 1974
},
{
"epoch": 0.0019759879939969987,
"grad_norm": 222.2601776123047,
"learning_rate": 3e-06,
"loss": -14.8587,
"step": 1975
},
{
"epoch": 0.0019769884942471235,
"grad_norm": 158.96644592285156,
"learning_rate": 3e-06,
"loss": -11.5531,
"step": 1976
},
{
"completion_length": 255.9166717529297,
"epoch": 0.0019779889944972484,
"grad_norm": 211.06991577148438,
"learning_rate": 3e-06,
"loss": -8.5541,
"reward": 0.2523975223302841,
"reward_std": 0.17239974439144135,
"rewards/sudoku_reward_func": 0.25239749997854233,
"step": 1977,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0019789894947473737,
"grad_norm": 296.3025817871094,
"learning_rate": 3e-06,
"loss": -12.2237,
"step": 1978
},
{
"epoch": 0.0019799899949974986,
"grad_norm": 373.58056640625,
"learning_rate": 3e-06,
"loss": -15.4929,
"step": 1979
},
{
"epoch": 0.001980990495247624,
"grad_norm": 185.84808349609375,
"learning_rate": 3e-06,
"loss": -11.0735,
"step": 1980
},
{
"epoch": 0.001981990995497749,
"grad_norm": 243.6155548095703,
"learning_rate": 3e-06,
"loss": -8.8876,
"step": 1981
},
{
"epoch": 0.001982991495747874,
"grad_norm": 293.4427185058594,
"learning_rate": 3e-06,
"loss": -13.6924,
"step": 1982
},
{
"epoch": 0.001983991995997999,
"grad_norm": 197.66085815429688,
"learning_rate": 3e-06,
"loss": -18.5818,
"step": 1983
},
{
"epoch": 0.001984992496248124,
"grad_norm": 185.08505249023438,
"learning_rate": 3e-06,
"loss": -13.7895,
"step": 1984
},
{
"completion_length": 255.9166717529297,
"epoch": 0.001985992996498249,
"grad_norm": 195.1097412109375,
"learning_rate": 3e-06,
"loss": -13.2296,
"reward": 0.24486306309700012,
"reward_std": 0.1268276385962963,
"rewards/sudoku_reward_func": 0.24486306309700012,
"step": 1985,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001986993496748374,
"grad_norm": 169.93081665039062,
"learning_rate": 3e-06,
"loss": -8.3141,
"step": 1986
},
{
"epoch": 0.0019879939969984993,
"grad_norm": 173.13099670410156,
"learning_rate": 3e-06,
"loss": -10.358,
"step": 1987
},
{
"epoch": 0.0019889944972486242,
"grad_norm": 197.01596069335938,
"learning_rate": 3e-06,
"loss": -4.8024,
"step": 1988
},
{
"epoch": 0.0019899949974987495,
"grad_norm": 211.9609832763672,
"learning_rate": 3e-06,
"loss": -13.3649,
"step": 1989
},
{
"epoch": 0.0019909954977488744,
"grad_norm": 186.90805053710938,
"learning_rate": 3e-06,
"loss": -9.3327,
"step": 1990
},
{
"epoch": 0.0019919959979989993,
"grad_norm": 78.69964599609375,
"learning_rate": 3e-06,
"loss": -11.159,
"step": 1991
},
{
"epoch": 0.0019929964982491246,
"grad_norm": 201.1640167236328,
"learning_rate": 3e-06,
"loss": -5.9724,
"step": 1992
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0019939969984992495,
"grad_norm": 125.78178405761719,
"learning_rate": 3e-06,
"loss": -6.6924,
"reward": 0.2843502163887024,
"reward_std": 0.13726307824254036,
"rewards/sudoku_reward_func": 0.2843502014875412,
"step": 1993,
"zero_std_ratio": 0.0
},
{
"epoch": 0.001994997498749375,
"grad_norm": 177.04168701171875,
"learning_rate": 3e-06,
"loss": -1.6726,
"step": 1994
},
{
"epoch": 0.0019959979989994997,
"grad_norm": 305.504150390625,
"learning_rate": 3e-06,
"loss": -7.7491,
"step": 1995
},
{
"epoch": 0.001996998499249625,
"grad_norm": 113.48680877685547,
"learning_rate": 3e-06,
"loss": -0.761,
"step": 1996
},
{
"epoch": 0.00199799899949975,
"grad_norm": 108.60614013671875,
"learning_rate": 3e-06,
"loss": -7.7709,
"step": 1997
},
{
"epoch": 0.0019989994997498747,
"grad_norm": 105.45082092285156,
"learning_rate": 3e-06,
"loss": -4.1487,
"step": 1998
},
{
"epoch": 0.002,
"grad_norm": 213.99227905273438,
"learning_rate": 3e-06,
"loss": -9.8322,
"step": 1999
},
{
"epoch": 0.002001000500250125,
"grad_norm": 134.13356018066406,
"learning_rate": 3e-06,
"loss": -2.2642,
"step": 2000
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0020020010005002502,
"grad_norm": 242.3455352783203,
"learning_rate": 3e-06,
"loss": -2.1762,
"reward": 0.2152777761220932,
"reward_std": 0.161978080868721,
"rewards/sudoku_reward_func": 0.2152777686715126,
"step": 2001,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002003001500750375,
"grad_norm": 409.4090576171875,
"learning_rate": 3e-06,
"loss": -4.2201,
"step": 2002
},
{
"epoch": 0.0020040020010005004,
"grad_norm": 174.99839782714844,
"learning_rate": 3e-06,
"loss": 1.8727,
"step": 2003
},
{
"epoch": 0.0020050025012506253,
"grad_norm": 214.46160888671875,
"learning_rate": 3e-06,
"loss": 8.1319,
"step": 2004
},
{
"epoch": 0.00200600300150075,
"grad_norm": 235.8225860595703,
"learning_rate": 3e-06,
"loss": -2.0186,
"step": 2005
},
{
"epoch": 0.0020070035017508755,
"grad_norm": 435.18310546875,
"learning_rate": 3e-06,
"loss": -3.7778,
"step": 2006
},
{
"epoch": 0.0020080040020010004,
"grad_norm": 188.9642791748047,
"learning_rate": 3e-06,
"loss": 1.6308,
"step": 2007
},
{
"epoch": 0.0020090045022511257,
"grad_norm": 254.4283447265625,
"learning_rate": 3e-06,
"loss": 7.1249,
"step": 2008
},
{
"completion_length": 254.81250762939453,
"epoch": 0.0020100050025012505,
"grad_norm": 307.87603759765625,
"learning_rate": 3e-06,
"loss": 6.6596,
"reward": 0.2307787761092186,
"reward_std": 0.15682782232761383,
"rewards/sudoku_reward_func": 0.2307787761092186,
"step": 2009,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002011005502751376,
"grad_norm": 192.25518798828125,
"learning_rate": 3e-06,
"loss": -0.1586,
"step": 2010
},
{
"epoch": 0.0020120060030015007,
"grad_norm": 180.306640625,
"learning_rate": 3e-06,
"loss": 0.0719,
"step": 2011
},
{
"epoch": 0.0020130065032516256,
"grad_norm": 170.99240112304688,
"learning_rate": 3e-06,
"loss": 5.0591,
"step": 2012
},
{
"epoch": 0.002014007003501751,
"grad_norm": 346.23883056640625,
"learning_rate": 3e-06,
"loss": 4.0824,
"step": 2013
},
{
"epoch": 0.002015007503751876,
"grad_norm": 311.57666015625,
"learning_rate": 3e-06,
"loss": -3.0988,
"step": 2014
},
{
"epoch": 0.002016008004002001,
"grad_norm": 158.6090545654297,
"learning_rate": 3e-06,
"loss": -4.0589,
"step": 2015
},
{
"epoch": 0.002017008504252126,
"grad_norm": 157.3676300048828,
"learning_rate": 3e-06,
"loss": 2.9554,
"step": 2016
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0020180090045022513,
"grad_norm": 161.91995239257812,
"learning_rate": 3e-06,
"loss": -4.3946,
"reward": 0.30390965938568115,
"reward_std": 0.15090640634298325,
"rewards/sudoku_reward_func": 0.30390962958335876,
"step": 2017,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002019009504752376,
"grad_norm": 165.71839904785156,
"learning_rate": 3e-06,
"loss": -1.4257,
"step": 2018
},
{
"epoch": 0.002020010005002501,
"grad_norm": 328.2276916503906,
"learning_rate": 3e-06,
"loss": 5.6413,
"step": 2019
},
{
"epoch": 0.0020210105052526264,
"grad_norm": 206.0242919921875,
"learning_rate": 3e-06,
"loss": 6.9081,
"step": 2020
},
{
"epoch": 0.0020220110055027512,
"grad_norm": 207.88768005371094,
"learning_rate": 3e-06,
"loss": -1.5048,
"step": 2021
},
{
"epoch": 0.0020230115057528766,
"grad_norm": 157.71121215820312,
"learning_rate": 3e-06,
"loss": -2.585,
"step": 2022
},
{
"epoch": 0.0020240120060030014,
"grad_norm": 158.0244140625,
"learning_rate": 3e-06,
"loss": 2.7574,
"step": 2023
},
{
"epoch": 0.0020250125062531267,
"grad_norm": 270.9903564453125,
"learning_rate": 3e-06,
"loss": 3.6836,
"step": 2024
},
{
"completion_length": 253.5625,
"epoch": 0.0020260130065032516,
"grad_norm": 123.82396697998047,
"learning_rate": 3e-06,
"loss": 8.2977,
"reward": 0.20379765331745148,
"reward_std": 0.13752370700240135,
"rewards/sudoku_reward_func": 0.20379765331745148,
"step": 2025,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020270135067533765,
"grad_norm": 147.5820770263672,
"learning_rate": 3e-06,
"loss": 2.0875,
"step": 2026
},
{
"epoch": 0.002028014007003502,
"grad_norm": 164.91355895996094,
"learning_rate": 3e-06,
"loss": 4.2679,
"step": 2027
},
{
"epoch": 0.0020290145072536267,
"grad_norm": 204.01260375976562,
"learning_rate": 3e-06,
"loss": 2.1948,
"step": 2028
},
{
"epoch": 0.002030015007503752,
"grad_norm": 155.6641387939453,
"learning_rate": 3e-06,
"loss": 8.1561,
"step": 2029
},
{
"epoch": 0.002031015507753877,
"grad_norm": 145.83570861816406,
"learning_rate": 3e-06,
"loss": 1.8471,
"step": 2030
},
{
"epoch": 0.002032016008004002,
"grad_norm": 129.6785888671875,
"learning_rate": 3e-06,
"loss": 3.1864,
"step": 2031
},
{
"epoch": 0.002033016508254127,
"grad_norm": 136.57089233398438,
"learning_rate": 3e-06,
"loss": 0.1599,
"step": 2032
},
{
"completion_length": 255.39584350585938,
"epoch": 0.002034017008504252,
"grad_norm": 256.8916015625,
"learning_rate": 3e-06,
"loss": 1.5327,
"reward": 0.2647569626569748,
"reward_std": 0.15292219817638397,
"rewards/sudoku_reward_func": 0.2647569477558136,
"step": 2033,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020350175087543772,
"grad_norm": 221.5432586669922,
"learning_rate": 3e-06,
"loss": -0.6326,
"step": 2034
},
{
"epoch": 0.002036018009004502,
"grad_norm": 115.88921356201172,
"learning_rate": 3e-06,
"loss": -8.1754,
"step": 2035
},
{
"epoch": 0.0020370185092546274,
"grad_norm": 155.23406982421875,
"learning_rate": 3e-06,
"loss": -1.9255,
"step": 2036
},
{
"epoch": 0.0020380190095047523,
"grad_norm": 174.22718811035156,
"learning_rate": 3e-06,
"loss": 0.6805,
"step": 2037
},
{
"epoch": 0.0020390195097548776,
"grad_norm": 271.7087097167969,
"learning_rate": 3e-06,
"loss": -0.6905,
"step": 2038
},
{
"epoch": 0.0020400200100050025,
"grad_norm": 117.21466827392578,
"learning_rate": 3e-06,
"loss": -9.2192,
"step": 2039
},
{
"epoch": 0.0020410205102551274,
"grad_norm": 161.93284606933594,
"learning_rate": 3e-06,
"loss": -3.25,
"step": 2040
},
{
"completion_length": 253.25,
"epoch": 0.0020420210105052527,
"grad_norm": 508.3278503417969,
"learning_rate": 3e-06,
"loss": -4.7108,
"reward": 0.26471560448408127,
"reward_std": 0.17860908806324005,
"rewards/sudoku_reward_func": 0.26471560448408127,
"step": 2041,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020430215107553776,
"grad_norm": 267.98602294921875,
"learning_rate": 3e-06,
"loss": -8.4861,
"step": 2042
},
{
"epoch": 0.002044022011005503,
"grad_norm": 243.9029083251953,
"learning_rate": 3e-06,
"loss": -3.0436,
"step": 2043
},
{
"epoch": 0.0020450225112556277,
"grad_norm": 202.64193725585938,
"learning_rate": 3e-06,
"loss": -8.469,
"step": 2044
},
{
"epoch": 0.002046023011505753,
"grad_norm": 304.8101501464844,
"learning_rate": 3e-06,
"loss": -7.1241,
"step": 2045
},
{
"epoch": 0.002047023511755878,
"grad_norm": 294.1930847167969,
"learning_rate": 3e-06,
"loss": -7.5035,
"step": 2046
},
{
"epoch": 0.002048024012006003,
"grad_norm": 237.90809631347656,
"learning_rate": 3e-06,
"loss": -5.4658,
"step": 2047
},
{
"epoch": 0.002049024512256128,
"grad_norm": 154.24185180664062,
"learning_rate": 3e-06,
"loss": -8.7881,
"step": 2048
},
{
"completion_length": 253.4375,
"epoch": 0.002050025012506253,
"grad_norm": 111.57600402832031,
"learning_rate": 3e-06,
"loss": -13.8521,
"reward": 0.22884725779294968,
"reward_std": 0.1646890565752983,
"rewards/sudoku_reward_func": 0.22884725779294968,
"step": 2049,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020510255127563783,
"grad_norm": 177.33627319335938,
"learning_rate": 3e-06,
"loss": -8.5202,
"step": 2050
},
{
"epoch": 0.002052026013006503,
"grad_norm": 189.1322479248047,
"learning_rate": 3e-06,
"loss": -11.823,
"step": 2051
},
{
"epoch": 0.0020530265132566285,
"grad_norm": 319.8341064453125,
"learning_rate": 3e-06,
"loss": -13.3699,
"step": 2052
},
{
"epoch": 0.0020540270135067534,
"grad_norm": 97.67252349853516,
"learning_rate": 3e-06,
"loss": -14.6061,
"step": 2053
},
{
"epoch": 0.0020550275137568783,
"grad_norm": 141.40914916992188,
"learning_rate": 3e-06,
"loss": -9.0696,
"step": 2054
},
{
"epoch": 0.0020560280140070036,
"grad_norm": 165.0148468017578,
"learning_rate": 3e-06,
"loss": -13.5339,
"step": 2055
},
{
"epoch": 0.0020570285142571284,
"grad_norm": 279.2270202636719,
"learning_rate": 3e-06,
"loss": -15.6225,
"step": 2056
},
{
"completion_length": 254.58333587646484,
"epoch": 0.0020580290145072538,
"grad_norm": 149.38491821289062,
"learning_rate": 3e-06,
"loss": 6.731,
"reward": 0.2059771865606308,
"reward_std": 0.13232478126883507,
"rewards/sudoku_reward_func": 0.2059771791100502,
"step": 2057,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020590295147573786,
"grad_norm": 115.99948120117188,
"learning_rate": 3e-06,
"loss": 4.0264,
"step": 2058
},
{
"epoch": 0.002060030015007504,
"grad_norm": 121.04714965820312,
"learning_rate": 3e-06,
"loss": 2.4383,
"step": 2059
},
{
"epoch": 0.002061030515257629,
"grad_norm": 261.35858154296875,
"learning_rate": 3e-06,
"loss": 4.7794,
"step": 2060
},
{
"epoch": 0.0020620310155077537,
"grad_norm": 161.84022521972656,
"learning_rate": 3e-06,
"loss": 6.2386,
"step": 2061
},
{
"epoch": 0.002063031515757879,
"grad_norm": 105.91159057617188,
"learning_rate": 3e-06,
"loss": 3.5483,
"step": 2062
},
{
"epoch": 0.002064032016008004,
"grad_norm": 104.31018829345703,
"learning_rate": 3e-06,
"loss": 1.5695,
"step": 2063
},
{
"epoch": 0.002065032516258129,
"grad_norm": 207.45989990234375,
"learning_rate": 3e-06,
"loss": 3.7625,
"step": 2064
},
{
"completion_length": 254.83333587646484,
"epoch": 0.002066033016508254,
"grad_norm": 109.15100860595703,
"learning_rate": 3e-06,
"loss": -15.7269,
"reward": 0.26253609359264374,
"reward_std": 0.13556809723377228,
"rewards/sudoku_reward_func": 0.26253607869148254,
"step": 2065,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020670335167583794,
"grad_norm": 360.2762145996094,
"learning_rate": 3e-06,
"loss": -14.5243,
"step": 2066
},
{
"epoch": 0.0020680340170085043,
"grad_norm": 135.55064392089844,
"learning_rate": 3e-06,
"loss": -9.1608,
"step": 2067
},
{
"epoch": 0.002069034517258629,
"grad_norm": 163.361328125,
"learning_rate": 3e-06,
"loss": -8.2301,
"step": 2068
},
{
"epoch": 0.0020700350175087544,
"grad_norm": 125.7318115234375,
"learning_rate": 3e-06,
"loss": -16.3416,
"step": 2069
},
{
"epoch": 0.0020710355177588793,
"grad_norm": 258.9736633300781,
"learning_rate": 3e-06,
"loss": -16.9302,
"step": 2070
},
{
"epoch": 0.0020720360180090046,
"grad_norm": 140.67019653320312,
"learning_rate": 3e-06,
"loss": -10.9255,
"step": 2071
},
{
"epoch": 0.0020730365182591295,
"grad_norm": 165.7702178955078,
"learning_rate": 3e-06,
"loss": -10.728,
"step": 2072
},
{
"completion_length": 255.9375,
"epoch": 0.002074037018509255,
"grad_norm": 620.3129272460938,
"learning_rate": 3e-06,
"loss": 1.4765,
"reward": 0.25063884258270264,
"reward_std": 0.16608520597219467,
"rewards/sudoku_reward_func": 0.25063882768154144,
"step": 2073,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020750375187593797,
"grad_norm": 172.11087036132812,
"learning_rate": 3e-06,
"loss": -14.5391,
"step": 2074
},
{
"epoch": 0.0020760380190095046,
"grad_norm": 191.67532348632812,
"learning_rate": 3e-06,
"loss": -5.3397,
"step": 2075
},
{
"epoch": 0.00207703851925963,
"grad_norm": 294.1646423339844,
"learning_rate": 3e-06,
"loss": -8.6712,
"step": 2076
},
{
"epoch": 0.0020780390195097548,
"grad_norm": 597.5181274414062,
"learning_rate": 3e-06,
"loss": 0.3632,
"step": 2077
},
{
"epoch": 0.00207903951975988,
"grad_norm": 167.82606506347656,
"learning_rate": 3e-06,
"loss": -16.7106,
"step": 2078
},
{
"epoch": 0.002080040020010005,
"grad_norm": 219.1627960205078,
"learning_rate": 3e-06,
"loss": -6.4416,
"step": 2079
},
{
"epoch": 0.0020810405202601303,
"grad_norm": 258.1119689941406,
"learning_rate": 3e-06,
"loss": -12.8399,
"step": 2080
},
{
"completion_length": 254.9375,
"epoch": 0.002082041020510255,
"grad_norm": 178.03309631347656,
"learning_rate": 3e-06,
"loss": -6.8223,
"reward": 0.23602844774723053,
"reward_std": 0.15177836269140244,
"rewards/sudoku_reward_func": 0.23602844029664993,
"step": 2081,
"zero_std_ratio": 0.0
},
{
"epoch": 0.00208304152076038,
"grad_norm": 131.65509033203125,
"learning_rate": 3e-06,
"loss": -4.6811,
"step": 2082
},
{
"epoch": 0.0020840420210105053,
"grad_norm": 318.57958984375,
"learning_rate": 3e-06,
"loss": 1.6984,
"step": 2083
},
{
"epoch": 0.00208504252126063,
"grad_norm": 277.5106506347656,
"learning_rate": 3e-06,
"loss": 2.3091,
"step": 2084
},
{
"epoch": 0.0020860430215107555,
"grad_norm": 152.53269958496094,
"learning_rate": 3e-06,
"loss": -7.6793,
"step": 2085
},
{
"epoch": 0.0020870435217608804,
"grad_norm": 236.90296936035156,
"learning_rate": 3e-06,
"loss": -5.8063,
"step": 2086
},
{
"epoch": 0.0020880440220110057,
"grad_norm": 309.64208984375,
"learning_rate": 3e-06,
"loss": -0.1638,
"step": 2087
},
{
"epoch": 0.0020890445222611306,
"grad_norm": 289.9593505859375,
"learning_rate": 3e-06,
"loss": -0.9137,
"step": 2088
},
{
"completion_length": 256.0,
"epoch": 0.0020900450225112555,
"grad_norm": 195.6490478515625,
"learning_rate": 3e-06,
"loss": 14.6669,
"reward": 0.2501089870929718,
"reward_std": 0.15444093942642212,
"rewards/sudoku_reward_func": 0.2501089796423912,
"step": 2089,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0020910455227613808,
"grad_norm": 189.63902282714844,
"learning_rate": 3e-06,
"loss": 20.3839,
"step": 2090
},
{
"epoch": 0.0020920460230115056,
"grad_norm": 183.08782958984375,
"learning_rate": 3e-06,
"loss": 22.8672,
"step": 2091
},
{
"epoch": 0.002093046523261631,
"grad_norm": 167.2587890625,
"learning_rate": 3e-06,
"loss": 16.6521,
"step": 2092
},
{
"epoch": 0.002094047023511756,
"grad_norm": 142.52476501464844,
"learning_rate": 3e-06,
"loss": 13.5591,
"step": 2093
},
{
"epoch": 0.002095047523761881,
"grad_norm": 143.21453857421875,
"learning_rate": 3e-06,
"loss": 17.2013,
"step": 2094
},
{
"epoch": 0.002096048024012006,
"grad_norm": 148.69102478027344,
"learning_rate": 3e-06,
"loss": 19.4678,
"step": 2095
},
{
"epoch": 0.002097048524262131,
"grad_norm": 120.7786865234375,
"learning_rate": 3e-06,
"loss": 14.8181,
"step": 2096
},
{
"completion_length": 253.52084350585938,
"epoch": 0.002098049024512256,
"grad_norm": 128.90467834472656,
"learning_rate": 3e-06,
"loss": -18.5774,
"reward": 0.24012070894241333,
"reward_std": 0.1463899165391922,
"rewards/sudoku_reward_func": 0.24012070894241333,
"step": 2097,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002099049524762381,
"grad_norm": 162.4427490234375,
"learning_rate": 3e-06,
"loss": -23.592,
"step": 2098
},
{
"epoch": 0.0021000500250125064,
"grad_norm": 142.89620971679688,
"learning_rate": 3e-06,
"loss": -22.294,
"step": 2099
},
{
"epoch": 0.0021010505252626313,
"grad_norm": 160.10720825195312,
"learning_rate": 3e-06,
"loss": -22.4738,
"step": 2100
},
{
"epoch": 0.0021020510255127566,
"grad_norm": 125.08181762695312,
"learning_rate": 3e-06,
"loss": -18.162,
"step": 2101
},
{
"epoch": 0.0021030515257628815,
"grad_norm": 141.7852020263672,
"learning_rate": 3e-06,
"loss": -23.9315,
"step": 2102
},
{
"epoch": 0.0021040520260130063,
"grad_norm": 120.26238250732422,
"learning_rate": 3e-06,
"loss": -23.2946,
"step": 2103
},
{
"epoch": 0.0021050525262631316,
"grad_norm": 155.07212829589844,
"learning_rate": 3e-06,
"loss": -24.0824,
"step": 2104
},
{
"completion_length": 256.0,
"epoch": 0.0021060530265132565,
"grad_norm": 159.4878387451172,
"learning_rate": 3e-06,
"loss": -12.6954,
"reward": 0.25752314925193787,
"reward_std": 0.1472446396946907,
"rewards/sudoku_reward_func": 0.25752314925193787,
"step": 2105,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002107053526763382,
"grad_norm": 103.97441101074219,
"learning_rate": 3e-06,
"loss": -11.32,
"step": 2106
},
{
"epoch": 0.0021080540270135067,
"grad_norm": 144.20181274414062,
"learning_rate": 3e-06,
"loss": -12.5571,
"step": 2107
},
{
"epoch": 0.002109054527263632,
"grad_norm": 117.40840148925781,
"learning_rate": 3e-06,
"loss": -11.9496,
"step": 2108
},
{
"epoch": 0.002110055027513757,
"grad_norm": 138.6615447998047,
"learning_rate": 3e-06,
"loss": -14.2046,
"step": 2109
},
{
"epoch": 0.0021110555277638818,
"grad_norm": 77.72933197021484,
"learning_rate": 3e-06,
"loss": -12.6948,
"step": 2110
},
{
"epoch": 0.002112056028014007,
"grad_norm": 397.400146484375,
"learning_rate": 3e-06,
"loss": -14.8182,
"step": 2111
},
{
"epoch": 0.002113056528264132,
"grad_norm": 225.19723510742188,
"learning_rate": 3e-06,
"loss": -15.2176,
"step": 2112
},
{
"completion_length": 253.6666717529297,
"epoch": 0.0021140570285142573,
"grad_norm": 125.04232788085938,
"learning_rate": 3e-06,
"loss": -4.1518,
"reward": 0.23790735751390457,
"reward_std": 0.13891858607530594,
"rewards/sudoku_reward_func": 0.23790735751390457,
"step": 2113,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002115057528764382,
"grad_norm": 134.87939453125,
"learning_rate": 3e-06,
"loss": -13.3869,
"step": 2114
},
{
"epoch": 0.0021160580290145075,
"grad_norm": 135.5575714111328,
"learning_rate": 3e-06,
"loss": -7.0319,
"step": 2115
},
{
"epoch": 0.0021170585292646323,
"grad_norm": 163.67832946777344,
"learning_rate": 3e-06,
"loss": -15.5896,
"step": 2116
},
{
"epoch": 0.0021180590295147572,
"grad_norm": 157.14891052246094,
"learning_rate": 3e-06,
"loss": -5.0652,
"step": 2117
},
{
"epoch": 0.0021190595297648825,
"grad_norm": 382.6764831542969,
"learning_rate": 3e-06,
"loss": -11.7335,
"step": 2118
},
{
"epoch": 0.0021200600300150074,
"grad_norm": 279.73651123046875,
"learning_rate": 3e-06,
"loss": -6.3247,
"step": 2119
},
{
"epoch": 0.0021210605302651327,
"grad_norm": 188.39959716796875,
"learning_rate": 3e-06,
"loss": -15.4458,
"step": 2120
},
{
"completion_length": 255.95834350585938,
"epoch": 0.0021220610305152576,
"grad_norm": 232.8545379638672,
"learning_rate": 3e-06,
"loss": -7.1581,
"reward": 0.27688343822956085,
"reward_std": 0.1465592235326767,
"rewards/sudoku_reward_func": 0.27688342332839966,
"step": 2121,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002123061530765383,
"grad_norm": 172.3257598876953,
"learning_rate": 3e-06,
"loss": -11.4882,
"step": 2122
},
{
"epoch": 0.0021240620310155078,
"grad_norm": 188.06607055664062,
"learning_rate": 3e-06,
"loss": -13.6338,
"step": 2123
},
{
"epoch": 0.0021250625312656327,
"grad_norm": 204.9393768310547,
"learning_rate": 3e-06,
"loss": -20.2453,
"step": 2124
},
{
"epoch": 0.002126063031515758,
"grad_norm": 244.41342163085938,
"learning_rate": 3e-06,
"loss": -9.4582,
"step": 2125
},
{
"epoch": 0.002127063531765883,
"grad_norm": 189.36769104003906,
"learning_rate": 3e-06,
"loss": -13.526,
"step": 2126
},
{
"epoch": 0.002128064032016008,
"grad_norm": 194.4271697998047,
"learning_rate": 3e-06,
"loss": -15.8828,
"step": 2127
},
{
"epoch": 0.002129064532266133,
"grad_norm": 203.0721435546875,
"learning_rate": 3e-06,
"loss": -22.4253,
"step": 2128
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0021300650325162583,
"grad_norm": 198.96873474121094,
"learning_rate": 3e-06,
"loss": 2.4782,
"reward": 0.3081071227788925,
"reward_std": 0.1305009424686432,
"rewards/sudoku_reward_func": 0.3081071227788925,
"step": 2129,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021310655327663832,
"grad_norm": 225.70071411132812,
"learning_rate": 3e-06,
"loss": -3.3406,
"step": 2130
},
{
"epoch": 0.002132066033016508,
"grad_norm": 182.7887725830078,
"learning_rate": 3e-06,
"loss": -6.5365,
"step": 2131
},
{
"epoch": 0.0021330665332666334,
"grad_norm": 256.8357238769531,
"learning_rate": 3e-06,
"loss": -6.6293,
"step": 2132
},
{
"epoch": 0.0021340670335167583,
"grad_norm": 175.3928680419922,
"learning_rate": 3e-06,
"loss": 2.0084,
"step": 2133
},
{
"epoch": 0.0021350675337668836,
"grad_norm": 236.942138671875,
"learning_rate": 3e-06,
"loss": -4.0,
"step": 2134
},
{
"epoch": 0.0021360680340170085,
"grad_norm": 124.70699310302734,
"learning_rate": 3e-06,
"loss": -8.0161,
"step": 2135
},
{
"epoch": 0.0021370685342671334,
"grad_norm": 208.55020141601562,
"learning_rate": 3e-06,
"loss": -7.0063,
"step": 2136
},
{
"completion_length": 256.0,
"epoch": 0.0021380690345172587,
"grad_norm": 451.7789001464844,
"learning_rate": 3e-06,
"loss": 23.7947,
"reward": 0.22189154475927353,
"reward_std": 0.1443362608551979,
"rewards/sudoku_reward_func": 0.22189154475927353,
"step": 2137,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021390695347673835,
"grad_norm": 239.3140106201172,
"learning_rate": 3e-06,
"loss": 24.3879,
"step": 2138
},
{
"epoch": 0.002140070035017509,
"grad_norm": 414.748779296875,
"learning_rate": 3e-06,
"loss": 29.8622,
"step": 2139
},
{
"epoch": 0.0021410705352676337,
"grad_norm": 366.05584716796875,
"learning_rate": 3e-06,
"loss": 24.7021,
"step": 2140
},
{
"epoch": 0.002142071035517759,
"grad_norm": 233.06585693359375,
"learning_rate": 3e-06,
"loss": 24.4789,
"step": 2141
},
{
"epoch": 0.002143071535767884,
"grad_norm": 322.3249816894531,
"learning_rate": 3e-06,
"loss": 22.6371,
"step": 2142
},
{
"epoch": 0.002144072036018009,
"grad_norm": 310.0884094238281,
"learning_rate": 3e-06,
"loss": 27.6157,
"step": 2143
},
{
"epoch": 0.002145072536268134,
"grad_norm": 295.13995361328125,
"learning_rate": 3e-06,
"loss": 21.8875,
"step": 2144
},
{
"completion_length": 253.62500762939453,
"epoch": 0.002146073036518259,
"grad_norm": 271.6358337402344,
"learning_rate": 3e-06,
"loss": 1.5988,
"reward": 0.22103850543498993,
"reward_std": 0.17497707903385162,
"rewards/sudoku_reward_func": 0.22103849798440933,
"step": 2145,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021470735367683843,
"grad_norm": 412.4436950683594,
"learning_rate": 3e-06,
"loss": 11.7546,
"step": 2146
},
{
"epoch": 0.002148074037018509,
"grad_norm": 187.46017456054688,
"learning_rate": 3e-06,
"loss": 3.9364,
"step": 2147
},
{
"epoch": 0.0021490745372686345,
"grad_norm": 193.0413055419922,
"learning_rate": 3e-06,
"loss": -0.1165,
"step": 2148
},
{
"epoch": 0.0021500750375187594,
"grad_norm": 305.0658874511719,
"learning_rate": 3e-06,
"loss": 1.0991,
"step": 2149
},
{
"epoch": 0.0021510755377688842,
"grad_norm": 441.3357849121094,
"learning_rate": 3e-06,
"loss": 10.5642,
"step": 2150
},
{
"epoch": 0.0021520760380190095,
"grad_norm": 196.80613708496094,
"learning_rate": 3e-06,
"loss": 2.8893,
"step": 2151
},
{
"epoch": 0.0021530765382691344,
"grad_norm": 197.22515869140625,
"learning_rate": 3e-06,
"loss": -0.7456,
"step": 2152
},
{
"completion_length": 250.6666717529297,
"epoch": 0.0021540770385192597,
"grad_norm": 162.09518432617188,
"learning_rate": 3e-06,
"loss": -16.1398,
"reward": 0.2230752632021904,
"reward_std": 0.15181417018175125,
"rewards/sudoku_reward_func": 0.2230752483010292,
"step": 2153,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021550775387693846,
"grad_norm": 153.7283477783203,
"learning_rate": 3e-06,
"loss": -15.0541,
"step": 2154
},
{
"epoch": 0.00215607803901951,
"grad_norm": 188.96493530273438,
"learning_rate": 3e-06,
"loss": -26.0853,
"step": 2155
},
{
"epoch": 0.002157078539269635,
"grad_norm": 177.75912475585938,
"learning_rate": 3e-06,
"loss": -5.8482,
"step": 2156
},
{
"epoch": 0.0021580790395197597,
"grad_norm": 166.35536193847656,
"learning_rate": 3e-06,
"loss": -16.6153,
"step": 2157
},
{
"epoch": 0.002159079539769885,
"grad_norm": 130.83131408691406,
"learning_rate": 3e-06,
"loss": -15.7326,
"step": 2158
},
{
"epoch": 0.00216008004002001,
"grad_norm": 237.99929809570312,
"learning_rate": 3e-06,
"loss": -24.3056,
"step": 2159
},
{
"epoch": 0.002161080540270135,
"grad_norm": 227.08934020996094,
"learning_rate": 3e-06,
"loss": -7.8361,
"step": 2160
},
{
"completion_length": 255.9166717529297,
"epoch": 0.00216208104052026,
"grad_norm": 372.7931213378906,
"learning_rate": 3e-06,
"loss": -11.8087,
"reward": 0.2559824585914612,
"reward_std": 0.12336140125989914,
"rewards/sudoku_reward_func": 0.2559824511408806,
"step": 2161,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021630815407703854,
"grad_norm": 267.86474609375,
"learning_rate": 3e-06,
"loss": -11.9906,
"step": 2162
},
{
"epoch": 0.0021640820410205102,
"grad_norm": 111.28511810302734,
"learning_rate": 3e-06,
"loss": -13.1757,
"step": 2163
},
{
"epoch": 0.002165082541270635,
"grad_norm": 184.33836364746094,
"learning_rate": 3e-06,
"loss": -12.4034,
"step": 2164
},
{
"epoch": 0.0021660830415207604,
"grad_norm": 167.9713592529297,
"learning_rate": 3e-06,
"loss": -15.7842,
"step": 2165
},
{
"epoch": 0.0021670835417708853,
"grad_norm": 193.59280395507812,
"learning_rate": 3e-06,
"loss": -12.9642,
"step": 2166
},
{
"epoch": 0.0021680840420210106,
"grad_norm": 100.986572265625,
"learning_rate": 3e-06,
"loss": -13.2477,
"step": 2167
},
{
"epoch": 0.0021690845422711355,
"grad_norm": 180.26425170898438,
"learning_rate": 3e-06,
"loss": -13.685,
"step": 2168
},
{
"completion_length": 255.81250762939453,
"epoch": 0.002170085042521261,
"grad_norm": 257.19793701171875,
"learning_rate": 3e-06,
"loss": 1.9957,
"reward": 0.25946594774723053,
"reward_std": 0.15108813345432281,
"rewards/sudoku_reward_func": 0.25946594774723053,
"step": 2169,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021710855427713857,
"grad_norm": 203.61459350585938,
"learning_rate": 3e-06,
"loss": 8.0578,
"step": 2170
},
{
"epoch": 0.0021720860430215106,
"grad_norm": 166.8609619140625,
"learning_rate": 3e-06,
"loss": -4.9759,
"step": 2171
},
{
"epoch": 0.002173086543271636,
"grad_norm": 230.0701446533203,
"learning_rate": 3e-06,
"loss": -0.9933,
"step": 2172
},
{
"epoch": 0.0021740870435217607,
"grad_norm": 291.85162353515625,
"learning_rate": 3e-06,
"loss": 1.5621,
"step": 2173
},
{
"epoch": 0.002175087543771886,
"grad_norm": 176.3345184326172,
"learning_rate": 3e-06,
"loss": 6.0896,
"step": 2174
},
{
"epoch": 0.002176088044022011,
"grad_norm": 147.5749969482422,
"learning_rate": 3e-06,
"loss": -6.063,
"step": 2175
},
{
"epoch": 0.0021770885442721362,
"grad_norm": 166.72695922851562,
"learning_rate": 3e-06,
"loss": -1.8773,
"step": 2176
},
{
"completion_length": 255.89583587646484,
"epoch": 0.002178089044522261,
"grad_norm": 123.5069808959961,
"learning_rate": 3e-06,
"loss": -15.7872,
"reward": 0.19102109223604202,
"reward_std": 0.13286863267421722,
"rewards/sudoku_reward_func": 0.19102108478546143,
"step": 2177,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002179089544772386,
"grad_norm": 148.22042846679688,
"learning_rate": 3e-06,
"loss": -17.1538,
"step": 2178
},
{
"epoch": 0.0021800900450225113,
"grad_norm": 216.6683807373047,
"learning_rate": 3e-06,
"loss": -19.8152,
"step": 2179
},
{
"epoch": 0.002181090545272636,
"grad_norm": 206.0824737548828,
"learning_rate": 3e-06,
"loss": -20.6436,
"step": 2180
},
{
"epoch": 0.0021820910455227615,
"grad_norm": 155.43673706054688,
"learning_rate": 3e-06,
"loss": -17.2832,
"step": 2181
},
{
"epoch": 0.0021830915457728864,
"grad_norm": 108.36864471435547,
"learning_rate": 3e-06,
"loss": -17.956,
"step": 2182
},
{
"epoch": 0.0021840920460230117,
"grad_norm": 176.97877502441406,
"learning_rate": 3e-06,
"loss": -21.1888,
"step": 2183
},
{
"epoch": 0.0021850925462731366,
"grad_norm": 140.15602111816406,
"learning_rate": 3e-06,
"loss": -22.7284,
"step": 2184
},
{
"completion_length": 256.0,
"epoch": 0.0021860930465232614,
"grad_norm": 201.99246215820312,
"learning_rate": 3e-06,
"loss": -2.748,
"reward": 0.229414701461792,
"reward_std": 0.09967641159892082,
"rewards/sudoku_reward_func": 0.2294146865606308,
"step": 2185,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0021870935467733867,
"grad_norm": 114.0577621459961,
"learning_rate": 3e-06,
"loss": -4.1496,
"step": 2186
},
{
"epoch": 0.0021880940470235116,
"grad_norm": 118.37448120117188,
"learning_rate": 3e-06,
"loss": -9.0165,
"step": 2187
},
{
"epoch": 0.002189094547273637,
"grad_norm": 160.56996154785156,
"learning_rate": 3e-06,
"loss": -3.3018,
"step": 2188
},
{
"epoch": 0.002190095047523762,
"grad_norm": 196.15040588378906,
"learning_rate": 3e-06,
"loss": -5.2191,
"step": 2189
},
{
"epoch": 0.002191095547773887,
"grad_norm": 166.47886657714844,
"learning_rate": 3e-06,
"loss": -5.5121,
"step": 2190
},
{
"epoch": 0.002192096048024012,
"grad_norm": 133.74618530273438,
"learning_rate": 3e-06,
"loss": -10.0543,
"step": 2191
},
{
"epoch": 0.002193096548274137,
"grad_norm": 275.3774108886719,
"learning_rate": 3e-06,
"loss": -6.3413,
"step": 2192
},
{
"completion_length": 255.93750762939453,
"epoch": 0.002194097048524262,
"grad_norm": 241.90306091308594,
"learning_rate": 3e-06,
"loss": -25.2367,
"reward": 0.2638888955116272,
"reward_std": 0.1872684732079506,
"rewards/sudoku_reward_func": 0.2638888955116272,
"step": 2193,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002195097548774387,
"grad_norm": 348.1951904296875,
"learning_rate": 3e-06,
"loss": -10.1058,
"step": 2194
},
{
"epoch": 0.0021960980490245124,
"grad_norm": 345.2223205566406,
"learning_rate": 3e-06,
"loss": -28.1819,
"step": 2195
},
{
"epoch": 0.0021970985492746373,
"grad_norm": 194.86376953125,
"learning_rate": 3e-06,
"loss": -12.9991,
"step": 2196
},
{
"epoch": 0.0021980990495247626,
"grad_norm": 380.6177978515625,
"learning_rate": 3e-06,
"loss": -25.8226,
"step": 2197
},
{
"epoch": 0.0021990995497748874,
"grad_norm": 203.87657165527344,
"learning_rate": 3e-06,
"loss": -11.971,
"step": 2198
},
{
"epoch": 0.0022001000500250123,
"grad_norm": 202.9221649169922,
"learning_rate": 3e-06,
"loss": -32.1544,
"step": 2199
},
{
"epoch": 0.0022011005502751376,
"grad_norm": 315.40374755859375,
"learning_rate": 3e-06,
"loss": -13.7635,
"step": 2200
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0022021010505252625,
"grad_norm": 392.06121826171875,
"learning_rate": 3e-06,
"loss": -4.4377,
"reward": 0.26651185750961304,
"reward_std": 0.11879193782806396,
"rewards/sudoku_reward_func": 0.26651184260845184,
"step": 2201,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002203101550775388,
"grad_norm": 110.69683837890625,
"learning_rate": 3e-06,
"loss": -6.0209,
"step": 2202
},
{
"epoch": 0.0022041020510255127,
"grad_norm": 144.53733825683594,
"learning_rate": 3e-06,
"loss": -7.4633,
"step": 2203
},
{
"epoch": 0.002205102551275638,
"grad_norm": 192.95216369628906,
"learning_rate": 3e-06,
"loss": -7.0989,
"step": 2204
},
{
"epoch": 0.002206103051525763,
"grad_norm": 206.1815948486328,
"learning_rate": 3e-06,
"loss": -9.3645,
"step": 2205
},
{
"epoch": 0.0022071035517758878,
"grad_norm": 107.30200958251953,
"learning_rate": 3e-06,
"loss": -7.0976,
"step": 2206
},
{
"epoch": 0.002208104052026013,
"grad_norm": 149.86318969726562,
"learning_rate": 3e-06,
"loss": -9.3709,
"step": 2207
},
{
"epoch": 0.002209104552276138,
"grad_norm": 229.2109375,
"learning_rate": 3e-06,
"loss": -7.4561,
"step": 2208
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0022101050525262633,
"grad_norm": 305.0406188964844,
"learning_rate": 3e-06,
"loss": 2.6266,
"reward": 0.2063116356730461,
"reward_std": 0.1306929923593998,
"rewards/sudoku_reward_func": 0.2063116356730461,
"step": 2209,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002211105552776388,
"grad_norm": 203.2418670654297,
"learning_rate": 3e-06,
"loss": 6.8086,
"step": 2210
},
{
"epoch": 0.0022121060530265134,
"grad_norm": 121.47610473632812,
"learning_rate": 3e-06,
"loss": 6.2834,
"step": 2211
},
{
"epoch": 0.0022131065532766383,
"grad_norm": 227.87969970703125,
"learning_rate": 3e-06,
"loss": 11.4717,
"step": 2212
},
{
"epoch": 0.002214107053526763,
"grad_norm": 240.4669189453125,
"learning_rate": 3e-06,
"loss": 1.7777,
"step": 2213
},
{
"epoch": 0.0022151075537768885,
"grad_norm": 134.55894470214844,
"learning_rate": 3e-06,
"loss": 6.5632,
"step": 2214
},
{
"epoch": 0.0022161080540270134,
"grad_norm": 113.4019546508789,
"learning_rate": 3e-06,
"loss": 5.987,
"step": 2215
},
{
"epoch": 0.0022171085542771387,
"grad_norm": 127.93946075439453,
"learning_rate": 3e-06,
"loss": 10.0904,
"step": 2216
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0022181090545272636,
"grad_norm": 160.65890502929688,
"learning_rate": 3e-06,
"loss": -14.7841,
"reward": 0.25946594774723053,
"reward_std": 0.1456310823559761,
"rewards/sudoku_reward_func": 0.25946593284606934,
"step": 2217,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002219109554777389,
"grad_norm": 150.86474609375,
"learning_rate": 3e-06,
"loss": -12.5941,
"step": 2218
},
{
"epoch": 0.0022201100550275138,
"grad_norm": 130.49571228027344,
"learning_rate": 3e-06,
"loss": -6.8444,
"step": 2219
},
{
"epoch": 0.0022211105552776386,
"grad_norm": 211.9480743408203,
"learning_rate": 3e-06,
"loss": -9.5312,
"step": 2220
},
{
"epoch": 0.002222111055527764,
"grad_norm": 160.16744995117188,
"learning_rate": 3e-06,
"loss": -16.0951,
"step": 2221
},
{
"epoch": 0.002223111555777889,
"grad_norm": 140.18873596191406,
"learning_rate": 3e-06,
"loss": -13.801,
"step": 2222
},
{
"epoch": 0.002224112056028014,
"grad_norm": 130.58078002929688,
"learning_rate": 3e-06,
"loss": -8.401,
"step": 2223
},
{
"epoch": 0.002225112556278139,
"grad_norm": 153.523193359375,
"learning_rate": 3e-06,
"loss": -10.8995,
"step": 2224
},
{
"completion_length": 255.93750762939453,
"epoch": 0.0022261130565282643,
"grad_norm": 314.0987243652344,
"learning_rate": 3e-06,
"loss": 56.8432,
"reward": 0.2909226268529892,
"reward_std": 0.15687301754951477,
"rewards/sudoku_reward_func": 0.2909226268529892,
"step": 2225,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002227113556778389,
"grad_norm": 327.9870300292969,
"learning_rate": 3e-06,
"loss": 52.9292,
"step": 2226
},
{
"epoch": 0.002228114057028514,
"grad_norm": 322.2725830078125,
"learning_rate": 3e-06,
"loss": 50.9084,
"step": 2227
},
{
"epoch": 0.0022291145572786394,
"grad_norm": 303.08941650390625,
"learning_rate": 3e-06,
"loss": 49.6696,
"step": 2228
},
{
"epoch": 0.0022301150575287643,
"grad_norm": 304.82855224609375,
"learning_rate": 3e-06,
"loss": 55.3564,
"step": 2229
},
{
"epoch": 0.0022311155577788896,
"grad_norm": 333.8318786621094,
"learning_rate": 3e-06,
"loss": 49.5088,
"step": 2230
},
{
"epoch": 0.0022321160580290145,
"grad_norm": 315.6372375488281,
"learning_rate": 3e-06,
"loss": 46.0253,
"step": 2231
},
{
"epoch": 0.0022331165582791398,
"grad_norm": 282.1758117675781,
"learning_rate": 3e-06,
"loss": 43.9942,
"step": 2232
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0022341170585292646,
"grad_norm": 149.5388946533203,
"learning_rate": 3e-06,
"loss": 19.3477,
"reward": 0.30179397761821747,
"reward_std": 0.1400880292057991,
"rewards/sudoku_reward_func": 0.30179397761821747,
"step": 2233,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0022351175587793895,
"grad_norm": 148.94393920898438,
"learning_rate": 3e-06,
"loss": 18.7291,
"step": 2234
},
{
"epoch": 0.002236118059029515,
"grad_norm": 110.72798156738281,
"learning_rate": 3e-06,
"loss": 15.4445,
"step": 2235
},
{
"epoch": 0.0022371185592796397,
"grad_norm": 127.52186584472656,
"learning_rate": 3e-06,
"loss": 14.8732,
"step": 2236
},
{
"epoch": 0.002238119059529765,
"grad_norm": 142.9965057373047,
"learning_rate": 3e-06,
"loss": 17.0905,
"step": 2237
},
{
"epoch": 0.00223911955977989,
"grad_norm": 118.69670867919922,
"learning_rate": 3e-06,
"loss": 16.1556,
"step": 2238
},
{
"epoch": 0.002240120060030015,
"grad_norm": 104.7170639038086,
"learning_rate": 3e-06,
"loss": 13.162,
"step": 2239
},
{
"epoch": 0.00224112056028014,
"grad_norm": 127.44586944580078,
"learning_rate": 3e-06,
"loss": 12.5951,
"step": 2240
},
{
"completion_length": 255.9791717529297,
"epoch": 0.002242121060530265,
"grad_norm": 173.59176635742188,
"learning_rate": 3e-06,
"loss": -16.5816,
"reward": 0.25268685817718506,
"reward_std": 0.145609550178051,
"rewards/sudoku_reward_func": 0.25268684327602386,
"step": 2241,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0022431215607803903,
"grad_norm": 95.00807189941406,
"learning_rate": 3e-06,
"loss": -11.9075,
"step": 2242
},
{
"epoch": 0.002244122061030515,
"grad_norm": 94.31062316894531,
"learning_rate": 3e-06,
"loss": -14.3515,
"step": 2243
},
{
"epoch": 0.0022451225612806405,
"grad_norm": 93.2145767211914,
"learning_rate": 3e-06,
"loss": -17.7514,
"step": 2244
},
{
"epoch": 0.0022461230615307653,
"grad_norm": 176.99551391601562,
"learning_rate": 3e-06,
"loss": -16.3497,
"step": 2245
},
{
"epoch": 0.0022471235617808906,
"grad_norm": 84.77751159667969,
"learning_rate": 3e-06,
"loss": -12.8933,
"step": 2246
},
{
"epoch": 0.0022481240620310155,
"grad_norm": 103.69670104980469,
"learning_rate": 3e-06,
"loss": -15.0246,
"step": 2247
},
{
"epoch": 0.0022491245622811404,
"grad_norm": 102.71160888671875,
"learning_rate": 3e-06,
"loss": -18.5848,
"step": 2248
},
{
"completion_length": 256.0,
"epoch": 0.0022501250625312657,
"grad_norm": 118.6711196899414,
"learning_rate": 3e-06,
"loss": 23.5176,
"reward": 0.27385087311267853,
"reward_std": 0.16272394359111786,
"rewards/sudoku_reward_func": 0.27385087311267853,
"step": 2249,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0022511255627813906,
"grad_norm": 122.66015625,
"learning_rate": 3e-06,
"loss": 23.9114,
"step": 2250
},
{
"epoch": 0.002252126063031516,
"grad_norm": 324.5727844238281,
"learning_rate": 3e-06,
"loss": 23.4655,
"step": 2251
},
{
"epoch": 0.0022531265632816408,
"grad_norm": 139.6287384033203,
"learning_rate": 3e-06,
"loss": 21.8016,
"step": 2252
},
{
"epoch": 0.002254127063531766,
"grad_norm": 131.1999053955078,
"learning_rate": 3e-06,
"loss": 22.3795,
"step": 2253
},
{
"epoch": 0.002255127563781891,
"grad_norm": 122.19425964355469,
"learning_rate": 3e-06,
"loss": 21.7393,
"step": 2254
},
{
"epoch": 0.002256128064032016,
"grad_norm": 217.8507080078125,
"learning_rate": 3e-06,
"loss": 17.6008,
"step": 2255
},
{
"epoch": 0.002257128564282141,
"grad_norm": 320.115234375,
"learning_rate": 3e-06,
"loss": 19.0938,
"step": 2256
},
{
"completion_length": 254.8541717529297,
"epoch": 0.002258129064532266,
"grad_norm": 109.14167022705078,
"learning_rate": 3e-06,
"loss": -16.7314,
"reward": 0.22406356036663055,
"reward_std": 0.14896760880947113,
"rewards/sudoku_reward_func": 0.22406355291604996,
"step": 2257,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0022591295647823913,
"grad_norm": 276.05255126953125,
"learning_rate": 3e-06,
"loss": -10.0484,
"step": 2258
},
{
"epoch": 0.002260130065032516,
"grad_norm": 165.75491333007812,
"learning_rate": 3e-06,
"loss": -9.5876,
"step": 2259
},
{
"epoch": 0.0022611305652826415,
"grad_norm": 257.5172424316406,
"learning_rate": 3e-06,
"loss": -11.2125,
"step": 2260
},
{
"epoch": 0.0022621310655327664,
"grad_norm": 170.18350219726562,
"learning_rate": 3e-06,
"loss": -17.6049,
"step": 2261
},
{
"epoch": 0.0022631315657828913,
"grad_norm": 217.0912628173828,
"learning_rate": 3e-06,
"loss": -15.0741,
"step": 2262
},
{
"epoch": 0.0022641320660330166,
"grad_norm": 310.0478820800781,
"learning_rate": 3e-06,
"loss": -11.486,
"step": 2263
},
{
"epoch": 0.0022651325662831415,
"grad_norm": 233.90057373046875,
"learning_rate": 3e-06,
"loss": -17.2378,
"step": 2264
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0022661330665332668,
"grad_norm": 261.5735168457031,
"learning_rate": 3e-06,
"loss": 1.2377,
"reward": 0.2796379029750824,
"reward_std": 0.15749312937259674,
"rewards/sudoku_reward_func": 0.2796379029750824,
"step": 2265,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0022671335667833917,
"grad_norm": 186.02745056152344,
"learning_rate": 3e-06,
"loss": -0.9633,
"step": 2266
},
{
"epoch": 0.002268134067033517,
"grad_norm": 366.9317321777344,
"learning_rate": 3e-06,
"loss": 5.7376,
"step": 2267
},
{
"epoch": 0.002269134567283642,
"grad_norm": 285.3451843261719,
"learning_rate": 3e-06,
"loss": 11.1022,
"step": 2268
},
{
"epoch": 0.0022701350675337667,
"grad_norm": 247.7568359375,
"learning_rate": 3e-06,
"loss": -0.5247,
"step": 2269
},
{
"epoch": 0.002271135567783892,
"grad_norm": 214.0550079345703,
"learning_rate": 3e-06,
"loss": -4.5155,
"step": 2270
},
{
"epoch": 0.002272136068034017,
"grad_norm": 144.87843322753906,
"learning_rate": 3e-06,
"loss": 4.6787,
"step": 2271
},
{
"epoch": 0.0022731365682841422,
"grad_norm": 312.683837890625,
"learning_rate": 3e-06,
"loss": 6.7345,
"step": 2272
},
{
"completion_length": 255.87500762939453,
"epoch": 0.002274137068534267,
"grad_norm": 90.921630859375,
"learning_rate": 3e-06,
"loss": -17.0006,
"reward": 0.19374174624681473,
"reward_std": 0.09137369692325592,
"rewards/sudoku_reward_func": 0.19374173879623413,
"step": 2273,
"zero_std_ratio": 0.125
},
{
"epoch": 0.0022751375687843924,
"grad_norm": 156.21347045898438,
"learning_rate": 3e-06,
"loss": -14.9885,
"step": 2274
},
{
"epoch": 0.0022761380690345173,
"grad_norm": 146.11834716796875,
"learning_rate": 3e-06,
"loss": -11.7141,
"step": 2275
},
{
"epoch": 0.002277138569284642,
"grad_norm": 125.28398895263672,
"learning_rate": 3e-06,
"loss": -13.9846,
"step": 2276
},
{
"epoch": 0.0022781390695347675,
"grad_norm": 136.9147491455078,
"learning_rate": 3e-06,
"loss": -17.4603,
"step": 2277
},
{
"epoch": 0.0022791395697848923,
"grad_norm": 116.94140625,
"learning_rate": 3e-06,
"loss": -14.504,
"step": 2278
},
{
"epoch": 0.0022801400700350177,
"grad_norm": 135.58804321289062,
"learning_rate": 3e-06,
"loss": -13.5907,
"step": 2279
},
{
"epoch": 0.0022811405702851425,
"grad_norm": 88.2119369506836,
"learning_rate": 3e-06,
"loss": -14.4023,
"step": 2280
},
{
"completion_length": 255.45833587646484,
"epoch": 0.002282141070535268,
"grad_norm": 322.60455322265625,
"learning_rate": 3e-06,
"loss": 1.4808,
"reward": 0.28232476115226746,
"reward_std": 0.16080554574728012,
"rewards/sudoku_reward_func": 0.28232474625110626,
"step": 2281,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0022831415707853927,
"grad_norm": 130.08282470703125,
"learning_rate": 3e-06,
"loss": -8.7648,
"step": 2282
},
{
"epoch": 0.0022841420710355176,
"grad_norm": 200.008056640625,
"learning_rate": 3e-06,
"loss": 3.0487,
"step": 2283
},
{
"epoch": 0.002285142571285643,
"grad_norm": 159.8165283203125,
"learning_rate": 3e-06,
"loss": 5.1691,
"step": 2284
},
{
"epoch": 0.002286143071535768,
"grad_norm": 409.3653869628906,
"learning_rate": 3e-06,
"loss": 2.0149,
"step": 2285
},
{
"epoch": 0.002287143571785893,
"grad_norm": 121.8490219116211,
"learning_rate": 3e-06,
"loss": -9.2563,
"step": 2286
},
{
"epoch": 0.002288144072036018,
"grad_norm": 172.1815185546875,
"learning_rate": 3e-06,
"loss": 0.5314,
"step": 2287
},
{
"epoch": 0.002289144572286143,
"grad_norm": 169.503662109375,
"learning_rate": 3e-06,
"loss": 4.3363,
"step": 2288
},
{
"completion_length": 255.83333587646484,
"epoch": 0.002290145072536268,
"grad_norm": 158.5614776611328,
"learning_rate": 3e-06,
"loss": -3.7874,
"reward": 0.23768188804388046,
"reward_std": 0.13747499138116837,
"rewards/sudoku_reward_func": 0.23768187314271927,
"step": 2289,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002291145572786393,
"grad_norm": 243.24232482910156,
"learning_rate": 3e-06,
"loss": 2.901,
"step": 2290
},
{
"epoch": 0.0022921460730365184,
"grad_norm": 171.938720703125,
"learning_rate": 3e-06,
"loss": 3.3795,
"step": 2291
},
{
"epoch": 0.0022931465732866432,
"grad_norm": 224.5988311767578,
"learning_rate": 3e-06,
"loss": 3.9502,
"step": 2292
},
{
"epoch": 0.0022941470735367685,
"grad_norm": 158.8858642578125,
"learning_rate": 3e-06,
"loss": -4.3771,
"step": 2293
},
{
"epoch": 0.0022951475737868934,
"grad_norm": 196.80746459960938,
"learning_rate": 3e-06,
"loss": 2.5411,
"step": 2294
},
{
"epoch": 0.0022961480740370183,
"grad_norm": 126.51914978027344,
"learning_rate": 3e-06,
"loss": 4.0453,
"step": 2295
},
{
"epoch": 0.0022971485742871436,
"grad_norm": 195.58306884765625,
"learning_rate": 3e-06,
"loss": 2.4945,
"step": 2296
},
{
"completion_length": 255.89583587646484,
"epoch": 0.0022981490745372685,
"grad_norm": 200.89503479003906,
"learning_rate": 3e-06,
"loss": -19.6789,
"reward": 0.2456597313284874,
"reward_std": 0.16119515150785446,
"rewards/sudoku_reward_func": 0.2456597313284874,
"step": 2297,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002299149574787394,
"grad_norm": 190.7845001220703,
"learning_rate": 3e-06,
"loss": -20.1583,
"step": 2298
},
{
"epoch": 0.0023001500750375187,
"grad_norm": 211.36935424804688,
"learning_rate": 3e-06,
"loss": -17.8248,
"step": 2299
},
{
"epoch": 0.002301150575287644,
"grad_norm": 210.21969604492188,
"learning_rate": 3e-06,
"loss": -21.7427,
"step": 2300
},
{
"epoch": 0.002302151075537769,
"grad_norm": 282.9001159667969,
"learning_rate": 3e-06,
"loss": -21.4653,
"step": 2301
},
{
"epoch": 0.0023031515757878937,
"grad_norm": 310.6829833984375,
"learning_rate": 3e-06,
"loss": -22.4297,
"step": 2302
},
{
"epoch": 0.002304152076038019,
"grad_norm": 210.1687774658203,
"learning_rate": 3e-06,
"loss": -18.4755,
"step": 2303
},
{
"epoch": 0.002305152576288144,
"grad_norm": 190.2571258544922,
"learning_rate": 3e-06,
"loss": -24.6334,
"step": 2304
},
{
"completion_length": 255.81250762939453,
"epoch": 0.0023061530765382692,
"grad_norm": 242.8677215576172,
"learning_rate": 3e-06,
"loss": 8.2897,
"reward": 0.23668982088565826,
"reward_std": 0.144187830388546,
"rewards/sudoku_reward_func": 0.23668982088565826,
"step": 2305,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002307153576788394,
"grad_norm": 216.795654296875,
"learning_rate": 3e-06,
"loss": 3.4369,
"step": 2306
},
{
"epoch": 0.0023081540770385194,
"grad_norm": 218.32254028320312,
"learning_rate": 3e-06,
"loss": -1.9715,
"step": 2307
},
{
"epoch": 0.0023091545772886443,
"grad_norm": 195.0783233642578,
"learning_rate": 3e-06,
"loss": 5.2904,
"step": 2308
},
{
"epoch": 0.002310155077538769,
"grad_norm": 202.93618774414062,
"learning_rate": 3e-06,
"loss": 8.8413,
"step": 2309
},
{
"epoch": 0.0023111555777888945,
"grad_norm": 306.6898498535156,
"learning_rate": 3e-06,
"loss": 2.429,
"step": 2310
},
{
"epoch": 0.0023121560780390194,
"grad_norm": 217.85650634765625,
"learning_rate": 3e-06,
"loss": -3.4418,
"step": 2311
},
{
"epoch": 0.0023131565782891447,
"grad_norm": 293.9922790527344,
"learning_rate": 3e-06,
"loss": 4.8448,
"step": 2312
},
{
"completion_length": 255.7916717529297,
"epoch": 0.0023141570785392696,
"grad_norm": 298.2754211425781,
"learning_rate": 3e-06,
"loss": -6.4534,
"reward": 0.24974070489406586,
"reward_std": 0.1554877981543541,
"rewards/sudoku_reward_func": 0.24974069744348526,
"step": 2313,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002315157578789395,
"grad_norm": 471.14935302734375,
"learning_rate": 3e-06,
"loss": -15.3898,
"step": 2314
},
{
"epoch": 0.0023161580790395197,
"grad_norm": 694.3447265625,
"learning_rate": 3e-06,
"loss": -5.2157,
"step": 2315
},
{
"epoch": 0.0023171585792896446,
"grad_norm": 415.9455261230469,
"learning_rate": 3e-06,
"loss": -21.4852,
"step": 2316
},
{
"epoch": 0.00231815907953977,
"grad_norm": 309.5008850097656,
"learning_rate": 3e-06,
"loss": -7.4771,
"step": 2317
},
{
"epoch": 0.002319159579789895,
"grad_norm": 368.5068664550781,
"learning_rate": 3e-06,
"loss": -14.7027,
"step": 2318
},
{
"epoch": 0.00232016008004002,
"grad_norm": 254.92578125,
"learning_rate": 3e-06,
"loss": -3.7084,
"step": 2319
},
{
"epoch": 0.002321160580290145,
"grad_norm": 579.2901000976562,
"learning_rate": 3e-06,
"loss": -19.9798,
"step": 2320
},
{
"completion_length": 255.87500762939453,
"epoch": 0.0023221610805402703,
"grad_norm": 389.4748840332031,
"learning_rate": 3e-06,
"loss": -23.0951,
"reward": 0.2786458432674408,
"reward_std": 0.18757472187280655,
"rewards/sudoku_reward_func": 0.2786458134651184,
"step": 2321,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002323161580790395,
"grad_norm": 415.79949951171875,
"learning_rate": 3e-06,
"loss": -9.136,
"step": 2322
},
{
"epoch": 0.00232416208104052,
"grad_norm": 173.99655151367188,
"learning_rate": 3e-06,
"loss": -26.185,
"step": 2323
},
{
"epoch": 0.0023251625812906454,
"grad_norm": 273.4259338378906,
"learning_rate": 3e-06,
"loss": -19.4097,
"step": 2324
},
{
"epoch": 0.0023261630815407702,
"grad_norm": 391.3758239746094,
"learning_rate": 3e-06,
"loss": -25.6756,
"step": 2325
},
{
"epoch": 0.0023271635817908956,
"grad_norm": 349.4239501953125,
"learning_rate": 3e-06,
"loss": -12.373,
"step": 2326
},
{
"epoch": 0.0023281640820410204,
"grad_norm": 206.3245849609375,
"learning_rate": 3e-06,
"loss": -26.9021,
"step": 2327
},
{
"epoch": 0.0023291645822911457,
"grad_norm": 213.48085021972656,
"learning_rate": 3e-06,
"loss": -21.3958,
"step": 2328
},
{
"completion_length": 255.83333587646484,
"epoch": 0.0023301650825412706,
"grad_norm": 269.2904968261719,
"learning_rate": 3e-06,
"loss": 0.1758,
"reward": 0.2615740895271301,
"reward_std": 0.14590194076299667,
"rewards/sudoku_reward_func": 0.2615740895271301,
"step": 2329,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023311655827913955,
"grad_norm": 259.6731262207031,
"learning_rate": 3e-06,
"loss": 3.9599,
"step": 2330
},
{
"epoch": 0.002332166083041521,
"grad_norm": 188.71871948242188,
"learning_rate": 3e-06,
"loss": 5.147,
"step": 2331
},
{
"epoch": 0.0023331665832916457,
"grad_norm": 129.87255859375,
"learning_rate": 3e-06,
"loss": -5.0955,
"step": 2332
},
{
"epoch": 0.002334167083541771,
"grad_norm": 193.02645874023438,
"learning_rate": 3e-06,
"loss": -3.1335,
"step": 2333
},
{
"epoch": 0.002335167583791896,
"grad_norm": 289.1039123535156,
"learning_rate": 3e-06,
"loss": -0.141,
"step": 2334
},
{
"epoch": 0.002336168084042021,
"grad_norm": 291.5890808105469,
"learning_rate": 3e-06,
"loss": 0.3956,
"step": 2335
},
{
"epoch": 0.002337168584292146,
"grad_norm": 284.3049011230469,
"learning_rate": 3e-06,
"loss": -5.9039,
"step": 2336
},
{
"completion_length": 255.9166717529297,
"epoch": 0.002338169084542271,
"grad_norm": 174.27735900878906,
"learning_rate": 3e-06,
"loss": 13.7326,
"reward": 0.23280423879623413,
"reward_std": 0.12118640542030334,
"rewards/sudoku_reward_func": 0.23280422389507294,
"step": 2337,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023391695847923962,
"grad_norm": 557.00146484375,
"learning_rate": 3e-06,
"loss": 7.0329,
"step": 2338
},
{
"epoch": 0.002340170085042521,
"grad_norm": 125.9331283569336,
"learning_rate": 3e-06,
"loss": 11.8082,
"step": 2339
},
{
"epoch": 0.0023411705852926464,
"grad_norm": 334.607666015625,
"learning_rate": 3e-06,
"loss": 6.1679,
"step": 2340
},
{
"epoch": 0.0023421710855427713,
"grad_norm": 169.49258422851562,
"learning_rate": 3e-06,
"loss": 12.367,
"step": 2341
},
{
"epoch": 0.0023431715857928966,
"grad_norm": 193.27243041992188,
"learning_rate": 3e-06,
"loss": 3.7953,
"step": 2342
},
{
"epoch": 0.0023441720860430215,
"grad_norm": 172.27786254882812,
"learning_rate": 3e-06,
"loss": 10.0385,
"step": 2343
},
{
"epoch": 0.0023451725862931464,
"grad_norm": 208.5091552734375,
"learning_rate": 3e-06,
"loss": 6.3254,
"step": 2344
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0023461730865432717,
"grad_norm": 127.00196838378906,
"learning_rate": 3e-06,
"loss": -3.2699,
"reward": 0.2292906790971756,
"reward_std": 0.11905381456017494,
"rewards/sudoku_reward_func": 0.2292906790971756,
"step": 2345,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023471735867933966,
"grad_norm": 210.29026794433594,
"learning_rate": 3e-06,
"loss": -4.7211,
"step": 2346
},
{
"epoch": 0.002348174087043522,
"grad_norm": 168.79876708984375,
"learning_rate": 3e-06,
"loss": -6.6915,
"step": 2347
},
{
"epoch": 0.0023491745872936468,
"grad_norm": 204.80516052246094,
"learning_rate": 3e-06,
"loss": -9.3385,
"step": 2348
},
{
"epoch": 0.002350175087543772,
"grad_norm": 187.41796875,
"learning_rate": 3e-06,
"loss": -5.4357,
"step": 2349
},
{
"epoch": 0.002351175587793897,
"grad_norm": 166.33779907226562,
"learning_rate": 3e-06,
"loss": -6.8552,
"step": 2350
},
{
"epoch": 0.002352176088044022,
"grad_norm": 246.25677490234375,
"learning_rate": 3e-06,
"loss": -9.6765,
"step": 2351
},
{
"epoch": 0.002353176588294147,
"grad_norm": 263.0240783691406,
"learning_rate": 3e-06,
"loss": -11.8961,
"step": 2352
},
{
"completion_length": 255.9375,
"epoch": 0.002354177088544272,
"grad_norm": 303.3554382324219,
"learning_rate": 3e-06,
"loss": -23.6641,
"reward": 0.24605431407690048,
"reward_std": 0.13267472386360168,
"rewards/sudoku_reward_func": 0.2460542991757393,
"step": 2353,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023551775887943973,
"grad_norm": 218.33726501464844,
"learning_rate": 3e-06,
"loss": -23.8785,
"step": 2354
},
{
"epoch": 0.002356178089044522,
"grad_norm": 204.11302185058594,
"learning_rate": 3e-06,
"loss": -10.7146,
"step": 2355
},
{
"epoch": 0.0023571785892946475,
"grad_norm": 155.26736450195312,
"learning_rate": 3e-06,
"loss": -28.534,
"step": 2356
},
{
"epoch": 0.0023581790895447724,
"grad_norm": 188.44581604003906,
"learning_rate": 3e-06,
"loss": -22.4155,
"step": 2357
},
{
"epoch": 0.0023591795897948973,
"grad_norm": 332.4205017089844,
"learning_rate": 3e-06,
"loss": -23.8225,
"step": 2358
},
{
"epoch": 0.0023601800900450226,
"grad_norm": 246.54193115234375,
"learning_rate": 3e-06,
"loss": -13.5497,
"step": 2359
},
{
"epoch": 0.0023611805902951474,
"grad_norm": 184.51052856445312,
"learning_rate": 3e-06,
"loss": -29.9814,
"step": 2360
},
{
"completion_length": 256.0,
"epoch": 0.0023621810905452728,
"grad_norm": 254.96438598632812,
"learning_rate": 3e-06,
"loss": -9.346,
"reward": 0.2498760148882866,
"reward_std": 0.16076447814702988,
"rewards/sudoku_reward_func": 0.2498759999871254,
"step": 2361,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023631815907953976,
"grad_norm": 354.0221252441406,
"learning_rate": 3e-06,
"loss": -2.763,
"step": 2362
},
{
"epoch": 0.002364182091045523,
"grad_norm": 464.5136413574219,
"learning_rate": 3e-06,
"loss": -4.9485,
"step": 2363
},
{
"epoch": 0.002365182591295648,
"grad_norm": 256.3653869628906,
"learning_rate": 3e-06,
"loss": -5.8165,
"step": 2364
},
{
"epoch": 0.0023661830915457727,
"grad_norm": 371.20037841796875,
"learning_rate": 3e-06,
"loss": -12.4337,
"step": 2365
},
{
"epoch": 0.002367183591795898,
"grad_norm": 299.24871826171875,
"learning_rate": 3e-06,
"loss": -4.3339,
"step": 2366
},
{
"epoch": 0.002368184092046023,
"grad_norm": 494.33038330078125,
"learning_rate": 3e-06,
"loss": -6.8858,
"step": 2367
},
{
"epoch": 0.002369184592296148,
"grad_norm": 150.08218383789062,
"learning_rate": 3e-06,
"loss": -6.5177,
"step": 2368
},
{
"completion_length": 255.95833587646484,
"epoch": 0.002370185092546273,
"grad_norm": 261.2372741699219,
"learning_rate": 3e-06,
"loss": 6.1855,
"reward": 0.2589285746216774,
"reward_std": 0.1584344282746315,
"rewards/sudoku_reward_func": 0.2589285746216774,
"step": 2369,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023711855927963984,
"grad_norm": 194.16799926757812,
"learning_rate": 3e-06,
"loss": 11.0327,
"step": 2370
},
{
"epoch": 0.0023721860930465233,
"grad_norm": 173.90184020996094,
"learning_rate": 3e-06,
"loss": 10.0381,
"step": 2371
},
{
"epoch": 0.002373186593296648,
"grad_norm": 260.4774475097656,
"learning_rate": 3e-06,
"loss": 9.2442,
"step": 2372
},
{
"epoch": 0.0023741870935467735,
"grad_norm": 268.046630859375,
"learning_rate": 3e-06,
"loss": 3.7758,
"step": 2373
},
{
"epoch": 0.0023751875937968983,
"grad_norm": 159.11863708496094,
"learning_rate": 3e-06,
"loss": 9.8232,
"step": 2374
},
{
"epoch": 0.0023761880940470236,
"grad_norm": 267.4617614746094,
"learning_rate": 3e-06,
"loss": 9.251,
"step": 2375
},
{
"epoch": 0.0023771885942971485,
"grad_norm": 247.35926818847656,
"learning_rate": 3e-06,
"loss": 6.8656,
"step": 2376
},
{
"completion_length": 255.87500762939453,
"epoch": 0.002378189094547274,
"grad_norm": 296.4661560058594,
"learning_rate": 3e-06,
"loss": -17.1805,
"reward": 0.19931082427501678,
"reward_std": 0.1540454626083374,
"rewards/sudoku_reward_func": 0.1993108168244362,
"step": 2377,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023791895947973987,
"grad_norm": 273.8221740722656,
"learning_rate": 3e-06,
"loss": -11.0449,
"step": 2378
},
{
"epoch": 0.0023801900950475236,
"grad_norm": 287.01287841796875,
"learning_rate": 3e-06,
"loss": -15.9991,
"step": 2379
},
{
"epoch": 0.002381190595297649,
"grad_norm": 322.6713562011719,
"learning_rate": 3e-06,
"loss": -13.4008,
"step": 2380
},
{
"epoch": 0.0023821910955477738,
"grad_norm": 227.85231018066406,
"learning_rate": 3e-06,
"loss": -18.8711,
"step": 2381
},
{
"epoch": 0.002383191595797899,
"grad_norm": 209.1007537841797,
"learning_rate": 3e-06,
"loss": -12.8625,
"step": 2382
},
{
"epoch": 0.002384192096048024,
"grad_norm": 264.0327453613281,
"learning_rate": 3e-06,
"loss": -15.8456,
"step": 2383
},
{
"epoch": 0.0023851925962981493,
"grad_norm": 170.41293334960938,
"learning_rate": 3e-06,
"loss": -13.2896,
"step": 2384
},
{
"completion_length": 256.0,
"epoch": 0.002386193096548274,
"grad_norm": 219.16714477539062,
"learning_rate": 3e-06,
"loss": -8.9917,
"reward": 0.20667991042137146,
"reward_std": 0.14874431490898132,
"rewards/sudoku_reward_func": 0.20667989552021027,
"step": 2385,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002387193596798399,
"grad_norm": 272.3582763671875,
"learning_rate": 3e-06,
"loss": 0.2413,
"step": 2386
},
{
"epoch": 0.0023881940970485243,
"grad_norm": 213.18934631347656,
"learning_rate": 3e-06,
"loss": -3.9258,
"step": 2387
},
{
"epoch": 0.002389194597298649,
"grad_norm": 190.1105194091797,
"learning_rate": 3e-06,
"loss": -6.8053,
"step": 2388
},
{
"epoch": 0.0023901950975487745,
"grad_norm": 157.9113311767578,
"learning_rate": 3e-06,
"loss": -11.0441,
"step": 2389
},
{
"epoch": 0.0023911955977988994,
"grad_norm": 364.7066955566406,
"learning_rate": 3e-06,
"loss": -2.5138,
"step": 2390
},
{
"epoch": 0.0023921960980490247,
"grad_norm": 288.8036804199219,
"learning_rate": 3e-06,
"loss": -4.8312,
"step": 2391
},
{
"epoch": 0.0023931965982991496,
"grad_norm": 206.28895568847656,
"learning_rate": 3e-06,
"loss": -10.4112,
"step": 2392
},
{
"completion_length": 256.0,
"epoch": 0.0023941970985492745,
"grad_norm": 234.87863159179688,
"learning_rate": 3e-06,
"loss": -29.7829,
"reward": 0.23030905425548553,
"reward_std": 0.1565355882048607,
"rewards/sudoku_reward_func": 0.23030905425548553,
"step": 2393,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0023951975987993998,
"grad_norm": 225.86874389648438,
"learning_rate": 3e-06,
"loss": -21.8689,
"step": 2394
},
{
"epoch": 0.0023961980990495246,
"grad_norm": 317.64678955078125,
"learning_rate": 3e-06,
"loss": -27.3714,
"step": 2395
},
{
"epoch": 0.00239719859929965,
"grad_norm": 273.0003356933594,
"learning_rate": 3e-06,
"loss": -35.7363,
"step": 2396
},
{
"epoch": 0.002398199099549775,
"grad_norm": 332.5831604003906,
"learning_rate": 3e-06,
"loss": -28.3564,
"step": 2397
},
{
"epoch": 0.0023991995997999,
"grad_norm": 258.6806640625,
"learning_rate": 3e-06,
"loss": -24.7617,
"step": 2398
},
{
"epoch": 0.002400200100050025,
"grad_norm": 270.16070556640625,
"learning_rate": 3e-06,
"loss": -27.9452,
"step": 2399
},
{
"epoch": 0.00240120060030015,
"grad_norm": 352.531494140625,
"learning_rate": 3e-06,
"loss": -36.7671,
"step": 2400
},
{
"completion_length": 255.9791717529297,
"epoch": 0.002402201100550275,
"grad_norm": 202.87521362304688,
"learning_rate": 3e-06,
"loss": -18.3767,
"reward": 0.26331019401550293,
"reward_std": 0.14331235736608505,
"rewards/sudoku_reward_func": 0.26331018656492233,
"step": 2401,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024032016008004,
"grad_norm": 231.7044677734375,
"learning_rate": 3e-06,
"loss": -13.1758,
"step": 2402
},
{
"epoch": 0.0024042021010505254,
"grad_norm": 230.66302490234375,
"learning_rate": 3e-06,
"loss": -10.3043,
"step": 2403
},
{
"epoch": 0.0024052026013006503,
"grad_norm": 281.7815856933594,
"learning_rate": 3e-06,
"loss": -17.6013,
"step": 2404
},
{
"epoch": 0.0024062031015507756,
"grad_norm": 201.96786499023438,
"learning_rate": 3e-06,
"loss": -20.4518,
"step": 2405
},
{
"epoch": 0.0024072036018009005,
"grad_norm": 299.170166015625,
"learning_rate": 3e-06,
"loss": -15.459,
"step": 2406
},
{
"epoch": 0.0024082041020510253,
"grad_norm": 265.810546875,
"learning_rate": 3e-06,
"loss": -12.8288,
"step": 2407
},
{
"epoch": 0.0024092046023011507,
"grad_norm": 261.6772155761719,
"learning_rate": 3e-06,
"loss": -19.0075,
"step": 2408
},
{
"completion_length": 256.0,
"epoch": 0.0024102051025512755,
"grad_norm": 413.2362976074219,
"learning_rate": 3e-06,
"loss": 20.3793,
"reward": 0.232127845287323,
"reward_std": 0.146223783493042,
"rewards/sudoku_reward_func": 0.232127845287323,
"step": 2409,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002411205602801401,
"grad_norm": 314.0757751464844,
"learning_rate": 3e-06,
"loss": 15.9171,
"step": 2410
},
{
"epoch": 0.0024122061030515257,
"grad_norm": 390.8291931152344,
"learning_rate": 3e-06,
"loss": 16.0378,
"step": 2411
},
{
"epoch": 0.002413206603301651,
"grad_norm": 293.4915466308594,
"learning_rate": 3e-06,
"loss": 14.1956,
"step": 2412
},
{
"epoch": 0.002414207103551776,
"grad_norm": 233.78207397460938,
"learning_rate": 3e-06,
"loss": 17.9772,
"step": 2413
},
{
"epoch": 0.002415207603801901,
"grad_norm": 255.8182830810547,
"learning_rate": 3e-06,
"loss": 13.7614,
"step": 2414
},
{
"epoch": 0.002416208104052026,
"grad_norm": 353.9018859863281,
"learning_rate": 3e-06,
"loss": 13.5526,
"step": 2415
},
{
"epoch": 0.002417208604302151,
"grad_norm": 179.1558074951172,
"learning_rate": 3e-06,
"loss": 15.9556,
"step": 2416
},
{
"completion_length": 256.0,
"epoch": 0.0024182091045522763,
"grad_norm": 215.33831787109375,
"learning_rate": 3e-06,
"loss": -3.8544,
"reward": 0.25442297756671906,
"reward_std": 0.15895532071590424,
"rewards/sudoku_reward_func": 0.25442295521497726,
"step": 2417,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002419209604802401,
"grad_norm": 478.9371337890625,
"learning_rate": 3e-06,
"loss": -17.7737,
"step": 2418
},
{
"epoch": 0.0024202101050525265,
"grad_norm": 244.527099609375,
"learning_rate": 3e-06,
"loss": -0.1481,
"step": 2419
},
{
"epoch": 0.0024212106053026513,
"grad_norm": 146.69509887695312,
"learning_rate": 3e-06,
"loss": -5.8427,
"step": 2420
},
{
"epoch": 0.0024222111055527762,
"grad_norm": 217.09681701660156,
"learning_rate": 3e-06,
"loss": -4.5632,
"step": 2421
},
{
"epoch": 0.0024232116058029015,
"grad_norm": 437.3373107910156,
"learning_rate": 3e-06,
"loss": -17.72,
"step": 2422
},
{
"epoch": 0.0024242121060530264,
"grad_norm": 255.216064453125,
"learning_rate": 3e-06,
"loss": -0.7436,
"step": 2423
},
{
"epoch": 0.0024252126063031517,
"grad_norm": 164.95989990234375,
"learning_rate": 3e-06,
"loss": -7.5431,
"step": 2424
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0024262131065532766,
"grad_norm": 251.02464294433594,
"learning_rate": 3e-06,
"loss": 29.46,
"reward": 0.2371031865477562,
"reward_std": 0.13912386447191238,
"rewards/sudoku_reward_func": 0.2371031865477562,
"step": 2425,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002427213606803402,
"grad_norm": 178.2760467529297,
"learning_rate": 3e-06,
"loss": 17.2126,
"step": 2426
},
{
"epoch": 0.002428214107053527,
"grad_norm": 188.71414184570312,
"learning_rate": 3e-06,
"loss": 28.4213,
"step": 2427
},
{
"epoch": 0.0024292146073036517,
"grad_norm": 151.65293884277344,
"learning_rate": 3e-06,
"loss": 22.4749,
"step": 2428
},
{
"epoch": 0.002430215107553777,
"grad_norm": 320.76190185546875,
"learning_rate": 3e-06,
"loss": 27.6502,
"step": 2429
},
{
"epoch": 0.002431215607803902,
"grad_norm": 202.29144287109375,
"learning_rate": 3e-06,
"loss": 15.5383,
"step": 2430
},
{
"epoch": 0.002432216108054027,
"grad_norm": 199.7091064453125,
"learning_rate": 3e-06,
"loss": 25.4369,
"step": 2431
},
{
"epoch": 0.002433216608304152,
"grad_norm": 169.84693908691406,
"learning_rate": 3e-06,
"loss": 20.8165,
"step": 2432
},
{
"completion_length": 256.0,
"epoch": 0.0024342171085542774,
"grad_norm": 86.38954162597656,
"learning_rate": 3e-06,
"loss": -3.2782,
"reward": 0.23937289416790009,
"reward_std": 0.10821668431162834,
"rewards/sudoku_reward_func": 0.2393728867173195,
"step": 2433,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024352176088044022,
"grad_norm": 210.60997009277344,
"learning_rate": 3e-06,
"loss": -1.2576,
"step": 2434
},
{
"epoch": 0.002436218109054527,
"grad_norm": 105.67581176757812,
"learning_rate": 3e-06,
"loss": -3.552,
"step": 2435
},
{
"epoch": 0.0024372186093046524,
"grad_norm": 146.38681030273438,
"learning_rate": 3e-06,
"loss": -2.7907,
"step": 2436
},
{
"epoch": 0.0024382191095547773,
"grad_norm": 103.02706146240234,
"learning_rate": 3e-06,
"loss": -4.2819,
"step": 2437
},
{
"epoch": 0.0024392196098049026,
"grad_norm": 108.49909973144531,
"learning_rate": 3e-06,
"loss": -2.6245,
"step": 2438
},
{
"epoch": 0.0024402201100550275,
"grad_norm": 120.60899353027344,
"learning_rate": 3e-06,
"loss": -3.8004,
"step": 2439
},
{
"epoch": 0.002441220610305153,
"grad_norm": 119.53984832763672,
"learning_rate": 3e-06,
"loss": -4.4491,
"step": 2440
},
{
"completion_length": 255.9791717529297,
"epoch": 0.0024422211105552777,
"grad_norm": 217.67234802246094,
"learning_rate": 3e-06,
"loss": 24.9726,
"reward": 0.22883598506450653,
"reward_std": 0.1351177804172039,
"rewards/sudoku_reward_func": 0.22883598506450653,
"step": 2441,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024432216108054025,
"grad_norm": 234.8048095703125,
"learning_rate": 3e-06,
"loss": 25.3563,
"step": 2442
},
{
"epoch": 0.002444222111055528,
"grad_norm": 189.23411560058594,
"learning_rate": 3e-06,
"loss": 21.1638,
"step": 2443
},
{
"epoch": 0.0024452226113056527,
"grad_norm": 263.4746398925781,
"learning_rate": 3e-06,
"loss": 22.9557,
"step": 2444
},
{
"epoch": 0.002446223111555778,
"grad_norm": 185.4212646484375,
"learning_rate": 3e-06,
"loss": 22.3036,
"step": 2445
},
{
"epoch": 0.002447223611805903,
"grad_norm": 193.51065063476562,
"learning_rate": 3e-06,
"loss": 21.5142,
"step": 2446
},
{
"epoch": 0.002448224112056028,
"grad_norm": 157.37619018554688,
"learning_rate": 3e-06,
"loss": 18.6296,
"step": 2447
},
{
"epoch": 0.002449224612306153,
"grad_norm": 159.09471130371094,
"learning_rate": 3e-06,
"loss": 18.2331,
"step": 2448
},
{
"completion_length": 254.9375,
"epoch": 0.002450225112556278,
"grad_norm": 203.94557189941406,
"learning_rate": 3e-06,
"loss": -39.4361,
"reward": 0.24582882970571518,
"reward_std": 0.15585172921419144,
"rewards/sudoku_reward_func": 0.24582882970571518,
"step": 2449,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024512256128064033,
"grad_norm": 126.65997314453125,
"learning_rate": 3e-06,
"loss": -35.301,
"step": 2450
},
{
"epoch": 0.002452226113056528,
"grad_norm": 237.62548828125,
"learning_rate": 3e-06,
"loss": -38.4792,
"step": 2451
},
{
"epoch": 0.0024532266133066535,
"grad_norm": 135.26449584960938,
"learning_rate": 3e-06,
"loss": -37.523,
"step": 2452
},
{
"epoch": 0.0024542271135567784,
"grad_norm": 249.38072204589844,
"learning_rate": 3e-06,
"loss": -39.7402,
"step": 2453
},
{
"epoch": 0.0024552276138069032,
"grad_norm": 126.46034240722656,
"learning_rate": 3e-06,
"loss": -36.0648,
"step": 2454
},
{
"epoch": 0.0024562281140570285,
"grad_norm": 189.29177856445312,
"learning_rate": 3e-06,
"loss": -39.9833,
"step": 2455
},
{
"epoch": 0.0024572286143071534,
"grad_norm": 107.6065902709961,
"learning_rate": 3e-06,
"loss": -38.7203,
"step": 2456
},
{
"completion_length": 250.68750762939453,
"epoch": 0.0024582291145572787,
"grad_norm": 99.97529602050781,
"learning_rate": 3e-06,
"loss": 15.4324,
"reward": 0.20304235816001892,
"reward_std": 0.13311458751559258,
"rewards/sudoku_reward_func": 0.20304234325885773,
"step": 2457,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024592296148074036,
"grad_norm": 188.85643005371094,
"learning_rate": 3e-06,
"loss": 12.5375,
"step": 2458
},
{
"epoch": 0.002460230115057529,
"grad_norm": 142.94924926757812,
"learning_rate": 3e-06,
"loss": 9.5263,
"step": 2459
},
{
"epoch": 0.002461230615307654,
"grad_norm": 96.89965057373047,
"learning_rate": 3e-06,
"loss": 9.125,
"step": 2460
},
{
"epoch": 0.0024622311155577787,
"grad_norm": 103.0599136352539,
"learning_rate": 3e-06,
"loss": 14.582,
"step": 2461
},
{
"epoch": 0.002463231615807904,
"grad_norm": 136.94151306152344,
"learning_rate": 3e-06,
"loss": 12.7515,
"step": 2462
},
{
"epoch": 0.002464232116058029,
"grad_norm": 113.3371810913086,
"learning_rate": 3e-06,
"loss": 7.4074,
"step": 2463
},
{
"epoch": 0.002465232616308154,
"grad_norm": 230.5481719970703,
"learning_rate": 3e-06,
"loss": 7.5585,
"step": 2464
},
{
"completion_length": 255.9791717529297,
"epoch": 0.002466233116558279,
"grad_norm": 235.51907348632812,
"learning_rate": 3e-06,
"loss": -28.1552,
"reward": 0.25078538805246353,
"reward_std": 0.17135849595069885,
"rewards/sudoku_reward_func": 0.25078538805246353,
"step": 2465,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024672336168084044,
"grad_norm": 226.1004638671875,
"learning_rate": 3e-06,
"loss": -22.9924,
"step": 2466
},
{
"epoch": 0.0024682341170585292,
"grad_norm": 418.1987609863281,
"learning_rate": 3e-06,
"loss": -30.068,
"step": 2467
},
{
"epoch": 0.002469234617308654,
"grad_norm": 180.6246795654297,
"learning_rate": 3e-06,
"loss": -13.8185,
"step": 2468
},
{
"epoch": 0.0024702351175587794,
"grad_norm": 309.594970703125,
"learning_rate": 3e-06,
"loss": -30.2354,
"step": 2469
},
{
"epoch": 0.0024712356178089043,
"grad_norm": 376.1004333496094,
"learning_rate": 3e-06,
"loss": -21.4337,
"step": 2470
},
{
"epoch": 0.0024722361180590296,
"grad_norm": 279.6305236816406,
"learning_rate": 3e-06,
"loss": -32.3534,
"step": 2471
},
{
"epoch": 0.0024732366183091545,
"grad_norm": 546.125732421875,
"learning_rate": 3e-06,
"loss": -17.4937,
"step": 2472
},
{
"completion_length": 253.62500762939453,
"epoch": 0.00247423711855928,
"grad_norm": 273.2940368652344,
"learning_rate": 3e-06,
"loss": 7.4418,
"reward": 0.2406994178891182,
"reward_std": 0.1260107159614563,
"rewards/sudoku_reward_func": 0.2406994178891182,
"step": 2473,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024752376188094047,
"grad_norm": 169.90762329101562,
"learning_rate": 3e-06,
"loss": 14.9494,
"step": 2474
},
{
"epoch": 0.0024762381190595296,
"grad_norm": 231.994140625,
"learning_rate": 3e-06,
"loss": 5.9744,
"step": 2475
},
{
"epoch": 0.002477238619309655,
"grad_norm": 154.6354522705078,
"learning_rate": 3e-06,
"loss": 9.9395,
"step": 2476
},
{
"epoch": 0.0024782391195597797,
"grad_norm": 459.22686767578125,
"learning_rate": 3e-06,
"loss": 5.8099,
"step": 2477
},
{
"epoch": 0.002479239619809905,
"grad_norm": 220.0775909423828,
"learning_rate": 3e-06,
"loss": 13.5452,
"step": 2478
},
{
"epoch": 0.00248024012006003,
"grad_norm": 285.4416198730469,
"learning_rate": 3e-06,
"loss": 5.4125,
"step": 2479
},
{
"epoch": 0.0024812406203101552,
"grad_norm": 148.00022888183594,
"learning_rate": 3e-06,
"loss": 8.1059,
"step": 2480
},
{
"completion_length": 255.95834350585938,
"epoch": 0.00248224112056028,
"grad_norm": 267.9368591308594,
"learning_rate": 3e-06,
"loss": 7.4214,
"reward": 0.23054204881191254,
"reward_std": 0.1383199244737625,
"rewards/sudoku_reward_func": 0.23054202646017075,
"step": 2481,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002483241620810405,
"grad_norm": 284.4049072265625,
"learning_rate": 3e-06,
"loss": 2.4674,
"step": 2482
},
{
"epoch": 0.0024842421210605303,
"grad_norm": 191.92669677734375,
"learning_rate": 3e-06,
"loss": 3.3142,
"step": 2483
},
{
"epoch": 0.002485242621310655,
"grad_norm": 337.16033935546875,
"learning_rate": 3e-06,
"loss": 2.5026,
"step": 2484
},
{
"epoch": 0.0024862431215607805,
"grad_norm": 586.3452758789062,
"learning_rate": 3e-06,
"loss": 2.5096,
"step": 2485
},
{
"epoch": 0.0024872436218109054,
"grad_norm": 344.7769775390625,
"learning_rate": 3e-06,
"loss": -1.1625,
"step": 2486
},
{
"epoch": 0.0024882441220610307,
"grad_norm": 227.2752227783203,
"learning_rate": 3e-06,
"loss": 0.9586,
"step": 2487
},
{
"epoch": 0.0024892446223111556,
"grad_norm": 303.7214050292969,
"learning_rate": 3e-06,
"loss": -1.7598,
"step": 2488
},
{
"completion_length": 255.9166717529297,
"epoch": 0.0024902451225612804,
"grad_norm": 161.27249145507812,
"learning_rate": 3e-06,
"loss": -5.8149,
"reward": 0.2981564402580261,
"reward_std": 0.16518419981002808,
"rewards/sudoku_reward_func": 0.29815642535686493,
"step": 2489,
"zero_std_ratio": 0.0
},
{
"epoch": 0.0024912456228114058,
"grad_norm": 188.5457763671875,
"learning_rate": 3e-06,
"loss": -6.1626,
"step": 2490
},
{
"epoch": 0.0024922461230615306,
"grad_norm": 205.0718231201172,
"learning_rate": 3e-06,
"loss": -6.9568,
"step": 2491
},
{
"epoch": 0.002493246623311656,
"grad_norm": 155.51025390625,
"learning_rate": 3e-06,
"loss": -9.4797,
"step": 2492
},
{
"epoch": 0.002494247123561781,
"grad_norm": 152.71153259277344,
"learning_rate": 3e-06,
"loss": -8.4538,
"step": 2493
},
{
"epoch": 0.002495247623811906,
"grad_norm": 125.3695068359375,
"learning_rate": 3e-06,
"loss": -6.4118,
"step": 2494
},
{
"epoch": 0.002496248124062031,
"grad_norm": 128.93853759765625,
"learning_rate": 3e-06,
"loss": -6.2118,
"step": 2495
},
{
"epoch": 0.002497248624312156,
"grad_norm": 181.36378479003906,
"learning_rate": 3e-06,
"loss": -9.2876,
"step": 2496
},
{
"completion_length": 255.8541717529297,
"epoch": 0.002498249124562281,
"grad_norm": 154.56900024414062,
"learning_rate": 3e-06,
"loss": -5.0903,
"reward": 0.29621364921331406,
"reward_std": 0.14325331896543503,
"rewards/sudoku_reward_func": 0.29621363431215286,
"step": 2497,
"zero_std_ratio": 0.0
},
{
"epoch": 0.002499249624812406,
"grad_norm": 141.0087127685547,
"learning_rate": 3e-06,
"loss": -9.3757,
"step": 2498
},
{
"epoch": 0.0025002501250625314,
"grad_norm": 119.01480102539062,
"learning_rate": 3e-06,
"loss": -3.5487,
"step": 2499
},
{
"epoch": 0.0025012506253126563,
"grad_norm": 170.59515380859375,
"learning_rate": 3e-06,
"loss": -3.0218,
"step": 2500
}
],
"logging_steps": 1,
"max_steps": 9995000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}