{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01773797392517833, "eval_steps": 500, "global_step": 19600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 134.2453125, "completions/mean_terminated_length": 134.2453125, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 4.524993348259778e-06, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15314806997776031, "kl": 1.1162874025073995e-08, "learning_rate": 2.857142857142857e-10, "loss": 0.0, "num_tokens": 369818.0, "reward": -0.0953125, "reward_std": 0.6069470643997192, "rewards/verify_chess_move/mean": -0.0953125, "rewards/verify_chess_move/std": 0.9882838249206543, "step": 5 }, { "completion_length": 348.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 127.53671875, "completions/mean_terminated_length": 127.53671875, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 9.049986696519556e-06, "frac_reward_zero_std": 0.375, "grad_norm": 0.1271524280309677, "kl": 3.7305913474888426e-05, "learning_rate": 6.428571428571428e-10, "loss": 0.0, "num_tokens": 731449.0, "reward": -0.1328125, "reward_std": 0.5808529257774353, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9840809583663941, "step": 10 }, { "completion_length": 356.6, "completions/clipped_ratio": 0.0, "completions/max_length": 356.6, "completions/max_terminated_length": 356.6, "completions/mean_length": 131.0546875, "completions/mean_terminated_length": 131.0546875, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 1.3574980044779333e-05, "frac_reward_zero_std": 0.39375, "grad_norm": 0.13510127365589142, "kl": 0.00011166165880907668, "learning_rate": 1e-09, "loss": 0.0, "num_tokens": 1096831.0, "reward": -0.1078125, "reward_std": 0.5521861791610718, "rewards/verify_chess_move/mean": -0.1078125, "rewards/verify_chess_move/std": 0.9916645765304566, "step": 15 }, { "completion_length": 371.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 137.39765625, "completions/mean_terminated_length": 137.39765625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.8099973393039112e-05, "frac_reward_zero_std": 0.33125, "grad_norm": 0.13521867990493774, "kl": 0.00018904195022741987, "learning_rate": 1.357142857142857e-09, "loss": 0.0, "num_tokens": 1473820.0, "reward": -0.1875, "reward_std": 0.5972784519195556, "rewards/verify_chess_move/mean": -0.1875, "rewards/verify_chess_move/std": 0.9765503168106079, "step": 20 }, { "completion_length": 380.8, "completions/clipped_ratio": 0.0, "completions/max_length": 380.8, "completions/max_terminated_length": 380.8, "completions/mean_length": 131.4046875, "completions/mean_terminated_length": 131.4046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.262496674129889e-05, "frac_reward_zero_std": 0.35, "grad_norm": 0.12960247695446014, "kl": 0.0003116093884273141, "learning_rate": 1.7142857142857142e-09, "loss": 0.0, "num_tokens": 1839682.0, "reward": -0.2859375, "reward_std": 0.5892244517803192, "rewards/verify_chess_move/mean": -0.2859375, "rewards/verify_chess_move/std": 0.9505633592605591, "step": 25 }, { "completion_length": 395.6, "completions/clipped_ratio": 0.0, "completions/max_length": 395.6, "completions/max_terminated_length": 395.6, "completions/mean_length": 129.984375, "completions/mean_terminated_length": 129.984375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 2.7149960089558667e-05, "frac_reward_zero_std": 0.30625, "grad_norm": 0.13179945945739746, "kl": 0.0003647734990408935, "learning_rate": 2.0714285714285713e-09, "loss": 0.0, "num_tokens": 2204614.0, "reward": -0.1296875, "reward_std": 0.6268324494361878, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9822301268577576, "step": 30 }, { "completion_length": 364.6, "completions/clipped_ratio": 0.0, "completions/max_length": 364.6, "completions/max_terminated_length": 364.6, "completions/mean_length": 123.96796875, "completions/mean_terminated_length": 123.96796875, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 3.1674953437818444e-05, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16639098525047302, "kl": 0.0003914118635293562, "learning_rate": 2.4285714285714285e-09, "loss": 0.0, "num_tokens": 2558813.0, "reward": -0.0421875, "reward_std": 0.5938471674919128, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.9985790133476258, "step": 35 }, { "completion_length": 424.8, "completions/clipped_ratio": 0.0, "completions/max_length": 424.8, "completions/max_terminated_length": 424.8, "completions/mean_length": 140.1015625, "completions/mean_terminated_length": 140.1015625, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 3.6199946786078225e-05, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1491273045539856, "kl": 0.00040266945552502875, "learning_rate": 2.7857142857142856e-09, "loss": 0.0, "num_tokens": 2938943.0, "reward": -0.1703125, "reward_std": 0.5951899290084839, "rewards/verify_chess_move/mean": -0.1703125, "rewards/verify_chess_move/std": 0.9850792288780212, "step": 40 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 133.93828125, "completions/mean_terminated_length": 133.93828125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 4.0724940134338006e-05, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1635921597480774, "kl": 0.0005308275031893573, "learning_rate": 3.1428571428571428e-09, "loss": 0.0, "num_tokens": 3309400.0, "reward": -0.1328125, "reward_std": 0.6130090117454529, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9844282746315003, "step": 45 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 462.6, "completions/max_terminated_length": 375.2, "completions/mean_length": 128.9828125, "completions/mean_terminated_length": 128.4787582397461, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 4.524993348259778e-05, "frac_reward_zero_std": 0.33125, "grad_norm": 0.11537030339241028, "kl": 0.0003926850155039574, "learning_rate": 3.5e-09, "loss": 0.0, "num_tokens": 3671690.0, "reward": -0.171875, "reward_std": 0.5931281447410583, "rewards/verify_chess_move/mean": -0.171875, "rewards/verify_chess_move/std": 0.9803287863731385, "step": 50 }, { "completion_length": 332.6, "completions/clipped_ratio": 0.0, "completions/max_length": 332.6, "completions/max_terminated_length": 332.6, "completions/mean_length": 133.246875, "completions/mean_terminated_length": 133.246875, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "epoch": 4.977492683085756e-05, "frac_reward_zero_std": 0.29375, "grad_norm": 0.14411744475364685, "kl": 0.0005153413497282599, "learning_rate": 3.857142857142857e-09, "loss": 0.0, "num_tokens": 4038158.0, "reward": -0.046875, "reward_std": 0.637720263004303, "rewards/verify_chess_move/mean": -0.046875, "rewards/verify_chess_move/std": 0.9889968037605286, "step": 55 }, { "completion_length": 366.8, "completions/clipped_ratio": 0.0, "completions/max_length": 366.8, "completions/max_terminated_length": 366.8, "completions/mean_length": 124.14296875, "completions/mean_terminated_length": 124.14296875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 5.4299920179117334e-05, "frac_reward_zero_std": 0.28125, "grad_norm": 0.11324295401573181, "kl": 0.0004699916483332345, "learning_rate": 4.214285714285714e-09, "loss": 0.0, "num_tokens": 4393669.0, "reward": -0.1046875, "reward_std": 0.654711389541626, "rewards/verify_chess_move/mean": -0.1046875, "rewards/verify_chess_move/std": 0.9957113623619079, "step": 60 }, { "completion_length": 360.6, "completions/clipped_ratio": 0.0, "completions/max_length": 360.6, "completions/max_terminated_length": 360.6, "completions/mean_length": 138.73828125, "completions/mean_terminated_length": 138.73828125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 5.8824913527377115e-05, "frac_reward_zero_std": 0.35625, "grad_norm": 0.12214316427707672, "kl": 0.0005132519132530433, "learning_rate": 4.571428571428571e-09, "loss": 0.0, "num_tokens": 4770222.0, "reward": -0.13125, "reward_std": 0.5812244176864624, "rewards/verify_chess_move/mean": -0.13125, "rewards/verify_chess_move/std": 0.9882860064506531, "step": 65 }, { "completion_length": 340.8, "completions/clipped_ratio": 0.0, "completions/max_length": 340.8, "completions/max_terminated_length": 340.8, "completions/mean_length": 132.4328125, "completions/mean_terminated_length": 132.4328125, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 6.334990687563689e-05, "frac_reward_zero_std": 0.29375, "grad_norm": 0.12794020771980286, "kl": 0.0005023449695727323, "learning_rate": 4.9285714285714285e-09, "loss": 0.0, "num_tokens": 5137008.0, "reward": 0.0125, "reward_std": 0.65308518409729, "rewards/verify_chess_move/mean": 0.0125, "rewards/verify_chess_move/std": 0.993426787853241, "step": 70 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 134.50234375, "completions/mean_terminated_length": 134.50234375, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 6.787490022389667e-05, "frac_reward_zero_std": 0.25625, "grad_norm": 0.13456571102142334, "kl": 0.0004742340965549374, "learning_rate": 5.285714285714286e-09, "loss": 0.0, "num_tokens": 5506035.0, "reward": -0.1015625, "reward_std": 0.6746501922607422, "rewards/verify_chess_move/mean": -0.1015625, "rewards/verify_chess_move/std": 0.979106605052948, "step": 75 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 135.02578125, "completions/mean_terminated_length": 135.02578125, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 7.239989357215645e-05, "frac_reward_zero_std": 0.33125, "grad_norm": 0.16221371293067932, "kl": 0.0005688568003279215, "learning_rate": 5.642857142857143e-09, "loss": 0.0, "num_tokens": 5874996.0, "reward": -0.084375, "reward_std": 0.6115776538848877, "rewards/verify_chess_move/mean": -0.084375, "rewards/verify_chess_move/std": 0.9785626411437989, "step": 80 }, { "completion_length": 383.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 136.0890625, "completions/mean_terminated_length": 136.0890625, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 7.692488692041623e-05, "frac_reward_zero_std": 0.38125, "grad_norm": 0.13259850442409515, "kl": 0.0005150648265043855, "learning_rate": 6e-09, "loss": 0.0, "num_tokens": 6247638.0, "reward": -0.1296875, "reward_std": 0.5565027534961701, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9886666536331177, "step": 85 }, { "completion_length": 387.8, "completions/clipped_ratio": 0.0, "completions/max_length": 387.8, "completions/max_terminated_length": 387.8, "completions/mean_length": 132.08125, "completions/mean_terminated_length": 132.08125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 8.144988026867601e-05, "frac_reward_zero_std": 0.35, "grad_norm": 0.08173348754644394, "kl": 0.0004966315394995035, "learning_rate": 6.357142857142857e-09, "loss": 0.0, "num_tokens": 6616798.0, "reward": -0.2046875, "reward_std": 0.5830029368400573, "rewards/verify_chess_move/mean": -0.2046875, "rewards/verify_chess_move/std": 0.9725927114486694, "step": 90 }, { "completion_length": 360.4, "completions/clipped_ratio": 0.0, "completions/max_length": 360.4, "completions/max_terminated_length": 360.4, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 8.597487361693578e-05, "frac_reward_zero_std": 0.41875, "grad_norm": 0.14269593358039856, "kl": 0.0005439656340968213, "learning_rate": 6.714285714285714e-09, "loss": 0.0, "num_tokens": 6993870.0, "reward": -0.15, "reward_std": 0.5251529455184937, "rewards/verify_chess_move/mean": -0.15, "rewards/verify_chess_move/std": 0.9866483569145202, "step": 95 }, { "completion_length": 329.4, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/max_terminated_length": 329.4, "completions/mean_length": 131.49453125, "completions/mean_terminated_length": 131.49453125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 9.049986696519556e-05, "frac_reward_zero_std": 0.425, "grad_norm": 0.15986844897270203, "kl": 0.000558555932911986, "learning_rate": 7.071428571428571e-09, "loss": 0.0, "num_tokens": 7361039.0, "reward": -0.0984375, "reward_std": 0.5187281250953675, "rewards/verify_chess_move/mean": -0.0984375, "rewards/verify_chess_move/std": 0.9964276909828186, "step": 100 }, { "completion_length": 353.8, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/max_terminated_length": 353.8, "completions/mean_length": 120.503125, "completions/mean_terminated_length": 120.503125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 9.502486031345534e-05, "frac_reward_zero_std": 0.28125, "grad_norm": 0.1400572806596756, "kl": 0.0004383791202599241, "learning_rate": 7.428571428571428e-09, "loss": 0.0, "num_tokens": 7709963.0, "reward": -0.0703125, "reward_std": 0.6548296093940735, "rewards/verify_chess_move/mean": -0.0703125, "rewards/verify_chess_move/std": 0.9806808710098267, "step": 105 }, { "completion_length": 434.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 139.31328125, "completions/mean_terminated_length": 139.31328125, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 9.954985366171512e-05, "frac_reward_zero_std": 0.3375, "grad_norm": 0.14659151434898376, "kl": 0.0004960132519045146, "learning_rate": 7.785714285714286e-09, "loss": 0.0, "num_tokens": 8086700.0, "reward": -0.103125, "reward_std": 0.6089030265808105, "rewards/verify_chess_move/mean": -0.103125, "rewards/verify_chess_move/std": 0.9944238066673279, "step": 110 }, { "completion_length": 364.2, "completions/clipped_ratio": 0.0, "completions/max_length": 364.2, "completions/max_terminated_length": 364.2, "completions/mean_length": 143.06015625, "completions/mean_terminated_length": 143.06015625, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0001040748470099749, "frac_reward_zero_std": 0.39375, "grad_norm": 0.13564597070217133, "kl": 0.0005639061088004383, "learning_rate": 8.142857142857142e-09, "loss": 0.0, "num_tokens": 8468769.0, "reward": -0.1515625, "reward_std": 0.5312604129314422, "rewards/verify_chess_move/mean": -0.1515625, "rewards/verify_chess_move/std": 0.9878770709037781, "step": 115 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 128.45234375, "completions/mean_terminated_length": 128.45234375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.00010859984035823467, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12440938502550125, "kl": 0.0005024509113354725, "learning_rate": 8.5e-09, "loss": 0.0, "num_tokens": 8829868.0, "reward": -0.0421875, "reward_std": 0.510055935382843, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.9984874963760376, "step": 120 }, { "completion_length": 359.6, "completions/clipped_ratio": 0.0, "completions/max_length": 359.6, "completions/max_terminated_length": 359.6, "completions/mean_length": 143.5953125, "completions/mean_terminated_length": 143.5953125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.00011312483370649445, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1376192420721054, "kl": 0.0005274909040963394, "learning_rate": 8.857142857142856e-09, "loss": 0.0, "num_tokens": 9215750.0, "reward": -0.1578125, "reward_std": 0.541982913017273, "rewards/verify_chess_move/mean": -0.1578125, "rewards/verify_chess_move/std": 0.9880678415298462, "step": 125 }, { "completion_length": 359.4, "completions/clipped_ratio": 0.0, "completions/max_length": 359.4, "completions/max_terminated_length": 359.4, "completions/mean_length": 141.78203125, "completions/mean_terminated_length": 141.78203125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00011764982705475423, "frac_reward_zero_std": 0.28125, "grad_norm": 0.16007082164287567, "kl": 0.0005671712748153368, "learning_rate": 9.214285714285714e-09, "loss": 0.0, "num_tokens": 9594031.0, "reward": -0.0703125, "reward_std": 0.6490343451499939, "rewards/verify_chess_move/mean": -0.0703125, "rewards/verify_chess_move/std": 0.9937982559204102, "step": 130 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 134.271875, "completions/mean_terminated_length": 134.271875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.000122174820403014, "frac_reward_zero_std": 0.3, "grad_norm": 0.13897006213665009, "kl": 0.0004882710232777754, "learning_rate": 9.57142857142857e-09, "loss": 0.0, "num_tokens": 9963755.0, "reward": -0.128125, "reward_std": 0.6375644445419312, "rewards/verify_chess_move/mean": -0.128125, "rewards/verify_chess_move/std": 0.9903793573379517, "step": 135 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 133.190625, "completions/mean_terminated_length": 133.190625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00012669981375127378, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15714672207832336, "kl": 0.0005160793528375506, "learning_rate": 9.928571428571429e-09, "loss": 0.0, "num_tokens": 10332615.0, "reward": -0.228125, "reward_std": 0.5908061385154724, "rewards/verify_chess_move/mean": -0.228125, "rewards/verify_chess_move/std": 0.9728738069534302, "step": 140 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 132.175, "completions/mean_terminated_length": 132.175, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.00013122480709953357, "frac_reward_zero_std": 0.3625, "grad_norm": 0.13153095543384552, "kl": 0.0004978550153282412, "learning_rate": 1.0285714285714285e-08, "loss": 0.0, "num_tokens": 10699239.0, "reward": -0.140625, "reward_std": 0.5735992908477783, "rewards/verify_chess_move/mean": -0.140625, "rewards/verify_chess_move/std": 0.9806422352790832, "step": 145 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.0, "completions/max_length": 440.6, "completions/max_terminated_length": 440.6, "completions/mean_length": 131.3265625, "completions/mean_terminated_length": 131.3265625, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.00013574980044779334, "frac_reward_zero_std": 0.4125, "grad_norm": 0.12787821888923645, "kl": 0.0004551921044367191, "learning_rate": 1.0642857142857143e-08, "loss": 0.0, "num_tokens": 11063913.0, "reward": -0.090625, "reward_std": 0.5184729754924774, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9843689560890198, "step": 150 }, { "completion_length": 450.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 450.8, "completions/max_terminated_length": 378.4, "completions/mean_length": 131.915625, "completions/mean_terminated_length": 131.40939178466797, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00014027479379605313, "frac_reward_zero_std": 0.30625, "grad_norm": 0.1883859932422638, "kl": 0.00044423059653126985, "learning_rate": 1.1e-08, "loss": 0.0, "num_tokens": 11430413.0, "reward": -0.0625, "reward_std": 0.6231558918952942, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.985701858997345, "step": 155 }, { "completion_length": 389.4, "completions/clipped_ratio": 0.0, "completions/max_length": 389.4, "completions/max_terminated_length": 389.4, "completions/mean_length": 134.72890625, "completions/mean_terminated_length": 134.72890625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0001447997871443129, "frac_reward_zero_std": 0.25, "grad_norm": 0.1762891262769699, "kl": 0.00048411803390990826, "learning_rate": 1.1357142857142857e-08, "loss": 0.0, "num_tokens": 11799058.0, "reward": -0.0484375, "reward_std": 0.6860690951347351, "rewards/verify_chess_move/mean": -0.0484375, "rewards/verify_chess_move/std": 0.9925577878952027, "step": 160 }, { "completion_length": 422.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 130.77109375, "completions/mean_terminated_length": 130.77109375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.00014932478049257267, "frac_reward_zero_std": 0.39375, "grad_norm": 0.18413178622722626, "kl": 0.00046858095629431773, "learning_rate": 1.1714285714285714e-08, "loss": 0.0, "num_tokens": 12165853.0, "reward": -0.1734375, "reward_std": 0.5556020915508271, "rewards/verify_chess_move/mean": -0.1734375, "rewards/verify_chess_move/std": 0.9716871738433838, "step": 165 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 140.82265625, "completions/mean_terminated_length": 140.82265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.00015384977384083246, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14052613079547882, "kl": 0.0005593854582912172, "learning_rate": 1.2071428571428572e-08, "loss": 0.0, "num_tokens": 12545690.0, "reward": -0.028125, "reward_std": 0.5197370767593383, "rewards/verify_chess_move/mean": -0.028125, "rewards/verify_chess_move/std": 0.9818094372749329, "step": 170 }, { "completion_length": 336.8, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/max_terminated_length": 336.8, "completions/mean_length": 123.18515625, "completions/mean_terminated_length": 123.18515625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.00015837476718909223, "frac_reward_zero_std": 0.3875, "grad_norm": 0.12865349650382996, "kl": 0.0005256450299384596, "learning_rate": 1.2428571428571428e-08, "loss": 0.0, "num_tokens": 12899319.0, "reward": -0.0359375, "reward_std": 0.5433613240718842, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 0.988101351261139, "step": 175 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 124.36640625, "completions/mean_terminated_length": 124.36640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.00016289976053735202, "frac_reward_zero_std": 0.3375, "grad_norm": 0.13334743678569794, "kl": 0.0004787738229424576, "learning_rate": 1.2785714285714286e-08, "loss": 0.0, "num_tokens": 13255116.0, "reward": -0.0796875, "reward_std": 0.609423291683197, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9983352422714233, "step": 180 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 122.5046875, "completions/mean_terminated_length": 122.5046875, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.0001674247538856118, "frac_reward_zero_std": 0.3875, "grad_norm": 0.12823162972927094, "kl": 0.0004965455504134297, "learning_rate": 1.3142857142857144e-08, "loss": 0.0, "num_tokens": 13610018.0, "reward": -0.1734375, "reward_std": 0.5750056803226471, "rewards/verify_chess_move/mean": -0.1734375, "rewards/verify_chess_move/std": 0.9841436147689819, "step": 185 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 138.1046875, "completions/mean_terminated_length": 138.1046875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.00017194974723387156, "frac_reward_zero_std": 0.375, "grad_norm": 0.16016431152820587, "kl": 0.0005906468390094233, "learning_rate": 1.3499999999999998e-08, "loss": 0.0, "num_tokens": 13986328.0, "reward": -0.1328125, "reward_std": 0.5542501091957093, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9877089023590088, "step": 190 }, { "completion_length": 321.2, "completions/clipped_ratio": 0.0, "completions/max_length": 321.2, "completions/max_terminated_length": 321.2, "completions/mean_length": 127.30703125, "completions/mean_terminated_length": 127.30703125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.00017647474058213135, "frac_reward_zero_std": 0.43125, "grad_norm": 0.13012774288654327, "kl": 0.0005535139311177772, "learning_rate": 1.3857142857142856e-08, "loss": 0.0, "num_tokens": 14346489.0, "reward": -0.0796875, "reward_std": 0.5102582156658173, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9865104913711548, "step": 195 }, { "completion_length": 433.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 433.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 131.66953125, "completions/mean_terminated_length": 131.16730804443358, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.00018099973393039112, "frac_reward_zero_std": 0.325, "grad_norm": 0.12658877670764923, "kl": 0.00047979095525079173, "learning_rate": 1.4214285714285713e-08, "loss": 0.0, "num_tokens": 14715130.0, "reward": -0.159375, "reward_std": 0.6113808393478394, "rewards/verify_chess_move/mean": -0.159375, "rewards/verify_chess_move/std": 0.983587098121643, "step": 200 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.0, "completions/max_length": 440.6, "completions/max_terminated_length": 440.6, "completions/mean_length": 141.909375, "completions/mean_terminated_length": 141.909375, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0001855247272786509, "frac_reward_zero_std": 0.3125, "grad_norm": 0.09419268369674683, "kl": 0.0004560593707537919, "learning_rate": 1.457142857142857e-08, "loss": 0.0, "num_tokens": 15097374.0, "reward": -0.0953125, "reward_std": 0.6274051308631897, "rewards/verify_chess_move/mean": -0.0953125, "rewards/verify_chess_move/std": 0.9886943340301514, "step": 205 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 134.23984375, "completions/mean_terminated_length": 134.23984375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.00019004972062691068, "frac_reward_zero_std": 0.325, "grad_norm": 0.13494615256786346, "kl": 0.0005142600552062504, "learning_rate": 1.4928571428571427e-08, "loss": 0.0, "num_tokens": 15466761.0, "reward": -0.0734375, "reward_std": 0.5967473268508912, "rewards/verify_chess_move/mean": -0.0734375, "rewards/verify_chess_move/std": 0.9972174644470215, "step": 210 }, { "completion_length": 424.4, "completions/clipped_ratio": 0.0, "completions/max_length": 424.4, "completions/max_terminated_length": 424.4, "completions/mean_length": 134.4203125, "completions/mean_terminated_length": 134.4203125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.00019457471397517045, "frac_reward_zero_std": 0.38125, "grad_norm": 0.11083868891000748, "kl": 0.000504795299275429, "learning_rate": 1.5285714285714285e-08, "loss": 0.0, "num_tokens": 15837875.0, "reward": -0.1625, "reward_std": 0.5680690228939056, "rewards/verify_chess_move/mean": -0.1625, "rewards/verify_chess_move/std": 0.9862509489059448, "step": 215 }, { "completion_length": 333.4, "completions/clipped_ratio": 0.0, "completions/max_length": 333.4, "completions/max_terminated_length": 333.4, "completions/mean_length": 126.71171875, "completions/mean_terminated_length": 126.71171875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.00019909970732343024, "frac_reward_zero_std": 0.30625, "grad_norm": 0.17613451182842255, "kl": 0.0005315284944117593, "learning_rate": 1.5642857142857143e-08, "loss": 0.0, "num_tokens": 16197322.0, "reward": -0.11875, "reward_std": 0.6311425566673279, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9926429152488708, "step": 220 }, { "completion_length": 328.6, "completions/clipped_ratio": 0.0, "completions/max_length": 328.6, "completions/max_terminated_length": 328.6, "completions/mean_length": 121.63828125, "completions/mean_terminated_length": 121.63828125, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.00020362470067169, "frac_reward_zero_std": 0.3375, "grad_norm": 0.13512490689754486, "kl": 0.0005057926004155888, "learning_rate": 1.6e-08, "loss": 0.0, "num_tokens": 16548115.0, "reward": -0.128125, "reward_std": 0.5725697755813599, "rewards/verify_chess_move/mean": -0.128125, "rewards/verify_chess_move/std": 0.9853529453277587, "step": 225 }, { "completion_length": 368.4, "completions/clipped_ratio": 0.0, "completions/max_length": 368.4, "completions/max_terminated_length": 368.4, "completions/mean_length": 134.88125, "completions/mean_terminated_length": 134.88125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0002081496940199498, "frac_reward_zero_std": 0.4, "grad_norm": 0.1267419010400772, "kl": 0.0005006938030874153, "learning_rate": 1.635714285714286e-08, "loss": 0.0, "num_tokens": 16920099.0, "reward": -0.2515625, "reward_std": 0.5324725449085236, "rewards/verify_chess_move/mean": -0.2515625, "rewards/verify_chess_move/std": 0.9591513395309448, "step": 230 }, { "completion_length": 374.4, "completions/clipped_ratio": 0.0, "completions/max_length": 374.4, "completions/max_terminated_length": 374.4, "completions/mean_length": 133.70859375, "completions/mean_terminated_length": 133.70859375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.00021267468736820957, "frac_reward_zero_std": 0.3375, "grad_norm": 0.17367476224899292, "kl": 0.0005452288108244829, "learning_rate": 1.6714285714285714e-08, "loss": 0.0, "num_tokens": 17291326.0, "reward": -0.1671875, "reward_std": 0.598534107208252, "rewards/verify_chess_move/mean": -0.1671875, "rewards/verify_chess_move/std": 0.981266450881958, "step": 235 }, { "completion_length": 431.2, "completions/clipped_ratio": 0.0, "completions/max_length": 431.2, "completions/max_terminated_length": 431.2, "completions/mean_length": 131.4125, "completions/mean_terminated_length": 131.4125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.00021719968071646934, "frac_reward_zero_std": 0.3875, "grad_norm": 0.08549005538225174, "kl": 0.0004266782946615422, "learning_rate": 1.7071428571428568e-08, "loss": 0.0, "num_tokens": 17657574.0, "reward": -0.13125, "reward_std": 0.5547707617282868, "rewards/verify_chess_move/mean": -0.13125, "rewards/verify_chess_move/std": 0.9902655959129334, "step": 240 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 133.85703125, "completions/mean_terminated_length": 133.85703125, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.00022172467406472913, "frac_reward_zero_std": 0.43125, "grad_norm": 0.15870779752731323, "kl": 0.0005168962332390947, "learning_rate": 1.7428571428571426e-08, "loss": 0.0, "num_tokens": 18028567.0, "reward": -0.103125, "reward_std": 0.5113190710544586, "rewards/verify_chess_move/mean": -0.103125, "rewards/verify_chess_move/std": 0.9911036968231202, "step": 245 }, { "completion_length": 324.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 126.6125, "completions/mean_terminated_length": 126.6125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0002262496674129889, "frac_reward_zero_std": 0.3625, "grad_norm": 0.17144867777824402, "kl": 0.0005444973166959244, "learning_rate": 1.7785714285714284e-08, "loss": 0.0, "num_tokens": 18387095.0, "reward": 0.021875, "reward_std": 0.5683930456638336, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.997988498210907, "step": 250 }, { "completion_length": 384.8, "completions/clipped_ratio": 0.0, "completions/max_length": 384.8, "completions/max_terminated_length": 384.8, "completions/mean_length": 134.2078125, "completions/mean_terminated_length": 134.2078125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0002307746607612487, "frac_reward_zero_std": 0.29375, "grad_norm": 0.15699464082717896, "kl": 0.000474457587733923, "learning_rate": 1.814285714285714e-08, "loss": 0.0, "num_tokens": 18759193.0, "reward": -0.1859375, "reward_std": 0.6310966312885284, "rewards/verify_chess_move/mean": -0.1859375, "rewards/verify_chess_move/std": 0.9766289234161377, "step": 255 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 138.4046875, "completions/mean_terminated_length": 138.4046875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.00023529965410950846, "frac_reward_zero_std": 0.29375, "grad_norm": 0.1533270925283432, "kl": 0.0005329363713826752, "learning_rate": 1.8499999999999997e-08, "loss": 0.0, "num_tokens": 19136895.0, "reward": -0.1375, "reward_std": 0.6430931091308594, "rewards/verify_chess_move/mean": -0.1375, "rewards/verify_chess_move/std": 0.9909014463424682, "step": 260 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 136.26875, "completions/mean_terminated_length": 136.26875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.00023982464745776823, "frac_reward_zero_std": 0.25625, "grad_norm": 0.17376801371574402, "kl": 0.0005278316754811386, "learning_rate": 1.8857142857142855e-08, "loss": 0.0, "num_tokens": 19508071.0, "reward": -0.015625, "reward_std": 0.6858425498008728, "rewards/verify_chess_move/mean": -0.015625, "rewards/verify_chess_move/std": 0.9975508928298951, "step": 265 }, { "completion_length": 336.6, "completions/clipped_ratio": 0.0, "completions/max_length": 336.6, "completions/max_terminated_length": 336.6, "completions/mean_length": 138.13515625, "completions/mean_terminated_length": 138.13515625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.000244349640806028, "frac_reward_zero_std": 0.40625, "grad_norm": 0.11872056126594543, "kl": 0.0005812303457787493, "learning_rate": 1.9214285714285713e-08, "loss": 0.0, "num_tokens": 19884660.0, "reward": -0.134375, "reward_std": 0.5399769186973572, "rewards/verify_chess_move/mean": -0.134375, "rewards/verify_chess_move/std": 0.9905327558517456, "step": 270 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 135.2515625, "completions/mean_terminated_length": 135.2515625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0002488746341542878, "frac_reward_zero_std": 0.29375, "grad_norm": 0.16553854942321777, "kl": 0.0005344869864529755, "learning_rate": 1.957142857142857e-08, "loss": 0.0, "num_tokens": 20256286.0, "reward": -0.0875, "reward_std": 0.6294739723205567, "rewards/verify_chess_move/mean": -0.0875, "rewards/verify_chess_move/std": 0.9920276761054992, "step": 275 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 133.66640625, "completions/mean_terminated_length": 133.66640625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.00025339962750254755, "frac_reward_zero_std": 0.3875, "grad_norm": 0.17109662294387817, "kl": 0.0005051863118751499, "learning_rate": 1.9928571428571426e-08, "loss": 0.0, "num_tokens": 20627275.0, "reward": -0.08125, "reward_std": 0.5406303405761719, "rewards/verify_chess_move/mean": -0.08125, "rewards/verify_chess_move/std": 0.9957510828971863, "step": 280 }, { "completion_length": 341.8, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/max_terminated_length": 341.8, "completions/mean_length": 127.85234375, "completions/mean_terminated_length": 127.85234375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0002579246208508074, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1669045239686966, "kl": 0.0005321152788383188, "learning_rate": 2.0285714285714283e-08, "loss": 0.0, "num_tokens": 20988270.0, "reward": -0.10625, "reward_std": 0.5473548293113708, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.9906241655349731, "step": 285 }, { "completion_length": 347.2, "completions/clipped_ratio": 0.0, "completions/max_length": 347.2, "completions/max_terminated_length": 347.2, "completions/mean_length": 126.2578125, "completions/mean_terminated_length": 126.2578125, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "epoch": 0.00026244961419906714, "frac_reward_zero_std": 0.375, "grad_norm": 0.11751631647348404, "kl": 0.0005258973545096523, "learning_rate": 2.064285714285714e-08, "loss": 0.0, "num_tokens": 21348672.0, "reward": -0.115625, "reward_std": 0.5643380999565124, "rewards/verify_chess_move/mean": -0.115625, "rewards/verify_chess_move/std": 0.9919114589691163, "step": 290 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 137.0265625, "completions/mean_terminated_length": 137.0265625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0002669746075473269, "frac_reward_zero_std": 0.35625, "grad_norm": 0.11491072922945023, "kl": 0.0004985606148693478, "learning_rate": 2.1e-08, "loss": 0.0, "num_tokens": 21722874.0, "reward": -0.159375, "reward_std": 0.5831096708774567, "rewards/verify_chess_move/mean": -0.159375, "rewards/verify_chess_move/std": 0.9860126495361328, "step": 295 }, { "completion_length": 436.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.2, "completions/max_terminated_length": 355.0, "completions/mean_length": 140.33984375, "completions/mean_terminated_length": 139.85765075683594, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0002714996008955867, "frac_reward_zero_std": 0.31875, "grad_norm": 0.14014610648155212, "kl": 0.0005102209881442832, "learning_rate": 2.1357142857142854e-08, "loss": 0.0, "num_tokens": 22101981.0, "reward": -0.1140625, "reward_std": 0.6208882927894592, "rewards/verify_chess_move/mean": -0.1140625, "rewards/verify_chess_move/std": 0.9906222820281982, "step": 300 }, { "completion_length": 390.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 129.290625, "completions/mean_terminated_length": 129.290625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.00027602459424384644, "frac_reward_zero_std": 0.41875, "grad_norm": 0.1420086771249771, "kl": 0.0007733490509963303, "learning_rate": 2.1714285714285712e-08, "loss": 0.0, "num_tokens": 22467153.0, "reward": -0.121875, "reward_std": 0.5165288209915161, "rewards/verify_chess_move/mean": -0.121875, "rewards/verify_chess_move/std": 0.9819389820098877, "step": 305 }, { "completion_length": 479.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 479.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 138.40390625, "completions/mean_terminated_length": 137.89886474609375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.00028054958759210627, "frac_reward_zero_std": 0.3, "grad_norm": 0.11088592559099197, "kl": 0.0004125326018311171, "learning_rate": 2.207142857142857e-08, "loss": 0.0, "num_tokens": 22841662.0, "reward": -0.1390625, "reward_std": 0.6268893957138062, "rewards/verify_chess_move/mean": -0.1390625, "rewards/verify_chess_move/std": 0.9799937605857849, "step": 310 }, { "completion_length": 359.8, "completions/clipped_ratio": 0.0, "completions/max_length": 359.8, "completions/max_terminated_length": 359.8, "completions/mean_length": 133.90703125, "completions/mean_terminated_length": 133.90703125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.00028507458094036603, "frac_reward_zero_std": 0.38125, "grad_norm": 0.12930209934711456, "kl": 0.0005243417936071637, "learning_rate": 2.2428571428571428e-08, "loss": 0.0, "num_tokens": 23211647.0, "reward": -0.1375, "reward_std": 0.5603400588035583, "rewards/verify_chess_move/mean": -0.1375, "rewards/verify_chess_move/std": 0.9823669552803039, "step": 315 }, { "completion_length": 354.4, "completions/clipped_ratio": 0.0, "completions/max_length": 354.4, "completions/max_terminated_length": 354.4, "completions/mean_length": 142.41953125, "completions/mean_terminated_length": 142.41953125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0002895995742886258, "frac_reward_zero_std": 0.35, "grad_norm": 0.13130724430084229, "kl": 0.0005479749903315678, "learning_rate": 2.2785714285714283e-08, "loss": 0.0, "num_tokens": 23593984.0, "reward": -0.096875, "reward_std": 0.5855518579483032, "rewards/verify_chess_move/mean": -0.096875, "rewards/verify_chess_move/std": 0.9845470547676086, "step": 320 }, { "completion_length": 400.8, "completions/clipped_ratio": 0.0, "completions/max_length": 400.8, "completions/max_terminated_length": 400.8, "completions/mean_length": 131.2953125, "completions/mean_terminated_length": 131.2953125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.00029412456763688557, "frac_reward_zero_std": 0.31875, "grad_norm": 0.16545194387435913, "kl": 0.00047776172104931904, "learning_rate": 2.314285714285714e-08, "loss": 0.0, "num_tokens": 23959530.0, "reward": -0.1390625, "reward_std": 0.6163660407066345, "rewards/verify_chess_move/mean": -0.1390625, "rewards/verify_chess_move/std": 0.9894736051559448, "step": 325 }, { "completion_length": 352.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 144.3890625, "completions/mean_terminated_length": 144.3890625, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.00029864956098514533, "frac_reward_zero_std": 0.3875, "grad_norm": 0.16169776022434235, "kl": 0.0006146742874079791, "learning_rate": 2.35e-08, "loss": 0.0, "num_tokens": 24345892.0, "reward": -0.165625, "reward_std": 0.5572895646095276, "rewards/verify_chess_move/mean": -0.165625, "rewards/verify_chess_move/std": 0.9859201073646545, "step": 330 }, { "completion_length": 387.8, "completions/clipped_ratio": 0.0, "completions/max_length": 387.8, "completions/max_terminated_length": 387.8, "completions/mean_length": 140.94296875, "completions/mean_terminated_length": 140.94296875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.00030317455433340515, "frac_reward_zero_std": 0.3625, "grad_norm": 0.13191354274749756, "kl": 0.0005175471806069254, "learning_rate": 2.3857142857142857e-08, "loss": 0.0, "num_tokens": 24727443.0, "reward": -0.190625, "reward_std": 0.582058709859848, "rewards/verify_chess_move/mean": -0.190625, "rewards/verify_chess_move/std": 0.9774404287338256, "step": 335 }, { "completion_length": 333.8, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/max_terminated_length": 333.8, "completions/mean_length": 124.7890625, "completions/mean_terminated_length": 124.7890625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0003076995476816649, "frac_reward_zero_std": 0.375, "grad_norm": 0.16765357553958893, "kl": 0.000551427713890007, "learning_rate": 2.4214285714285715e-08, "loss": 0.0, "num_tokens": 25085357.0, "reward": -0.0265625, "reward_std": 0.5631803512573242, "rewards/verify_chess_move/mean": -0.0265625, "rewards/verify_chess_move/std": 0.9955638170242309, "step": 340 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 130.0640625, "completions/mean_terminated_length": 130.0640625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0003122245410299247, "frac_reward_zero_std": 0.34375, "grad_norm": 0.14440035820007324, "kl": 0.0005255147939351446, "learning_rate": 2.457142857142857e-08, "loss": 0.0, "num_tokens": 25449031.0, "reward": -0.040625, "reward_std": 0.6047887682914734, "rewards/verify_chess_move/mean": -0.040625, "rewards/verify_chess_move/std": 0.9913957476615906, "step": 345 }, { "completion_length": 325.4, "completions/clipped_ratio": 0.0, "completions/max_length": 325.4, "completions/max_terminated_length": 325.4, "completions/mean_length": 118.26640625, "completions/mean_terminated_length": 118.26640625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.00031674953437818446, "frac_reward_zero_std": 0.36875, "grad_norm": 0.13467754423618317, "kl": 0.0005174140748749779, "learning_rate": 2.4928571428571427e-08, "loss": 0.0, "num_tokens": 25794660.0, "reward": -0.125, "reward_std": 0.5618742406368256, "rewards/verify_chess_move/mean": -0.125, "rewards/verify_chess_move/std": 0.9868377566337585, "step": 350 }, { "completion_length": 439.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.2, "completions/max_terminated_length": 426.4, "completions/mean_length": 135.628125, "completions/mean_terminated_length": 135.14762268066406, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0003212745277264442, "frac_reward_zero_std": 0.35, "grad_norm": 0.1537379026412964, "kl": 0.00046782002573309, "learning_rate": 2.5285714285714285e-08, "loss": 0.0, "num_tokens": 26167568.0, "reward": -0.1125, "reward_std": 0.5744495213031768, "rewards/verify_chess_move/mean": -0.1125, "rewards/verify_chess_move/std": 0.9902488708496093, "step": 355 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 136.69140625, "completions/mean_terminated_length": 136.69140625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.00032579952107470404, "frac_reward_zero_std": 0.34375, "grad_norm": 0.11923245340585709, "kl": 0.0005230741626292001, "learning_rate": 2.5642857142857143e-08, "loss": 0.0, "num_tokens": 26541189.0, "reward": -0.13125, "reward_std": 0.5920647501945495, "rewards/verify_chess_move/mean": -0.13125, "rewards/verify_chess_move/std": 0.987118148803711, "step": 360 }, { "completion_length": 364.8, "completions/clipped_ratio": 0.0, "completions/max_length": 364.8, "completions/max_terminated_length": 364.8, "completions/mean_length": 135.6203125, "completions/mean_terminated_length": 135.6203125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0003303245144229638, "frac_reward_zero_std": 0.40625, "grad_norm": 0.12758605182170868, "kl": 0.0005417275539002731, "learning_rate": 2.5999999999999998e-08, "loss": 0.0, "num_tokens": 26914079.0, "reward": -0.084375, "reward_std": 0.5465045988559722, "rewards/verify_chess_move/mean": -0.084375, "rewards/verify_chess_move/std": 0.996761679649353, "step": 365 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 126.36328125, "completions/mean_terminated_length": 126.36328125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0003348495077712236, "frac_reward_zero_std": 0.33125, "grad_norm": 0.14674222469329834, "kl": 0.0004758536094414012, "learning_rate": 2.6357142857142856e-08, "loss": 0.0, "num_tokens": 27271528.0, "reward": -0.0609375, "reward_std": 0.5927047967910767, "rewards/verify_chess_move/mean": -0.0609375, "rewards/verify_chess_move/std": 0.9928872108459472, "step": 370 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 130.10859375, "completions/mean_terminated_length": 130.10859375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00033937450111948335, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15215511620044708, "kl": 0.00048488995062143656, "learning_rate": 2.6714285714285714e-08, "loss": 0.0, "num_tokens": 27634811.0, "reward": -0.1609375, "reward_std": 0.5821345329284668, "rewards/verify_chess_move/mean": -0.1609375, "rewards/verify_chess_move/std": 0.9811620950698853, "step": 375 }, { "completion_length": 330.2, "completions/clipped_ratio": 0.0, "completions/max_length": 330.2, "completions/max_terminated_length": 330.2, "completions/mean_length": 131.51796875, "completions/mean_terminated_length": 131.51796875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0003438994944677431, "frac_reward_zero_std": 0.35, "grad_norm": 0.1428312510251999, "kl": 0.0005702257540178834, "learning_rate": 2.7071428571428572e-08, "loss": 0.0, "num_tokens": 28003826.0, "reward": -0.0625, "reward_std": 0.5738624572753906, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.9934999108314514, "step": 380 }, { "completion_length": 406.2, "completions/clipped_ratio": 0.0, "completions/max_length": 406.2, "completions/max_terminated_length": 406.2, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.00034842448781600293, "frac_reward_zero_std": 0.30625, "grad_norm": 0.1726021021604538, "kl": 0.00043868992979696487, "learning_rate": 2.7428571428571426e-08, "loss": 0.0, "num_tokens": 28365714.0, "reward": 0.0109375, "reward_std": 0.6258888483047486, "rewards/verify_chess_move/mean": 0.0109375, "rewards/verify_chess_move/std": 0.9884800791740418, "step": 385 }, { "completion_length": 392.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 140.3890625, "completions/mean_terminated_length": 140.3890625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0003529494811642627, "frac_reward_zero_std": 0.30625, "grad_norm": 0.13546356558799744, "kl": 0.0005149330100721272, "learning_rate": 2.7785714285714284e-08, "loss": 0.0, "num_tokens": 28742676.0, "reward": -0.05625, "reward_std": 0.6401909947395324, "rewards/verify_chess_move/mean": -0.05625, "rewards/verify_chess_move/std": 0.9982851505279541, "step": 390 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 132.746875, "completions/mean_terminated_length": 132.746875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.00035747447451252247, "frac_reward_zero_std": 0.38125, "grad_norm": 0.17535258829593658, "kl": 0.0005464774749270873, "learning_rate": 2.8142857142857142e-08, "loss": 0.0, "num_tokens": 29113920.0, "reward": -0.234375, "reward_std": 0.5519291043281556, "rewards/verify_chess_move/mean": -0.234375, "rewards/verify_chess_move/std": 0.9616544246673584, "step": 395 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.00036199946786078224, "frac_reward_zero_std": 0.35625, "grad_norm": 0.12265703827142715, "kl": 0.000492412275980314, "learning_rate": 2.85e-08, "loss": 0.0, "num_tokens": 29490032.0, "reward": -0.01875, "reward_std": 0.58106170296669, "rewards/verify_chess_move/mean": -0.01875, "rewards/verify_chess_move/std": 0.9971416831016541, "step": 400 }, { "completion_length": 342.2, "completions/clipped_ratio": 0.0, "completions/max_length": 342.2, "completions/max_terminated_length": 342.2, "completions/mean_length": 137.03515625, "completions/mean_terminated_length": 137.03515625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.000366524461209042, "frac_reward_zero_std": 0.30625, "grad_norm": 0.16767562925815582, "kl": 0.0005507564524123154, "learning_rate": 2.8857142857142855e-08, "loss": 0.0, "num_tokens": 29864629.0, "reward": -0.146875, "reward_std": 0.6299412608146667, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9863567352294922, "step": 405 }, { "completion_length": 371.4, "completions/clipped_ratio": 0.0, "completions/max_length": 371.4, "completions/max_terminated_length": 371.4, "completions/mean_length": 133.615625, "completions/mean_terminated_length": 133.615625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0003710494545573018, "frac_reward_zero_std": 0.38125, "grad_norm": 0.1803787648677826, "kl": 0.0005433150503449724, "learning_rate": 2.9214285714285713e-08, "loss": 0.0, "num_tokens": 30236777.0, "reward": -0.175, "reward_std": 0.5470270097255707, "rewards/verify_chess_move/mean": -0.175, "rewards/verify_chess_move/std": 0.9848761320114136, "step": 410 }, { "completion_length": 342.8, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 135.16875, "completions/mean_terminated_length": 135.16875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0003755744479055616, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1678856909275055, "kl": 0.0005993315775413066, "learning_rate": 2.957142857142857e-08, "loss": 0.0, "num_tokens": 30606537.0, "reward": -0.0140625, "reward_std": 0.5743861079216004, "rewards/verify_chess_move/mean": -0.0140625, "rewards/verify_chess_move/std": 0.9974093437194824, "step": 415 }, { "completion_length": 377.4, "completions/clipped_ratio": 0.0, "completions/max_length": 377.4, "completions/max_terminated_length": 377.4, "completions/mean_length": 136.06328125, "completions/mean_terminated_length": 136.06328125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.00038009944125382136, "frac_reward_zero_std": 0.30625, "grad_norm": 0.14279493689537048, "kl": 0.0005302856213347696, "learning_rate": 2.992857142857143e-08, "loss": 0.0, "num_tokens": 30979058.0, "reward": -0.0890625, "reward_std": 0.6326257467269898, "rewards/verify_chess_move/mean": -0.0890625, "rewards/verify_chess_move/std": 0.993071448802948, "step": 420 }, { "completion_length": 372.6, "completions/clipped_ratio": 0.0, "completions/max_length": 372.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 131.31640625, "completions/mean_terminated_length": 131.31640625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0003846244346020811, "frac_reward_zero_std": 0.35, "grad_norm": 0.15547646582126617, "kl": 0.000496222020501591, "learning_rate": 3.0285714285714284e-08, "loss": 0.0, "num_tokens": 31344015.0, "reward": -0.078125, "reward_std": 0.5692897796630859, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.9948380827903748, "step": 425 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 126.22890625, "completions/mean_terminated_length": 126.22890625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0003891494279503409, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13576434552669525, "kl": 0.0005116719296893279, "learning_rate": 3.064285714285714e-08, "loss": 0.0, "num_tokens": 31702220.0, "reward": -0.128125, "reward_std": 0.5842160403728485, "rewards/verify_chess_move/mean": -0.128125, "rewards/verify_chess_move/std": 0.9895259976387024, "step": 430 }, { "completion_length": 401.2, "completions/clipped_ratio": 0.0, "completions/max_length": 401.2, "completions/max_terminated_length": 401.2, "completions/mean_length": 134.1234375, "completions/mean_terminated_length": 134.1234375, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0003936744212986007, "frac_reward_zero_std": 0.3625, "grad_norm": 0.10280859470367432, "kl": 0.0005213167489273474, "learning_rate": 3.1e-08, "loss": 0.0, "num_tokens": 32075202.0, "reward": -0.2046875, "reward_std": 0.583119535446167, "rewards/verify_chess_move/mean": -0.2046875, "rewards/verify_chess_move/std": 0.9786143183708191, "step": 435 }, { "completion_length": 372.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 119.76171875, "completions/mean_terminated_length": 119.76171875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0003981994146468605, "frac_reward_zero_std": 0.33125, "grad_norm": 0.1641569435596466, "kl": 0.0004889068720331125, "learning_rate": 3.1357142857142854e-08, "loss": 0.0, "num_tokens": 32423745.0, "reward": -0.1203125, "reward_std": 0.616104805469513, "rewards/verify_chess_move/mean": -0.1203125, "rewards/verify_chess_move/std": 0.9919061779975891, "step": 440 }, { "completion_length": 508.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 508.8, "completions/max_terminated_length": 342.4, "completions/mean_length": 139.88671875, "completions/mean_terminated_length": 138.4181365966797, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00040272440799512025, "frac_reward_zero_std": 0.4, "grad_norm": 0.1221618726849556, "kl": 0.00043173503700018047, "learning_rate": 3.1714285714285716e-08, "loss": 0.0, "num_tokens": 32804888.0, "reward": -0.1203125, "reward_std": 0.5378241300582886, "rewards/verify_chess_move/mean": -0.1203125, "rewards/verify_chess_move/std": 0.9895591616630555, "step": 445 }, { "completion_length": 521.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 521.2, "completions/max_terminated_length": 453.4, "completions/mean_length": 122.4921875, "completions/mean_terminated_length": 121.97279205322266, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.00040724940134338, "frac_reward_zero_std": 0.26875, "grad_norm": 0.10401973873376846, "kl": 0.0006764737394860277, "learning_rate": 3.207142857142857e-08, "loss": 0.0, "num_tokens": 33155014.0, "reward": -0.05625, "reward_std": 0.6460898280143738, "rewards/verify_chess_move/mean": -0.05625, "rewards/verify_chess_move/std": 0.9981199622154235, "step": 450 }, { "completion_length": 360.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 134.1046875, "completions/mean_terminated_length": 134.1046875, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.0004117743946916398, "frac_reward_zero_std": 0.4125, "grad_norm": 0.14105719327926636, "kl": 0.0005613567346699711, "learning_rate": 3.242857142857143e-08, "loss": 0.0, "num_tokens": 33528228.0, "reward": -0.1140625, "reward_std": 0.529031777381897, "rewards/verify_chess_move/mean": -0.1140625, "rewards/verify_chess_move/std": 0.9862160325050354, "step": 455 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 126.07265625, "completions/mean_terminated_length": 126.07265625, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0004162993880398996, "frac_reward_zero_std": 0.29375, "grad_norm": 0.14409011602401733, "kl": 0.0005468539143294038, "learning_rate": 3.2785714285714286e-08, "loss": 0.0, "num_tokens": 33887873.0, "reward": -0.16875, "reward_std": 0.6185847878456116, "rewards/verify_chess_move/mean": -0.16875, "rewards/verify_chess_move/std": 0.9822284817695618, "step": 460 }, { "completion_length": 325.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 127.70859375, "completions/mean_terminated_length": 127.70859375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.00042082438138815937, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1491907835006714, "kl": 0.0005285769154397713, "learning_rate": 3.314285714285714e-08, "loss": 0.0, "num_tokens": 34249260.0, "reward": -0.06875, "reward_std": 0.6235193848609925, "rewards/verify_chess_move/mean": -0.06875, "rewards/verify_chess_move/std": 0.9993003487586976, "step": 465 }, { "completion_length": 373.4, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 123.734375, "completions/mean_terminated_length": 123.734375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.00042534937473641914, "frac_reward_zero_std": 0.2625, "grad_norm": 0.13455362617969513, "kl": 0.0004650644102184742, "learning_rate": 3.35e-08, "loss": 0.0, "num_tokens": 34602248.0, "reward": -0.103125, "reward_std": 0.6683455228805542, "rewards/verify_chess_move/mean": -0.103125, "rewards/verify_chess_move/std": 0.995215904712677, "step": 470 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 128.9078125, "completions/mean_terminated_length": 128.9078125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0004298743680846789, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15551581978797913, "kl": 0.0005434768781924504, "learning_rate": 3.385714285714286e-08, "loss": 0.0, "num_tokens": 34966538.0, "reward": -0.2046875, "reward_std": 0.5529130935668946, "rewards/verify_chess_move/mean": -0.2046875, "rewards/verify_chess_move/std": 0.9777985334396362, "step": 475 }, { "completion_length": 356.8, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 127.6296875, "completions/mean_terminated_length": 127.6296875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.00043439936143293867, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15893498063087463, "kl": 0.00046945591138864985, "learning_rate": 3.421428571428571e-08, "loss": 0.0, "num_tokens": 35327128.0, "reward": -0.1296875, "reward_std": 0.6076903223991394, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.992763364315033, "step": 480 }, { "completion_length": 390.6, "completions/clipped_ratio": 0.0, "completions/max_length": 390.6, "completions/max_terminated_length": 390.6, "completions/mean_length": 126.5484375, "completions/mean_terminated_length": 126.5484375, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0004389243547811985, "frac_reward_zero_std": 0.3, "grad_norm": 0.12033933401107788, "kl": 0.00045100612269379783, "learning_rate": 3.457142857142857e-08, "loss": 0.0, "num_tokens": 35685886.0, "reward": -0.1, "reward_std": 0.6341969847679139, "rewards/verify_chess_move/mean": -0.1, "rewards/verify_chess_move/std": 0.9948747396469116, "step": 485 }, { "completion_length": 395.4, "completions/clipped_ratio": 0.0, "completions/max_length": 395.4, "completions/max_terminated_length": 395.4, "completions/mean_length": 126.984375, "completions/mean_terminated_length": 126.984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.00044344934812945826, "frac_reward_zero_std": 0.3375, "grad_norm": 0.12334916740655899, "kl": 0.0004742785738926614, "learning_rate": 3.492857142857143e-08, "loss": 0.0, "num_tokens": 36047554.0, "reward": -0.109375, "reward_std": 0.6063590049743652, "rewards/verify_chess_move/mean": -0.109375, "rewards/verify_chess_move/std": 0.9926255941390991, "step": 490 }, { "completion_length": 436.6, "completions/clipped_ratio": 0.0, "completions/max_length": 436.6, "completions/max_terminated_length": 436.6, "completions/mean_length": 130.38359375, "completions/mean_terminated_length": 130.38359375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.00044797434147771803, "frac_reward_zero_std": 0.3875, "grad_norm": 0.11415991932153702, "kl": 0.00045180364922998707, "learning_rate": 3.528571428571429e-08, "loss": 0.0, "num_tokens": 36412853.0, "reward": -0.1984375, "reward_std": 0.5505245387554168, "rewards/verify_chess_move/mean": -0.1984375, "rewards/verify_chess_move/std": 0.974141788482666, "step": 495 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.0, "completions/max_length": 392.6, "completions/max_terminated_length": 392.6, "completions/mean_length": 137.8, "completions/mean_terminated_length": 137.8, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0004524993348259778, "frac_reward_zero_std": 0.3625, "grad_norm": 0.12447461485862732, "kl": 0.0005110210940983961, "learning_rate": 3.5642857142857143e-08, "loss": 0.0, "num_tokens": 36787645.0, "reward": -0.1109375, "reward_std": 0.5906818389892579, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.985427975654602, "step": 500 }, { "completion_length": 330.8, "completions/clipped_ratio": 0.0, "completions/max_length": 330.8, "completions/max_terminated_length": 330.8, "completions/mean_length": 140.86171875, "completions/mean_terminated_length": 140.86171875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.00045702432817423756, "frac_reward_zero_std": 0.36875, "grad_norm": 0.14481520652770996, "kl": 0.000606859936760884, "learning_rate": 3.6e-08, "loss": 0.0, "num_tokens": 37171372.0, "reward": -0.0875, "reward_std": 0.573703122138977, "rewards/verify_chess_move/mean": -0.0875, "rewards/verify_chess_move/std": 0.9953174352645874, "step": 505 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 129.75234375, "completions/mean_terminated_length": 129.75234375, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "epoch": 0.0004615493215224974, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1256173700094223, "kl": 0.0004863768282120873, "learning_rate": 3.635714285714286e-08, "loss": 0.0, "num_tokens": 37535063.0, "reward": -0.146875, "reward_std": 0.5990078687667847, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9898825407028198, "step": 510 }, { "completion_length": 350.4, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 135.815625, "completions/mean_terminated_length": 135.815625, "completions/min_length": 43.4, "completions/min_terminated_length": 43.4, "epoch": 0.00046607431487075715, "frac_reward_zero_std": 0.40625, "grad_norm": 0.15724559128284454, "kl": 0.0005583493890298996, "learning_rate": 3.6714285714285714e-08, "loss": 0.0, "num_tokens": 37910771.0, "reward": -0.18125, "reward_std": 0.5278884291648864, "rewards/verify_chess_move/mean": -0.18125, "rewards/verify_chess_move/std": 0.9828167676925659, "step": 515 }, { "completion_length": 367.6, "completions/clipped_ratio": 0.0, "completions/max_length": 367.6, "completions/max_terminated_length": 367.6, "completions/mean_length": 137.071875, "completions/mean_terminated_length": 137.071875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0004705993082190169, "frac_reward_zero_std": 0.29375, "grad_norm": 0.15261414647102356, "kl": 0.0005160929951671279, "learning_rate": 3.7071428571428575e-08, "loss": 0.0, "num_tokens": 38282719.0, "reward": -0.0328125, "reward_std": 0.644834017753601, "rewards/verify_chess_move/mean": -0.0328125, "rewards/verify_chess_move/std": 0.9994126915931701, "step": 520 }, { "completion_length": 342.2, "completions/clipped_ratio": 0.0, "completions/max_length": 342.2, "completions/max_terminated_length": 342.2, "completions/mean_length": 127.89921875, "completions/mean_terminated_length": 127.89921875, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0004751243015672767, "frac_reward_zero_std": 0.3, "grad_norm": 0.16111087799072266, "kl": 0.0005389956535509554, "learning_rate": 3.742857142857143e-08, "loss": 0.0, "num_tokens": 38641502.0, "reward": -0.040625, "reward_std": 0.6195837616920471, "rewards/verify_chess_move/mean": -0.040625, "rewards/verify_chess_move/std": 0.9934018731117249, "step": 525 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 136.88359375, "completions/mean_terminated_length": 136.88359375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00047964929491553645, "frac_reward_zero_std": 0.41875, "grad_norm": 0.14663848280906677, "kl": 0.0005495226124367037, "learning_rate": 3.7785714285714285e-08, "loss": 0.0, "num_tokens": 39015609.0, "reward": -0.1421875, "reward_std": 0.5312572717666626, "rewards/verify_chess_move/mean": -0.1421875, "rewards/verify_chess_move/std": 0.9828656077384949, "step": 530 }, { "completion_length": 445.6, "completions/clipped_ratio": 0.0, "completions/max_length": 445.6, "completions/max_terminated_length": 445.6, "completions/mean_length": 134.32109375, "completions/mean_terminated_length": 134.32109375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.00048417428826379627, "frac_reward_zero_std": 0.33125, "grad_norm": 0.12797488272190094, "kl": 0.0004920831667732273, "learning_rate": 3.8142857142857146e-08, "loss": 0.0, "num_tokens": 39385756.0, "reward": -0.159375, "reward_std": 0.6103066325187683, "rewards/verify_chess_move/mean": -0.159375, "rewards/verify_chess_move/std": 0.9681557178497314, "step": 535 }, { "completion_length": 339.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 135.16796875, "completions/mean_terminated_length": 135.16796875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.000488699281612056, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1757209151983261, "kl": 0.0005705366025722469, "learning_rate": 3.85e-08, "loss": 0.0, "num_tokens": 39755803.0, "reward": -0.1609375, "reward_std": 0.58333780169487, "rewards/verify_chess_move/mean": -0.1609375, "rewards/verify_chess_move/std": 0.9844417095184326, "step": 540 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 136.82421875, "completions/mean_terminated_length": 136.82421875, "completions/min_length": 42.2, "completions/min_terminated_length": 42.2, "epoch": 0.0004932242749603158, "frac_reward_zero_std": 0.41875, "grad_norm": 0.14006957411766052, "kl": 0.0005327944742930413, "learning_rate": 3.8857142857142855e-08, "loss": 0.0, "num_tokens": 40131954.0, "reward": -0.1609375, "reward_std": 0.5089635491371155, "rewards/verify_chess_move/mean": -0.1609375, "rewards/verify_chess_move/std": 0.9851698160171509, "step": 545 }, { "completion_length": 350.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 130.53828125, "completions/mean_terminated_length": 130.53828125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0004977492683085756, "frac_reward_zero_std": 0.275, "grad_norm": 0.15691903233528137, "kl": 0.0005597562279035628, "learning_rate": 3.9214285714285717e-08, "loss": 0.0, "num_tokens": 40494811.0, "reward": -0.0703125, "reward_std": 0.6602381348609925, "rewards/verify_chess_move/mean": -0.0703125, "rewards/verify_chess_move/std": 0.9900437712669372, "step": 550 }, { "completion_length": 345.4, "completions/clipped_ratio": 0.0, "completions/max_length": 345.4, "completions/max_terminated_length": 345.4, "completions/mean_length": 137.34921875, "completions/mean_terminated_length": 137.34921875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0005022742616568353, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1129101887345314, "kl": 0.0005751775957833161, "learning_rate": 3.957142857142857e-08, "loss": 0.0, "num_tokens": 40870346.0, "reward": -0.159375, "reward_std": 0.5398390233516693, "rewards/verify_chess_move/mean": -0.159375, "rewards/verify_chess_move/std": 0.974979567527771, "step": 555 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0, "completions/max_length": 393.8, "completions/max_terminated_length": 393.8, "completions/mean_length": 125.52109375, "completions/mean_terminated_length": 125.52109375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0005067992550050951, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1305374950170517, "kl": 0.00046419130494541606, "learning_rate": 3.992857142857143e-08, "loss": 0.0, "num_tokens": 41227829.0, "reward": -0.159375, "reward_std": 0.5743851065635681, "rewards/verify_chess_move/mean": -0.159375, "rewards/verify_chess_move/std": 0.9832979440689087, "step": 560 }, { "completion_length": 412.4, "completions/clipped_ratio": 0.0, "completions/max_length": 412.4, "completions/max_terminated_length": 412.4, "completions/mean_length": 136.62734375, "completions/mean_terminated_length": 136.62734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0005113242483533549, "frac_reward_zero_std": 0.40625, "grad_norm": 0.09695765376091003, "kl": 0.0004767493748659035, "learning_rate": 4.028571428571429e-08, "loss": 0.0, "num_tokens": 41602288.0, "reward": -0.1984375, "reward_std": 0.5362558424472809, "rewards/verify_chess_move/mean": -0.1984375, "rewards/verify_chess_move/std": 0.9785617828369141, "step": 565 }, { "completion_length": 368.8, "completions/clipped_ratio": 0.0, "completions/max_length": 368.8, "completions/max_terminated_length": 368.8, "completions/mean_length": 140.5640625, "completions/mean_terminated_length": 140.5640625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0005158492417016148, "frac_reward_zero_std": 0.35625, "grad_norm": 0.12775756418704987, "kl": 0.0005368285715121601, "learning_rate": 4.0642857142857135e-08, "loss": 0.0, "num_tokens": 41984338.0, "reward": -0.14375, "reward_std": 0.5780661702156067, "rewards/verify_chess_move/mean": -0.14375, "rewards/verify_chess_move/std": 0.9849952220916748, "step": 570 }, { "completion_length": 429.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 125.5359375, "completions/mean_terminated_length": 125.5359375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0005203742350498745, "frac_reward_zero_std": 0.34375, "grad_norm": 0.07564494758844376, "kl": 0.0004280015476979315, "learning_rate": 4.1e-08, "loss": 0.0, "num_tokens": 42339040.0, "reward": -0.053125, "reward_std": 0.6120933890342712, "rewards/verify_chess_move/mean": -0.053125, "rewards/verify_chess_move/std": 0.9953701257705688, "step": 575 }, { "completion_length": 541.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 541.4, "completions/max_terminated_length": 374.6, "completions/mean_length": 134.3546875, "completions/mean_terminated_length": 133.35759735107422, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.0005248992283981343, "frac_reward_zero_std": 0.38125, "grad_norm": 0.06078275293111801, "kl": 0.00039743380484651424, "learning_rate": 4.135714285714285e-08, "loss": 0.0, "num_tokens": 42709814.0, "reward": -0.1984375, "reward_std": 0.5630749464035034, "rewards/verify_chess_move/mean": -0.1984375, "rewards/verify_chess_move/std": 0.9665329456329346, "step": 580 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 144.83359375, "completions/mean_terminated_length": 144.83359375, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "epoch": 0.000529424221746394, "frac_reward_zero_std": 0.4125, "grad_norm": 0.12648756802082062, "kl": 0.0005285642262606415, "learning_rate": 4.171428571428571e-08, "loss": 0.0, "num_tokens": 43096113.0, "reward": -0.115625, "reward_std": 0.5306312263011932, "rewards/verify_chess_move/mean": -0.115625, "rewards/verify_chess_move/std": 0.9917517185211182, "step": 585 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 143.89765625, "completions/mean_terminated_length": 143.89765625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0005339492150946538, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13495396077632904, "kl": 0.0005651773795761983, "learning_rate": 4.207142857142857e-08, "loss": 0.0, "num_tokens": 43482142.0, "reward": -0.1828125, "reward_std": 0.5721284925937653, "rewards/verify_chess_move/mean": -0.1828125, "rewards/verify_chess_move/std": 0.9706456780433654, "step": 590 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 132.41875, "completions/mean_terminated_length": 132.41875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0005384742084429136, "frac_reward_zero_std": 0.3375, "grad_norm": 0.13313278555870056, "kl": 0.0004765903762290691, "learning_rate": 4.242857142857142e-08, "loss": 0.0, "num_tokens": 43849518.0, "reward": -0.078125, "reward_std": 0.5856464147567749, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.9967073798179626, "step": 595 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 133.175, "completions/mean_terminated_length": 133.175, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0005429992017911734, "frac_reward_zero_std": 0.325, "grad_norm": 0.11002358794212341, "kl": 0.0005243723637249787, "learning_rate": 4.278571428571428e-08, "loss": 0.0, "num_tokens": 44218798.0, "reward": -0.0984375, "reward_std": 0.6035326957702637, "rewards/verify_chess_move/mean": -0.0984375, "rewards/verify_chess_move/std": 0.9837772965431213, "step": 600 }, { "completion_length": 372.6, "completions/clipped_ratio": 0.0, "completions/max_length": 372.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 133.39453125, "completions/mean_terminated_length": 133.39453125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0005475241951394331, "frac_reward_zero_std": 0.39375, "grad_norm": 0.12504805624485016, "kl": 0.0004969850325323933, "learning_rate": 4.314285714285714e-08, "loss": 0.0, "num_tokens": 44589079.0, "reward": -0.1765625, "reward_std": 0.5605990946292877, "rewards/verify_chess_move/mean": -0.1765625, "rewards/verify_chess_move/std": 0.9681035995483398, "step": 605 }, { "completion_length": 332.2, "completions/clipped_ratio": 0.0, "completions/max_length": 332.2, "completions/max_terminated_length": 332.2, "completions/mean_length": 128.5671875, "completions/mean_terminated_length": 128.5671875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0005520491884876929, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16168077290058136, "kl": 0.0005922111015024712, "learning_rate": 4.349999999999999e-08, "loss": 0.0, "num_tokens": 44950149.0, "reward": -0.11875, "reward_std": 0.6237770557403565, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9867865681648255, "step": 610 }, { "completion_length": 336.8, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/max_terminated_length": 336.8, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0005565741818359527, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1413211077451706, "kl": 0.0005487902565619151, "learning_rate": 4.3857142857142854e-08, "loss": 0.0, "num_tokens": 45326333.0, "reward": -0.2796875, "reward_std": 0.5370576441287994, "rewards/verify_chess_move/mean": -0.2796875, "rewards/verify_chess_move/std": 0.9598218202590942, "step": 615 }, { "completion_length": 332.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 136.58515625, "completions/mean_terminated_length": 136.58515625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0005610991751842125, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14953583478927612, "kl": 0.0005646518400681088, "learning_rate": 4.421428571428571e-08, "loss": 0.0, "num_tokens": 45700938.0, "reward": -0.1125, "reward_std": 0.5351446211338043, "rewards/verify_chess_move/mean": -0.1125, "rewards/verify_chess_move/std": 0.9835181117057801, "step": 620 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 143.4171875, "completions/mean_terminated_length": 143.4171875, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0005656241685324723, "frac_reward_zero_std": 0.4, "grad_norm": 0.17014265060424805, "kl": 0.0005822628350870218, "learning_rate": 4.457142857142857e-08, "loss": 0.0, "num_tokens": 46084872.0, "reward": -0.1296875, "reward_std": 0.5436658978462219, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9925632119178772, "step": 625 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 134.278125, "completions/mean_terminated_length": 134.278125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0005701491618807321, "frac_reward_zero_std": 0.375, "grad_norm": 0.13609103858470917, "kl": 0.0005450526492495556, "learning_rate": 4.4928571428571424e-08, "loss": 0.0, "num_tokens": 46455836.0, "reward": -0.1890625, "reward_std": 0.5554533481597901, "rewards/verify_chess_move/mean": -0.1890625, "rewards/verify_chess_move/std": 0.9790793180465698, "step": 630 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 134.70546875, "completions/mean_terminated_length": 134.70546875, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0005746741552289918, "frac_reward_zero_std": 0.38125, "grad_norm": 0.16034622490406036, "kl": 0.0005068842895070702, "learning_rate": 4.528571428571428e-08, "loss": 0.0, "num_tokens": 46827467.0, "reward": 0.0, "reward_std": 0.5495245456695557, "rewards/verify_chess_move/mean": 0.0, "rewards/verify_chess_move/std": 0.9967810988426209, "step": 635 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 136.915625, "completions/mean_terminated_length": 136.915625, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0005791991485772516, "frac_reward_zero_std": 0.39375, "grad_norm": 0.15594175457954407, "kl": 0.0005173275111701514, "learning_rate": 4.564285714285714e-08, "loss": 0.0, "num_tokens": 47203271.0, "reward": -0.1328125, "reward_std": 0.5612811088562012, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9890674829483033, "step": 640 }, { "completion_length": 362.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 134.23125, "completions/mean_terminated_length": 134.23125, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0005837241419255114, "frac_reward_zero_std": 0.4, "grad_norm": 0.13353700935840607, "kl": 0.0006099846561482992, "learning_rate": 4.5999999999999995e-08, "loss": 0.0, "num_tokens": 47573591.0, "reward": -0.0921875, "reward_std": 0.545198506116867, "rewards/verify_chess_move/mean": -0.0921875, "rewards/verify_chess_move/std": 0.9905158042907715, "step": 645 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 136.35625, "completions/mean_terminated_length": 136.35625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0005882491352737711, "frac_reward_zero_std": 0.375, "grad_norm": 0.14366188645362854, "kl": 0.0005598826778623334, "learning_rate": 4.635714285714285e-08, "loss": 0.0, "num_tokens": 47947775.0, "reward": -0.1109375, "reward_std": 0.5606111407279968, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.9892132520675659, "step": 650 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 123.77421875, "completions/mean_terminated_length": 123.77421875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0005927741286220309, "frac_reward_zero_std": 0.45, "grad_norm": 0.15246500074863434, "kl": 0.00047534462401017664, "learning_rate": 4.671428571428571e-08, "loss": 0.0, "num_tokens": 48304126.0, "reward": -0.1765625, "reward_std": 0.4880222618579865, "rewards/verify_chess_move/mean": -0.1765625, "rewards/verify_chess_move/std": 0.9794639229774476, "step": 655 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 136.8328125, "completions/mean_terminated_length": 136.8328125, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0005972991219702907, "frac_reward_zero_std": 0.36875, "grad_norm": 0.09949576109647751, "kl": 0.0005286243032969651, "learning_rate": 4.7071428571428566e-08, "loss": 0.0, "num_tokens": 48678792.0, "reward": -0.1859375, "reward_std": 0.5600829780101776, "rewards/verify_chess_move/mean": -0.1859375, "rewards/verify_chess_move/std": 0.9753221035003662, "step": 660 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 142.80625, "completions/mean_terminated_length": 142.80625, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0006018241153185504, "frac_reward_zero_std": 0.36875, "grad_norm": 0.13500961661338806, "kl": 0.0005317144304171961, "learning_rate": 4.742857142857143e-08, "loss": 0.0, "num_tokens": 49062768.0, "reward": -0.190625, "reward_std": 0.5716066598892212, "rewards/verify_chess_move/mean": -0.190625, "rewards/verify_chess_move/std": 0.9770471811294555, "step": 665 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 131.01953125, "completions/mean_terminated_length": 131.01953125, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0006063491086668103, "frac_reward_zero_std": 0.35, "grad_norm": 0.12950246036052704, "kl": 0.0004904595043626614, "learning_rate": 4.778571428571428e-08, "loss": 0.0, "num_tokens": 49428937.0, "reward": -0.1234375, "reward_std": 0.5799183368682861, "rewards/verify_chess_move/mean": -0.1234375, "rewards/verify_chess_move/std": 0.9813547730445862, "step": 670 }, { "completion_length": 351.2, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/max_terminated_length": 351.2, "completions/mean_length": 131.553125, "completions/mean_terminated_length": 131.553125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0006108741020150701, "frac_reward_zero_std": 0.31875, "grad_norm": 0.18400916457176208, "kl": 0.0005502390984474914, "learning_rate": 4.8142857142857136e-08, "loss": 0.0, "num_tokens": 49793757.0, "reward": -0.0390625, "reward_std": 0.6106870293617248, "rewards/verify_chess_move/mean": -0.0390625, "rewards/verify_chess_move/std": 0.9944554448127747, "step": 675 }, { "completion_length": 398.4, "completions/clipped_ratio": 0.0, "completions/max_length": 398.4, "completions/max_terminated_length": 398.4, "completions/mean_length": 137.053125, "completions/mean_terminated_length": 137.053125, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0006153990953633298, "frac_reward_zero_std": 0.35, "grad_norm": 0.1039770096540451, "kl": 0.00046981033074189327, "learning_rate": 4.85e-08, "loss": 0.0, "num_tokens": 50167377.0, "reward": -0.2390625, "reward_std": 0.592384672164917, "rewards/verify_chess_move/mean": -0.2390625, "rewards/verify_chess_move/std": 0.9714247465133667, "step": 680 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 136.13203125, "completions/mean_terminated_length": 135.63777770996094, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0006199240887115896, "frac_reward_zero_std": 0.41875, "grad_norm": 0.15567028522491455, "kl": 0.0005013536791921069, "learning_rate": 4.885714285714285e-08, "loss": 0.0, "num_tokens": 50544306.0, "reward": -0.1765625, "reward_std": 0.5262622117996216, "rewards/verify_chess_move/mean": -0.1765625, "rewards/verify_chess_move/std": 0.9828842878341675, "step": 685 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 128.990625, "completions/mean_terminated_length": 128.990625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0006244490820598494, "frac_reward_zero_std": 0.35625, "grad_norm": 0.12042675912380219, "kl": 0.000536596127312805, "learning_rate": 4.9214285714285713e-08, "loss": 0.0, "num_tokens": 50905750.0, "reward": -0.071875, "reward_std": 0.5818154156208039, "rewards/verify_chess_move/mean": -0.071875, "rewards/verify_chess_move/std": 0.9951167464256286, "step": 690 }, { "completion_length": 370.4, "completions/clipped_ratio": 0.0, "completions/max_length": 370.4, "completions/max_terminated_length": 370.4, "completions/mean_length": 131.06875, "completions/mean_terminated_length": 131.06875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0006289740754081091, "frac_reward_zero_std": 0.34375, "grad_norm": 0.17307518422603607, "kl": 0.0005130271422785881, "learning_rate": 4.957142857142857e-08, "loss": 0.0, "num_tokens": 51270678.0, "reward": -0.146875, "reward_std": 0.5989450693130494, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9813497066497803, "step": 695 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 131.296875, "completions/mean_terminated_length": 131.296875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0006334990687563689, "frac_reward_zero_std": 0.36875, "grad_norm": 0.14671878516674042, "kl": 0.0005215001993747137, "learning_rate": 4.992857142857142e-08, "loss": 0.0, "num_tokens": 51640314.0, "reward": -0.1546875, "reward_std": 0.5671279549598693, "rewards/verify_chess_move/mean": -0.1546875, "rewards/verify_chess_move/std": 0.9878567934036255, "step": 700 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 132.1609375, "completions/mean_terminated_length": 132.1609375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0006380240621046287, "frac_reward_zero_std": 0.35625, "grad_norm": 0.11073741316795349, "kl": 0.0005160043167961703, "learning_rate": 5.0285714285714284e-08, "loss": 0.0, "num_tokens": 52009040.0, "reward": -0.1375, "reward_std": 0.5793856501579284, "rewards/verify_chess_move/mean": -0.1375, "rewards/verify_chess_move/std": 0.9851471781730652, "step": 705 }, { "completion_length": 329.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 126.1390625, "completions/mean_terminated_length": 126.1390625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0006425490554528884, "frac_reward_zero_std": 0.24375, "grad_norm": 0.17039503157138824, "kl": 0.0005498141621501418, "learning_rate": 5.064285714285714e-08, "loss": 0.0, "num_tokens": 52365722.0, "reward": -0.0859375, "reward_std": 0.6854924917221069, "rewards/verify_chess_move/mean": -0.0859375, "rewards/verify_chess_move/std": 0.9973724722862244, "step": 710 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 126.62890625, "completions/mean_terminated_length": 126.62890625, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0006470740488011482, "frac_reward_zero_std": 0.3375, "grad_norm": 0.14068225026130676, "kl": 0.00047503067908110096, "learning_rate": 5.0999999999999993e-08, "loss": 0.0, "num_tokens": 52725375.0, "reward": -0.18125, "reward_std": 0.6063105583190918, "rewards/verify_chess_move/mean": -0.18125, "rewards/verify_chess_move/std": 0.981126344203949, "step": 715 }, { "completion_length": 358.6, "completions/clipped_ratio": 0.0, "completions/max_length": 358.6, "completions/max_terminated_length": 358.6, "completions/mean_length": 136.5015625, "completions/mean_terminated_length": 136.5015625, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0006515990421494081, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1409977376461029, "kl": 0.0005262560390292492, "learning_rate": 5.1357142857142855e-08, "loss": 0.0, "num_tokens": 53099921.0, "reward": -0.0796875, "reward_std": 0.6215168833732605, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9957218289375305, "step": 720 }, { "completion_length": 399.8, "completions/clipped_ratio": 0.0, "completions/max_length": 399.8, "completions/max_terminated_length": 399.8, "completions/mean_length": 143.33046875, "completions/mean_terminated_length": 143.33046875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0006561240354976679, "frac_reward_zero_std": 0.41875, "grad_norm": 0.08350160717964172, "kl": 0.00051996722295371, "learning_rate": 5.171428571428571e-08, "loss": 0.0, "num_tokens": 53486752.0, "reward": -0.209375, "reward_std": 0.5269917070865631, "rewards/verify_chess_move/mean": -0.209375, "rewards/verify_chess_move/std": 0.9766757726669312, "step": 725 }, { "completion_length": 337.2, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/max_terminated_length": 337.2, "completions/mean_length": 126.9109375, "completions/mean_terminated_length": 126.9109375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0006606490288459276, "frac_reward_zero_std": 0.325, "grad_norm": 0.1733839511871338, "kl": 0.0005387489826716773, "learning_rate": 5.207142857142857e-08, "loss": 0.0, "num_tokens": 53847974.0, "reward": -0.11875, "reward_std": 0.5955470383167267, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9896595001220703, "step": 730 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 136.04296875, "completions/mean_terminated_length": 136.04296875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0006651740221941874, "frac_reward_zero_std": 0.41875, "grad_norm": 0.10910322517156601, "kl": 0.0005558432540055946, "learning_rate": 5.2428571428571425e-08, "loss": 0.0, "num_tokens": 54222165.0, "reward": -0.228125, "reward_std": 0.5257187008857727, "rewards/verify_chess_move/mean": -0.228125, "rewards/verify_chess_move/std": 0.9691211938858032, "step": 735 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 122.91328125, "completions/mean_terminated_length": 122.91328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0006696990155424472, "frac_reward_zero_std": 0.33125, "grad_norm": 0.11808203905820847, "kl": 0.000502585174763226, "learning_rate": 5.278571428571428e-08, "loss": 0.0, "num_tokens": 54574262.0, "reward": -0.0640625, "reward_std": 0.5818195343017578, "rewards/verify_chess_move/mean": -0.0640625, "rewards/verify_chess_move/std": 0.996541428565979, "step": 740 }, { "completion_length": 442.4, "completions/clipped_ratio": 0.0, "completions/max_length": 442.4, "completions/max_terminated_length": 442.4, "completions/mean_length": 136.6453125, "completions/mean_terminated_length": 136.6453125, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0006742240088907069, "frac_reward_zero_std": 0.35, "grad_norm": 0.14882437884807587, "kl": 0.0004635026997675595, "learning_rate": 5.314285714285714e-08, "loss": 0.0, "num_tokens": 54946000.0, "reward": -0.1015625, "reward_std": 0.5829099416732788, "rewards/verify_chess_move/mean": -0.1015625, "rewards/verify_chess_move/std": 0.9950744152069092, "step": 745 }, { "completion_length": 431.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 431.6, "completions/max_terminated_length": 361.4, "completions/mean_length": 133.00078125, "completions/mean_terminated_length": 132.5070037841797, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0006787490022389667, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1654488891363144, "kl": 0.0005047027031650942, "learning_rate": 5.3499999999999996e-08, "loss": 0.0, "num_tokens": 55313569.0, "reward": -0.1078125, "reward_std": 0.6090009331703186, "rewards/verify_chess_move/mean": -0.1078125, "rewards/verify_chess_move/std": 0.9891697883605957, "step": 750 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 136.5421875, "completions/mean_terminated_length": 136.5421875, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0006832739955872265, "frac_reward_zero_std": 0.39375, "grad_norm": 0.16494297981262207, "kl": 0.0005375655292482407, "learning_rate": 5.385714285714286e-08, "loss": 0.0, "num_tokens": 55688703.0, "reward": -0.15625, "reward_std": 0.5501836955547332, "rewards/verify_chess_move/mean": -0.15625, "rewards/verify_chess_move/std": 0.9861991047859192, "step": 755 }, { "completion_length": 323.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 128.9546875, "completions/mean_terminated_length": 128.9546875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0006877989889354862, "frac_reward_zero_std": 0.35625, "grad_norm": 0.19859080016613007, "kl": 0.0005828658881910087, "learning_rate": 5.421428571428571e-08, "loss": 0.0, "num_tokens": 56050789.0, "reward": -0.1328125, "reward_std": 0.5832269191741943, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9890452980995178, "step": 760 }, { "completion_length": 346.4, "completions/clipped_ratio": 0.0, "completions/max_length": 346.4, "completions/max_terminated_length": 346.4, "completions/mean_length": 135.8390625, "completions/mean_terminated_length": 135.8390625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.000692323982283746, "frac_reward_zero_std": 0.35625, "grad_norm": 0.15496039390563965, "kl": 0.0005335558327715262, "learning_rate": 5.4571428571428567e-08, "loss": 0.0, "num_tokens": 56423583.0, "reward": -0.1203125, "reward_std": 0.5848040699958801, "rewards/verify_chess_move/mean": -0.1203125, "rewards/verify_chess_move/std": 0.9899220585823059, "step": 765 }, { "completion_length": 358.6, "completions/clipped_ratio": 0.0, "completions/max_length": 358.6, "completions/max_terminated_length": 358.6, "completions/mean_length": 140.67890625, "completions/mean_terminated_length": 140.67890625, "completions/min_length": 43.6, "completions/min_terminated_length": 43.6, "epoch": 0.0006968489756320059, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13164451718330383, "kl": 0.0005623469066449616, "learning_rate": 5.492857142857143e-08, "loss": 0.0, "num_tokens": 56803012.0, "reward": -0.2203125, "reward_std": 0.5954316258430481, "rewards/verify_chess_move/mean": -0.2203125, "rewards/verify_chess_move/std": 0.9720540404319763, "step": 770 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 130.4703125, "completions/mean_terminated_length": 130.4703125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0007013739689802656, "frac_reward_zero_std": 0.41875, "grad_norm": 0.11676247417926788, "kl": 0.0005479153825945105, "learning_rate": 5.528571428571428e-08, "loss": 0.0, "num_tokens": 57166942.0, "reward": -0.2453125, "reward_std": 0.5211470246315002, "rewards/verify_chess_move/mean": -0.2453125, "rewards/verify_chess_move/std": 0.9548951268196106, "step": 775 }, { "completion_length": 361.8, "completions/clipped_ratio": 0.0, "completions/max_length": 361.8, "completions/max_terminated_length": 361.8, "completions/mean_length": 130.72109375, "completions/mean_terminated_length": 130.72109375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0007058989623285254, "frac_reward_zero_std": 0.36875, "grad_norm": 0.12566863000392914, "kl": 0.0005261821476779005, "learning_rate": 5.564285714285714e-08, "loss": 0.0, "num_tokens": 57531393.0, "reward": -0.078125, "reward_std": 0.5513668417930603, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.9867147445678711, "step": 780 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 136.4359375, "completions/mean_terminated_length": 136.4359375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0007104239556767852, "frac_reward_zero_std": 0.36875, "grad_norm": 0.14385004341602325, "kl": 0.0005657665174112481, "learning_rate": 5.6e-08, "loss": 0.0, "num_tokens": 57908751.0, "reward": -0.15, "reward_std": 0.5707580208778381, "rewards/verify_chess_move/mean": -0.15, "rewards/verify_chess_move/std": 0.9781332612037659, "step": 785 }, { "completion_length": 360.8, "completions/clipped_ratio": 0.0, "completions/max_length": 360.8, "completions/max_terminated_length": 360.8, "completions/mean_length": 125.30390625, "completions/mean_terminated_length": 125.30390625, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0007149489490250449, "frac_reward_zero_std": 0.35, "grad_norm": 0.12224519997835159, "kl": 0.00047485184268225566, "learning_rate": 5.635714285714285e-08, "loss": 0.0, "num_tokens": 58265468.0, "reward": -0.059375, "reward_std": 0.5751315236091614, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9949082493782043, "step": 790 }, { "completion_length": 321.8, "completions/clipped_ratio": 0.0, "completions/max_length": 321.8, "completions/max_terminated_length": 321.8, "completions/mean_length": 130.2, "completions/mean_terminated_length": 130.2, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0007194739423733047, "frac_reward_zero_std": 0.325, "grad_norm": 0.17941497266292572, "kl": 0.0005556191379582742, "learning_rate": 5.6714285714285714e-08, "loss": 0.0, "num_tokens": 58629572.0, "reward": -0.1015625, "reward_std": 0.6054179549217225, "rewards/verify_chess_move/mean": -0.1015625, "rewards/verify_chess_move/std": 0.9831313967704773, "step": 795 }, { "completion_length": 432.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 432.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 142.934375, "completions/mean_terminated_length": 142.45282287597655, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0007239989357215645, "frac_reward_zero_std": 0.425, "grad_norm": 0.1290520876646042, "kl": 0.0005130383618961787, "learning_rate": 5.707142857142857e-08, "loss": 0.0, "num_tokens": 59014080.0, "reward": -0.1953125, "reward_std": 0.5290515184402466, "rewards/verify_chess_move/mean": -0.1953125, "rewards/verify_chess_move/std": 0.9777982354164123, "step": 800 }, { "completion_length": 373.2, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 128.84140625, "completions/mean_terminated_length": 128.84140625, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0007285239290698242, "frac_reward_zero_std": 0.35625, "grad_norm": 0.09747790545225143, "kl": 0.0004964436573573038, "learning_rate": 5.7428571428571424e-08, "loss": 0.0, "num_tokens": 59378901.0, "reward": -0.2140625, "reward_std": 0.5786561906337738, "rewards/verify_chess_move/mean": -0.2140625, "rewards/verify_chess_move/std": 0.971923828125, "step": 805 }, { "completion_length": 383.8, "completions/clipped_ratio": 0.0, "completions/max_length": 383.8, "completions/max_terminated_length": 383.8, "completions/mean_length": 137.63671875, "completions/mean_terminated_length": 137.63671875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.000733048922418084, "frac_reward_zero_std": 0.36875, "grad_norm": 0.15239937603473663, "kl": 0.0005215465913352091, "learning_rate": 5.7785714285714285e-08, "loss": 0.0, "num_tokens": 59754988.0, "reward": -0.0890625, "reward_std": 0.5702416777610779, "rewards/verify_chess_move/mean": -0.0890625, "rewards/verify_chess_move/std": 0.9954239249229431, "step": 810 }, { "completion_length": 357.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 130.65234375, "completions/mean_terminated_length": 130.65234375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0007375739157663438, "frac_reward_zero_std": 0.28125, "grad_norm": 0.1729770451784134, "kl": 0.00048621416708556354, "learning_rate": 5.814285714285714e-08, "loss": 0.0, "num_tokens": 60117447.0, "reward": -0.065625, "reward_std": 0.6538172245025635, "rewards/verify_chess_move/mean": -0.065625, "rewards/verify_chess_move/std": 0.9991641283035279, "step": 815 }, { "completion_length": 387.2, "completions/clipped_ratio": 0.0, "completions/max_length": 387.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 124.6828125, "completions/mean_terminated_length": 124.6828125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0007420989091146036, "frac_reward_zero_std": 0.36875, "grad_norm": 0.13599546253681183, "kl": 0.0004683162029323285, "learning_rate": 5.85e-08, "loss": 0.0, "num_tokens": 60473313.0, "reward": -0.04375, "reward_std": 0.570971155166626, "rewards/verify_chess_move/mean": -0.04375, "rewards/verify_chess_move/std": 0.9979599356651306, "step": 820 }, { "completion_length": 349.6, "completions/clipped_ratio": 0.0, "completions/max_length": 349.6, "completions/max_terminated_length": 349.6, "completions/mean_length": 135.584375, "completions/mean_terminated_length": 135.584375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0007466239024628634, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13587090373039246, "kl": 0.0005563602379879739, "learning_rate": 5.8857142857142856e-08, "loss": 0.0, "num_tokens": 60847045.0, "reward": -0.2875, "reward_std": 0.5736621141433715, "rewards/verify_chess_move/mean": -0.2875, "rewards/verify_chess_move/std": 0.9516097426414489, "step": 825 }, { "completion_length": 342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 130.32578125, "completions/mean_terminated_length": 130.32578125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0007511488958111232, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12396746128797531, "kl": 0.0005163858724699822, "learning_rate": 5.921428571428571e-08, "loss": 0.0, "num_tokens": 61212734.0, "reward": -0.01875, "reward_std": 0.5718249261379242, "rewards/verify_chess_move/mean": -0.01875, "rewards/verify_chess_move/std": 0.9983915567398072, "step": 830 }, { "completion_length": 329.8, "completions/clipped_ratio": 0.0, "completions/max_length": 329.8, "completions/max_terminated_length": 329.8, "completions/mean_length": 126.621875, "completions/mean_terminated_length": 126.621875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.000755673889159383, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1465359479188919, "kl": 0.000527058161333116, "learning_rate": 5.957142857142857e-08, "loss": 0.0, "num_tokens": 61571394.0, "reward": -0.140625, "reward_std": 0.5647031247615815, "rewards/verify_chess_move/mean": -0.140625, "rewards/verify_chess_move/std": 0.9841536045074463, "step": 835 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 132.60703125, "completions/mean_terminated_length": 132.60703125, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0007601988825076427, "frac_reward_zero_std": 0.375, "grad_norm": 0.09872814267873764, "kl": 0.0005028800147556467, "learning_rate": 5.992857142857142e-08, "loss": 0.0, "num_tokens": 61940467.0, "reward": -0.1109375, "reward_std": 0.5539924263954162, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.9953439354896545, "step": 840 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 121.190625, "completions/mean_terminated_length": 121.190625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0007647238758559025, "frac_reward_zero_std": 0.33125, "grad_norm": 0.154079869389534, "kl": 0.0004984573784895474, "learning_rate": 6.028571428571428e-08, "loss": 0.0, "num_tokens": 62291823.0, "reward": -0.09375, "reward_std": 0.5857052981853486, "rewards/verify_chess_move/mean": -0.09375, "rewards/verify_chess_move/std": 0.9918566107749939, "step": 845 }, { "completion_length": 388.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 136.98203125, "completions/mean_terminated_length": 136.98203125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0007692488692041623, "frac_reward_zero_std": 0.3375, "grad_norm": 0.12214067578315735, "kl": 0.0005218539720317494, "learning_rate": 6.064285714285714e-08, "loss": 0.0, "num_tokens": 62667320.0, "reward": -0.16875, "reward_std": 0.6017398357391357, "rewards/verify_chess_move/mean": -0.16875, "rewards/verify_chess_move/std": 0.9869036674499512, "step": 850 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 135.89453125, "completions/mean_terminated_length": 135.89453125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.000773773862552422, "frac_reward_zero_std": 0.36875, "grad_norm": 0.15757450461387634, "kl": 0.00047964114410206096, "learning_rate": 6.099999999999999e-08, "loss": 0.0, "num_tokens": 63039945.0, "reward": -0.1265625, "reward_std": 0.5515112280845642, "rewards/verify_chess_move/mean": -0.1265625, "rewards/verify_chess_move/std": 0.9915991306304932, "step": 855 }, { "completion_length": 348.4, "completions/clipped_ratio": 0.0, "completions/max_length": 348.4, "completions/max_terminated_length": 348.4, "completions/mean_length": 121.53515625, "completions/mean_terminated_length": 121.53515625, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0007782988559006818, "frac_reward_zero_std": 0.36875, "grad_norm": 0.11280359327793121, "kl": 0.0005118433857205674, "learning_rate": 6.135714285714285e-08, "loss": 0.0, "num_tokens": 63393166.0, "reward": -0.10625, "reward_std": 0.5618762195110321, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.9902596950531006, "step": 860 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 131.56875, "completions/mean_terminated_length": 131.56875, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0007828238492489416, "frac_reward_zero_std": 0.45, "grad_norm": 0.11449841409921646, "kl": 0.0005708107297323295, "learning_rate": 6.171428571428571e-08, "loss": 0.0, "num_tokens": 63762766.0, "reward": -0.1328125, "reward_std": 0.49711914658546447, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9909356832504272, "step": 865 }, { "completion_length": 375.6, "completions/clipped_ratio": 0.0, "completions/max_length": 375.6, "completions/max_terminated_length": 375.6, "completions/mean_length": 138.52890625, "completions/mean_terminated_length": 138.52890625, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0007873488425972014, "frac_reward_zero_std": 0.3125, "grad_norm": 0.17954076826572418, "kl": 0.0005222749657150417, "learning_rate": 6.207142857142856e-08, "loss": 0.0, "num_tokens": 64139203.0, "reward": -0.240625, "reward_std": 0.624834942817688, "rewards/verify_chess_move/mean": -0.240625, "rewards/verify_chess_move/std": 0.9615049719810486, "step": 870 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 137.7171875, "completions/mean_terminated_length": 137.7171875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0007918738359454612, "frac_reward_zero_std": 0.35, "grad_norm": 0.14227046072483063, "kl": 0.0005285792456561466, "learning_rate": 6.242857142857142e-08, "loss": 0.0, "num_tokens": 64515105.0, "reward": -0.19375, "reward_std": 0.5864915251731873, "rewards/verify_chess_move/mean": -0.19375, "rewards/verify_chess_move/std": 0.9803982377052307, "step": 875 }, { "completion_length": 379.8, "completions/clipped_ratio": 0.0, "completions/max_length": 379.8, "completions/max_terminated_length": 379.8, "completions/mean_length": 131.30859375, "completions/mean_terminated_length": 131.30859375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.000796398829293721, "frac_reward_zero_std": 0.3375, "grad_norm": 0.14164437353610992, "kl": 0.0005321157696926093, "learning_rate": 6.278571428571428e-08, "loss": 0.0, "num_tokens": 64881292.0, "reward": -0.0390625, "reward_std": 0.6086676001548768, "rewards/verify_chess_move/mean": -0.0390625, "rewards/verify_chess_move/std": 1.0002628326416017, "step": 880 }, { "completion_length": 456.2, "completions/clipped_ratio": 0.0, "completions/max_length": 456.2, "completions/max_terminated_length": 456.2, "completions/mean_length": 126.63671875, "completions/mean_terminated_length": 126.63671875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0008009238226419807, "frac_reward_zero_std": 0.4, "grad_norm": 0.10835513472557068, "kl": 0.0004232664104165451, "learning_rate": 6.314285714285713e-08, "loss": 0.0, "num_tokens": 65240075.0, "reward": -0.1671875, "reward_std": 0.558864164352417, "rewards/verify_chess_move/mean": -0.1671875, "rewards/verify_chess_move/std": 0.9841862320899963, "step": 885 }, { "completion_length": 473.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 473.6, "completions/max_terminated_length": 402.0, "completions/mean_length": 130.73203125, "completions/mean_terminated_length": 130.22475891113282, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0008054488159902405, "frac_reward_zero_std": 0.4, "grad_norm": 0.14978323876857758, "kl": 0.00044608609032366077, "learning_rate": 6.349999999999999e-08, "loss": 0.0, "num_tokens": 65604580.0, "reward": -0.1875, "reward_std": 0.533721286058426, "rewards/verify_chess_move/mean": -0.1875, "rewards/verify_chess_move/std": 0.9647961616516113, "step": 890 }, { "completion_length": 528.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 528.6, "completions/max_terminated_length": 360.2, "completions/mean_length": 137.978125, "completions/mean_terminated_length": 136.98475799560546, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0008099738093385003, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13109473884105682, "kl": 0.00042628177775441147, "learning_rate": 6.385714285714285e-08, "loss": 0.0, "num_tokens": 65979704.0, "reward": -0.078125, "reward_std": 0.5623021006584168, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.991110360622406, "step": 895 }, { "completion_length": 347.6, "completions/clipped_ratio": 0.0, "completions/max_length": 347.6, "completions/max_terminated_length": 347.6, "completions/mean_length": 128.396875, "completions/mean_terminated_length": 128.396875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00081449880268676, "frac_reward_zero_std": 0.38125, "grad_norm": 0.16596296429634094, "kl": 0.0005361448391340673, "learning_rate": 6.42142857142857e-08, "loss": 0.0, "num_tokens": 66341260.0, "reward": 0.0125, "reward_std": 0.555133831501007, "rewards/verify_chess_move/mean": 0.0125, "rewards/verify_chess_move/std": 0.9910654425621033, "step": 900 }, { "completion_length": 430.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 430.8, "completions/max_terminated_length": 348.2, "completions/mean_length": 129.30703125, "completions/mean_terminated_length": 128.81643676757812, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0008190237960350198, "frac_reward_zero_std": 0.35, "grad_norm": 0.13476188480854034, "kl": 0.0004840901979150658, "learning_rate": 6.457142857142856e-08, "loss": 0.0, "num_tokens": 66704253.0, "reward": -0.1265625, "reward_std": 0.5874806404113769, "rewards/verify_chess_move/mean": -0.1265625, "rewards/verify_chess_move/std": 0.9889323830604553, "step": 905 }, { "completion_length": 384.6, "completions/clipped_ratio": 0.0, "completions/max_length": 384.6, "completions/max_terminated_length": 384.6, "completions/mean_length": 135.92265625, "completions/mean_terminated_length": 135.92265625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0008235487893832796, "frac_reward_zero_std": 0.4625, "grad_norm": 0.15556073188781738, "kl": 0.0005052658944805444, "learning_rate": 6.492857142857142e-08, "loss": 0.0, "num_tokens": 67078282.0, "reward": -0.059375, "reward_std": 0.4867002248764038, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9944942355155945, "step": 910 }, { "completion_length": 388.8, "completions/clipped_ratio": 0.0, "completions/max_length": 388.8, "completions/max_terminated_length": 388.8, "completions/mean_length": 139.0734375, "completions/mean_terminated_length": 139.0734375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0008280737827315393, "frac_reward_zero_std": 0.4125, "grad_norm": 0.1133640855550766, "kl": 0.0005115478706102294, "learning_rate": 6.528571428571427e-08, "loss": 0.0, "num_tokens": 67456800.0, "reward": -0.1515625, "reward_std": 0.5347805559635163, "rewards/verify_chess_move/mean": -0.1515625, "rewards/verify_chess_move/std": 0.9850387930870056, "step": 915 }, { "completion_length": 400.2, "completions/clipped_ratio": 0.0, "completions/max_length": 400.2, "completions/max_terminated_length": 400.2, "completions/mean_length": 129.80859375, "completions/mean_terminated_length": 129.80859375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0008325987760797992, "frac_reward_zero_std": 0.3, "grad_norm": 0.15567034482955933, "kl": 0.0005062656261543453, "learning_rate": 6.564285714285713e-08, "loss": 0.0, "num_tokens": 67820035.0, "reward": -0.009375, "reward_std": 0.6285159826278687, "rewards/verify_chess_move/mean": -0.009375, "rewards/verify_chess_move/std": 0.9926192402839661, "step": 920 }, { "completion_length": 355.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 127.2046875, "completions/mean_terminated_length": 127.2046875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.000837123769428059, "frac_reward_zero_std": 0.3875, "grad_norm": 0.09753994643688202, "kl": 0.0005036998291870987, "learning_rate": 6.6e-08, "loss": 0.0, "num_tokens": 68179081.0, "reward": -0.046875, "reward_std": 0.5402060329914093, "rewards/verify_chess_move/mean": -0.046875, "rewards/verify_chess_move/std": 0.9877225041389466, "step": 925 }, { "completion_length": 371.8, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 136.834375, "completions/mean_terminated_length": 136.834375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0008416487627763187, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15159235894680023, "kl": 0.0005131910815180163, "learning_rate": 6.635714285714284e-08, "loss": 0.0, "num_tokens": 68552157.0, "reward": -0.1296875, "reward_std": 0.5899119853973389, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9933515906333923, "step": 930 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 133.60625, "completions/mean_terminated_length": 133.60625, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0008461737561245785, "frac_reward_zero_std": 0.3375, "grad_norm": 0.09028126299381256, "kl": 0.0005075263233266014, "learning_rate": 6.67142857142857e-08, "loss": 0.0, "num_tokens": 68919421.0, "reward": -0.0703125, "reward_std": 0.5837581992149353, "rewards/verify_chess_move/mean": -0.0703125, "rewards/verify_chess_move/std": 0.9900108218193054, "step": 935 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 133.5328125, "completions/mean_terminated_length": 133.5328125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0008506987494728383, "frac_reward_zero_std": 0.35625, "grad_norm": 0.14804455637931824, "kl": 0.0005006498735383502, "learning_rate": 6.707142857142857e-08, "loss": 0.0, "num_tokens": 69289263.0, "reward": -0.2140625, "reward_std": 0.5895879149436951, "rewards/verify_chess_move/mean": -0.2140625, "rewards/verify_chess_move/std": 0.97692711353302, "step": 940 }, { "completion_length": 391.4, "completions/clipped_ratio": 0.0, "completions/max_length": 391.4, "completions/max_terminated_length": 391.4, "completions/mean_length": 136.19609375, "completions/mean_terminated_length": 136.19609375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.000855223742821098, "frac_reward_zero_std": 0.31875, "grad_norm": 0.1884077936410904, "kl": 0.00050684166717474, "learning_rate": 6.742857142857143e-08, "loss": 0.0, "num_tokens": 69661242.0, "reward": -0.1265625, "reward_std": 0.6195223093032837, "rewards/verify_chess_move/mean": -0.1265625, "rewards/verify_chess_move/std": 0.988859486579895, "step": 945 }, { "completion_length": 380.8, "completions/clipped_ratio": 0.0, "completions/max_length": 380.8, "completions/max_terminated_length": 380.8, "completions/mean_length": 132.21796875, "completions/mean_terminated_length": 132.21796875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0008597487361693578, "frac_reward_zero_std": 0.34375, "grad_norm": 0.17804144322872162, "kl": 0.0005169742317775672, "learning_rate": 6.778571428571428e-08, "loss": 0.0, "num_tokens": 70026785.0, "reward": -0.1359375, "reward_std": 0.5937358975410462, "rewards/verify_chess_move/mean": -0.1359375, "rewards/verify_chess_move/std": 0.9805051445960998, "step": 950 }, { "completion_length": 367.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 128.971875, "completions/mean_terminated_length": 128.971875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0008642737295176176, "frac_reward_zero_std": 0.44375, "grad_norm": 0.1227385625243187, "kl": 0.0005154446832420945, "learning_rate": 6.814285714285714e-08, "loss": 0.0, "num_tokens": 70390085.0, "reward": -0.2515625, "reward_std": 0.5056374967098236, "rewards/verify_chess_move/mean": -0.2515625, "rewards/verify_chess_move/std": 0.9676732659339905, "step": 955 }, { "completion_length": 435.6, "completions/clipped_ratio": 0.0, "completions/max_length": 435.6, "completions/max_terminated_length": 435.6, "completions/mean_length": 133.6140625, "completions/mean_terminated_length": 133.6140625, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.0008687987228658773, "frac_reward_zero_std": 0.36875, "grad_norm": 0.15188060700893402, "kl": 0.00046512768412867447, "learning_rate": 6.85e-08, "loss": 0.0, "num_tokens": 70759855.0, "reward": -0.075, "reward_std": 0.5769562959671021, "rewards/verify_chess_move/mean": -0.075, "rewards/verify_chess_move/std": 0.9948126554489136, "step": 960 }, { "completion_length": 458.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 135.928125, "completions/mean_terminated_length": 135.928125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0008733237162141372, "frac_reward_zero_std": 0.30625, "grad_norm": 0.1382942795753479, "kl": 0.0004609661038557533, "learning_rate": 6.885714285714285e-08, "loss": 0.0, "num_tokens": 71132891.0, "reward": -0.1015625, "reward_std": 0.6190530896186829, "rewards/verify_chess_move/mean": -0.1015625, "rewards/verify_chess_move/std": 0.9948200583457947, "step": 965 }, { "completion_length": 447.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.6, "completions/max_terminated_length": 365.8, "completions/mean_length": 138.28828125, "completions/mean_terminated_length": 137.80589599609374, "completions/min_length": 42.8, "completions/min_terminated_length": 42.8, "epoch": 0.000877848709562397, "frac_reward_zero_std": 0.425, "grad_norm": 0.15925540030002594, "kl": 0.0005223496879807499, "learning_rate": 6.921428571428571e-08, "loss": 0.0, "num_tokens": 71511068.0, "reward": -0.134375, "reward_std": 0.5268378615379333, "rewards/verify_chess_move/mean": -0.134375, "rewards/verify_chess_move/std": 0.9887067079544067, "step": 970 }, { "completion_length": 364.6, "completions/clipped_ratio": 0.0, "completions/max_length": 364.6, "completions/max_terminated_length": 364.6, "completions/mean_length": 140.6203125, "completions/mean_terminated_length": 140.6203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0008823737029106568, "frac_reward_zero_std": 0.3625, "grad_norm": 0.13523711264133453, "kl": 0.0005570327062741854, "learning_rate": 6.957142857142857e-08, "loss": 0.0, "num_tokens": 71892230.0, "reward": -0.1984375, "reward_std": 0.5615137338638305, "rewards/verify_chess_move/mean": -0.1984375, "rewards/verify_chess_move/std": 0.9763812065124512, "step": 975 }, { "completion_length": 361.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 130.8171875, "completions/mean_terminated_length": 130.8171875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0008868986962589165, "frac_reward_zero_std": 0.35625, "grad_norm": 0.1468433290719986, "kl": 0.000503795667009399, "learning_rate": 6.992857142857142e-08, "loss": 0.0, "num_tokens": 72254516.0, "reward": -0.0859375, "reward_std": 0.5719192802906037, "rewards/verify_chess_move/mean": -0.0859375, "rewards/verify_chess_move/std": 0.9955902576446534, "step": 980 }, { "completion_length": 333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 126.59765625, "completions/mean_terminated_length": 126.59765625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0008914236896071763, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13413649797439575, "kl": 0.000564046304589283, "learning_rate": 7.028571428571428e-08, "loss": 0.0, "num_tokens": 72614345.0, "reward": -0.1640625, "reward_std": 0.5798594415187835, "rewards/verify_chess_move/mean": -0.1640625, "rewards/verify_chess_move/std": 0.9844985008239746, "step": 985 }, { "completion_length": 391.2, "completions/clipped_ratio": 0.0, "completions/max_length": 391.2, "completions/max_terminated_length": 391.2, "completions/mean_length": 137.13203125, "completions/mean_terminated_length": 137.13203125, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0008959486829554361, "frac_reward_zero_std": 0.39375, "grad_norm": 0.13358400762081146, "kl": 0.0004998274958779803, "learning_rate": 7.064285714285714e-08, "loss": 0.0, "num_tokens": 72990546.0, "reward": -0.109375, "reward_std": 0.556071937084198, "rewards/verify_chess_move/mean": -0.109375, "rewards/verify_chess_move/std": 0.9862615823745727, "step": 990 }, { "completion_length": 350.8, "completions/clipped_ratio": 0.0, "completions/max_length": 350.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 141.43984375, "completions/mean_terminated_length": 141.43984375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0009004736763036958, "frac_reward_zero_std": 0.30625, "grad_norm": 0.15050140023231506, "kl": 0.0005557332166063134, "learning_rate": 7.099999999999999e-08, "loss": 0.0, "num_tokens": 73374693.0, "reward": -0.1703125, "reward_std": 0.621318131685257, "rewards/verify_chess_move/mean": -0.1703125, "rewards/verify_chess_move/std": 0.977129864692688, "step": 995 }, { "completion_length": 343.6, "completions/clipped_ratio": 0.0, "completions/max_length": 343.6, "completions/max_terminated_length": 343.6, "completions/mean_length": 130.83203125, "completions/mean_terminated_length": 130.83203125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0009049986696519556, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15061236917972565, "kl": 0.000547427215224161, "learning_rate": 7.135714285714285e-08, "loss": 0.0, "num_tokens": 73740086.0, "reward": -0.09375, "reward_std": 0.5971547484397888, "rewards/verify_chess_move/mean": -0.09375, "rewards/verify_chess_move/std": 0.9934443354606628, "step": 1000 }, { "completion_length": 346.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 125.9875, "completions/mean_terminated_length": 125.9875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0009095236630002154, "frac_reward_zero_std": 0.4625, "grad_norm": 0.16403420269489288, "kl": 0.0005453133479022653, "learning_rate": 7.171428571428571e-08, "loss": 0.0, "num_tokens": 74100270.0, "reward": -0.10625, "reward_std": 0.4792289316654205, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.9865412235260009, "step": 1005 }, { "completion_length": 471.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.4, "completions/max_terminated_length": 384.0, "completions/mean_length": 136.56328125, "completions/mean_terminated_length": 136.06571044921876, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0009140486563484751, "frac_reward_zero_std": 0.45625, "grad_norm": 0.11190203577280045, "kl": 0.00043647904685713, "learning_rate": 7.207142857142857e-08, "loss": 0.0, "num_tokens": 74475247.0, "reward": -0.125, "reward_std": 0.48855139017105104, "rewards/verify_chess_move/mean": -0.125, "rewards/verify_chess_move/std": 0.9865574240684509, "step": 1010 }, { "completion_length": 410.4, "completions/clipped_ratio": 0.0, "completions/max_length": 410.4, "completions/max_terminated_length": 410.4, "completions/mean_length": 131.34140625, "completions/mean_terminated_length": 131.34140625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.000918573649696735, "frac_reward_zero_std": 0.28125, "grad_norm": 0.16756542026996613, "kl": 0.0004636066268176364, "learning_rate": 7.242857142857142e-08, "loss": 0.0, "num_tokens": 74839212.0, "reward": -0.06875, "reward_std": 0.6538152694702148, "rewards/verify_chess_move/mean": -0.06875, "rewards/verify_chess_move/std": 0.998385488986969, "step": 1015 }, { "completion_length": 358.6, "completions/clipped_ratio": 0.0, "completions/max_length": 358.6, "completions/max_terminated_length": 358.6, "completions/mean_length": 133.8359375, "completions/mean_terminated_length": 133.8359375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0009230986430449948, "frac_reward_zero_std": 0.375, "grad_norm": 0.13480636477470398, "kl": 0.0005407270108662487, "learning_rate": 7.278571428571428e-08, "loss": 0.0, "num_tokens": 75208738.0, "reward": -0.16875, "reward_std": 0.5579286217689514, "rewards/verify_chess_move/mean": -0.16875, "rewards/verify_chess_move/std": 0.9869429230690002, "step": 1020 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 132.04375, "completions/mean_terminated_length": 132.04375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0009276236363932545, "frac_reward_zero_std": 0.4, "grad_norm": 0.16147522628307343, "kl": 0.0005440507645289472, "learning_rate": 7.314285714285714e-08, "loss": 0.0, "num_tokens": 75576314.0, "reward": -0.1109375, "reward_std": 0.5432415246963501, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.9928311586380005, "step": 1025 }, { "completion_length": 356.4, "completions/clipped_ratio": 0.0, "completions/max_length": 356.4, "completions/max_terminated_length": 356.4, "completions/mean_length": 131.25859375, "completions/mean_terminated_length": 131.25859375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0009321486297415143, "frac_reward_zero_std": 0.31875, "grad_norm": 0.15971393883228302, "kl": 0.0005027012495702366, "learning_rate": 7.349999999999999e-08, "loss": 0.0, "num_tokens": 75941501.0, "reward": -0.19375, "reward_std": 0.6070104598999023, "rewards/verify_chess_move/mean": -0.19375, "rewards/verify_chess_move/std": 0.9685346007347106, "step": 1030 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 125.34765625, "completions/mean_terminated_length": 125.34765625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0009366736230897741, "frac_reward_zero_std": 0.3625, "grad_norm": 0.16320545971393585, "kl": 0.0005024338434850506, "learning_rate": 7.385714285714285e-08, "loss": 0.0, "num_tokens": 76298282.0, "reward": -0.0890625, "reward_std": 0.5761230111122131, "rewards/verify_chess_move/mean": -0.0890625, "rewards/verify_chess_move/std": 0.9902339339256286, "step": 1035 }, { "completion_length": 379.8, "completions/clipped_ratio": 0.0, "completions/max_length": 379.8, "completions/max_terminated_length": 379.8, "completions/mean_length": 139.24140625, "completions/mean_terminated_length": 139.24140625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0009411986164380338, "frac_reward_zero_std": 0.35625, "grad_norm": 0.12962330877780914, "kl": 0.0005337823933587061, "learning_rate": 7.421428571428571e-08, "loss": 0.0, "num_tokens": 76677103.0, "reward": -0.0171875, "reward_std": 0.5630335688591004, "rewards/verify_chess_move/mean": -0.0171875, "rewards/verify_chess_move/std": 0.996393084526062, "step": 1040 }, { "completion_length": 338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 128.43984375, "completions/mean_terminated_length": 128.43984375, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0009457236097862936, "frac_reward_zero_std": 0.4, "grad_norm": 0.17027801275253296, "kl": 0.0005758762135883444, "learning_rate": 7.457142857142856e-08, "loss": 0.0, "num_tokens": 77038882.0, "reward": -0.1625, "reward_std": 0.5409309804439545, "rewards/verify_chess_move/mean": -0.1625, "rewards/verify_chess_move/std": 0.9816642642021179, "step": 1045 }, { "completion_length": 356.2, "completions/clipped_ratio": 0.0, "completions/max_length": 356.2, "completions/max_terminated_length": 356.2, "completions/mean_length": 142.09296875, "completions/mean_terminated_length": 142.09296875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0009502486031345534, "frac_reward_zero_std": 0.45625, "grad_norm": 0.14419004321098328, "kl": 0.0005607663399132435, "learning_rate": 7.492857142857142e-08, "loss": 0.0, "num_tokens": 77424489.0, "reward": -0.1015625, "reward_std": 0.4877501964569092, "rewards/verify_chess_move/mean": -0.1015625, "rewards/verify_chess_move/std": 0.9926938414573669, "step": 1050 }, { "completion_length": 368.2, "completions/clipped_ratio": 0.0, "completions/max_length": 368.2, "completions/max_terminated_length": 368.2, "completions/mean_length": 127.15, "completions/mean_terminated_length": 127.15, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0009547735964828131, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13973493874073029, "kl": 0.0005121916270582006, "learning_rate": 7.528571428571428e-08, "loss": 0.0, "num_tokens": 77785545.0, "reward": -0.1296875, "reward_std": 0.546811717748642, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9866021871566772, "step": 1055 }, { "completion_length": 425.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.8, "completions/max_terminated_length": 344.8, "completions/mean_length": 134.515625, "completions/mean_terminated_length": 134.02525634765624, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0009592985898310729, "frac_reward_zero_std": 0.35625, "grad_norm": 0.06981560587882996, "kl": 0.0005083508559437178, "learning_rate": 7.564285714285713e-08, "loss": 0.0, "num_tokens": 78154461.0, "reward": -0.0515625, "reward_std": 0.5739653170108795, "rewards/verify_chess_move/mean": -0.0515625, "rewards/verify_chess_move/std": 0.9927350878715515, "step": 1060 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.0, "completions/max_length": 419.4, "completions/max_terminated_length": 419.4, "completions/mean_length": 128.24921875, "completions/mean_terminated_length": 128.24921875, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.0009638235831793328, "frac_reward_zero_std": 0.3, "grad_norm": 0.15907180309295654, "kl": 0.0004732689075353846, "learning_rate": 7.599999999999999e-08, "loss": 0.0, "num_tokens": 78513732.0, "reward": -0.0578125, "reward_std": 0.6151089787483215, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9888334274291992, "step": 1065 }, { "completion_length": 422.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 422.8, "completions/max_terminated_length": 367.0, "completions/mean_length": 129.70234375, "completions/mean_terminated_length": 129.2083526611328, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0009683485765275925, "frac_reward_zero_std": 0.31875, "grad_norm": 0.13346613943576813, "kl": 0.0005097436536743771, "learning_rate": 7.635714285714285e-08, "loss": 0.0, "num_tokens": 78875391.0, "reward": -0.05625, "reward_std": 0.6168823599815368, "rewards/verify_chess_move/mean": -0.05625, "rewards/verify_chess_move/std": 0.9921198487281799, "step": 1070 }, { "completion_length": 428.2, "completions/clipped_ratio": 0.0, "completions/max_length": 428.2, "completions/max_terminated_length": 428.2, "completions/mean_length": 147.02578125, "completions/mean_terminated_length": 147.02578125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0009728735698758523, "frac_reward_zero_std": 0.3625, "grad_norm": 0.09851451218128204, "kl": 0.0005054260661381705, "learning_rate": 7.671428571428571e-08, "loss": 0.0, "num_tokens": 79265776.0, "reward": -0.2046875, "reward_std": 0.5777911722660065, "rewards/verify_chess_move/mean": -0.2046875, "rewards/verify_chess_move/std": 0.9728338599205018, "step": 1075 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 132.32421875, "completions/mean_terminated_length": 132.32421875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.000977398563224112, "frac_reward_zero_std": 0.35, "grad_norm": 0.2565191686153412, "kl": 0.0007154139208068955, "learning_rate": 7.707142857142856e-08, "loss": 0.0, "num_tokens": 79633335.0, "reward": -0.10625, "reward_std": 0.6058827042579651, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.985037338733673, "step": 1080 }, { "completion_length": 422.4, "completions/clipped_ratio": 0.0, "completions/max_length": 422.4, "completions/max_terminated_length": 422.4, "completions/mean_length": 132.753125, "completions/mean_terminated_length": 132.753125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0009819235565723717, "frac_reward_zero_std": 0.40625, "grad_norm": 0.13871613144874573, "kl": 0.0004902343363028195, "learning_rate": 7.742857142857142e-08, "loss": 0.0, "num_tokens": 80002819.0, "reward": -0.1625, "reward_std": 0.5508137226104737, "rewards/verify_chess_move/mean": -0.1625, "rewards/verify_chess_move/std": 0.9786080241203308, "step": 1085 }, { "completion_length": 333.4, "completions/clipped_ratio": 0.0, "completions/max_length": 333.4, "completions/max_terminated_length": 333.4, "completions/mean_length": 132.48828125, "completions/mean_terminated_length": 132.48828125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0009864485499206316, "frac_reward_zero_std": 0.3875, "grad_norm": 0.14237618446350098, "kl": 0.0006122174476331565, "learning_rate": 7.778571428571429e-08, "loss": 0.0, "num_tokens": 80370932.0, "reward": -0.1171875, "reward_std": 0.5522470355033875, "rewards/verify_chess_move/mean": -0.1171875, "rewards/verify_chess_move/std": 0.9830397486686706, "step": 1090 }, { "completion_length": 359.4, "completions/clipped_ratio": 0.0, "completions/max_length": 359.4, "completions/max_terminated_length": 359.4, "completions/mean_length": 134.7140625, "completions/mean_terminated_length": 134.7140625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0009909735432688915, "frac_reward_zero_std": 0.3, "grad_norm": 0.15656450390815735, "kl": 0.0005365884821003419, "learning_rate": 7.814285714285713e-08, "loss": 0.0, "num_tokens": 80742054.0, "reward": -0.0859375, "reward_std": 0.6405589818954468, "rewards/verify_chess_move/mean": -0.0859375, "rewards/verify_chess_move/std": 0.9926385998725891, "step": 1095 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 141.20078125, "completions/mean_terminated_length": 141.20078125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0009954985366171511, "frac_reward_zero_std": 0.35625, "grad_norm": 0.14803890883922577, "kl": 0.0005815417171106674, "learning_rate": 7.85e-08, "loss": 0.0, "num_tokens": 81123191.0, "reward": -0.059375, "reward_std": 0.576392138004303, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9867865562438964, "step": 1100 }, { "completion_length": 330.4, "completions/clipped_ratio": 0.0, "completions/max_length": 330.4, "completions/max_terminated_length": 330.4, "completions/mean_length": 133.55234375, "completions/mean_terminated_length": 133.55234375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.001000023529965411, "frac_reward_zero_std": 0.34375, "grad_norm": 0.16475848853588104, "kl": 0.0006426528723750379, "learning_rate": 7.885714285714286e-08, "loss": 0.0, "num_tokens": 81494794.0, "reward": -0.1421875, "reward_std": 0.6119064629077912, "rewards/verify_chess_move/mean": -0.1421875, "rewards/verify_chess_move/std": 0.9780222654342652, "step": 1105 }, { "completion_length": 397.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 137.603125, "completions/mean_terminated_length": 137.603125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0010045485233136707, "frac_reward_zero_std": 0.3875, "grad_norm": 0.17803263664245605, "kl": 0.0005437554624222685, "learning_rate": 7.92142857142857e-08, "loss": 0.0, "num_tokens": 81870910.0, "reward": -0.134375, "reward_std": 0.54378662109375, "rewards/verify_chess_move/mean": -0.134375, "rewards/verify_chess_move/std": 0.9817947387695313, "step": 1110 }, { "completion_length": 408.2, "completions/clipped_ratio": 0.0, "completions/max_length": 408.2, "completions/max_terminated_length": 408.2, "completions/mean_length": 141.8984375, "completions/mean_terminated_length": 141.8984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0010090735166619306, "frac_reward_zero_std": 0.30625, "grad_norm": 0.11227331310510635, "kl": 0.0005046506170401699, "learning_rate": 7.957142857142857e-08, "loss": 0.0, "num_tokens": 82250508.0, "reward": -0.0578125, "reward_std": 0.6268537521362305, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9902273178100586, "step": 1115 }, { "completion_length": 348.8, "completions/clipped_ratio": 0.0, "completions/max_length": 348.8, "completions/max_terminated_length": 348.8, "completions/mean_length": 134.87890625, "completions/mean_terminated_length": 134.87890625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0010135985100101902, "frac_reward_zero_std": 0.33125, "grad_norm": 0.1763082891702652, "kl": 0.0005487060273480892, "learning_rate": 7.992857142857143e-08, "loss": 0.0, "num_tokens": 82622145.0, "reward": 0.0, "reward_std": 0.5900163650512695, "rewards/verify_chess_move/mean": 0.0, "rewards/verify_chess_move/std": 0.9922482013702393, "step": 1120 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 140.8375, "completions/mean_terminated_length": 140.8375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.00101812350335845, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15848250687122345, "kl": 0.0006055619589460548, "learning_rate": 8.028571428571427e-08, "loss": 0.0, "num_tokens": 83004993.0, "reward": -0.03125, "reward_std": 0.5936439037322998, "rewards/verify_chess_move/mean": -0.03125, "rewards/verify_chess_move/std": 0.9951650023460388, "step": 1125 }, { "completion_length": 340.8, "completions/clipped_ratio": 0.0, "completions/max_length": 340.8, "completions/max_terminated_length": 340.8, "completions/mean_length": 126.47734375, "completions/mean_terminated_length": 126.47734375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0010226484967067098, "frac_reward_zero_std": 0.38125, "grad_norm": 0.1403663605451584, "kl": 0.0005389185069361701, "learning_rate": 8.064285714285714e-08, "loss": 0.0, "num_tokens": 83364748.0, "reward": -0.0515625, "reward_std": 0.5594013333320618, "rewards/verify_chess_move/mean": -0.0515625, "rewards/verify_chess_move/std": 0.9896413087844849, "step": 1130 }, { "completion_length": 341.4, "completions/clipped_ratio": 0.0, "completions/max_length": 341.4, "completions/max_terminated_length": 341.4, "completions/mean_length": 133.9265625, "completions/mean_terminated_length": 133.9265625, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0010271734900549696, "frac_reward_zero_std": 0.3, "grad_norm": 0.1593542844057083, "kl": 0.0005937999070738442, "learning_rate": 8.1e-08, "loss": 0.0, "num_tokens": 83735622.0, "reward": -0.1359375, "reward_std": 0.6365753293037415, "rewards/verify_chess_move/mean": -0.1359375, "rewards/verify_chess_move/std": 0.9863779902458191, "step": 1135 }, { "completion_length": 441.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 441.2, "completions/max_terminated_length": 355.6, "completions/mean_length": 134.771875, "completions/mean_terminated_length": 134.2862548828125, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.0010316984834032295, "frac_reward_zero_std": 0.36875, "grad_norm": 0.13736718893051147, "kl": 0.0004749976043967763, "learning_rate": 8.135714285714286e-08, "loss": 0.0, "num_tokens": 84108706.0, "reward": -0.240625, "reward_std": 0.5634514212608337, "rewards/verify_chess_move/mean": -0.240625, "rewards/verify_chess_move/std": 0.9677720189094543, "step": 1140 }, { "completion_length": 400.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 140.58984375, "completions/mean_terminated_length": 140.58984375, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.0010362234767514892, "frac_reward_zero_std": 0.35625, "grad_norm": 0.14307811856269836, "kl": 0.0005803378994642117, "learning_rate": 8.171428571428571e-08, "loss": 0.0, "num_tokens": 84487677.0, "reward": -0.15625, "reward_std": 0.5807031989097595, "rewards/verify_chess_move/mean": -0.15625, "rewards/verify_chess_move/std": 0.983121919631958, "step": 1145 }, { "completion_length": 386.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 131.640625, "completions/mean_terminated_length": 131.640625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.001040748470099749, "frac_reward_zero_std": 0.39375, "grad_norm": 0.15388289093971252, "kl": 0.0005103645823510306, "learning_rate": 8.207142857142857e-08, "loss": 0.0, "num_tokens": 84855137.0, "reward": -0.159375, "reward_std": 0.5564497828483581, "rewards/verify_chess_move/mean": -0.159375, "rewards/verify_chess_move/std": 0.9843539953231811, "step": 1150 }, { "completion_length": 394.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 133.484375, "completions/mean_terminated_length": 133.484375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0010452734634480087, "frac_reward_zero_std": 0.45, "grad_norm": 0.14416436851024628, "kl": 0.0005074827191492659, "learning_rate": 8.242857142857143e-08, "loss": 0.0, "num_tokens": 85227101.0, "reward": -0.1828125, "reward_std": 0.494431746006012, "rewards/verify_chess_move/mean": -0.1828125, "rewards/verify_chess_move/std": 0.9696420192718506, "step": 1155 }, { "completion_length": 429.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 429.2, "completions/max_terminated_length": 347.2, "completions/mean_length": 136.40703125, "completions/mean_terminated_length": 135.92486572265625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0010497984567962686, "frac_reward_zero_std": 0.4125, "grad_norm": 0.15252959728240967, "kl": 0.0005181612609703734, "learning_rate": 8.278571428571428e-08, "loss": 0.0, "num_tokens": 85600438.0, "reward": -0.1640625, "reward_std": 0.5450302720069885, "rewards/verify_chess_move/mean": -0.1640625, "rewards/verify_chess_move/std": 0.9864589214324951, "step": 1160 }, { "completion_length": 422.8, "completions/clipped_ratio": 0.0, "completions/max_length": 422.8, "completions/max_terminated_length": 422.8, "completions/mean_length": 133.31328125, "completions/mean_terminated_length": 133.31328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0010543234501445282, "frac_reward_zero_std": 0.4, "grad_norm": 0.1115526407957077, "kl": 0.0005024678336667421, "learning_rate": 8.314285714285714e-08, "loss": 0.0, "num_tokens": 85972311.0, "reward": -0.1921875, "reward_std": 0.548285037279129, "rewards/verify_chess_move/mean": -0.1921875, "rewards/verify_chess_move/std": 0.9769058465957642, "step": 1165 }, { "completion_length": 427.8, "completions/clipped_ratio": 0.0, "completions/max_length": 427.8, "completions/max_terminated_length": 427.8, "completions/mean_length": 135.8953125, "completions/mean_terminated_length": 135.8953125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.001058848443492788, "frac_reward_zero_std": 0.325, "grad_norm": 0.13560707867145538, "kl": 0.0005087394007205148, "learning_rate": 8.35e-08, "loss": 0.0, "num_tokens": 86345017.0, "reward": -0.14375, "reward_std": 0.6133126020431519, "rewards/verify_chess_move/mean": -0.14375, "rewards/verify_chess_move/std": 0.9869972944259644, "step": 1170 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 128.2328125, "completions/mean_terminated_length": 128.2328125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0010633734368410478, "frac_reward_zero_std": 0.4, "grad_norm": 0.11825694143772125, "kl": 0.0005375636550525087, "learning_rate": 8.385714285714285e-08, "loss": 0.0, "num_tokens": 86707987.0, "reward": -0.1578125, "reward_std": 0.5399922788143158, "rewards/verify_chess_move/mean": -0.1578125, "rewards/verify_chess_move/std": 0.980218255519867, "step": 1175 }, { "completion_length": 349.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 133.6609375, "completions/mean_terminated_length": 133.6609375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0010678984301893076, "frac_reward_zero_std": 0.36875, "grad_norm": 0.122072234749794, "kl": 0.0006013072244968499, "learning_rate": 8.421428571428571e-08, "loss": 0.0, "num_tokens": 87079121.0, "reward": -0.10625, "reward_std": 0.5739162564277649, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.9913096308708191, "step": 1180 }, { "completion_length": 415.2, "completions/clipped_ratio": 0.0, "completions/max_length": 415.2, "completions/max_terminated_length": 415.2, "completions/mean_length": 132.84609375, "completions/mean_terminated_length": 132.84609375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0010724234235375673, "frac_reward_zero_std": 0.39375, "grad_norm": 0.08336970210075378, "kl": 0.0005091223513773002, "learning_rate": 8.457142857142857e-08, "loss": 0.0, "num_tokens": 87447988.0, "reward": -0.1265625, "reward_std": 0.5439844250679016, "rewards/verify_chess_move/mean": -0.1265625, "rewards/verify_chess_move/std": 0.985704493522644, "step": 1185 }, { "completion_length": 451.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.6, "completions/max_terminated_length": 363.8, "completions/mean_length": 141.1109375, "completions/mean_terminated_length": 140.61900329589844, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0010769484168858272, "frac_reward_zero_std": 0.39375, "grad_norm": 0.14480894804000854, "kl": 0.000509630485521484, "learning_rate": 8.492857142857142e-08, "loss": 0.0, "num_tokens": 87830410.0, "reward": -0.1328125, "reward_std": 0.5525155663490295, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9893287301063538, "step": 1190 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.4, "completions/max_terminated_length": 372.6, "completions/mean_length": 126.68046875, "completions/mean_terminated_length": 126.18095397949219, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.001081473410234087, "frac_reward_zero_std": 0.375, "grad_norm": 0.10995178669691086, "kl": 0.0004614634064637357, "learning_rate": 8.528571428571428e-08, "loss": 0.0, "num_tokens": 88188337.0, "reward": -0.0640625, "reward_std": 0.5683420658111572, "rewards/verify_chess_move/mean": -0.0640625, "rewards/verify_chess_move/std": 0.9976949095726013, "step": 1195 }, { "completion_length": 427.2, "completions/clipped_ratio": 0.0, "completions/max_length": 427.2, "completions/max_terminated_length": 427.2, "completions/mean_length": 130.88359375, "completions/mean_terminated_length": 130.88359375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0010859984035823467, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12160340696573257, "kl": 0.00045425660800901824, "learning_rate": 8.564285714285714e-08, "loss": 0.0, "num_tokens": 88552940.0, "reward": -0.0765625, "reward_std": 0.5796013355255127, "rewards/verify_chess_move/mean": -0.0765625, "rewards/verify_chess_move/std": 0.9864071011543274, "step": 1200 }, { "completion_length": 417.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 417.6, "completions/max_terminated_length": 330.8, "completions/mean_length": 130.2359375, "completions/mean_terminated_length": 129.72876434326173, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0010905233969306066, "frac_reward_zero_std": 0.35625, "grad_norm": 0.19692502915859222, "kl": 0.0005493954851317539, "learning_rate": 8.599999999999999e-08, "loss": 0.0, "num_tokens": 88919986.0, "reward": -0.1296875, "reward_std": 0.5700320601463318, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9887852311134339, "step": 1205 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 150.5890625, "completions/mean_terminated_length": 150.5890625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0010950483902788662, "frac_reward_zero_std": 0.325, "grad_norm": 0.17169851064682007, "kl": 0.0006069650842619012, "learning_rate": 8.635714285714285e-08, "loss": 0.0, "num_tokens": 89314788.0, "reward": -0.08125, "reward_std": 0.6046400427818298, "rewards/verify_chess_move/mean": -0.08125, "rewards/verify_chess_move/std": 0.99577796459198, "step": 1210 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 139.70234375, "completions/mean_terminated_length": 139.70234375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.0010995733836271261, "frac_reward_zero_std": 0.35, "grad_norm": 0.14881503582000732, "kl": 0.0005601852471954771, "learning_rate": 8.671428571428571e-08, "loss": 0.0, "num_tokens": 89694567.0, "reward": -0.1578125, "reward_std": 0.5867017269134521, "rewards/verify_chess_move/mean": -0.1578125, "rewards/verify_chess_move/std": 0.9835136890411377, "step": 1215 }, { "completion_length": 430.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 430.2, "completions/max_terminated_length": 338.2, "completions/mean_length": 123.778125, "completions/mean_terminated_length": 123.27073364257812, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0011040983769753858, "frac_reward_zero_std": 0.33125, "grad_norm": 0.15342827141284943, "kl": 0.0005030564971093554, "learning_rate": 8.707142857142857e-08, "loss": 0.0, "num_tokens": 90048571.0, "reward": -0.0796875, "reward_std": 0.6154653906822205, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9923671126365662, "step": 1220 }, { "completion_length": 382.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 137.1578125, "completions/mean_terminated_length": 137.1578125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0011086233703236457, "frac_reward_zero_std": 0.325, "grad_norm": 0.12654584646224976, "kl": 0.0005525705240870593, "learning_rate": 8.742857142857142e-08, "loss": 0.0, "num_tokens": 90421045.0, "reward": -0.1921875, "reward_std": 0.6135247707366943, "rewards/verify_chess_move/mean": -0.1921875, "rewards/verify_chess_move/std": 0.9736244201660156, "step": 1225 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 134.64609375, "completions/mean_terminated_length": 134.64609375, "completions/min_length": 41.6, "completions/min_terminated_length": 41.6, "epoch": 0.0011131483636719053, "frac_reward_zero_std": 0.39375, "grad_norm": 0.12760767340660095, "kl": 0.0005736597171562607, "learning_rate": 8.778571428571428e-08, "loss": 0.0, "num_tokens": 90791336.0, "reward": -0.0890625, "reward_std": 0.5589754462242127, "rewards/verify_chess_move/mean": -0.0890625, "rewards/verify_chess_move/std": 0.9914473056793213, "step": 1230 }, { "completion_length": 362.6, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/max_terminated_length": 362.6, "completions/mean_length": 132.14609375, "completions/mean_terminated_length": 132.14609375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0011176733570201652, "frac_reward_zero_std": 0.39375, "grad_norm": 0.11022810637950897, "kl": 0.0005744385558500653, "learning_rate": 8.814285714285714e-08, "loss": 0.0, "num_tokens": 91158115.0, "reward": -0.0578125, "reward_std": 0.5476154565811158, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9873874306678772, "step": 1235 }, { "completion_length": 406.8, "completions/clipped_ratio": 0.0, "completions/max_length": 406.8, "completions/max_terminated_length": 406.8, "completions/mean_length": 130.03984375, "completions/mean_terminated_length": 130.03984375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.001122198350368425, "frac_reward_zero_std": 0.30625, "grad_norm": 0.15211078524589539, "kl": 0.0005544881743844599, "learning_rate": 8.849999999999999e-08, "loss": 0.0, "num_tokens": 91520558.0, "reward": -0.046875, "reward_std": 0.6206796646118165, "rewards/verify_chess_move/mean": -0.046875, "rewards/verify_chess_move/std": 0.995402717590332, "step": 1240 }, { "completion_length": 368.4, "completions/clipped_ratio": 0.0, "completions/max_length": 368.4, "completions/max_terminated_length": 368.4, "completions/mean_length": 136.940625, "completions/mean_terminated_length": 136.940625, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0011267233437166847, "frac_reward_zero_std": 0.3375, "grad_norm": 0.17067250609397888, "kl": 0.000545699585654802, "learning_rate": 8.885714285714285e-08, "loss": 0.0, "num_tokens": 91894642.0, "reward": -0.1453125, "reward_std": 0.5964841723442078, "rewards/verify_chess_move/mean": -0.1453125, "rewards/verify_chess_move/std": 0.9858521342277526, "step": 1245 }, { "completion_length": 363.8, "completions/clipped_ratio": 0.0, "completions/max_length": 363.8, "completions/max_terminated_length": 363.8, "completions/mean_length": 124.7734375, "completions/mean_terminated_length": 124.7734375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0011312483370649446, "frac_reward_zero_std": 0.30625, "grad_norm": 0.14913058280944824, "kl": 0.0005420336322458752, "learning_rate": 8.921428571428571e-08, "loss": 0.0, "num_tokens": 92249040.0, "reward": -0.0171875, "reward_std": 0.6098863959312439, "rewards/verify_chess_move/mean": -0.0171875, "rewards/verify_chess_move/std": 1.0013042211532592, "step": 1250 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 131.89609375, "completions/mean_terminated_length": 131.89609375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0011357733304132043, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1286267638206482, "kl": 0.0005580617901614459, "learning_rate": 8.957142857142856e-08, "loss": 0.0, "num_tokens": 92616331.0, "reward": -0.0671875, "reward_std": 0.5619341492652893, "rewards/verify_chess_move/mean": -0.0671875, "rewards/verify_chess_move/std": 0.9907590270042419, "step": 1255 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 126.79296875, "completions/mean_terminated_length": 126.79296875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0011402983237614641, "frac_reward_zero_std": 0.275, "grad_norm": 0.14209407567977905, "kl": 0.0005723544243664946, "learning_rate": 8.992857142857142e-08, "loss": 0.0, "num_tokens": 92975258.0, "reward": -0.1890625, "reward_std": 0.6600269496440887, "rewards/verify_chess_move/mean": -0.1890625, "rewards/verify_chess_move/std": 0.9739773035049438, "step": 1260 }, { "completion_length": 371.8, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 138.89921875, "completions/mean_terminated_length": 138.89921875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0011448233171097238, "frac_reward_zero_std": 0.33125, "grad_norm": 0.1313125491142273, "kl": 0.0006007763966408674, "learning_rate": 9.028571428571428e-08, "loss": 0.0, "num_tokens": 93353809.0, "reward": -0.2453125, "reward_std": 0.606159245967865, "rewards/verify_chess_move/mean": -0.2453125, "rewards/verify_chess_move/std": 0.9676803231239319, "step": 1265 }, { "completion_length": 327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 138.728125, "completions/mean_terminated_length": 138.728125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0011493483104579837, "frac_reward_zero_std": 0.3875, "grad_norm": 0.18960046768188477, "kl": 0.0006819841246397118, "learning_rate": 9.064285714285713e-08, "loss": 0.0, "num_tokens": 93734157.0, "reward": -0.146875, "reward_std": 0.537094247341156, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9882668018341064, "step": 1270 }, { "completion_length": 426.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.4, "completions/max_terminated_length": 342.0, "completions/mean_length": 127.528125, "completions/mean_terminated_length": 127.04064025878907, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0011538733038062433, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1199425607919693, "kl": 0.0004956723116265494, "learning_rate": 9.1e-08, "loss": 0.0, "num_tokens": 94094681.0, "reward": -0.2015625, "reward_std": 0.531987339258194, "rewards/verify_chess_move/mean": -0.2015625, "rewards/verify_chess_move/std": 0.9696355342864991, "step": 1275 }, { "completion_length": 340.6, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/max_terminated_length": 340.6, "completions/mean_length": 136.86328125, "completions/mean_terminated_length": 136.86328125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0011583982971545032, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1543232500553131, "kl": 0.0006334073308607912, "learning_rate": 9.135714285714286e-08, "loss": 0.0, "num_tokens": 94470002.0, "reward": -0.2265625, "reward_std": 0.5479913532733918, "rewards/verify_chess_move/mean": -0.2265625, "rewards/verify_chess_move/std": 0.9723625779151917, "step": 1280 }, { "completion_length": 392.2, "completions/clipped_ratio": 0.0, "completions/max_length": 392.2, "completions/max_terminated_length": 392.2, "completions/mean_length": 129.23046875, "completions/mean_terminated_length": 129.23046875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0011629232905027629, "frac_reward_zero_std": 0.3875, "grad_norm": 0.12575185298919678, "kl": 0.0005508724245373742, "learning_rate": 9.171428571428572e-08, "loss": 0.0, "num_tokens": 94832785.0, "reward": -0.1625, "reward_std": 0.5377801954746246, "rewards/verify_chess_move/mean": -0.1625, "rewards/verify_chess_move/std": 0.97844557762146, "step": 1285 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 131.865625, "completions/mean_terminated_length": 131.865625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0011674482838510227, "frac_reward_zero_std": 0.375, "grad_norm": 0.13457274436950684, "kl": 0.000620726354100043, "learning_rate": 9.207142857142856e-08, "loss": 0.0, "num_tokens": 95201541.0, "reward": -0.18125, "reward_std": 0.5494013786315918, "rewards/verify_chess_move/mean": -0.18125, "rewards/verify_chess_move/std": 0.97884042263031, "step": 1290 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 134.446875, "completions/mean_terminated_length": 134.446875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0011719732771992826, "frac_reward_zero_std": 0.40625, "grad_norm": 0.14454202353954315, "kl": 0.0006157371690278524, "learning_rate": 9.242857142857143e-08, "loss": 0.0, "num_tokens": 95573761.0, "reward": -0.1328125, "reward_std": 0.5333107590675354, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9911189794540405, "step": 1295 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 130.7578125, "completions/mean_terminated_length": 130.7578125, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0011764982705475423, "frac_reward_zero_std": 0.3625, "grad_norm": 0.17476123571395874, "kl": 0.0006412344133423176, "learning_rate": 9.278571428571429e-08, "loss": 0.0, "num_tokens": 95937859.0, "reward": -0.13125, "reward_std": 0.5883247971534729, "rewards/verify_chess_move/mean": -0.13125, "rewards/verify_chess_move/std": 0.9900909423828125, "step": 1300 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 151.4140625, "completions/mean_terminated_length": 151.4140625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0011810232638958021, "frac_reward_zero_std": 0.31875, "grad_norm": 0.1266242414712906, "kl": 0.0005945709641309804, "learning_rate": 9.314285714285714e-08, "loss": 0.0, "num_tokens": 96335141.0, "reward": -0.215625, "reward_std": 0.6095331907272339, "rewards/verify_chess_move/mean": -0.215625, "rewards/verify_chess_move/std": 0.9701293468475342, "step": 1305 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.6, "completions/max_terminated_length": 365.4, "completions/mean_length": 130.68125, "completions/mean_terminated_length": 130.20010986328126, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0011855482572440618, "frac_reward_zero_std": 0.33125, "grad_norm": 0.13515536487102509, "kl": 0.0005086431759082188, "learning_rate": 9.35e-08, "loss": 0.0, "num_tokens": 96699885.0, "reward": -0.084375, "reward_std": 0.612098902463913, "rewards/verify_chess_move/mean": -0.084375, "rewards/verify_chess_move/std": 0.9953944444656372, "step": 1310 }, { "completion_length": 501.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 501.8, "completions/max_terminated_length": 413.0, "completions/mean_length": 133.65859375, "completions/mean_terminated_length": 133.1667938232422, "completions/min_length": 42.2, "completions/min_terminated_length": 42.2, "epoch": 0.0011900732505923217, "frac_reward_zero_std": 0.38125, "grad_norm": 0.07811259478330612, "kl": 0.00047374290825246134, "learning_rate": 9.385714285714286e-08, "loss": 0.0, "num_tokens": 97068232.0, "reward": -0.0484375, "reward_std": 0.541678375005722, "rewards/verify_chess_move/mean": -0.0484375, "rewards/verify_chess_move/std": 0.9981115102767945, "step": 1315 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 122.7640625, "completions/mean_terminated_length": 122.7640625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0011945982439405813, "frac_reward_zero_std": 0.3625, "grad_norm": 0.12699240446090698, "kl": 0.0005876494361473306, "learning_rate": 9.42142857142857e-08, "loss": 0.0, "num_tokens": 97423490.0, "reward": -0.15, "reward_std": 0.5863252639770508, "rewards/verify_chess_move/mean": -0.15, "rewards/verify_chess_move/std": 0.9833355784416199, "step": 1320 }, { "completion_length": 466.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 466.6, "completions/max_terminated_length": 455.8, "completions/mean_length": 129.39453125, "completions/mean_terminated_length": 128.89501037597657, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0011991232372888412, "frac_reward_zero_std": 0.35625, "grad_norm": 0.05973820388317108, "kl": 0.000505186113241507, "learning_rate": 9.457142857142857e-08, "loss": 0.0, "num_tokens": 97785859.0, "reward": -0.1078125, "reward_std": 0.5901071965694428, "rewards/verify_chess_move/mean": -0.1078125, "rewards/verify_chess_move/std": 0.9876677036285401, "step": 1325 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 129.64921875, "completions/mean_terminated_length": 129.64921875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0012036482306371009, "frac_reward_zero_std": 0.375, "grad_norm": 0.15449018776416779, "kl": 0.0005990727531752782, "learning_rate": 9.492857142857143e-08, "loss": 0.0, "num_tokens": 98148730.0, "reward": 0.0125, "reward_std": 0.5697574734687805, "rewards/verify_chess_move/mean": 0.0125, "rewards/verify_chess_move/std": 0.9961340188980102, "step": 1330 }, { "completion_length": 369.6, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/max_terminated_length": 369.6, "completions/mean_length": 141.20390625, "completions/mean_terminated_length": 141.20390625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0012081732239853607, "frac_reward_zero_std": 0.4125, "grad_norm": 0.1322152465581894, "kl": 0.0006163175688925548, "learning_rate": 9.528571428571428e-08, "loss": 0.0, "num_tokens": 98532887.0, "reward": -0.1734375, "reward_std": 0.5131727874279022, "rewards/verify_chess_move/mean": -0.1734375, "rewards/verify_chess_move/std": 0.9789072036743164, "step": 1335 }, { "completion_length": 350.8, "completions/clipped_ratio": 0.0, "completions/max_length": 350.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 148.309375, "completions/mean_terminated_length": 148.309375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0012126982173336206, "frac_reward_zero_std": 0.4, "grad_norm": 0.1478617787361145, "kl": 0.00067221667177364, "learning_rate": 9.564285714285714e-08, "loss": 0.0, "num_tokens": 98928515.0, "reward": -0.16875, "reward_std": 0.5381505787372589, "rewards/verify_chess_move/mean": -0.16875, "rewards/verify_chess_move/std": 0.9867131829261779, "step": 1340 }, { "completion_length": 335.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 134.09453125, "completions/mean_terminated_length": 134.09453125, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.0012172232106818803, "frac_reward_zero_std": 0.35625, "grad_norm": 0.15065155923366547, "kl": 0.0006899787864313112, "learning_rate": 9.6e-08, "loss": 0.0, "num_tokens": 99300860.0, "reward": -0.2234375, "reward_std": 0.5916863322257996, "rewards/verify_chess_move/mean": -0.2234375, "rewards/verify_chess_move/std": 0.9732407093048095, "step": 1345 }, { "completion_length": 416.8, "completions/clipped_ratio": 0.0, "completions/max_length": 416.8, "completions/max_terminated_length": 416.8, "completions/mean_length": 129.54609375, "completions/mean_terminated_length": 129.54609375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0012217482040301402, "frac_reward_zero_std": 0.36875, "grad_norm": 0.13212734460830688, "kl": 0.0005630667487821484, "learning_rate": 9.635714285714286e-08, "loss": 0.0, "num_tokens": 99662831.0, "reward": -0.1171875, "reward_std": 0.5743900418281556, "rewards/verify_chess_move/mean": -0.1171875, "rewards/verify_chess_move/std": 0.9897769927978516, "step": 1350 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 123.47265625, "completions/mean_terminated_length": 123.47265625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0012262731973783998, "frac_reward_zero_std": 0.38125, "grad_norm": 0.12020699679851532, "kl": 0.0005880271298337903, "learning_rate": 9.671428571428571e-08, "loss": 0.0, "num_tokens": 100014060.0, "reward": 0.0640625, "reward_std": 0.5623464465141297, "rewards/verify_chess_move/mean": 0.0640625, "rewards/verify_chess_move/std": 0.9994649291038513, "step": 1355 }, { "completion_length": 438.6, "completions/clipped_ratio": 0.0, "completions/max_length": 438.6, "completions/max_terminated_length": 438.6, "completions/mean_length": 132.421875, "completions/mean_terminated_length": 132.421875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0012307981907266597, "frac_reward_zero_std": 0.3875, "grad_norm": 0.09993390738964081, "kl": 0.0005792179285890597, "learning_rate": 9.707142857142857e-08, "loss": 0.0, "num_tokens": 100380704.0, "reward": -0.109375, "reward_std": 0.5416184902191162, "rewards/verify_chess_move/mean": -0.109375, "rewards/verify_chess_move/std": 0.9915328621864319, "step": 1360 }, { "completion_length": 342.6, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/max_terminated_length": 342.6, "completions/mean_length": 130.14921875, "completions/mean_terminated_length": 130.14921875, "completions/min_length": 41.6, "completions/min_terminated_length": 41.6, "epoch": 0.0012353231840749193, "frac_reward_zero_std": 0.35, "grad_norm": 0.18416163325309753, "kl": 0.000694863955686742, "learning_rate": 9.742857142857143e-08, "loss": 0.0, "num_tokens": 100743887.0, "reward": -0.128125, "reward_std": 0.5877625823020936, "rewards/verify_chess_move/mean": -0.128125, "rewards/verify_chess_move/std": 0.9877972483634949, "step": 1365 }, { "completion_length": 361.8, "completions/clipped_ratio": 0.0, "completions/max_length": 361.8, "completions/max_terminated_length": 361.8, "completions/mean_length": 126.9578125, "completions/mean_terminated_length": 126.9578125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0012398481774231792, "frac_reward_zero_std": 0.35625, "grad_norm": 0.15410692989826202, "kl": 0.0006066494595870608, "learning_rate": 9.778571428571428e-08, "loss": 0.0, "num_tokens": 101102041.0, "reward": -0.0796875, "reward_std": 0.5757575511932373, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9883603572845459, "step": 1370 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 134.38046875, "completions/mean_terminated_length": 134.38046875, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0012443731707714389, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1546029895544052, "kl": 0.0006390255060978234, "learning_rate": 9.814285714285714e-08, "loss": 0.0, "num_tokens": 101471392.0, "reward": -0.065625, "reward_std": 0.5954846024513245, "rewards/verify_chess_move/mean": -0.065625, "rewards/verify_chess_move/std": 0.9933204889297486, "step": 1375 }, { "completion_length": 339.6, "completions/clipped_ratio": 0.0, "completions/max_length": 339.6, "completions/max_terminated_length": 339.6, "completions/mean_length": 129.54375, "completions/mean_terminated_length": 129.54375, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0012488981641196988, "frac_reward_zero_std": 0.38125, "grad_norm": 0.17107778787612915, "kl": 0.0006511830977615318, "learning_rate": 9.85e-08, "loss": 0.0, "num_tokens": 101834992.0, "reward": -0.1140625, "reward_std": 0.5469310820102692, "rewards/verify_chess_move/mean": -0.1140625, "rewards/verify_chess_move/std": 0.9924014806747437, "step": 1380 }, { "completion_length": 392.4, "completions/clipped_ratio": 0.0, "completions/max_length": 392.4, "completions/max_terminated_length": 392.4, "completions/mean_length": 132.465625, "completions/mean_terminated_length": 132.465625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0012534231574679584, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1665506809949875, "kl": 0.0006323497220364516, "learning_rate": 9.885714285714285e-08, "loss": 0.0, "num_tokens": 102203076.0, "reward": -0.1640625, "reward_std": 0.6094264209270477, "rewards/verify_chess_move/mean": -0.1640625, "rewards/verify_chess_move/std": 0.9855400085449219, "step": 1385 }, { "completion_length": 375.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 125.20234375, "completions/mean_terminated_length": 125.20234375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0012579481508162183, "frac_reward_zero_std": 0.28125, "grad_norm": 0.13246527314186096, "kl": 0.000577746974886395, "learning_rate": 9.921428571428571e-08, "loss": 0.0, "num_tokens": 102558087.0, "reward": -0.08125, "reward_std": 0.6416802883148194, "rewards/verify_chess_move/mean": -0.08125, "rewards/verify_chess_move/std": 0.9967874646186828, "step": 1390 }, { "completion_length": 373.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 137.41953125, "completions/mean_terminated_length": 137.41953125, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "epoch": 0.0012624731441644782, "frac_reward_zero_std": 0.40625, "grad_norm": 0.16030752658843994, "kl": 0.0006275798805290833, "learning_rate": 9.957142857142857e-08, "loss": 0.0, "num_tokens": 102934088.0, "reward": -0.0390625, "reward_std": 0.5446668267250061, "rewards/verify_chess_move/mean": -0.0390625, "rewards/verify_chess_move/std": 0.9954768300056458, "step": 1395 }, { "completion_length": 360.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 132.853125, "completions/mean_terminated_length": 132.853125, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0012669981375127378, "frac_reward_zero_std": 0.38125, "grad_norm": 0.13327282667160034, "kl": 0.0009129870733886492, "learning_rate": 9.992857142857142e-08, "loss": 0.0, "num_tokens": 103301076.0, "reward": -0.05625, "reward_std": 0.5556026697158813, "rewards/verify_chess_move/mean": -0.05625, "rewards/verify_chess_move/std": 0.9771528601646423, "step": 1400 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 123.97109375, "completions/mean_terminated_length": 123.97109375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0012715231308609977, "frac_reward_zero_std": 0.33125, "grad_norm": 0.1092817485332489, "kl": 0.000630542076396523, "learning_rate": 1.0028571428571428e-07, "loss": 0.0, "num_tokens": 103655815.0, "reward": 0.0234375, "reward_std": 0.5961671948432923, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 1.0000301361083985, "step": 1405 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 139.7296875, "completions/mean_terminated_length": 139.7296875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0012760481242092574, "frac_reward_zero_std": 0.3375, "grad_norm": 0.18331748247146606, "kl": 0.0006646625713983667, "learning_rate": 1.0064285714285714e-07, "loss": 0.0, "num_tokens": 104033693.0, "reward": -0.0625, "reward_std": 0.5759856760501861, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.998637855052948, "step": 1410 }, { "completion_length": 361.6, "completions/clipped_ratio": 0.0, "completions/max_length": 361.6, "completions/max_terminated_length": 361.6, "completions/mean_length": 140.45625, "completions/mean_terminated_length": 140.45625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0012805731175575172, "frac_reward_zero_std": 0.33125, "grad_norm": 0.16018252074718475, "kl": 0.0006916164831636707, "learning_rate": 1.01e-07, "loss": 0.0, "num_tokens": 104413677.0, "reward": -0.0640625, "reward_std": 0.6013269305229187, "rewards/verify_chess_move/mean": -0.0640625, "rewards/verify_chess_move/std": 0.9893305540084839, "step": 1415 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 138.9796875, "completions/mean_terminated_length": 138.9796875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.001285098110905777, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1076941192150116, "kl": 0.0006802010530009284, "learning_rate": 1.0135714285714285e-07, "loss": 0.0, "num_tokens": 104791619.0, "reward": 0.003125, "reward_std": 0.5738678336143493, "rewards/verify_chess_move/mean": 0.003125, "rewards/verify_chess_move/std": 1.0003056049346923, "step": 1420 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 127.7046875, "completions/mean_terminated_length": 127.7046875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0012896231042540368, "frac_reward_zero_std": 0.3625, "grad_norm": 0.15639910101890564, "kl": 0.0006345396801407333, "learning_rate": 1.0171428571428571e-07, "loss": 0.0, "num_tokens": 105149505.0, "reward": -0.053125, "reward_std": 0.571080482006073, "rewards/verify_chess_move/mean": -0.053125, "rewards/verify_chess_move/std": 0.9831048488616944, "step": 1425 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 130.00546875, "completions/mean_terminated_length": 130.00546875, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0012941480976022964, "frac_reward_zero_std": 0.43125, "grad_norm": 0.1310967206954956, "kl": 0.0006197612832693266, "learning_rate": 1.0207142857142858e-07, "loss": 0.0, "num_tokens": 105516656.0, "reward": -0.203125, "reward_std": 0.4926583468914032, "rewards/verify_chess_move/mean": -0.203125, "rewards/verify_chess_move/std": 0.9770609378814697, "step": 1430 }, { "completion_length": 442.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 442.8, "completions/max_terminated_length": 353.4, "completions/mean_length": 135.459375, "completions/mean_terminated_length": 134.9672058105469, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0012986730909505563, "frac_reward_zero_std": 0.3875, "grad_norm": 0.12909221649169922, "kl": 0.0006243745247047627, "learning_rate": 1.0242857142857142e-07, "loss": 0.0, "num_tokens": 105888628.0, "reward": -0.090625, "reward_std": 0.5334671437740326, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9951507568359375, "step": 1435 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 138.7140625, "completions/mean_terminated_length": 138.7140625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0013031980842988162, "frac_reward_zero_std": 0.35625, "grad_norm": 0.15850494801998138, "kl": 0.0007497490541936713, "learning_rate": 1.0278571428571428e-07, "loss": 0.0, "num_tokens": 106264646.0, "reward": -0.0890625, "reward_std": 0.5943717956542969, "rewards/verify_chess_move/mean": -0.0890625, "rewards/verify_chess_move/std": 0.9931583762168884, "step": 1440 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 137.01875, "completions/mean_terminated_length": 137.01875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0013077230776470758, "frac_reward_zero_std": 0.39375, "grad_norm": 0.13038940727710724, "kl": 0.0007232879632283584, "learning_rate": 1.0314285714285715e-07, "loss": 0.0, "num_tokens": 106640438.0, "reward": -0.23125, "reward_std": 0.5428306043148041, "rewards/verify_chess_move/mean": -0.23125, "rewards/verify_chess_move/std": 0.9746591329574585, "step": 1445 }, { "completion_length": 384.6, "completions/clipped_ratio": 0.0, "completions/max_length": 384.6, "completions/max_terminated_length": 384.6, "completions/mean_length": 130.37265625, "completions/mean_terminated_length": 130.37265625, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0013122480709953357, "frac_reward_zero_std": 0.39375, "grad_norm": 0.15015292167663574, "kl": 0.0006805029650422511, "learning_rate": 1.035e-07, "loss": 0.0, "num_tokens": 107004579.0, "reward": -0.1265625, "reward_std": 0.5303672194480896, "rewards/verify_chess_move/mean": -0.1265625, "rewards/verify_chess_move/std": 0.9916138768196106, "step": 1450 }, { "completion_length": 368.4, "completions/clipped_ratio": 0.0, "completions/max_length": 368.4, "completions/max_terminated_length": 368.4, "completions/mean_length": 137.60390625, "completions/mean_terminated_length": 137.60390625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0013167730643435954, "frac_reward_zero_std": 0.38125, "grad_norm": 0.12852178514003754, "kl": 0.0007029026573945885, "learning_rate": 1.0385714285714286e-07, "loss": 0.0, "num_tokens": 107382416.0, "reward": -0.0671875, "reward_std": 0.5587638676166534, "rewards/verify_chess_move/mean": -0.0671875, "rewards/verify_chess_move/std": 0.9915683150291443, "step": 1455 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 131.5109375, "completions/mean_terminated_length": 131.5109375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0013212980576918552, "frac_reward_zero_std": 0.36875, "grad_norm": 0.13200269639492035, "kl": 0.0006836685031885281, "learning_rate": 1.0421428571428572e-07, "loss": 0.0, "num_tokens": 107746822.0, "reward": -0.1546875, "reward_std": 0.5562427163124084, "rewards/verify_chess_move/mean": -0.1546875, "rewards/verify_chess_move/std": 0.9867491364479065, "step": 1460 }, { "completion_length": 398.8, "completions/clipped_ratio": 0.0, "completions/max_length": 398.8, "completions/max_terminated_length": 398.8, "completions/mean_length": 133.91015625, "completions/mean_terminated_length": 133.91015625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.001325823051040115, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16785116493701935, "kl": 0.0006690549214908969, "learning_rate": 1.0457142857142856e-07, "loss": 0.0, "num_tokens": 108116923.0, "reward": -0.1234375, "reward_std": 0.5901251077651978, "rewards/verify_chess_move/mean": -0.1234375, "rewards/verify_chess_move/std": 0.9920803546905518, "step": 1465 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 129.0375, "completions/mean_terminated_length": 129.0375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0013303480443883748, "frac_reward_zero_std": 0.4125, "grad_norm": 0.16394656896591187, "kl": 0.0007440317740474711, "learning_rate": 1.0492857142857143e-07, "loss": 0.0, "num_tokens": 108481371.0, "reward": -0.1703125, "reward_std": 0.5197954058647156, "rewards/verify_chess_move/mean": -0.1703125, "rewards/verify_chess_move/std": 0.9844035267829895, "step": 1470 }, { "completion_length": 436.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.2, "completions/max_terminated_length": 350.6, "completions/mean_length": 130.44765625, "completions/mean_terminated_length": 129.96199645996094, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0013348730377366344, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1654195934534073, "kl": 0.0006281973104705685, "learning_rate": 1.0528571428571429e-07, "loss": 0.0, "num_tokens": 108844128.0, "reward": -0.0046875, "reward_std": 0.5693930208683013, "rewards/verify_chess_move/mean": -0.0046875, "rewards/verify_chess_move/std": 0.9993272066116333, "step": 1475 }, { "completion_length": 367.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 129.7078125, "completions/mean_terminated_length": 129.7078125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0013393980310848943, "frac_reward_zero_std": 0.30625, "grad_norm": 0.15832817554473877, "kl": 0.0007455258588379366, "learning_rate": 1.0564285714285715e-07, "loss": 0.0, "num_tokens": 109205170.0, "reward": -0.115625, "reward_std": 0.6143186330795288, "rewards/verify_chess_move/mean": -0.115625, "rewards/verify_chess_move/std": 0.9866799235343933, "step": 1480 }, { "completion_length": 369.6, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/max_terminated_length": 369.6, "completions/mean_length": 132.128125, "completions/mean_terminated_length": 132.128125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0013439230244331542, "frac_reward_zero_std": 0.35, "grad_norm": 0.1786845624446869, "kl": 0.0007400696184049593, "learning_rate": 1.06e-07, "loss": 0.0, "num_tokens": 109571062.0, "reward": -0.078125, "reward_std": 0.5930657029151917, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.9933083772659301, "step": 1485 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 123.91484375, "completions/mean_terminated_length": 123.91484375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0013484480177814138, "frac_reward_zero_std": 0.3625, "grad_norm": 0.15832509100437164, "kl": 0.0007734519909718074, "learning_rate": 1.0635714285714286e-07, "loss": 0.0, "num_tokens": 109925497.0, "reward": -0.096875, "reward_std": 0.5751803576946258, "rewards/verify_chess_move/mean": -0.096875, "rewards/verify_chess_move/std": 0.9899497985839844, "step": 1490 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 138.96171875, "completions/mean_terminated_length": 138.96171875, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.0013529730111296737, "frac_reward_zero_std": 0.34375, "grad_norm": 0.18075479567050934, "kl": 0.0008765389437030535, "learning_rate": 1.0671428571428572e-07, "loss": 0.0, "num_tokens": 110304584.0, "reward": -0.1734375, "reward_std": 0.5860108315944672, "rewards/verify_chess_move/mean": -0.1734375, "rewards/verify_chess_move/std": 0.9810157537460327, "step": 1495 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 130.46640625, "completions/mean_terminated_length": 130.46640625, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.0013574980044779334, "frac_reward_zero_std": 0.38125, "grad_norm": 0.12134838104248047, "kl": 0.000734243297665671, "learning_rate": 1.0707142857142857e-07, "loss": 0.0, "num_tokens": 110668053.0, "reward": -0.1296875, "reward_std": 0.5521897077560425, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.981460440158844, "step": 1500 }, { "completion_length": 437.8, "completions/clipped_ratio": 0.0, "completions/max_length": 437.8, "completions/max_terminated_length": 437.8, "completions/mean_length": 142.29375, "completions/mean_terminated_length": 142.29375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0013620229978261933, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1725902408361435, "kl": 0.0006682152939902153, "learning_rate": 1.0742857142857143e-07, "loss": 0.0, "num_tokens": 111050077.0, "reward": -0.0640625, "reward_std": 0.6270970463752746, "rewards/verify_chess_move/mean": -0.0640625, "rewards/verify_chess_move/std": 0.9895126223564148, "step": 1505 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 130.81875, "completions/mean_terminated_length": 130.81875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.001366547991174453, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16827508807182312, "kl": 0.0007925719724880764, "learning_rate": 1.0778571428571429e-07, "loss": 0.0, "num_tokens": 111412029.0, "reward": 0.0046875, "reward_std": 0.5913283348083496, "rewards/verify_chess_move/mean": 0.0046875, "rewards/verify_chess_move/std": 0.9988507032394409, "step": 1510 }, { "completion_length": 384.8, "completions/clipped_ratio": 0.0, "completions/max_length": 384.8, "completions/max_terminated_length": 384.8, "completions/mean_length": 134.64765625, "completions/mean_terminated_length": 134.64765625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0013710729845227128, "frac_reward_zero_std": 0.35, "grad_norm": 0.1806281954050064, "kl": 0.0007385652254015441, "learning_rate": 1.0814285714285714e-07, "loss": 0.0, "num_tokens": 111780698.0, "reward": -0.1921875, "reward_std": 0.5851730346679688, "rewards/verify_chess_move/mean": -0.1921875, "rewards/verify_chess_move/std": 0.9682012677192688, "step": 1515 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.4, "completions/max_terminated_length": 337.0, "completions/mean_length": 124.4390625, "completions/mean_terminated_length": 123.939794921875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0013755979778709725, "frac_reward_zero_std": 0.425, "grad_norm": 0.1479262113571167, "kl": 0.0007355279344665178, "learning_rate": 1.085e-07, "loss": 0.0, "num_tokens": 112135900.0, "reward": -0.103125, "reward_std": 0.5124184548854828, "rewards/verify_chess_move/mean": -0.103125, "rewards/verify_chess_move/std": 0.9840619683265686, "step": 1520 }, { "completion_length": 395.6, "completions/clipped_ratio": 0.0, "completions/max_length": 395.6, "completions/max_terminated_length": 395.6, "completions/mean_length": 135.22890625, "completions/mean_terminated_length": 135.22890625, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0013801229712192323, "frac_reward_zero_std": 0.375, "grad_norm": 0.09358714520931244, "kl": 0.0007041824496809568, "learning_rate": 1.0885714285714286e-07, "loss": 0.0, "num_tokens": 112508449.0, "reward": -0.115625, "reward_std": 0.5686007082462311, "rewards/verify_chess_move/mean": -0.115625, "rewards/verify_chess_move/std": 0.985379958152771, "step": 1525 }, { "completion_length": 385.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 139.68828125, "completions/mean_terminated_length": 139.68828125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.001384647964567492, "frac_reward_zero_std": 0.36875, "grad_norm": 0.14409980177879333, "kl": 0.0007145386713091284, "learning_rate": 1.0921428571428571e-07, "loss": 0.0, "num_tokens": 112887194.0, "reward": -0.1609375, "reward_std": 0.5734948635101318, "rewards/verify_chess_move/mean": -0.1609375, "rewards/verify_chess_move/std": 0.985887885093689, "step": 1530 }, { "completion_length": 342.8, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 127.15, "completions/mean_terminated_length": 127.15, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0013891729579157519, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16272620856761932, "kl": 0.0007971789236762561, "learning_rate": 1.0957142857142857e-07, "loss": 0.0, "num_tokens": 113247666.0, "reward": -0.165625, "reward_std": 0.6020014524459839, "rewards/verify_chess_move/mean": -0.165625, "rewards/verify_chess_move/std": 0.9783551096916199, "step": 1535 }, { "completion_length": 376.4, "completions/clipped_ratio": 0.0, "completions/max_length": 376.4, "completions/max_terminated_length": 376.4, "completions/mean_length": 135.49921875, "completions/mean_terminated_length": 135.49921875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0013936979512640117, "frac_reward_zero_std": 0.4125, "grad_norm": 0.15357854962348938, "kl": 0.00078734570852248, "learning_rate": 1.0992857142857143e-07, "loss": 0.0, "num_tokens": 113623569.0, "reward": -0.08125, "reward_std": 0.5450293004512787, "rewards/verify_chess_move/mean": -0.08125, "rewards/verify_chess_move/std": 0.996129322052002, "step": 1540 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 142.2796875, "completions/mean_terminated_length": 142.2796875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0013982229446122714, "frac_reward_zero_std": 0.26875, "grad_norm": 0.1482146680355072, "kl": 0.0007394779604510404, "learning_rate": 1.1028571428571429e-07, "loss": 0.0, "num_tokens": 114005575.0, "reward": -0.11875, "reward_std": 0.6457817196846009, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9926041603088379, "step": 1545 }, { "completion_length": 356.2, "completions/clipped_ratio": 0.0, "completions/max_length": 356.2, "completions/max_terminated_length": 356.2, "completions/mean_length": 140.8875, "completions/mean_terminated_length": 140.8875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0014027479379605313, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1739072948694229, "kl": 0.0008637275741421035, "learning_rate": 1.1064285714285714e-07, "loss": 0.0, "num_tokens": 114384807.0, "reward": -0.1296875, "reward_std": 0.5742357909679413, "rewards/verify_chess_move/mean": -0.1296875, "rewards/verify_chess_move/std": 0.9893501400947571, "step": 1550 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 137.1515625, "completions/mean_terminated_length": 137.1515625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.001407272931308791, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1190822646021843, "kl": 0.000782863233689568, "learning_rate": 1.11e-07, "loss": 0.0, "num_tokens": 114756849.0, "reward": -0.121875, "reward_std": 0.5620300531387329, "rewards/verify_chess_move/mean": -0.121875, "rewards/verify_chess_move/std": 0.9826922297477723, "step": 1555 }, { "completion_length": 367.6, "completions/clipped_ratio": 0.0, "completions/max_length": 367.6, "completions/max_terminated_length": 367.6, "completions/mean_length": 137.56171875, "completions/mean_terminated_length": 137.56171875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0014117979246570508, "frac_reward_zero_std": 0.28125, "grad_norm": 0.13339678943157196, "kl": 0.0008269624780950835, "learning_rate": 1.1135714285714286e-07, "loss": 0.0, "num_tokens": 115131280.0, "reward": -0.1140625, "reward_std": 0.6428340911865235, "rewards/verify_chess_move/mean": -0.1140625, "rewards/verify_chess_move/std": 0.9930252313613892, "step": 1560 }, { "completion_length": 331.4, "completions/clipped_ratio": 0.0, "completions/max_length": 331.4, "completions/max_terminated_length": 331.4, "completions/mean_length": 130.18515625, "completions/mean_terminated_length": 130.18515625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0014163229180053105, "frac_reward_zero_std": 0.31875, "grad_norm": 0.14474110305309296, "kl": 0.0009096044765101397, "learning_rate": 1.1171428571428571e-07, "loss": 0.0, "num_tokens": 115493597.0, "reward": -0.0421875, "reward_std": 0.6195707678794861, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.9982029795646667, "step": 1565 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.4, "completions/max_terminated_length": 384.6, "completions/mean_length": 147.6296875, "completions/mean_terminated_length": 147.14019165039062, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0014208479113535703, "frac_reward_zero_std": 0.3, "grad_norm": 0.19227881729602814, "kl": 0.0007033408041024813, "learning_rate": 1.1207142857142857e-07, "loss": 0.0, "num_tokens": 115882795.0, "reward": -0.0640625, "reward_std": 0.6182652652263642, "rewards/verify_chess_move/mean": -0.0640625, "rewards/verify_chess_move/std": 0.9969972372055054, "step": 1570 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 137.47265625, "completions/mean_terminated_length": 137.47265625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.00142537290470183, "frac_reward_zero_std": 0.38125, "grad_norm": 0.15481331944465637, "kl": 0.0009172900263365591, "learning_rate": 1.1242857142857143e-07, "loss": 0.0, "num_tokens": 116259352.0, "reward": -0.1578125, "reward_std": 0.5578687131404877, "rewards/verify_chess_move/mean": -0.1578125, "rewards/verify_chess_move/std": 0.9751262903213501, "step": 1575 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 122.034375, "completions/mean_terminated_length": 122.034375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0014298978980500899, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1601172834634781, "kl": 0.0008761430481172283, "learning_rate": 1.1278571428571428e-07, "loss": 0.0, "num_tokens": 116609540.0, "reward": 0.0328125, "reward_std": 0.5829599499702454, "rewards/verify_chess_move/mean": 0.0328125, "rewards/verify_chess_move/std": 0.9950380086898803, "step": 1580 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 135.08046875, "completions/mean_terminated_length": 135.08046875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0014344228913983497, "frac_reward_zero_std": 0.35, "grad_norm": 0.15959174931049347, "kl": 0.0008328961637744214, "learning_rate": 1.1314285714285714e-07, "loss": 0.0, "num_tokens": 116981147.0, "reward": -0.14375, "reward_std": 0.5995216608047486, "rewards/verify_chess_move/mean": -0.14375, "rewards/verify_chess_move/std": 0.9881539821624756, "step": 1585 }, { "completion_length": 377.4, "completions/clipped_ratio": 0.0, "completions/max_length": 377.4, "completions/max_terminated_length": 377.4, "completions/mean_length": 135.725, "completions/mean_terminated_length": 135.725, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0014389478847466094, "frac_reward_zero_std": 0.31875, "grad_norm": 0.16230060160160065, "kl": 0.0008705789756277227, "learning_rate": 1.135e-07, "loss": 0.0, "num_tokens": 117355443.0, "reward": -0.190625, "reward_std": 0.6126875400543212, "rewards/verify_chess_move/mean": -0.190625, "rewards/verify_chess_move/std": 0.979674506187439, "step": 1590 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 138.134375, "completions/mean_terminated_length": 138.134375, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0014434728780948693, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1466253399848938, "kl": 0.0008438326662144391, "learning_rate": 1.1385714285714285e-07, "loss": 0.0, "num_tokens": 117732519.0, "reward": -0.1, "reward_std": 0.5565001845359803, "rewards/verify_chess_move/mean": -0.1, "rewards/verify_chess_move/std": 0.9955476999282837, "step": 1595 }, { "completion_length": 362.2, "completions/clipped_ratio": 0.0, "completions/max_length": 362.2, "completions/max_terminated_length": 362.2, "completions/mean_length": 131.55390625, "completions/mean_terminated_length": 131.55390625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.001447997871443129, "frac_reward_zero_std": 0.3875, "grad_norm": 0.16216495633125305, "kl": 0.0009154481076620868, "learning_rate": 1.1421428571428571e-07, "loss": 0.0, "num_tokens": 118098756.0, "reward": -0.009375, "reward_std": 0.5297896146774292, "rewards/verify_chess_move/mean": -0.009375, "rewards/verify_chess_move/std": 0.981619906425476, "step": 1600 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 135.49921875, "completions/mean_terminated_length": 135.49921875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0014525228647913888, "frac_reward_zero_std": 0.4125, "grad_norm": 0.1442968100309372, "kl": 0.0009169134971671155, "learning_rate": 1.1457142857142857e-07, "loss": 0.0, "num_tokens": 118471763.0, "reward": -0.103125, "reward_std": 0.5279942274093627, "rewards/verify_chess_move/mean": -0.103125, "rewards/verify_chess_move/std": 0.9909327507019043, "step": 1605 }, { "completion_length": 336.2, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/max_terminated_length": 336.2, "completions/mean_length": 123.965625, "completions/mean_terminated_length": 123.965625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0014570478581396485, "frac_reward_zero_std": 0.325, "grad_norm": 0.1516593098640442, "kl": 0.0009780530534044373, "learning_rate": 1.1492857142857144e-07, "loss": 0.0, "num_tokens": 118825903.0, "reward": -0.109375, "reward_std": 0.6269298076629639, "rewards/verify_chess_move/mean": -0.109375, "rewards/verify_chess_move/std": 0.9856643319129944, "step": 1610 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 131.9828125, "completions/mean_terminated_length": 131.9828125, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0014615728514879084, "frac_reward_zero_std": 0.39375, "grad_norm": 0.12743079662322998, "kl": 0.0008432573476966354, "learning_rate": 1.1528571428571428e-07, "loss": 0.0, "num_tokens": 119193393.0, "reward": -0.1453125, "reward_std": 0.5416263937950134, "rewards/verify_chess_move/mean": -0.1453125, "rewards/verify_chess_move/std": 0.9881008863449097, "step": 1615 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 138.79609375, "completions/mean_terminated_length": 138.79609375, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.001466097844836168, "frac_reward_zero_std": 0.475, "grad_norm": 0.14057107269763947, "kl": 0.0008608948135588434, "learning_rate": 1.1564285714285715e-07, "loss": 0.0, "num_tokens": 119572132.0, "reward": -0.1609375, "reward_std": 0.48221994638442994, "rewards/verify_chess_move/mean": -0.1609375, "rewards/verify_chess_move/std": 0.9799022316932678, "step": 1620 }, { "completion_length": 414.8, "completions/clipped_ratio": 0.0, "completions/max_length": 414.8, "completions/max_terminated_length": 414.8, "completions/mean_length": 135.38046875, "completions/mean_terminated_length": 135.38046875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0014706228381844279, "frac_reward_zero_std": 0.425, "grad_norm": 0.12702295184135437, "kl": 0.0007641415752004832, "learning_rate": 1.16e-07, "loss": 0.0, "num_tokens": 119945587.0, "reward": -0.1046875, "reward_std": 0.5200515270233155, "rewards/verify_chess_move/mean": -0.1046875, "rewards/verify_chess_move/std": 0.990627110004425, "step": 1625 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0, "completions/max_length": 393.8, "completions/max_terminated_length": 393.8, "completions/mean_length": 137.8671875, "completions/mean_terminated_length": 137.8671875, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.0014751478315326875, "frac_reward_zero_std": 0.39375, "grad_norm": 0.14567476511001587, "kl": 0.0008309457727591508, "learning_rate": 1.1635714285714285e-07, "loss": 0.0, "num_tokens": 120320529.0, "reward": -0.0484375, "reward_std": 0.5452341735363007, "rewards/verify_chess_move/mean": -0.0484375, "rewards/verify_chess_move/std": 0.9974678993225098, "step": 1630 }, { "completion_length": 343.6, "completions/clipped_ratio": 0.0, "completions/max_length": 343.6, "completions/max_terminated_length": 343.6, "completions/mean_length": 131.81796875, "completions/mean_terminated_length": 131.81796875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0014796728248809474, "frac_reward_zero_std": 0.3375, "grad_norm": 0.14238892495632172, "kl": 0.0009119808886680403, "learning_rate": 1.1671428571428572e-07, "loss": 0.0, "num_tokens": 120687032.0, "reward": 0.0234375, "reward_std": 0.5987492322921752, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 0.9965833187103271, "step": 1635 }, { "completion_length": 369.8, "completions/clipped_ratio": 0.0, "completions/max_length": 369.8, "completions/max_terminated_length": 369.8, "completions/mean_length": 133.840625, "completions/mean_terminated_length": 133.840625, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.0014841978182292073, "frac_reward_zero_std": 0.35625, "grad_norm": 0.14068062603473663, "kl": 0.0008622196450232877, "learning_rate": 1.1707142857142858e-07, "loss": 0.0, "num_tokens": 121056620.0, "reward": -0.1046875, "reward_std": 0.5907446861267089, "rewards/verify_chess_move/mean": -0.1046875, "rewards/verify_chess_move/std": 0.9900789141654969, "step": 1640 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 133.72265625, "completions/mean_terminated_length": 133.72265625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.001488722811577467, "frac_reward_zero_std": 0.38125, "grad_norm": 0.18116962909698486, "kl": 0.0009016309217258822, "learning_rate": 1.1742857142857143e-07, "loss": 0.0, "num_tokens": 121425481.0, "reward": -0.090625, "reward_std": 0.5535082161426544, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9938021540641785, "step": 1645 }, { "completion_length": 352.2, "completions/clipped_ratio": 0.0, "completions/max_length": 352.2, "completions/max_terminated_length": 352.2, "completions/mean_length": 126.31484375, "completions/mean_terminated_length": 126.31484375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0014932478049257268, "frac_reward_zero_std": 0.34375, "grad_norm": 0.16118226945400238, "kl": 0.0009873548640825902, "learning_rate": 1.1778571428571429e-07, "loss": 0.0, "num_tokens": 121783244.0, "reward": -0.0234375, "reward_std": 0.5841275453567505, "rewards/verify_chess_move/mean": -0.0234375, "rewards/verify_chess_move/std": 0.9892277121543884, "step": 1650 }, { "completion_length": 338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 127.3921875, "completions/mean_terminated_length": 127.3921875, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0014977727982739865, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16867804527282715, "kl": 0.0010523658585952945, "learning_rate": 1.1814285714285715e-07, "loss": 0.0, "num_tokens": 122142034.0, "reward": -0.0328125, "reward_std": 0.589487612247467, "rewards/verify_chess_move/mean": -0.0328125, "rewards/verify_chess_move/std": 0.9941859483718872, "step": 1655 }, { "completion_length": 375.6, "completions/clipped_ratio": 0.0, "completions/max_length": 375.6, "completions/max_terminated_length": 375.6, "completions/mean_length": 134.11484375, "completions/mean_terminated_length": 134.11484375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0015022977916222464, "frac_reward_zero_std": 0.35, "grad_norm": 0.1398046761751175, "kl": 0.0008614267810116872, "learning_rate": 1.1849999999999998e-07, "loss": 0.0, "num_tokens": 122510749.0, "reward": -0.0734375, "reward_std": 0.5794920563697815, "rewards/verify_chess_move/mean": -0.0734375, "rewards/verify_chess_move/std": 0.9954866647720337, "step": 1660 }, { "completion_length": 336.2, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/max_terminated_length": 336.2, "completions/mean_length": 136.46328125, "completions/mean_terminated_length": 136.46328125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.001506822784970506, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1646568924188614, "kl": 0.0010234430570562835, "learning_rate": 1.1885714285714284e-07, "loss": 0.0, "num_tokens": 122883886.0, "reward": -0.146875, "reward_std": 0.5700186967849732, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9869393348693848, "step": 1665 }, { "completion_length": 393.6, "completions/clipped_ratio": 0.0, "completions/max_length": 393.6, "completions/max_terminated_length": 393.6, "completions/mean_length": 133.78984375, "completions/mean_terminated_length": 133.78984375, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.001511347778318766, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13542170822620392, "kl": 0.0008937468983276631, "learning_rate": 1.192142857142857e-07, "loss": 0.0, "num_tokens": 123252505.0, "reward": -0.0890625, "reward_std": 0.5950557351112366, "rewards/verify_chess_move/mean": -0.0890625, "rewards/verify_chess_move/std": 0.9959102630615234, "step": 1670 }, { "completion_length": 353.8, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/max_terminated_length": 353.8, "completions/mean_length": 140.25234375, "completions/mean_terminated_length": 140.25234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0015158727716670256, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13786165416240692, "kl": 0.0009318273567259893, "learning_rate": 1.1957142857142857e-07, "loss": 0.0, "num_tokens": 123631100.0, "reward": -0.096875, "reward_std": 0.5045425832271576, "rewards/verify_chess_move/mean": -0.096875, "rewards/verify_chess_move/std": 0.9823588848114013, "step": 1675 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 125.1671875, "completions/mean_terminated_length": 125.1671875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0015203977650152854, "frac_reward_zero_std": 0.33125, "grad_norm": 0.16006872057914734, "kl": 0.0010378550898167305, "learning_rate": 1.1992857142857143e-07, "loss": 0.0, "num_tokens": 123987842.0, "reward": -0.0578125, "reward_std": 0.5959559977054596, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9922518134117126, "step": 1680 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 129.3953125, "completions/mean_terminated_length": 129.3953125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0015249227583635453, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16059328615665436, "kl": 0.0011153091018059058, "learning_rate": 1.202857142857143e-07, "loss": 0.0, "num_tokens": 124350452.0, "reward": -0.0234375, "reward_std": 0.5856008887290954, "rewards/verify_chess_move/mean": -0.0234375, "rewards/verify_chess_move/std": 0.999508547782898, "step": 1685 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 143.73046875, "completions/mean_terminated_length": 143.73046875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.001529447751711805, "frac_reward_zero_std": 0.44375, "grad_norm": 0.15655188262462616, "kl": 0.0010233432863969937, "learning_rate": 1.2064285714285715e-07, "loss": 0.0, "num_tokens": 124737251.0, "reward": -0.2109375, "reward_std": 0.5106325387954712, "rewards/verify_chess_move/mean": -0.2109375, "rewards/verify_chess_move/std": 0.9747123837471008, "step": 1690 }, { "completion_length": 336.8, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/max_terminated_length": 336.8, "completions/mean_length": 132.55859375, "completions/mean_terminated_length": 132.55859375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0015339727450600648, "frac_reward_zero_std": 0.41875, "grad_norm": 0.15774673223495483, "kl": 0.0011099327261035797, "learning_rate": 1.2099999999999998e-07, "loss": 0.0, "num_tokens": 125105422.0, "reward": -0.0953125, "reward_std": 0.5297246456146241, "rewards/verify_chess_move/mean": -0.0953125, "rewards/verify_chess_move/std": 0.9953136205673218, "step": 1695 }, { "completion_length": 373.8, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 133.2109375, "completions/mean_terminated_length": 133.2109375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0015384977384083245, "frac_reward_zero_std": 0.425, "grad_norm": 0.16294576227664948, "kl": 0.0010439684614539147, "learning_rate": 1.2135714285714285e-07, "loss": 0.0, "num_tokens": 125475636.0, "reward": -0.146875, "reward_std": 0.510058468580246, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9879671096801758, "step": 1700 }, { "completion_length": 351.8, "completions/clipped_ratio": 0.0, "completions/max_length": 351.8, "completions/max_terminated_length": 351.8, "completions/mean_length": 142.4015625, "completions/mean_terminated_length": 142.4015625, "completions/min_length": 42.8, "completions/min_terminated_length": 42.8, "epoch": 0.0015430227317565844, "frac_reward_zero_std": 0.35, "grad_norm": 0.18489105999469757, "kl": 0.001013566885740147, "learning_rate": 1.217142857142857e-07, "loss": 0.0, "num_tokens": 125857990.0, "reward": -0.1109375, "reward_std": 0.6019989013671875, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.9847619295120239, "step": 1705 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 135.590625, "completions/mean_terminated_length": 135.590625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.001547547725104844, "frac_reward_zero_std": 0.39375, "grad_norm": 0.147421732544899, "kl": 0.0011734040035662474, "learning_rate": 1.2207142857142857e-07, "loss": 0.0, "num_tokens": 126233978.0, "reward": -0.146875, "reward_std": 0.5515012145042419, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9858897924423218, "step": 1710 }, { "completion_length": 487.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 487.0, "completions/max_terminated_length": 407.4, "completions/mean_length": 148.725, "completions/mean_terminated_length": 148.23528137207032, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.001552072718453104, "frac_reward_zero_std": 0.39375, "grad_norm": 0.11173372715711594, "kl": 0.0008130329702908057, "learning_rate": 1.2242857142857143e-07, "loss": 0.0, "num_tokens": 126627042.0, "reward": -0.234375, "reward_std": 0.5308370530605316, "rewards/verify_chess_move/mean": -0.234375, "rewards/verify_chess_move/std": 0.9731070160865783, "step": 1715 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.0, "completions/max_length": 390.4, "completions/max_terminated_length": 390.4, "completions/mean_length": 134.6203125, "completions/mean_terminated_length": 134.6203125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0015565977118013636, "frac_reward_zero_std": 0.325, "grad_norm": 0.18473568558692932, "kl": 0.0009417135848707403, "learning_rate": 1.227857142857143e-07, "loss": 0.0, "num_tokens": 126997452.0, "reward": -0.125, "reward_std": 0.6191543459892273, "rewards/verify_chess_move/mean": -0.125, "rewards/verify_chess_move/std": 0.9888764381408691, "step": 1720 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 135.82421875, "completions/mean_terminated_length": 135.82421875, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0015611227051496234, "frac_reward_zero_std": 0.35625, "grad_norm": 0.15911369025707245, "kl": 0.0011201650975635857, "learning_rate": 1.2314285714285713e-07, "loss": 0.0, "num_tokens": 127369043.0, "reward": -0.1484375, "reward_std": 0.5734964489936829, "rewards/verify_chess_move/mean": -0.1484375, "rewards/verify_chess_move/std": 0.9814405083656311, "step": 1725 }, { "completion_length": 453.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 128.921875, "completions/mean_terminated_length": 128.921875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.001565647698497883, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15901614725589752, "kl": 0.0008530205512215616, "learning_rate": 1.235e-07, "loss": 0.0, "num_tokens": 127733071.0, "reward": -0.078125, "reward_std": 0.5906987667083741, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.9894073009490967, "step": 1730 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 138.15859375, "completions/mean_terminated_length": 138.15859375, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.001570172691846143, "frac_reward_zero_std": 0.3875, "grad_norm": 0.14269843697547913, "kl": 0.00101411536870728, "learning_rate": 1.2385714285714285e-07, "loss": 0.0, "num_tokens": 128109218.0, "reward": -0.1453125, "reward_std": 0.5554033279418945, "rewards/verify_chess_move/mean": -0.1453125, "rewards/verify_chess_move/std": 0.9862972617149353, "step": 1735 }, { "completion_length": 379.8, "completions/clipped_ratio": 0.0, "completions/max_length": 379.8, "completions/max_terminated_length": 379.8, "completions/mean_length": 139.59765625, "completions/mean_terminated_length": 139.59765625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0015746976851944029, "frac_reward_zero_std": 0.43125, "grad_norm": 0.12284737080335617, "kl": 0.0010809149258420802, "learning_rate": 1.242142857142857e-07, "loss": 0.0, "num_tokens": 128487751.0, "reward": -0.1484375, "reward_std": 0.5181518852710724, "rewards/verify_chess_move/mean": -0.1484375, "rewards/verify_chess_move/std": 0.9867670774459839, "step": 1740 }, { "completion_length": 439.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.6, "completions/max_terminated_length": 355.8, "completions/mean_length": 138.60859375, "completions/mean_terminated_length": 138.116552734375, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0015792226785426625, "frac_reward_zero_std": 0.29375, "grad_norm": 0.07490142434835434, "kl": 0.0011062448833399686, "learning_rate": 1.2457142857142857e-07, "loss": 0.0, "num_tokens": 128862202.0, "reward": -0.090625, "reward_std": 0.6335254073143005, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9933177471160889, "step": 1745 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 141.41015625, "completions/mean_terminated_length": 141.41015625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0015837476718909224, "frac_reward_zero_std": 0.4, "grad_norm": 0.1386321634054184, "kl": 0.001159692829605774, "learning_rate": 1.2492857142857143e-07, "loss": 0.0, "num_tokens": 129244103.0, "reward": -0.2, "reward_std": 0.5488266110420227, "rewards/verify_chess_move/mean": -0.2, "rewards/verify_chess_move/std": 0.9797620177268982, "step": 1750 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 131.80703125, "completions/mean_terminated_length": 131.80703125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.001588272665239182, "frac_reward_zero_std": 0.30625, "grad_norm": 0.15944474935531616, "kl": 0.001138340580291697, "learning_rate": 1.2528571428571427e-07, "loss": 0.0, "num_tokens": 129608104.0, "reward": -0.0859375, "reward_std": 0.6227286338806153, "rewards/verify_chess_move/mean": -0.0859375, "rewards/verify_chess_move/std": 0.9954705119132996, "step": 1755 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 130.55859375, "completions/mean_terminated_length": 130.55859375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.001592797658587442, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12873350083827972, "kl": 0.001198137544997735, "learning_rate": 1.2564285714285713e-07, "loss": 0.0, "num_tokens": 129973131.0, "reward": -0.146875, "reward_std": 0.5914757430553437, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9860428690910339, "step": 1760 }, { "completion_length": 443.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 135.5453125, "completions/mean_terminated_length": 135.0474822998047, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0015973226519357016, "frac_reward_zero_std": 0.2875, "grad_norm": 0.0724705383181572, "kl": 0.0012427671365003335, "learning_rate": 1.26e-07, "loss": 0.0, "num_tokens": 130345541.0, "reward": -0.028125, "reward_std": 0.6579744577407837, "rewards/verify_chess_move/mean": -0.028125, "rewards/verify_chess_move/std": 0.9926026940345765, "step": 1765 }, { "completion_length": 373.2, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 143.44609375, "completions/mean_terminated_length": 143.44609375, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.0016018476452839615, "frac_reward_zero_std": 0.41875, "grad_norm": 0.12017706036567688, "kl": 0.001196693095334922, "learning_rate": 1.2635714285714285e-07, "loss": 0.0, "num_tokens": 130732136.0, "reward": -0.14375, "reward_std": 0.5251529395580292, "rewards/verify_chess_move/mean": -0.14375, "rewards/verify_chess_move/std": 0.9844402313232422, "step": 1770 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 143.48046875, "completions/mean_terminated_length": 143.48046875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0016063726386322211, "frac_reward_zero_std": 0.38125, "grad_norm": 0.1057048812508583, "kl": 0.0012217668338053045, "learning_rate": 1.267142857142857e-07, "loss": 0.0, "num_tokens": 131115991.0, "reward": -0.1671875, "reward_std": 0.5417268216609955, "rewards/verify_chess_move/mean": -0.1671875, "rewards/verify_chess_move/std": 0.9732113599777221, "step": 1775 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 129.6890625, "completions/mean_terminated_length": 129.6890625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.001610897631980481, "frac_reward_zero_std": 0.34375, "grad_norm": 0.16387926042079926, "kl": 0.001338371791280224, "learning_rate": 1.2707142857142857e-07, "loss": 0.0, "num_tokens": 131476905.0, "reward": -0.071875, "reward_std": 0.60167897939682, "rewards/verify_chess_move/mean": -0.071875, "rewards/verify_chess_move/std": 0.9929974675178528, "step": 1780 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 142.3828125, "completions/mean_terminated_length": 142.3828125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0016154226253287409, "frac_reward_zero_std": 0.375, "grad_norm": 0.1543474942445755, "kl": 0.0011857742365464219, "learning_rate": 1.274285714285714e-07, "loss": 0.0, "num_tokens": 131860875.0, "reward": -0.1265625, "reward_std": 0.5766387701034545, "rewards/verify_chess_move/mean": -0.1265625, "rewards/verify_chess_move/std": 0.9810736060142518, "step": 1785 }, { "completion_length": 351.4, "completions/clipped_ratio": 0.0, "completions/max_length": 351.4, "completions/max_terminated_length": 351.4, "completions/mean_length": 134.03046875, "completions/mean_terminated_length": 134.03046875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0016199476186770005, "frac_reward_zero_std": 0.425, "grad_norm": 0.13754990696907043, "kl": 0.0013570610994065647, "learning_rate": 1.2778571428571427e-07, "loss": 0.0, "num_tokens": 132231522.0, "reward": -0.1484375, "reward_std": 0.5164689421653748, "rewards/verify_chess_move/mean": -0.1484375, "rewards/verify_chess_move/std": 0.9833133101463318, "step": 1790 }, { "completion_length": 356.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 141.928125, "completions/mean_terminated_length": 141.928125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0016244726120252604, "frac_reward_zero_std": 0.35625, "grad_norm": 0.1560053676366806, "kl": 0.001256775972797186, "learning_rate": 1.2814285714285713e-07, "loss": 0.0, "num_tokens": 132613846.0, "reward": -0.059375, "reward_std": 0.5766052603721619, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9988855838775634, "step": 1795 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 130.45078125, "completions/mean_terminated_length": 130.45078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.00162899760537352, "frac_reward_zero_std": 0.40625, "grad_norm": 0.10075167566537857, "kl": 0.0014061290361496504, "learning_rate": 1.285e-07, "loss": 0.0, "num_tokens": 132977415.0, "reward": 0.0609375, "reward_std": 0.5304566562175751, "rewards/verify_chess_move/mean": 0.0609375, "rewards/verify_chess_move/std": 0.9894279479980469, "step": 1800 }, { "completion_length": 365.8, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 142.7140625, "completions/mean_terminated_length": 142.7140625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00163352259872178, "frac_reward_zero_std": 0.36875, "grad_norm": 0.16132640838623047, "kl": 0.001457459891025792, "learning_rate": 1.2885714285714285e-07, "loss": 0.0, "num_tokens": 133362217.0, "reward": -0.14375, "reward_std": 0.5643970012664795, "rewards/verify_chess_move/mean": -0.14375, "rewards/verify_chess_move/std": 0.9713255405426026, "step": 1805 }, { "completion_length": 348.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 137.09609375, "completions/mean_terminated_length": 137.09609375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0016380475920700396, "frac_reward_zero_std": 0.375, "grad_norm": 0.10216664522886276, "kl": 0.001518453665812558, "learning_rate": 1.2921428571428572e-07, "loss": 0.0, "num_tokens": 133737348.0, "reward": -0.2234375, "reward_std": 0.5588208079338074, "rewards/verify_chess_move/mean": -0.2234375, "rewards/verify_chess_move/std": 0.9691570639610291, "step": 1810 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 141.26796875, "completions/mean_terminated_length": 141.26796875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0016425725854182995, "frac_reward_zero_std": 0.35625, "grad_norm": 0.16625341773033142, "kl": 0.002015843867775402, "learning_rate": 1.2957142857142855e-07, "loss": 0.0, "num_tokens": 134119043.0, "reward": -0.140625, "reward_std": 0.5836502432823181, "rewards/verify_chess_move/mean": -0.140625, "rewards/verify_chess_move/std": 0.9862741112709046, "step": 1815 }, { "completion_length": 356.8, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 154.7421875, "completions/mean_terminated_length": 154.7421875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0016470975787665591, "frac_reward_zero_std": 0.2875, "grad_norm": 0.17970481514930725, "kl": 0.0020669876150350317, "learning_rate": 1.299285714285714e-07, "loss": 0.0, "num_tokens": 134521945.0, "reward": -0.1328125, "reward_std": 0.652813720703125, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9887855172157287, "step": 1820 }, { "completion_length": 448.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 448.2, "completions/max_terminated_length": 406.0, "completions/mean_length": 128.6875, "completions/mean_terminated_length": 128.1901062011719, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.001651622572114819, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16775307059288025, "kl": 0.0026568097915514953, "learning_rate": 1.3028571428571427e-07, "loss": 0.0, "num_tokens": 134881913.0, "reward": -0.0078125, "reward_std": 0.5935461401939393, "rewards/verify_chess_move/mean": -0.0078125, "rewards/verify_chess_move/std": 0.9950980067253112, "step": 1825 }, { "completion_length": 331.4, "completions/clipped_ratio": 0.0, "completions/max_length": 331.4, "completions/max_terminated_length": 331.4, "completions/mean_length": 132.1234375, "completions/mean_terminated_length": 132.1234375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0016561475654630787, "frac_reward_zero_std": 0.3375, "grad_norm": 0.14951927959918976, "kl": 0.0027922932367800968, "learning_rate": 1.3064285714285713e-07, "loss": 0.0, "num_tokens": 135247231.0, "reward": -0.0140625, "reward_std": 0.5935410380363464, "rewards/verify_chess_move/mean": -0.0140625, "rewards/verify_chess_move/std": 0.9963250756263733, "step": 1830 }, { "completion_length": 340.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 131.1796875, "completions/mean_terminated_length": 131.1796875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0016606725588113385, "frac_reward_zero_std": 0.33125, "grad_norm": 0.13402821123600006, "kl": 0.002922878799290629, "learning_rate": 1.31e-07, "loss": 0.0, "num_tokens": 135613365.0, "reward": -0.0578125, "reward_std": 0.6061592340469361, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9788910627365113, "step": 1835 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 131.17421875, "completions/mean_terminated_length": 131.17421875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0016651975521595984, "frac_reward_zero_std": 0.3625, "grad_norm": 0.15174351632595062, "kl": 0.0031729217149404575, "learning_rate": 1.3135714285714286e-07, "loss": 0.0, "num_tokens": 135977284.0, "reward": -0.1734375, "reward_std": 0.5711260080337525, "rewards/verify_chess_move/mean": -0.1734375, "rewards/verify_chess_move/std": 0.985247278213501, "step": 1840 }, { "completion_length": 384.2, "completions/clipped_ratio": 0.0, "completions/max_length": 384.2, "completions/max_terminated_length": 384.2, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.001669722545507858, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1314859688282013, "kl": 0.003101542201693519, "learning_rate": 1.317142857142857e-07, "loss": 0.0, "num_tokens": 136356508.0, "reward": -0.1515625, "reward_std": 0.562877756357193, "rewards/verify_chess_move/mean": -0.1515625, "rewards/verify_chess_move/std": 0.9829872846603394, "step": 1845 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 129.91875, "completions/mean_terminated_length": 129.91875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.001674247538856118, "frac_reward_zero_std": 0.35625, "grad_norm": 0.13334894180297852, "kl": 0.008154000094873482, "learning_rate": 1.3207142857142855e-07, "loss": 0.0, "num_tokens": 136720508.0, "reward": -0.053125, "reward_std": 0.5827570676803588, "rewards/verify_chess_move/mean": -0.053125, "rewards/verify_chess_move/std": 0.9964704632759094, "step": 1850 }, { "completion_length": 360.4, "completions/clipped_ratio": 0.0, "completions/max_length": 360.4, "completions/max_terminated_length": 360.4, "completions/mean_length": 137.3234375, "completions/mean_terminated_length": 137.3234375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0016787725322043776, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1807989627122879, "kl": 0.004819869511629804, "learning_rate": 1.3242857142857141e-07, "loss": 0.0, "num_tokens": 137095770.0, "reward": -0.1046875, "reward_std": 0.5920103549957275, "rewards/verify_chess_move/mean": -0.1046875, "rewards/verify_chess_move/std": 0.9848783254623413, "step": 1855 }, { "completion_length": 384.4, "completions/clipped_ratio": 0.0, "completions/max_length": 384.4, "completions/max_terminated_length": 384.4, "completions/mean_length": 137.43671875, "completions/mean_terminated_length": 137.43671875, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0016832975255526375, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1653291881084442, "kl": 0.006084062709123828, "learning_rate": 1.3278571428571428e-07, "loss": 0.0, "num_tokens": 137469585.0, "reward": -0.0625, "reward_std": 0.586477530002594, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.986817467212677, "step": 1860 }, { "completion_length": 423.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 423.8, "completions/max_terminated_length": 343.2, "completions/mean_length": 136.171875, "completions/mean_terminated_length": 135.68587951660157, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0016878225189008971, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1904376745223999, "kl": 0.006030583879328333, "learning_rate": 1.3314285714285714e-07, "loss": 0.0, "num_tokens": 137842189.0, "reward": -0.125, "reward_std": 0.5816503167152405, "rewards/verify_chess_move/mean": -0.125, "rewards/verify_chess_move/std": 0.9887273430824279, "step": 1865 }, { "completion_length": 350.4, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 135.825, "completions/mean_terminated_length": 135.825, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.001692347512249157, "frac_reward_zero_std": 0.325, "grad_norm": 0.19297200441360474, "kl": 0.0052128534302028125, "learning_rate": 1.335e-07, "loss": 0.0, "num_tokens": 138216093.0, "reward": -0.165625, "reward_std": 0.6241958856582641, "rewards/verify_chess_move/mean": -0.165625, "rewards/verify_chess_move/std": 0.9814623713493347, "step": 1870 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 148.99140625, "completions/mean_terminated_length": 148.99140625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0016968725055974167, "frac_reward_zero_std": 0.34375, "grad_norm": 0.18946784734725952, "kl": 0.0051951219124021005, "learning_rate": 1.3385714285714286e-07, "loss": 0.0, "num_tokens": 138610042.0, "reward": -0.1953125, "reward_std": 0.5902289509773254, "rewards/verify_chess_move/mean": -0.1953125, "rewards/verify_chess_move/std": 0.9758653759956359, "step": 1875 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 144.071875, "completions/mean_terminated_length": 144.071875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0017013974989456766, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1316050887107849, "kl": 0.009102074824477313, "learning_rate": 1.342142857142857e-07, "loss": 0.0, "num_tokens": 138993358.0, "reward": -0.025, "reward_std": 0.575176453590393, "rewards/verify_chess_move/mean": -0.025, "rewards/verify_chess_move/std": 0.9945296287536621, "step": 1880 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 137.575, "completions/mean_terminated_length": 137.575, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0017059224922939364, "frac_reward_zero_std": 0.275, "grad_norm": 0.21831904351711273, "kl": 0.014629513958425377, "learning_rate": 1.3457142857142856e-07, "loss": 0.0, "num_tokens": 139365686.0, "reward": -0.1234375, "reward_std": 0.6357782363891602, "rewards/verify_chess_move/mean": -0.1234375, "rewards/verify_chess_move/std": 0.9851063370704651, "step": 1885 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 136.18359375, "completions/mean_terminated_length": 136.18359375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.001710447485642196, "frac_reward_zero_std": 0.35625, "grad_norm": 0.20880861580371857, "kl": 0.01764419991959585, "learning_rate": 1.3492857142857142e-07, "loss": 0.0, "num_tokens": 139738457.0, "reward": -0.175, "reward_std": 0.5898466050624848, "rewards/verify_chess_move/mean": -0.175, "rewards/verify_chess_move/std": 0.9716943383216858, "step": 1890 }, { "completion_length": 427.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 427.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 135.2453125, "completions/mean_terminated_length": 134.75615844726562, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.001714972478990456, "frac_reward_zero_std": 0.325, "grad_norm": 0.3180710971355438, "kl": 0.04365742031895934, "learning_rate": 1.3528571428571428e-07, "loss": 0.0, "num_tokens": 140108971.0, "reward": -0.08125, "reward_std": 0.5996914505958557, "rewards/verify_chess_move/mean": -0.08125, "rewards/verify_chess_move/std": 0.9982188105583191, "step": 1895 }, { "completion_length": 355.4, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/max_terminated_length": 355.4, "completions/mean_length": 126.50546875, "completions/mean_terminated_length": 126.50546875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0017194974723387156, "frac_reward_zero_std": 0.3125, "grad_norm": 0.18655548989772797, "kl": 0.03579515859601088, "learning_rate": 1.3564285714285714e-07, "loss": 0.0, "num_tokens": 140466370.0, "reward": -0.11875, "reward_std": 0.6167320609092712, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9928619027137756, "step": 1900 }, { "completion_length": 404.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 125.14296875, "completions/mean_terminated_length": 125.14296875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0017240224656869755, "frac_reward_zero_std": 0.35625, "grad_norm": 0.14543329179286957, "kl": 0.027980446206493072, "learning_rate": 1.36e-07, "loss": 0.0, "num_tokens": 140821289.0, "reward": 0.0203125, "reward_std": 0.5739188194274902, "rewards/verify_chess_move/mean": 0.0203125, "rewards/verify_chess_move/std": 0.9922362923622131, "step": 1905 }, { "completion_length": 425.8, "completions/clipped_ratio": 0.0, "completions/max_length": 425.8, "completions/max_terminated_length": 425.8, "completions/mean_length": 145.30078125, "completions/mean_terminated_length": 145.30078125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0017285474590352352, "frac_reward_zero_std": 0.275, "grad_norm": 0.17634274065494537, "kl": 0.014560357631853548, "learning_rate": 1.3635714285714284e-07, "loss": 0.0, "num_tokens": 141205690.0, "reward": -0.1125, "reward_std": 0.6627114534378051, "rewards/verify_chess_move/mean": -0.1125, "rewards/verify_chess_move/std": 0.9896098732948303, "step": 1910 }, { "completion_length": 498.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 498.2, "completions/max_terminated_length": 416.8, "completions/mean_length": 140.25234375, "completions/mean_terminated_length": 139.74407348632812, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.001733072452383495, "frac_reward_zero_std": 0.38125, "grad_norm": 0.1972043365240097, "kl": 0.021332638949752437, "learning_rate": 1.367142857142857e-07, "loss": 0.0, "num_tokens": 141586509.0, "reward": -0.046875, "reward_std": 0.564655065536499, "rewards/verify_chess_move/mean": -0.046875, "rewards/verify_chess_move/std": 0.995689845085144, "step": 1915 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 141.23046875, "completions/mean_terminated_length": 141.23046875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0017375974457317547, "frac_reward_zero_std": 0.3875, "grad_norm": 0.08526518940925598, "kl": 0.0347375238823588, "learning_rate": 1.3707142857142856e-07, "loss": 0.0, "num_tokens": 141967172.0, "reward": -0.140625, "reward_std": 0.5540828585624695, "rewards/verify_chess_move/mean": -0.140625, "rewards/verify_chess_move/std": 0.9759856700897217, "step": 1920 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 127.3234375, "completions/mean_terminated_length": 127.3234375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0017421224390800146, "frac_reward_zero_std": 0.375, "grad_norm": 0.13563749194145203, "kl": 0.05198759257546044, "learning_rate": 1.3742857142857142e-07, "loss": 0.0001, "num_tokens": 142326442.0, "reward": -0.0296875, "reward_std": 0.5629246592521667, "rewards/verify_chess_move/mean": -0.0296875, "rewards/verify_chess_move/std": 0.9837581872940063, "step": 1925 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 132.58125, "completions/mean_terminated_length": 132.58125, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0017466474324282744, "frac_reward_zero_std": 0.31875, "grad_norm": 0.17186547815799713, "kl": 0.04796328176744282, "learning_rate": 1.3778571428571428e-07, "loss": 0.0, "num_tokens": 142693258.0, "reward": -0.0921875, "reward_std": 0.6192607402801513, "rewards/verify_chess_move/mean": -0.0921875, "rewards/verify_chess_move/std": 0.9926180720329285, "step": 1930 }, { "completion_length": 362.2, "completions/clipped_ratio": 0.0, "completions/max_length": 362.2, "completions/max_terminated_length": 362.2, "completions/mean_length": 138.2921875, "completions/mean_terminated_length": 138.2921875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.001751172425776534, "frac_reward_zero_std": 0.36875, "grad_norm": 0.3222127854824066, "kl": 0.03785144674075127, "learning_rate": 1.3814285714285714e-07, "loss": 0.0, "num_tokens": 143069472.0, "reward": -0.178125, "reward_std": 0.5729009509086609, "rewards/verify_chess_move/mean": -0.178125, "rewards/verify_chess_move/std": 0.9792868137359619, "step": 1935 }, { "completion_length": 431.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 141.15390625, "completions/mean_terminated_length": 141.15390625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.001755697419124794, "frac_reward_zero_std": 0.39375, "grad_norm": 0.18073780834674835, "kl": 0.056522931349172724, "learning_rate": 1.385e-07, "loss": 0.0001, "num_tokens": 143451493.0, "reward": -0.153125, "reward_std": 0.5501332819461823, "rewards/verify_chess_move/mean": -0.153125, "rewards/verify_chess_move/std": 0.9866564631462097, "step": 1940 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 135.19765625, "completions/mean_terminated_length": 135.19765625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0017602224124730536, "frac_reward_zero_std": 0.40625, "grad_norm": 0.227837473154068, "kl": 0.04958074933019816, "learning_rate": 1.3885714285714284e-07, "loss": 0.0, "num_tokens": 143823386.0, "reward": -0.1328125, "reward_std": 0.547022920846939, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9908116936683655, "step": 1945 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 124.84453125, "completions/mean_terminated_length": 124.84453125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0017647474058213135, "frac_reward_zero_std": 0.35625, "grad_norm": 0.1868344396352768, "kl": 0.05303937436037813, "learning_rate": 1.392142857142857e-07, "loss": 0.0001, "num_tokens": 144179947.0, "reward": -0.0875, "reward_std": 0.5851111769676208, "rewards/verify_chess_move/mean": -0.0875, "rewards/verify_chess_move/std": 0.9960119605064393, "step": 1950 }, { "completion_length": 408.8, "completions/clipped_ratio": 0.0, "completions/max_length": 408.8, "completions/max_terminated_length": 408.8, "completions/mean_length": 140.69140625, "completions/mean_terminated_length": 140.69140625, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0017692723991695732, "frac_reward_zero_std": 0.35625, "grad_norm": 0.22212834656238556, "kl": 0.04874621890776325, "learning_rate": 1.3957142857142856e-07, "loss": 0.0, "num_tokens": 144558888.0, "reward": -0.0265625, "reward_std": 0.5835330367088318, "rewards/verify_chess_move/mean": -0.0265625, "rewards/verify_chess_move/std": 0.9928256154060364, "step": 1955 }, { "completion_length": 401.8, "completions/clipped_ratio": 0.0, "completions/max_length": 401.8, "completions/max_terminated_length": 401.8, "completions/mean_length": 136.6234375, "completions/mean_terminated_length": 136.6234375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.001773797392517833, "frac_reward_zero_std": 0.34375, "grad_norm": 0.14400586485862732, "kl": 0.02362661621773441, "learning_rate": 1.3992857142857142e-07, "loss": 0.0, "num_tokens": 144930606.0, "reward": 0.0296875, "reward_std": 0.5831800103187561, "rewards/verify_chess_move/mean": 0.0296875, "rewards/verify_chess_move/std": 0.9937467932701111, "step": 1960 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 131.9703125, "completions/mean_terminated_length": 131.9703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0017783223858660927, "frac_reward_zero_std": 0.375, "grad_norm": 0.2628585398197174, "kl": 0.019777955814424784, "learning_rate": 1.4028571428571428e-07, "loss": 0.0, "num_tokens": 145295568.0, "reward": -0.0125, "reward_std": 0.5618657648563385, "rewards/verify_chess_move/mean": -0.0125, "rewards/verify_chess_move/std": 0.9964586496353149, "step": 1965 }, { "completion_length": 404.6, "completions/clipped_ratio": 0.0, "completions/max_length": 404.6, "completions/max_terminated_length": 404.6, "completions/mean_length": 136.8609375, "completions/mean_terminated_length": 136.8609375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0017828473792143526, "frac_reward_zero_std": 0.4, "grad_norm": 0.2056237757205963, "kl": 0.015995392886179616, "learning_rate": 1.4064285714285714e-07, "loss": 0.0, "num_tokens": 145669902.0, "reward": -0.146875, "reward_std": 0.5384102106094361, "rewards/verify_chess_move/mean": -0.146875, "rewards/verify_chess_move/std": 0.9895971894264222, "step": 1970 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 134.2828125, "completions/mean_terminated_length": 134.2828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0017873723725626122, "frac_reward_zero_std": 0.4125, "grad_norm": 0.15783990919589996, "kl": 0.012778067283943529, "learning_rate": 1.4099999999999998e-07, "loss": 0.0, "num_tokens": 146039344.0, "reward": 0.015625, "reward_std": 0.5172251999378205, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9830448508262635, "step": 1975 }, { "completion_length": 344.2, "completions/clipped_ratio": 0.0, "completions/max_length": 344.2, "completions/max_terminated_length": 344.2, "completions/mean_length": 141.96640625, "completions/mean_terminated_length": 141.96640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0017918973659108721, "frac_reward_zero_std": 0.30625, "grad_norm": 0.17160938680171967, "kl": 0.009072994723828743, "learning_rate": 1.4135714285714284e-07, "loss": 0.0, "num_tokens": 146419597.0, "reward": -0.096875, "reward_std": 0.6531261801719666, "rewards/verify_chess_move/mean": -0.096875, "rewards/verify_chess_move/std": 0.9894146203994751, "step": 1980 }, { "completion_length": 373.8, "completions/clipped_ratio": 0.0, "completions/max_length": 373.8, "completions/max_terminated_length": 373.8, "completions/mean_length": 135.09375, "completions/mean_terminated_length": 135.09375, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.001796422359259132, "frac_reward_zero_std": 0.3625, "grad_norm": 0.20170912146568298, "kl": 0.01399392346284003, "learning_rate": 1.417142857142857e-07, "loss": 0.0, "num_tokens": 146792125.0, "reward": 0.003125, "reward_std": 0.5776991665363311, "rewards/verify_chess_move/mean": 0.003125, "rewards/verify_chess_move/std": 0.9922678112983704, "step": 1985 }, { "completion_length": 362.4, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/max_terminated_length": 362.4, "completions/mean_length": 146.17734375, "completions/mean_terminated_length": 146.17734375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0018009473526073916, "frac_reward_zero_std": 0.36875, "grad_norm": 0.12369384616613388, "kl": 0.007737601236294722, "learning_rate": 1.4207142857142856e-07, "loss": 0.0, "num_tokens": 147181448.0, "reward": -0.1390625, "reward_std": 0.5770735383033753, "rewards/verify_chess_move/mean": -0.1390625, "rewards/verify_chess_move/std": 0.9893765091896057, "step": 1990 }, { "completion_length": 352.2, "completions/clipped_ratio": 0.0, "completions/max_length": 352.2, "completions/max_terminated_length": 352.2, "completions/mean_length": 135.6125, "completions/mean_terminated_length": 135.6125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0018054723459556515, "frac_reward_zero_std": 0.30625, "grad_norm": 0.18155674636363983, "kl": 0.015425322682858678, "learning_rate": 1.4242857142857142e-07, "loss": 0.0, "num_tokens": 147552208.0, "reward": -0.03125, "reward_std": 0.634982806444168, "rewards/verify_chess_move/mean": -0.03125, "rewards/verify_chess_move/std": 0.990547239780426, "step": 1995 }, { "completion_length": 397.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 141.175, "completions/mean_terminated_length": 141.175, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0018099973393039112, "frac_reward_zero_std": 0.33125, "grad_norm": 0.16278119385242462, "kl": 0.02054486894630827, "learning_rate": 1.4278571428571429e-07, "loss": 0.0, "num_tokens": 147932888.0, "reward": -0.040625, "reward_std": 0.6135152816772461, "rewards/verify_chess_move/mean": -0.040625, "rewards/verify_chess_move/std": 0.9939890027046203, "step": 2000 }, { "completion_length": 381.4, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 137.63203125, "completions/mean_terminated_length": 137.63203125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.001814522332652171, "frac_reward_zero_std": 0.28125, "grad_norm": 0.17684204876422882, "kl": 0.0417679502574174, "learning_rate": 1.4314285714285715e-07, "loss": 0.0, "num_tokens": 148305321.0, "reward": -0.11875, "reward_std": 0.6367801666259766, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9939594626426697, "step": 2005 }, { "completion_length": 346.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 143.43671875, "completions/mean_terminated_length": 143.43671875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0018190473260004307, "frac_reward_zero_std": 0.41875, "grad_norm": 0.16170567274093628, "kl": 0.05202995368272241, "learning_rate": 1.4349999999999998e-07, "loss": 0.0001, "num_tokens": 148691472.0, "reward": -0.1328125, "reward_std": 0.5317745864391327, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.986409866809845, "step": 2010 }, { "completion_length": 348.8, "completions/clipped_ratio": 0.0, "completions/max_length": 348.8, "completions/max_terminated_length": 348.8, "completions/mean_length": 141.5953125, "completions/mean_terminated_length": 141.5953125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0018235723193486906, "frac_reward_zero_std": 0.3625, "grad_norm": 0.15995047986507416, "kl": 0.10290712802161579, "learning_rate": 1.4385714285714284e-07, "loss": 0.0001, "num_tokens": 149073170.0, "reward": -0.121875, "reward_std": 0.568393075466156, "rewards/verify_chess_move/mean": -0.121875, "rewards/verify_chess_move/std": 0.9924513339996338, "step": 2015 }, { "completion_length": 345.6, "completions/clipped_ratio": 0.0, "completions/max_length": 345.6, "completions/max_terminated_length": 345.6, "completions/mean_length": 141.42578125, "completions/mean_terminated_length": 141.42578125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0018280973126969502, "frac_reward_zero_std": 0.4375, "grad_norm": 0.25258418917655945, "kl": 0.10946741627412848, "learning_rate": 1.442142857142857e-07, "loss": 0.0001, "num_tokens": 149456067.0, "reward": -0.1109375, "reward_std": 0.5168916940689087, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.9940158605575562, "step": 2020 }, { "completion_length": 443.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.4, "completions/max_terminated_length": 360.0, "completions/mean_length": 129.89921875, "completions/mean_terminated_length": 129.41012878417968, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0018326223060452101, "frac_reward_zero_std": 0.31875, "grad_norm": 0.08839476108551025, "kl": 0.08173496501185581, "learning_rate": 1.4457142857142857e-07, "loss": 0.0001, "num_tokens": 149818810.0, "reward": -0.015625, "reward_std": 0.6025327384471894, "rewards/verify_chess_move/mean": -0.015625, "rewards/verify_chess_move/std": 0.9898167848587036, "step": 2025 }, { "completion_length": 386.8, "completions/clipped_ratio": 0.0, "completions/max_length": 386.8, "completions/max_terminated_length": 386.8, "completions/mean_length": 128.81875, "completions/mean_terminated_length": 128.81875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00183714729939347, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1933668553829193, "kl": 0.06430532291487907, "learning_rate": 1.4492857142857143e-07, "loss": 0.0001, "num_tokens": 150180466.0, "reward": -0.0234375, "reward_std": 0.5980063557624817, "rewards/verify_chess_move/mean": -0.0234375, "rewards/verify_chess_move/std": 0.9991344451904297, "step": 2030 }, { "completion_length": 359.8, "completions/clipped_ratio": 0.0, "completions/max_length": 359.8, "completions/max_terminated_length": 359.8, "completions/mean_length": 140.078125, "completions/mean_terminated_length": 140.078125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0018416722927417297, "frac_reward_zero_std": 0.36875, "grad_norm": 0.18051379919052124, "kl": 0.031552208253197024, "learning_rate": 1.452857142857143e-07, "loss": 0.0, "num_tokens": 150557646.0, "reward": 0.003125, "reward_std": 0.5831061542034149, "rewards/verify_chess_move/mean": 0.003125, "rewards/verify_chess_move/std": 0.9931652307510376, "step": 2035 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.0, "completions/max_length": 380.6, "completions/max_terminated_length": 380.6, "completions/mean_length": 148.05625, "completions/mean_terminated_length": 148.05625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0018461972860899895, "frac_reward_zero_std": 0.2875, "grad_norm": 0.2933424413204193, "kl": 0.014322874368372141, "learning_rate": 1.4564285714285712e-07, "loss": 0.0, "num_tokens": 150947822.0, "reward": -0.10625, "reward_std": 0.6493968367576599, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.9943345069885254, "step": 2040 }, { "completion_length": 369.4, "completions/clipped_ratio": 0.0, "completions/max_length": 369.4, "completions/max_terminated_length": 369.4, "completions/mean_length": 137.6203125, "completions/mean_terminated_length": 137.6203125, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.0018507222794382492, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1628894805908203, "kl": 0.024346129849436692, "learning_rate": 1.4599999999999998e-07, "loss": 0.0, "num_tokens": 151321376.0, "reward": -0.028125, "reward_std": 0.5800731718540192, "rewards/verify_chess_move/mean": -0.028125, "rewards/verify_chess_move/std": 0.9932409286499023, "step": 2045 }, { "completion_length": 409.2, "completions/clipped_ratio": 0.0, "completions/max_length": 409.2, "completions/max_terminated_length": 409.2, "completions/mean_length": 133.6890625, "completions/mean_terminated_length": 133.6890625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.001855247272786509, "frac_reward_zero_std": 0.3625, "grad_norm": 0.25338876247406006, "kl": 0.04130960040092759, "learning_rate": 1.4635714285714285e-07, "loss": 0.0, "num_tokens": 151689794.0, "reward": -0.096875, "reward_std": 0.5621927857398987, "rewards/verify_chess_move/mean": -0.096875, "rewards/verify_chess_move/std": 0.9867778778076172, "step": 2050 }, { "completion_length": 423.4, "completions/clipped_ratio": 0.0, "completions/max_length": 423.4, "completions/max_terminated_length": 423.4, "completions/mean_length": 141.9078125, "completions/mean_terminated_length": 141.9078125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0018597722661347687, "frac_reward_zero_std": 0.4, "grad_norm": 0.1067776158452034, "kl": 0.04808168435083644, "learning_rate": 1.467142857142857e-07, "loss": 0.0, "num_tokens": 152073836.0, "reward": -0.2140625, "reward_std": 0.5381999969482422, "rewards/verify_chess_move/mean": -0.2140625, "rewards/verify_chess_move/std": 0.9616376280784606, "step": 2055 }, { "completion_length": 361.4, "completions/clipped_ratio": 0.0, "completions/max_length": 361.4, "completions/max_terminated_length": 361.4, "completions/mean_length": 131.80546875, "completions/mean_terminated_length": 131.80546875, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.0018642972594830286, "frac_reward_zero_std": 0.33125, "grad_norm": 0.12190230935811996, "kl": 0.06736433590340311, "learning_rate": 1.4707142857142857e-07, "loss": 0.0001, "num_tokens": 152440315.0, "reward": -0.0734375, "reward_std": 0.6029545068740845, "rewards/verify_chess_move/mean": -0.0734375, "rewards/verify_chess_move/std": 0.9873651385307312, "step": 2060 }, { "completion_length": 422.2, "completions/clipped_ratio": 0.0, "completions/max_length": 422.2, "completions/max_terminated_length": 422.2, "completions/mean_length": 151.83125, "completions/mean_terminated_length": 151.83125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0018688222528312883, "frac_reward_zero_std": 0.375, "grad_norm": 0.23673109710216522, "kl": 0.060572026936642945, "learning_rate": 1.4742857142857143e-07, "loss": 0.0001, "num_tokens": 152835779.0, "reward": -0.0953125, "reward_std": 0.5697584569454193, "rewards/verify_chess_move/mean": -0.0953125, "rewards/verify_chess_move/std": 0.9789682507514954, "step": 2065 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 140.7, "completions/mean_terminated_length": 140.7, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0018733472461795481, "frac_reward_zero_std": 0.35625, "grad_norm": 0.18807759881019592, "kl": 0.08395765470777405, "learning_rate": 1.477857142857143e-07, "loss": 0.0001, "num_tokens": 153216539.0, "reward": -0.1625, "reward_std": 0.5910033345222473, "rewards/verify_chess_move/mean": -0.1625, "rewards/verify_chess_move/std": 0.9861393570899963, "step": 2070 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.0, "completions/max_length": 425.6, "completions/max_terminated_length": 425.6, "completions/mean_length": 132.9609375, "completions/mean_terminated_length": 132.9609375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0018778722395278078, "frac_reward_zero_std": 0.325, "grad_norm": 0.2098475843667984, "kl": 0.08107565205791616, "learning_rate": 1.4814285714285713e-07, "loss": 0.0001, "num_tokens": 153583289.0, "reward": -0.071875, "reward_std": 0.6192008256912231, "rewards/verify_chess_move/mean": -0.071875, "rewards/verify_chess_move/std": 0.9974643468856812, "step": 2075 }, { "completion_length": 393.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 141.0546875, "completions/mean_terminated_length": 141.0546875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0018823972328760677, "frac_reward_zero_std": 0.3625, "grad_norm": 0.2875458896160126, "kl": 0.09106630016030977, "learning_rate": 1.4849999999999999e-07, "loss": 0.0001, "num_tokens": 153961567.0, "reward": -0.1375, "reward_std": 0.5869103729724884, "rewards/verify_chess_move/mean": -0.1375, "rewards/verify_chess_move/std": 0.9805881261825562, "step": 2080 }, { "completion_length": 393.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 138.1515625, "completions/mean_terminated_length": 138.1515625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0018869222262243275, "frac_reward_zero_std": 0.34375, "grad_norm": 0.34171316027641296, "kl": 0.11425254283458344, "learning_rate": 1.4885714285714285e-07, "loss": 0.0001, "num_tokens": 154337505.0, "reward": -0.1203125, "reward_std": 0.6072640180587768, "rewards/verify_chess_move/mean": -0.1203125, "rewards/verify_chess_move/std": 0.9933632135391235, "step": 2085 }, { "completion_length": 337.4, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/max_terminated_length": 337.4, "completions/mean_length": 135.22421875, "completions/mean_terminated_length": 135.22421875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0018914472195725872, "frac_reward_zero_std": 0.375, "grad_norm": 0.23675666749477386, "kl": 0.09498451640829444, "learning_rate": 1.492142857142857e-07, "loss": 0.0001, "num_tokens": 154710496.0, "reward": -0.1125, "reward_std": 0.5563475310802459, "rewards/verify_chess_move/mean": -0.1125, "rewards/verify_chess_move/std": 0.9853845119476319, "step": 2090 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 130.65, "completions/mean_terminated_length": 130.65, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.001895972212920847, "frac_reward_zero_std": 0.325, "grad_norm": 0.19187970459461212, "kl": 0.08849013524959445, "learning_rate": 1.4957142857142857e-07, "loss": 0.0001, "num_tokens": 155076000.0, "reward": 0.0171875, "reward_std": 0.5921262025833129, "rewards/verify_chess_move/mean": 0.0171875, "rewards/verify_chess_move/std": 1.0001153230667115, "step": 2095 }, { "completion_length": 337.8, "completions/clipped_ratio": 0.0, "completions/max_length": 337.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 134.4140625, "completions/mean_terminated_length": 134.4140625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0019004972062691067, "frac_reward_zero_std": 0.3, "grad_norm": 0.2240484356880188, "kl": 0.023581505751644728, "learning_rate": 1.4992857142857143e-07, "loss": 0.0, "num_tokens": 155446554.0, "reward": -0.1125, "reward_std": 0.6139531970024109, "rewards/verify_chess_move/mean": -0.1125, "rewards/verify_chess_move/std": 0.9855868339538574, "step": 2100 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 133.59296875, "completions/mean_terminated_length": 133.59296875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0019050221996173666, "frac_reward_zero_std": 0.30625, "grad_norm": 0.3087317943572998, "kl": 0.02566397776681697, "learning_rate": 1.5028571428571427e-07, "loss": 0.0, "num_tokens": 155814249.0, "reward": 0.0546875, "reward_std": 0.623623788356781, "rewards/verify_chess_move/mean": 0.0546875, "rewards/verify_chess_move/std": 0.9983588814735412, "step": 2105 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 132.80625, "completions/mean_terminated_length": 132.80625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0019095471929656263, "frac_reward_zero_std": 0.35, "grad_norm": 0.16785621643066406, "kl": 0.032421163622348106, "learning_rate": 1.5064285714285713e-07, "loss": 0.0, "num_tokens": 156181081.0, "reward": -0.0734375, "reward_std": 0.5744524478912354, "rewards/verify_chess_move/mean": -0.0734375, "rewards/verify_chess_move/std": 0.9956524968147278, "step": 2110 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 144.86640625, "completions/mean_terminated_length": 144.86640625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0019140721863138861, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1971108615398407, "kl": 0.04108193610154558, "learning_rate": 1.51e-07, "loss": 0.0, "num_tokens": 156567302.0, "reward": -0.0375, "reward_std": 0.5445066213607788, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9992249488830567, "step": 2115 }, { "completion_length": 407.4, "completions/clipped_ratio": 0.0, "completions/max_length": 407.4, "completions/max_terminated_length": 407.4, "completions/mean_length": 132.846875, "completions/mean_terminated_length": 132.846875, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0019185971796621458, "frac_reward_zero_std": 0.4, "grad_norm": 0.09913089126348495, "kl": 0.05889361157387611, "learning_rate": 1.5135714285714285e-07, "loss": 0.0001, "num_tokens": 156934538.0, "reward": -0.0203125, "reward_std": 0.535890418291092, "rewards/verify_chess_move/mean": -0.0203125, "rewards/verify_chess_move/std": 0.9950348496437073, "step": 2120 }, { "completion_length": 371.4, "completions/clipped_ratio": 0.0, "completions/max_length": 371.4, "completions/max_terminated_length": 371.4, "completions/mean_length": 138.44140625, "completions/mean_terminated_length": 138.44140625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0019231221730104057, "frac_reward_zero_std": 0.275, "grad_norm": 0.28708094358444214, "kl": 0.05580642764361983, "learning_rate": 1.517142857142857e-07, "loss": 0.0001, "num_tokens": 157308639.0, "reward": -0.0171875, "reward_std": 0.6905184864997864, "rewards/verify_chess_move/mean": -0.0171875, "rewards/verify_chess_move/std": 1.0010225534439088, "step": 2125 }, { "completion_length": 340.6, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/max_terminated_length": 340.6, "completions/mean_length": 134.103125, "completions/mean_terminated_length": 134.103125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0019276471663586656, "frac_reward_zero_std": 0.33125, "grad_norm": 0.24658681452274323, "kl": 0.09547559730708599, "learning_rate": 1.5207142857142857e-07, "loss": 0.0001, "num_tokens": 157677155.0, "reward": -0.0375, "reward_std": 0.5906983613967896, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9924020171165466, "step": 2130 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 133.94609375, "completions/mean_terminated_length": 133.94609375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0019321721597069252, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1772029548883438, "kl": 0.08565423865620687, "learning_rate": 1.5242857142857143e-07, "loss": 0.0001, "num_tokens": 158044422.0, "reward": -0.109375, "reward_std": 0.6171977877616882, "rewards/verify_chess_move/mean": -0.109375, "rewards/verify_chess_move/std": 0.9862590789794922, "step": 2135 }, { "completion_length": 379.4, "completions/clipped_ratio": 0.0, "completions/max_length": 379.4, "completions/max_terminated_length": 379.4, "completions/mean_length": 131.48984375, "completions/mean_terminated_length": 131.48984375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.001936697153055185, "frac_reward_zero_std": 0.3125, "grad_norm": 0.15985898673534393, "kl": 0.06011968955208431, "learning_rate": 1.5278571428571427e-07, "loss": 0.0001, "num_tokens": 158409433.0, "reward": 0.04375, "reward_std": 0.6177221298217773, "rewards/verify_chess_move/mean": 0.04375, "rewards/verify_chess_move/std": 0.9991767168045044, "step": 2140 }, { "completion_length": 408.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 132.0328125, "completions/mean_terminated_length": 132.0328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0019412221464034447, "frac_reward_zero_std": 0.35, "grad_norm": 0.15134263038635254, "kl": 0.04270337851557997, "learning_rate": 1.5314285714285713e-07, "loss": 0.0, "num_tokens": 158777195.0, "reward": -0.003125, "reward_std": 0.5726592361927032, "rewards/verify_chess_move/mean": -0.003125, "rewards/verify_chess_move/std": 0.9954805850982666, "step": 2145 }, { "completion_length": 381.8, "completions/clipped_ratio": 0.0, "completions/max_length": 381.8, "completions/max_terminated_length": 381.8, "completions/mean_length": 140.0625, "completions/mean_terminated_length": 140.0625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0019457471397517046, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1408434957265854, "kl": 0.03500581514454097, "learning_rate": 1.535e-07, "loss": 0.0, "num_tokens": 159158235.0, "reward": -0.1625, "reward_std": 0.5098902523517609, "rewards/verify_chess_move/mean": -0.1625, "rewards/verify_chess_move/std": 0.981385326385498, "step": 2150 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 136.21640625, "completions/mean_terminated_length": 136.21640625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0019502721330999643, "frac_reward_zero_std": 0.325, "grad_norm": 0.18035291135311127, "kl": 0.036961043057817736, "learning_rate": 1.5385714285714285e-07, "loss": 0.0, "num_tokens": 159528888.0, "reward": 0.021875, "reward_std": 0.6037468194961548, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.9931867122650146, "step": 2155 }, { "completion_length": 402.2, "completions/clipped_ratio": 0.0, "completions/max_length": 402.2, "completions/max_terminated_length": 402.2, "completions/mean_length": 137.91875, "completions/mean_terminated_length": 137.91875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.001954797126448224, "frac_reward_zero_std": 0.3625, "grad_norm": 0.20617489516735077, "kl": 0.04025931785727153, "learning_rate": 1.5421428571428571e-07, "loss": 0.0, "num_tokens": 159905688.0, "reward": -0.090625, "reward_std": 0.5806462526321411, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9938525319099426, "step": 2160 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 138.0671875, "completions/mean_terminated_length": 138.0671875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.001959322119796484, "frac_reward_zero_std": 0.30625, "grad_norm": 0.13705703616142273, "kl": 0.035539388091274306, "learning_rate": 1.5457142857142858e-07, "loss": 0.0, "num_tokens": 160278654.0, "reward": -0.009375, "reward_std": 0.6459184408187866, "rewards/verify_chess_move/mean": -0.009375, "rewards/verify_chess_move/std": 0.9989862561225891, "step": 2165 }, { "completion_length": 469.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 469.6, "completions/max_terminated_length": 390.0, "completions/mean_length": 140.0765625, "completions/mean_terminated_length": 139.57142333984376, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0019638471131447435, "frac_reward_zero_std": 0.3625, "grad_norm": 0.13172757625579834, "kl": 0.06246026990556856, "learning_rate": 1.549285714285714e-07, "loss": 0.0001, "num_tokens": 160655752.0, "reward": -0.1, "reward_std": 0.5753431200981141, "rewards/verify_chess_move/mean": -0.1, "rewards/verify_chess_move/std": 0.9943399906158448, "step": 2170 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 151.69453125, "completions/mean_terminated_length": 151.69453125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0019683721064930036, "frac_reward_zero_std": 0.36875, "grad_norm": 0.16713842749595642, "kl": 0.06233443283344968, "learning_rate": 1.5528571428571427e-07, "loss": 0.0001, "num_tokens": 161051233.0, "reward": -0.134375, "reward_std": 0.564139324426651, "rewards/verify_chess_move/mean": -0.134375, "rewards/verify_chess_move/std": 0.9753315329551697, "step": 2175 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 152.71484375, "completions/mean_terminated_length": 152.71484375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0019728970998412632, "frac_reward_zero_std": 0.3625, "grad_norm": 0.23978395760059357, "kl": 0.0634373755146953, "learning_rate": 1.5564285714285713e-07, "loss": 0.0001, "num_tokens": 161445156.0, "reward": -0.0375, "reward_std": 0.5719717264175415, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9961151719093323, "step": 2180 }, { "completion_length": 421.8, "completions/clipped_ratio": 0.0, "completions/max_length": 421.8, "completions/max_terminated_length": 421.8, "completions/mean_length": 139.85, "completions/mean_terminated_length": 139.85, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.001977422093189523, "frac_reward_zero_std": 0.31875, "grad_norm": 0.20420295000076294, "kl": 0.05294802484713727, "learning_rate": 1.56e-07, "loss": 0.0001, "num_tokens": 161821204.0, "reward": 0.00625, "reward_std": 0.603734016418457, "rewards/verify_chess_move/mean": 0.00625, "rewards/verify_chess_move/std": 0.9993088245391846, "step": 2185 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 148.71328125, "completions/mean_terminated_length": 148.71328125, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.001981947086537783, "frac_reward_zero_std": 0.3375, "grad_norm": 0.18987488746643066, "kl": 0.042692229871317974, "learning_rate": 1.5635714285714286e-07, "loss": 0.0, "num_tokens": 162212021.0, "reward": -0.1171875, "reward_std": 0.6090435266494751, "rewards/verify_chess_move/mean": -0.1171875, "rewards/verify_chess_move/std": 0.9914653420448303, "step": 2190 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 147.234375, "completions/mean_terminated_length": 147.234375, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0019864720798860426, "frac_reward_zero_std": 0.3875, "grad_norm": 0.2287667840719223, "kl": 0.05317384878653684, "learning_rate": 1.5671428571428572e-07, "loss": 0.0001, "num_tokens": 162602953.0, "reward": -0.1453125, "reward_std": 0.5474651396274567, "rewards/verify_chess_move/mean": -0.1453125, "rewards/verify_chess_move/std": 0.9837769389152526, "step": 2195 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 142.4359375, "completions/mean_terminated_length": 142.4359375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0019909970732343023, "frac_reward_zero_std": 0.30625, "grad_norm": 0.17703230679035187, "kl": 0.06738999664812581, "learning_rate": 1.5707142857142858e-07, "loss": 0.0001, "num_tokens": 162984607.0, "reward": -0.0578125, "reward_std": 0.6222132802009582, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9896243929862976, "step": 2200 }, { "completion_length": 440.2, "completions/clipped_ratio": 0.0, "completions/max_length": 440.2, "completions/max_terminated_length": 440.2, "completions/mean_length": 134.9296875, "completions/mean_terminated_length": 134.9296875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.001995522066582562, "frac_reward_zero_std": 0.31875, "grad_norm": 0.26846206188201904, "kl": 0.06679066854703705, "learning_rate": 1.574285714285714e-07, "loss": 0.0001, "num_tokens": 163355085.0, "reward": -0.090625, "reward_std": 0.6147859215736389, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9976799488067627, "step": 2205 }, { "completion_length": 387.2, "completions/clipped_ratio": 0.0, "completions/max_length": 387.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 136.27578125, "completions/mean_terminated_length": 136.27578125, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.002000047059930822, "frac_reward_zero_std": 0.36875, "grad_norm": 0.19936731457710266, "kl": 0.07418139193905518, "learning_rate": 1.5778571428571427e-07, "loss": 0.0001, "num_tokens": 163728342.0, "reward": -0.06875, "reward_std": 0.5757530570030213, "rewards/verify_chess_move/mean": -0.06875, "rewards/verify_chess_move/std": 0.9889754533767701, "step": 2210 }, { "completion_length": 407.2, "completions/clipped_ratio": 0.0, "completions/max_length": 407.2, "completions/max_terminated_length": 407.2, "completions/mean_length": 148.571875, "completions/mean_terminated_length": 148.571875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0020045720532790817, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1427839994430542, "kl": 0.047899645686993605, "learning_rate": 1.5814285714285714e-07, "loss": 0.0, "num_tokens": 164121674.0, "reward": -0.0875, "reward_std": 0.5842268705368042, "rewards/verify_chess_move/mean": -0.0875, "rewards/verify_chess_move/std": 0.9945414066314697, "step": 2215 }, { "completion_length": 427.4, "completions/clipped_ratio": 0.0, "completions/max_length": 427.4, "completions/max_terminated_length": 427.4, "completions/mean_length": 147.31015625, "completions/mean_terminated_length": 147.31015625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0020090970466273414, "frac_reward_zero_std": 0.38125, "grad_norm": 0.1498345285654068, "kl": 0.030737308484822277, "learning_rate": 1.585e-07, "loss": 0.0, "num_tokens": 164510135.0, "reward": -0.0625, "reward_std": 0.5617544889450073, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.9941455483436584, "step": 2220 }, { "completion_length": 344.6, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 135.21328125, "completions/mean_terminated_length": 135.21328125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.002013622039975601, "frac_reward_zero_std": 0.3375, "grad_norm": 0.22058655321598053, "kl": 0.055202579129399965, "learning_rate": 1.5885714285714286e-07, "loss": 0.0001, "num_tokens": 164882784.0, "reward": -0.0875, "reward_std": 0.5905949532985687, "rewards/verify_chess_move/mean": -0.0875, "rewards/verify_chess_move/std": 0.9914514899253846, "step": 2225 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 138.65, "completions/mean_terminated_length": 138.65, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.002018147033323861, "frac_reward_zero_std": 0.35625, "grad_norm": 0.2225487381219864, "kl": 0.03940031766469474, "learning_rate": 1.5921428571428572e-07, "loss": 0.0, "num_tokens": 165258800.0, "reward": -0.0640625, "reward_std": 0.5861255288124084, "rewards/verify_chess_move/mean": -0.0640625, "rewards/verify_chess_move/std": 0.9952776551246643, "step": 2230 }, { "completion_length": 466.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 466.6, "completions/max_terminated_length": 387.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 134.7048583984375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0020226720266721208, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15809251368045807, "kl": 0.04426259661267977, "learning_rate": 1.5957142857142855e-07, "loss": 0.0, "num_tokens": 165629608.0, "reward": -0.2140625, "reward_std": 0.5998411893844604, "rewards/verify_chess_move/mean": -0.2140625, "rewards/verify_chess_move/std": 0.9713527917861938, "step": 2235 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 139.55859375, "completions/mean_terminated_length": 139.55859375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0020271970200203804, "frac_reward_zero_std": 0.39375, "grad_norm": 0.30248209834098816, "kl": 0.07106042691739275, "learning_rate": 1.5992857142857142e-07, "loss": 0.0001, "num_tokens": 166010155.0, "reward": -0.1390625, "reward_std": 0.546720278263092, "rewards/verify_chess_move/mean": -0.1390625, "rewards/verify_chess_move/std": 0.9839111208915711, "step": 2240 }, { "completion_length": 429.6, "completions/clipped_ratio": 0.0, "completions/max_length": 429.6, "completions/max_terminated_length": 429.6, "completions/mean_length": 136.078125, "completions/mean_terminated_length": 136.078125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0020317220133686405, "frac_reward_zero_std": 0.29375, "grad_norm": 0.16728034615516663, "kl": 0.07382683661853662, "learning_rate": 1.6028571428571428e-07, "loss": 0.0001, "num_tokens": 166379583.0, "reward": -0.0203125, "reward_std": 0.6558161616325379, "rewards/verify_chess_move/mean": -0.0203125, "rewards/verify_chess_move/std": 1.0009002447128297, "step": 2245 }, { "completion_length": 362.2, "completions/clipped_ratio": 0.0, "completions/max_length": 362.2, "completions/max_terminated_length": 362.2, "completions/mean_length": 128.17578125, "completions/mean_terminated_length": 128.17578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0020362470067169, "frac_reward_zero_std": 0.31875, "grad_norm": 0.12522172927856445, "kl": 0.10737579746346455, "learning_rate": 1.6064285714285714e-07, "loss": 0.0001, "num_tokens": 166740384.0, "reward": -0.05, "reward_std": 0.6176845312118531, "rewards/verify_chess_move/mean": -0.05, "rewards/verify_chess_move/std": 0.9893251180648803, "step": 2250 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 133.28046875, "completions/mean_terminated_length": 133.28046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.00204077200006516, "frac_reward_zero_std": 0.35, "grad_norm": 0.157683327794075, "kl": 0.11592374939500587, "learning_rate": 1.61e-07, "loss": 0.0001, "num_tokens": 167106831.0, "reward": -0.134375, "reward_std": 0.6020404815673828, "rewards/verify_chess_move/mean": -0.134375, "rewards/verify_chess_move/std": 0.9894103288650513, "step": 2255 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 137.41640625, "completions/mean_terminated_length": 137.41640625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0020452969934134195, "frac_reward_zero_std": 0.38125, "grad_norm": 0.29197606444358826, "kl": 0.1945369095024944, "learning_rate": 1.6135714285714286e-07, "loss": 0.0002, "num_tokens": 167481420.0, "reward": -0.0984375, "reward_std": 0.5728073835372924, "rewards/verify_chess_move/mean": -0.0984375, "rewards/verify_chess_move/std": 0.9809035897254944, "step": 2260 }, { "completion_length": 424.2, "completions/clipped_ratio": 0.0, "completions/max_length": 424.2, "completions/max_terminated_length": 424.2, "completions/mean_length": 143.07578125, "completions/mean_terminated_length": 143.07578125, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0020498219867616796, "frac_reward_zero_std": 0.39375, "grad_norm": 0.13538064062595367, "kl": 0.03705068154449691, "learning_rate": 1.6171428571428572e-07, "loss": 0.0, "num_tokens": 167864821.0, "reward": -0.1484375, "reward_std": 0.5741717875003814, "rewards/verify_chess_move/mean": -0.1484375, "rewards/verify_chess_move/std": 0.9782132506370544, "step": 2265 }, { "completion_length": 394.2, "completions/clipped_ratio": 0.0, "completions/max_length": 394.2, "completions/max_terminated_length": 394.2, "completions/mean_length": 133.071875, "completions/mean_terminated_length": 133.071875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0020543469801099393, "frac_reward_zero_std": 0.35, "grad_norm": 0.1639159619808197, "kl": 0.04367964485209086, "learning_rate": 1.6207142857142856e-07, "loss": 0.0, "num_tokens": 168233257.0, "reward": 0.0328125, "reward_std": 0.5912279188632965, "rewards/verify_chess_move/mean": 0.0328125, "rewards/verify_chess_move/std": 0.9942065238952636, "step": 2270 }, { "completion_length": 397.6, "completions/clipped_ratio": 0.0, "completions/max_length": 397.6, "completions/max_terminated_length": 397.6, "completions/mean_length": 133.8421875, "completions/mean_terminated_length": 133.8421875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.002058871973458199, "frac_reward_zero_std": 0.3, "grad_norm": 0.27845603227615356, "kl": 0.028281817739480176, "learning_rate": 1.6242857142857142e-07, "loss": 0.0, "num_tokens": 168599919.0, "reward": -0.003125, "reward_std": 0.6433403611183166, "rewards/verify_chess_move/mean": -0.003125, "rewards/verify_chess_move/std": 0.997649085521698, "step": 2275 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 142.90390625, "completions/mean_terminated_length": 142.90390625, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.002063396966806459, "frac_reward_zero_std": 0.38125, "grad_norm": 0.2045123130083084, "kl": 0.020943244238151238, "learning_rate": 1.6278571428571428e-07, "loss": 0.0, "num_tokens": 168983676.0, "reward": -0.1171875, "reward_std": 0.5508217632770538, "rewards/verify_chess_move/mean": -0.1171875, "rewards/verify_chess_move/std": 0.9889607906341553, "step": 2280 }, { "completion_length": 370.6, "completions/clipped_ratio": 0.0, "completions/max_length": 370.6, "completions/max_terminated_length": 370.6, "completions/mean_length": 133.93671875, "completions/mean_terminated_length": 133.93671875, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0020679219601547187, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1488858014345169, "kl": 0.016782278374012093, "learning_rate": 1.6314285714285714e-07, "loss": 0.0, "num_tokens": 169352667.0, "reward": 0.0140625, "reward_std": 0.5911635398864746, "rewards/verify_chess_move/mean": 0.0140625, "rewards/verify_chess_move/std": 0.9937710285186767, "step": 2285 }, { "completion_length": 392.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 146.4359375, "completions/mean_terminated_length": 146.4359375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0020724469535029783, "frac_reward_zero_std": 0.35625, "grad_norm": 0.1634940505027771, "kl": 0.016658728088805218, "learning_rate": 1.635e-07, "loss": 0.0, "num_tokens": 169740913.0, "reward": -0.0109375, "reward_std": 0.5768639266490936, "rewards/verify_chess_move/mean": -0.0109375, "rewards/verify_chess_move/std": 0.9911780714988708, "step": 2290 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 134.62890625, "completions/mean_terminated_length": 134.62890625, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.002076971946851238, "frac_reward_zero_std": 0.3125, "grad_norm": 0.17834247648715973, "kl": 0.02037836761883227, "learning_rate": 1.6385714285714286e-07, "loss": 0.0, "num_tokens": 170108686.0, "reward": -0.065625, "reward_std": 0.6316687345504761, "rewards/verify_chess_move/mean": -0.065625, "rewards/verify_chess_move/std": 0.9887609362602234, "step": 2295 }, { "completion_length": 416.8, "completions/clipped_ratio": 0.0, "completions/max_length": 416.8, "completions/max_terminated_length": 416.8, "completions/mean_length": 141.59453125, "completions/mean_terminated_length": 141.59453125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002081496940199498, "frac_reward_zero_std": 0.3375, "grad_norm": 0.17254780232906342, "kl": 0.01862644655338954, "learning_rate": 1.642142857142857e-07, "loss": 0.0, "num_tokens": 170488207.0, "reward": -0.0609375, "reward_std": 0.6159955143928528, "rewards/verify_chess_move/mean": -0.0609375, "rewards/verify_chess_move/std": 0.9907169222831727, "step": 2300 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 138.14296875, "completions/mean_terminated_length": 138.14296875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0020860219335477577, "frac_reward_zero_std": 0.3, "grad_norm": 0.21174676716327667, "kl": 0.028794095613557148, "learning_rate": 1.6457142857142856e-07, "loss": 0.0, "num_tokens": 170862662.0, "reward": -0.0859375, "reward_std": 0.6298364520072937, "rewards/verify_chess_move/mean": -0.0859375, "rewards/verify_chess_move/std": 0.9942379474639893, "step": 2305 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 149.5890625, "completions/mean_terminated_length": 149.5890625, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0020905469268960174, "frac_reward_zero_std": 0.4, "grad_norm": 0.1989012062549591, "kl": 0.0491991466464242, "learning_rate": 1.6492857142857142e-07, "loss": 0.0, "num_tokens": 171257792.0, "reward": -0.209375, "reward_std": 0.5306327939033508, "rewards/verify_chess_move/mean": -0.209375, "rewards/verify_chess_move/std": 0.9685253024101257, "step": 2310 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 133.3234375, "completions/mean_terminated_length": 133.3234375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.002095071920244277, "frac_reward_zero_std": 0.36875, "grad_norm": 0.1699744611978531, "kl": 0.05092649572179653, "learning_rate": 1.6528571428571428e-07, "loss": 0.0001, "num_tokens": 171627006.0, "reward": -0.178125, "reward_std": 0.5705933213233948, "rewards/verify_chess_move/mean": -0.178125, "rewards/verify_chess_move/std": 0.9791673183441162, "step": 2315 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 136.040625, "completions/mean_terminated_length": 136.040625, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.002099596913592537, "frac_reward_zero_std": 0.2875, "grad_norm": 0.17840708792209625, "kl": 0.06553510643570917, "learning_rate": 1.6564285714285714e-07, "loss": 0.0001, "num_tokens": 171998754.0, "reward": -0.0375, "reward_std": 0.6443048715591431, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9948637127876282, "step": 2320 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 141.01875, "completions/mean_terminated_length": 141.01875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.002104121906940797, "frac_reward_zero_std": 0.3625, "grad_norm": 0.17580485343933105, "kl": 0.06230300142487977, "learning_rate": 1.66e-07, "loss": 0.0001, "num_tokens": 172377642.0, "reward": -0.0421875, "reward_std": 0.5674484729766845, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.993936049938202, "step": 2325 }, { "completion_length": 404.6, "completions/clipped_ratio": 0.0, "completions/max_length": 404.6, "completions/max_terminated_length": 404.6, "completions/mean_length": 140.003125, "completions/mean_terminated_length": 140.003125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0021086469002890565, "frac_reward_zero_std": 0.39375, "grad_norm": 0.10333205759525299, "kl": 0.04393911853840109, "learning_rate": 1.6635714285714287e-07, "loss": 0.0, "num_tokens": 172754174.0, "reward": -0.0109375, "reward_std": 0.5449764788150787, "rewards/verify_chess_move/mean": -0.0109375, "rewards/verify_chess_move/std": 0.9967106819152832, "step": 2330 }, { "completion_length": 353.8, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/max_terminated_length": 353.8, "completions/mean_length": 133.4375, "completions/mean_terminated_length": 133.4375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0021131718936373165, "frac_reward_zero_std": 0.40625, "grad_norm": 0.31981948018074036, "kl": 0.05700149239200982, "learning_rate": 1.667142857142857e-07, "loss": 0.0001, "num_tokens": 173121998.0, "reward": -0.090625, "reward_std": 0.5529140830039978, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.995103657245636, "step": 2335 }, { "completion_length": 453.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 128.378125, "completions/mean_terminated_length": 128.378125, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.002117696886985576, "frac_reward_zero_std": 0.3625, "grad_norm": 0.2245410531759262, "kl": 0.08611296221861267, "learning_rate": 1.6707142857142856e-07, "loss": 0.0001, "num_tokens": 173484618.0, "reward": -0.1421875, "reward_std": 0.5879014611244202, "rewards/verify_chess_move/mean": -0.1421875, "rewards/verify_chess_move/std": 0.9836718082427979, "step": 2340 }, { "completion_length": 361.6, "completions/clipped_ratio": 0.0, "completions/max_length": 361.6, "completions/max_terminated_length": 361.6, "completions/mean_length": 148.803125, "completions/mean_terminated_length": 148.803125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.002122221880333836, "frac_reward_zero_std": 0.29375, "grad_norm": 0.39416298270225525, "kl": 0.14235143752011936, "learning_rate": 1.6742857142857142e-07, "loss": 0.0001, "num_tokens": 173875838.0, "reward": -0.028125, "reward_std": 0.6628621697425843, "rewards/verify_chess_move/mean": -0.028125, "rewards/verify_chess_move/std": 0.9950923800468445, "step": 2345 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.0, "completions/max_length": 378.4, "completions/max_terminated_length": 378.4, "completions/mean_length": 134.35390625, "completions/mean_terminated_length": 134.35390625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0021267468736820955, "frac_reward_zero_std": 0.375, "grad_norm": 0.1907842606306076, "kl": 0.15849912481935463, "learning_rate": 1.6778571428571428e-07, "loss": 0.0002, "num_tokens": 174244931.0, "reward": 0.04375, "reward_std": 0.5622842073440552, "rewards/verify_chess_move/mean": 0.04375, "rewards/verify_chess_move/std": 0.9954211473464966, "step": 2350 }, { "completion_length": 411.2, "completions/clipped_ratio": 0.0, "completions/max_length": 411.2, "completions/max_terminated_length": 411.2, "completions/mean_length": 144.05703125, "completions/mean_terminated_length": 144.05703125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0021312718670303556, "frac_reward_zero_std": 0.375, "grad_norm": 0.12392512708902359, "kl": 0.04415841953887138, "learning_rate": 1.6814285714285715e-07, "loss": 0.0, "num_tokens": 174630892.0, "reward": -0.0109375, "reward_std": 0.5644533514976502, "rewards/verify_chess_move/mean": -0.0109375, "rewards/verify_chess_move/std": 0.9893957376480103, "step": 2355 }, { "completion_length": 407.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 144.62734375, "completions/mean_terminated_length": 144.62734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0021357968603786153, "frac_reward_zero_std": 0.35, "grad_norm": 0.18857407569885254, "kl": 0.029857688724587206, "learning_rate": 1.685e-07, "loss": 0.0, "num_tokens": 175015743.0, "reward": -0.0421875, "reward_std": 0.5760761141777039, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.9991489171981811, "step": 2360 }, { "completion_length": 412.8, "completions/clipped_ratio": 0.0, "completions/max_length": 412.8, "completions/max_terminated_length": 412.8, "completions/mean_length": 138.67578125, "completions/mean_terminated_length": 138.67578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.002140321853726875, "frac_reward_zero_std": 0.35625, "grad_norm": 0.12650324404239655, "kl": 0.018615932990360306, "learning_rate": 1.6885714285714284e-07, "loss": 0.0, "num_tokens": 175390552.0, "reward": 0.0109375, "reward_std": 0.5741804122924805, "rewards/verify_chess_move/mean": 0.0109375, "rewards/verify_chess_move/std": 0.9962981581687927, "step": 2365 }, { "completion_length": 385.2, "completions/clipped_ratio": 0.0, "completions/max_length": 385.2, "completions/max_terminated_length": 385.2, "completions/mean_length": 136.0609375, "completions/mean_terminated_length": 136.0609375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0021448468470751346, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1662798523902893, "kl": 0.017805217629211258, "learning_rate": 1.692142857142857e-07, "loss": 0.0, "num_tokens": 175761054.0, "reward": -0.0140625, "reward_std": 0.5854431092739105, "rewards/verify_chess_move/mean": -0.0140625, "rewards/verify_chess_move/std": 0.9981855750083923, "step": 2370 }, { "completion_length": 373.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 141.22890625, "completions/mean_terminated_length": 141.22890625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0021493718404233947, "frac_reward_zero_std": 0.3625, "grad_norm": 0.21990065276622772, "kl": 0.025186145595216657, "learning_rate": 1.6957142857142856e-07, "loss": 0.0, "num_tokens": 176142331.0, "reward": -0.0375, "reward_std": 0.5860152006149292, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9880841374397278, "step": 2375 }, { "completion_length": 441.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 441.2, "completions/max_terminated_length": 355.4, "completions/mean_length": 141.7140625, "completions/mean_terminated_length": 141.22726440429688, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0021538968337716543, "frac_reward_zero_std": 0.35625, "grad_norm": 0.18965326249599457, "kl": 0.015621185221971246, "learning_rate": 1.6992857142857143e-07, "loss": 0.0, "num_tokens": 176521901.0, "reward": -0.0015625, "reward_std": 0.5786542057991028, "rewards/verify_chess_move/mean": -0.0015625, "rewards/verify_chess_move/std": 0.9912512063980102, "step": 2380 }, { "completion_length": 434.8, "completions/clipped_ratio": 0.0, "completions/max_length": 434.8, "completions/max_terminated_length": 434.8, "completions/mean_length": 146.30859375, "completions/mean_terminated_length": 146.30859375, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.002158421827119914, "frac_reward_zero_std": 0.38125, "grad_norm": 0.15389403700828552, "kl": 0.031943568375572794, "learning_rate": 1.7028571428571429e-07, "loss": 0.0, "num_tokens": 176907088.0, "reward": 0.009375, "reward_std": 0.5512470781803132, "rewards/verify_chess_move/mean": 0.009375, "rewards/verify_chess_move/std": 0.9900226593017578, "step": 2385 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 137.14765625, "completions/mean_terminated_length": 137.14765625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.002162946820468174, "frac_reward_zero_std": 0.31875, "grad_norm": 0.3516212999820709, "kl": 0.031426040398946496, "learning_rate": 1.7064285714285715e-07, "loss": 0.0, "num_tokens": 177282213.0, "reward": -0.0796875, "reward_std": 0.6143141150474548, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9914002537727356, "step": 2390 }, { "completion_length": 364.4, "completions/clipped_ratio": 0.0, "completions/max_length": 364.4, "completions/max_terminated_length": 364.4, "completions/mean_length": 138.63828125, "completions/mean_terminated_length": 138.63828125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0021674718138164338, "frac_reward_zero_std": 0.38125, "grad_norm": 0.14376574754714966, "kl": 0.03991924791771453, "learning_rate": 1.71e-07, "loss": 0.0, "num_tokens": 177657750.0, "reward": -0.1046875, "reward_std": 0.5730185389518738, "rewards/verify_chess_move/mean": -0.1046875, "rewards/verify_chess_move/std": 0.9951687455177307, "step": 2395 }, { "completion_length": 406.2, "completions/clipped_ratio": 0.0, "completions/max_length": 406.2, "completions/max_terminated_length": 406.2, "completions/mean_length": 141.22421875, "completions/mean_terminated_length": 141.22421875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0021719968071646934, "frac_reward_zero_std": 0.3375, "grad_norm": 0.21666622161865234, "kl": 0.06000828236356028, "learning_rate": 1.7135714285714284e-07, "loss": 0.0001, "num_tokens": 178035861.0, "reward": 0.0015625, "reward_std": 0.596437680721283, "rewards/verify_chess_move/mean": 0.0015625, "rewards/verify_chess_move/std": 1.0003605604171752, "step": 2400 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 137.5484375, "completions/mean_terminated_length": 137.5484375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.002176521800512953, "frac_reward_zero_std": 0.35625, "grad_norm": 0.21265779435634613, "kl": 0.06627305677757249, "learning_rate": 1.717142857142857e-07, "loss": 0.0001, "num_tokens": 178412427.0, "reward": -0.1375, "reward_std": 0.5873258054256439, "rewards/verify_chess_move/mean": -0.1375, "rewards/verify_chess_move/std": 0.9887895941734314, "step": 2405 }, { "completion_length": 426.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.8, "completions/max_terminated_length": 395.4, "completions/mean_length": 133.23203125, "completions/mean_terminated_length": 132.75115356445312, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.002181046793861213, "frac_reward_zero_std": 0.33125, "grad_norm": 0.18731476366519928, "kl": 0.04334643893234898, "learning_rate": 1.7207142857142857e-07, "loss": 0.0, "num_tokens": 178779476.0, "reward": -0.0359375, "reward_std": 0.6050044536590576, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 0.9818778753280639, "step": 2410 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.002185571787209473, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1673417091369629, "kl": 0.04066256330333999, "learning_rate": 1.7242857142857143e-07, "loss": 0.0, "num_tokens": 179158868.0, "reward": -0.05625, "reward_std": 0.5876892924308776, "rewards/verify_chess_move/mean": -0.05625, "rewards/verify_chess_move/std": 0.9929763317108155, "step": 2415 }, { "completion_length": 350.6, "completions/clipped_ratio": 0.0, "completions/max_length": 350.6, "completions/max_terminated_length": 350.6, "completions/mean_length": 144.6390625, "completions/mean_terminated_length": 144.6390625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0021900967805577325, "frac_reward_zero_std": 0.4, "grad_norm": 0.16284912824630737, "kl": 0.0406235070367984, "learning_rate": 1.727857142857143e-07, "loss": 0.0, "num_tokens": 179545774.0, "reward": -0.1328125, "reward_std": 0.5547177970409394, "rewards/verify_chess_move/mean": -0.1328125, "rewards/verify_chess_move/std": 0.9828023552894593, "step": 2420 }, { "completion_length": 450.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 138.60859375, "completions/mean_terminated_length": 138.60859375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0021946217739059926, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15192678570747375, "kl": 0.04336375297789345, "learning_rate": 1.7314285714285715e-07, "loss": 0.0, "num_tokens": 179919713.0, "reward": 0.00625, "reward_std": 0.5901221692562103, "rewards/verify_chess_move/mean": 0.00625, "rewards/verify_chess_move/std": 0.9912027359008789, "step": 2425 }, { "completion_length": 360.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 143.7875, "completions/mean_terminated_length": 143.7875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0021991467672542522, "frac_reward_zero_std": 0.35, "grad_norm": 0.16651703417301178, "kl": 0.08267807831580284, "learning_rate": 1.7349999999999999e-07, "loss": 0.0001, "num_tokens": 180305593.0, "reward": -0.1, "reward_std": 0.5746606945991516, "rewards/verify_chess_move/mean": -0.1, "rewards/verify_chess_move/std": 0.9960073709487915, "step": 2430 }, { "completion_length": 379.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 142.440625, "completions/mean_terminated_length": 142.440625, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.002203671760602512, "frac_reward_zero_std": 0.35625, "grad_norm": 0.28379684686660767, "kl": 0.06198802032740787, "learning_rate": 1.7385714285714285e-07, "loss": 0.0001, "num_tokens": 180688733.0, "reward": -0.06875, "reward_std": 0.6051650524139405, "rewards/verify_chess_move/mean": -0.06875, "rewards/verify_chess_move/std": 0.9975046515464783, "step": 2435 }, { "completion_length": 404.6, "completions/clipped_ratio": 0.0, "completions/max_length": 404.6, "completions/max_terminated_length": 404.6, "completions/mean_length": 136.0515625, "completions/mean_terminated_length": 136.0515625, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0022081967539507716, "frac_reward_zero_std": 0.3, "grad_norm": 0.24054737389087677, "kl": 0.1109378658598871, "learning_rate": 1.742142857142857e-07, "loss": 0.0001, "num_tokens": 181059351.0, "reward": 0.0078125, "reward_std": 0.6288443803787231, "rewards/verify_chess_move/mean": 0.0078125, "rewards/verify_chess_move/std": 0.9957873463630676, "step": 2440 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 133.00546875, "completions/mean_terminated_length": 133.00546875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0022127217472990316, "frac_reward_zero_std": 0.3375, "grad_norm": 0.1869378387928009, "kl": 0.10018589260871522, "learning_rate": 1.7457142857142857e-07, "loss": 0.0001, "num_tokens": 181424062.0, "reward": 0.0921875, "reward_std": 0.5982744872570038, "rewards/verify_chess_move/mean": 0.0921875, "rewards/verify_chess_move/std": 0.9938002586364746, "step": 2445 }, { "completion_length": 449.6, "completions/clipped_ratio": 0.0, "completions/max_length": 449.6, "completions/max_terminated_length": 449.6, "completions/mean_length": 137.90546875, "completions/mean_terminated_length": 137.90546875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0022172467406472913, "frac_reward_zero_std": 0.4, "grad_norm": 0.1186194121837616, "kl": 0.07087443573473137, "learning_rate": 1.7492857142857143e-07, "loss": 0.0001, "num_tokens": 181800277.0, "reward": -0.0140625, "reward_std": 0.5446559667587281, "rewards/verify_chess_move/mean": -0.0140625, "rewards/verify_chess_move/std": 1.0011082887649536, "step": 2450 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.0, "completions/max_length": 425.6, "completions/max_terminated_length": 425.6, "completions/mean_length": 151.0078125, "completions/mean_terminated_length": 151.0078125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.002221771733995551, "frac_reward_zero_std": 0.2875, "grad_norm": 0.17296360433101654, "kl": 0.04915364078478888, "learning_rate": 1.752857142857143e-07, "loss": 0.0, "num_tokens": 182193023.0, "reward": -0.11875, "reward_std": 0.6595981001853943, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9703645348548889, "step": 2455 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 143.18515625, "completions/mean_terminated_length": 143.18515625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0022262967273438106, "frac_reward_zero_std": 0.375, "grad_norm": 0.16637375950813293, "kl": 0.060657396135502493, "learning_rate": 1.7564285714285715e-07, "loss": 0.0001, "num_tokens": 182577556.0, "reward": -0.053125, "reward_std": 0.5641734004020691, "rewards/verify_chess_move/mean": -0.053125, "rewards/verify_chess_move/std": 0.9929107308387757, "step": 2460 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 146.79609375, "completions/mean_terminated_length": 146.79609375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0022308217206920707, "frac_reward_zero_std": 0.38125, "grad_norm": 0.25024861097335815, "kl": 0.043270996279170505, "learning_rate": 1.76e-07, "loss": 0.0, "num_tokens": 182969447.0, "reward": -0.140625, "reward_std": 0.545352965593338, "rewards/verify_chess_move/mean": -0.140625, "rewards/verify_chess_move/std": 0.9805113077163696, "step": 2465 }, { "completion_length": 458.2, "completions/clipped_ratio": 0.0, "completions/max_length": 458.2, "completions/max_terminated_length": 458.2, "completions/mean_length": 152.02109375, "completions/mean_terminated_length": 152.02109375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0022353467140403304, "frac_reward_zero_std": 0.35, "grad_norm": 0.12720827758312225, "kl": 0.030746048553555737, "learning_rate": 1.7635714285714285e-07, "loss": 0.0, "num_tokens": 183365370.0, "reward": -0.003125, "reward_std": 0.5834979772567749, "rewards/verify_chess_move/mean": -0.003125, "rewards/verify_chess_move/std": 0.9949919700622558, "step": 2470 }, { "completion_length": 354.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 141.27578125, "completions/mean_terminated_length": 141.27578125, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.00223987170738859, "frac_reward_zero_std": 0.36875, "grad_norm": 0.19385069608688354, "kl": 0.017374995633144864, "learning_rate": 1.767142857142857e-07, "loss": 0.0, "num_tokens": 183745467.0, "reward": -0.0875, "reward_std": 0.5773826003074646, "rewards/verify_chess_move/mean": -0.0875, "rewards/verify_chess_move/std": 0.9953070282936096, "step": 2475 }, { "completion_length": 408.4, "completions/clipped_ratio": 0.0, "completions/max_length": 408.4, "completions/max_terminated_length": 408.4, "completions/mean_length": 142.203125, "completions/mean_terminated_length": 142.203125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.00224439670073685, "frac_reward_zero_std": 0.35625, "grad_norm": 0.15478824079036713, "kl": 0.015487169927655486, "learning_rate": 1.7707142857142857e-07, "loss": 0.0, "num_tokens": 184125295.0, "reward": -0.0796875, "reward_std": 0.586476182937622, "rewards/verify_chess_move/mean": -0.0796875, "rewards/verify_chess_move/std": 0.9865691065788269, "step": 2480 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 134.425, "completions/mean_terminated_length": 134.425, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0022489216940851098, "frac_reward_zero_std": 0.34375, "grad_norm": 0.14823517203330994, "kl": 0.018497400223714065, "learning_rate": 1.7742857142857143e-07, "loss": 0.0, "num_tokens": 184493479.0, "reward": -0.0578125, "reward_std": 0.5833951473236084, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9957728862762452, "step": 2485 }, { "completion_length": 404.8, "completions/clipped_ratio": 0.0, "completions/max_length": 404.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 142.68203125, "completions/mean_terminated_length": 142.68203125, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0022534466874333694, "frac_reward_zero_std": 0.26875, "grad_norm": 0.14274871349334717, "kl": 0.016892510449542895, "learning_rate": 1.777857142857143e-07, "loss": 0.0, "num_tokens": 184875648.0, "reward": -0.0859375, "reward_std": 0.6455715417861938, "rewards/verify_chess_move/mean": -0.0859375, "rewards/verify_chess_move/std": 0.9977437734603882, "step": 2490 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 142.18203125, "completions/mean_terminated_length": 142.18203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.002257971680781629, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15131144225597382, "kl": 0.029513477742148096, "learning_rate": 1.7814285714285713e-07, "loss": 0.0, "num_tokens": 185255721.0, "reward": -0.071875, "reward_std": 0.6044717907905579, "rewards/verify_chess_move/mean": -0.071875, "rewards/verify_chess_move/std": 0.9943570017814636, "step": 2495 }, { "completion_length": 466.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 466.2, "completions/max_terminated_length": 380.0, "completions/mean_length": 132.3203125, "completions/mean_terminated_length": 131.81613159179688, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.002262496674129889, "frac_reward_zero_std": 0.375, "grad_norm": 0.19048747420310974, "kl": 0.03727421307557961, "learning_rate": 1.785e-07, "loss": 0.0, "num_tokens": 185624243.0, "reward": -0.0625, "reward_std": 0.5637703597545624, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.9925297856330871, "step": 2500 }, { "completion_length": 405.4, "completions/clipped_ratio": 0.0, "completions/max_length": 405.4, "completions/max_terminated_length": 405.4, "completions/mean_length": 145.6765625, "completions/mean_terminated_length": 145.6765625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.002267021667478149, "frac_reward_zero_std": 0.35, "grad_norm": 0.1551334410905838, "kl": 0.03573697673564311, "learning_rate": 1.7885714285714285e-07, "loss": 0.0, "num_tokens": 186010437.0, "reward": -0.06875, "reward_std": 0.5960572957992554, "rewards/verify_chess_move/mean": -0.06875, "rewards/verify_chess_move/std": 0.9952069044113159, "step": 2505 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.0, "completions/max_length": 462.6, "completions/max_terminated_length": 462.6, "completions/mean_length": 144.5671875, "completions/mean_terminated_length": 144.5671875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0022715466608264085, "frac_reward_zero_std": 0.29375, "grad_norm": 0.19588109850883484, "kl": 0.03569626504831831, "learning_rate": 1.792142857142857e-07, "loss": 0.0, "num_tokens": 186393075.0, "reward": -0.06875, "reward_std": 0.6237930059432983, "rewards/verify_chess_move/mean": -0.06875, "rewards/verify_chess_move/std": 0.9965782761573792, "step": 2510 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 138.32578125, "completions/mean_terminated_length": 138.32578125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.002276071654174668, "frac_reward_zero_std": 0.3375, "grad_norm": 0.2564217746257782, "kl": 0.09180202467759954, "learning_rate": 1.7957142857142857e-07, "loss": 0.0001, "num_tokens": 186770044.0, "reward": -0.1125, "reward_std": 0.5976399540901184, "rewards/verify_chess_move/mean": -0.1125, "rewards/verify_chess_move/std": 0.9921355485916138, "step": 2515 }, { "completion_length": 350.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 144.7828125, "completions/mean_terminated_length": 144.7828125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0022805966475229283, "frac_reward_zero_std": 0.3375, "grad_norm": 0.26169925928115845, "kl": 0.12151081559131853, "learning_rate": 1.7992857142857144e-07, "loss": 0.0001, "num_tokens": 187157334.0, "reward": -0.075, "reward_std": 0.5923872351646423, "rewards/verify_chess_move/mean": -0.075, "rewards/verify_chess_move/std": 0.9869626879692077, "step": 2520 }, { "completion_length": 376.8, "completions/clipped_ratio": 0.0, "completions/max_length": 376.8, "completions/max_terminated_length": 376.8, "completions/mean_length": 139.71875, "completions/mean_terminated_length": 139.71875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002285121640871188, "frac_reward_zero_std": 0.35625, "grad_norm": 0.14380745589733124, "kl": 0.12696503077386295, "learning_rate": 1.802857142857143e-07, "loss": 0.0001, "num_tokens": 187532606.0, "reward": 0.009375, "reward_std": 0.5819064319133759, "rewards/verify_chess_move/mean": 0.009375, "rewards/verify_chess_move/std": 1.0007464408874511, "step": 2525 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 141.0921875, "completions/mean_terminated_length": 141.0921875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0022896466342194476, "frac_reward_zero_std": 0.39375, "grad_norm": 0.16349877417087555, "kl": 0.08795331978108152, "learning_rate": 1.8064285714285713e-07, "loss": 0.0001, "num_tokens": 187914308.0, "reward": -0.1421875, "reward_std": 0.5608587563037872, "rewards/verify_chess_move/mean": -0.1421875, "rewards/verify_chess_move/std": 0.990114939212799, "step": 2530 }, { "completion_length": 356.8, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 140.58203125, "completions/mean_terminated_length": 140.58203125, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0022941716275677077, "frac_reward_zero_std": 0.3625, "grad_norm": 0.16653874516487122, "kl": 0.040791677461675134, "learning_rate": 1.81e-07, "loss": 0.0, "num_tokens": 188291301.0, "reward": 0.0359375, "reward_std": 0.5803836584091187, "rewards/verify_chess_move/mean": 0.0359375, "rewards/verify_chess_move/std": 0.9952909708023071, "step": 2535 }, { "completion_length": 342.6, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/max_terminated_length": 342.6, "completions/mean_length": 134.66953125, "completions/mean_terminated_length": 134.66953125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0022986966209159673, "frac_reward_zero_std": 0.34375, "grad_norm": 0.19218222796916962, "kl": 0.0405906157757272, "learning_rate": 1.8135714285714285e-07, "loss": 0.0, "num_tokens": 188661622.0, "reward": -0.0546875, "reward_std": 0.5964272379875183, "rewards/verify_chess_move/mean": -0.0546875, "rewards/verify_chess_move/std": 0.9998948693275451, "step": 2540 }, { "completion_length": 330.8, "completions/clipped_ratio": 0.0, "completions/max_length": 330.8, "completions/max_terminated_length": 330.8, "completions/mean_length": 131.96171875, "completions/mean_terminated_length": 131.96171875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002303221614264227, "frac_reward_zero_std": 0.325, "grad_norm": 0.17830054461956024, "kl": 0.04630861049809028, "learning_rate": 1.8171428571428572e-07, "loss": 0.0, "num_tokens": 189028013.0, "reward": -0.090625, "reward_std": 0.6151009321212768, "rewards/verify_chess_move/mean": -0.090625, "rewards/verify_chess_move/std": 0.9878741025924682, "step": 2545 }, { "completion_length": 385.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 141.77265625, "completions/mean_terminated_length": 141.77265625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0023077466076124866, "frac_reward_zero_std": 0.34375, "grad_norm": 0.1833927035331726, "kl": 0.03755513953074115, "learning_rate": 1.8207142857142858e-07, "loss": 0.0, "num_tokens": 189408298.0, "reward": -0.0078125, "reward_std": 0.5930558323860169, "rewards/verify_chess_move/mean": -0.0078125, "rewards/verify_chess_move/std": 0.9948360204696656, "step": 2550 }, { "completion_length": 420.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 140.79921875, "completions/mean_terminated_length": 140.79921875, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0023122716009607467, "frac_reward_zero_std": 0.375, "grad_norm": 0.13506396114826202, "kl": 0.043262213192065246, "learning_rate": 1.8242857142857144e-07, "loss": 0.0, "num_tokens": 189789449.0, "reward": -0.11875, "reward_std": 0.5670700311660767, "rewards/verify_chess_move/mean": -0.11875, "rewards/verify_chess_move/std": 0.9883346676826477, "step": 2555 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 150.15078125, "completions/mean_terminated_length": 150.15078125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0023167965943090064, "frac_reward_zero_std": 0.3375, "grad_norm": 0.16321884095668793, "kl": 0.02108868226132472, "learning_rate": 1.8278571428571427e-07, "loss": 0.0, "num_tokens": 190181058.0, "reward": -0.0328125, "reward_std": 0.5888075590133667, "rewards/verify_chess_move/mean": -0.0328125, "rewards/verify_chess_move/std": 0.9985413908958435, "step": 2560 }, { "completion_length": 397.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 139.75390625, "completions/mean_terminated_length": 139.75390625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.002321321587657266, "frac_reward_zero_std": 0.30625, "grad_norm": 0.3139757513999939, "kl": 0.019675533611007268, "learning_rate": 1.8314285714285713e-07, "loss": 0.0, "num_tokens": 190558791.0, "reward": -0.1484375, "reward_std": 0.6338734984397888, "rewards/verify_chess_move/mean": -0.1484375, "rewards/verify_chess_move/std": 0.988506019115448, "step": 2565 }, { "completion_length": 347.2, "completions/clipped_ratio": 0.0, "completions/max_length": 347.2, "completions/max_terminated_length": 347.2, "completions/mean_length": 137.12890625, "completions/mean_terminated_length": 137.12890625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0023258465810055257, "frac_reward_zero_std": 0.275, "grad_norm": 0.1745087057352066, "kl": 0.023386424405907748, "learning_rate": 1.835e-07, "loss": 0.0, "num_tokens": 190931924.0, "reward": -0.115625, "reward_std": 0.6722307085990906, "rewards/verify_chess_move/mean": -0.115625, "rewards/verify_chess_move/std": 0.9892145395278931, "step": 2570 }, { "completion_length": 405.8, "completions/clipped_ratio": 0.0, "completions/max_length": 405.8, "completions/max_terminated_length": 405.8, "completions/mean_length": 137.51015625, "completions/mean_terminated_length": 137.51015625, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.002330371574353786, "frac_reward_zero_std": 0.3625, "grad_norm": 0.39373648166656494, "kl": 0.02902988793939585, "learning_rate": 1.8385714285714286e-07, "loss": 0.0, "num_tokens": 191305553.0, "reward": -0.0453125, "reward_std": 0.5717615187168121, "rewards/verify_chess_move/mean": -0.0453125, "rewards/verify_chess_move/std": 0.9939318418502807, "step": 2575 }, { "completion_length": 446.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 446.0, "completions/max_terminated_length": 361.8, "completions/mean_length": 138.32578125, "completions/mean_terminated_length": 137.8467041015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0023348965677020455, "frac_reward_zero_std": 0.3875, "grad_norm": 0.24905090034008026, "kl": 0.028324963420891434, "learning_rate": 1.8421428571428572e-07, "loss": 0.0, "num_tokens": 191682650.0, "reward": -0.10625, "reward_std": 0.5542959809303284, "rewards/verify_chess_move/mean": -0.10625, "rewards/verify_chess_move/std": 0.9893331408500672, "step": 2580 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 146.99375, "completions/mean_terminated_length": 146.99375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002339421561050305, "frac_reward_zero_std": 0.31875, "grad_norm": 0.19923627376556396, "kl": 0.02849295602281927, "learning_rate": 1.8457142857142858e-07, "loss": 0.0, "num_tokens": 192071322.0, "reward": -0.0109375, "reward_std": 0.6017548203468323, "rewards/verify_chess_move/mean": -0.0109375, "rewards/verify_chess_move/std": 0.9998579859733582, "step": 2585 }, { "completion_length": 395.8, "completions/clipped_ratio": 0.0, "completions/max_length": 395.8, "completions/max_terminated_length": 395.8, "completions/mean_length": 146.50859375, "completions/mean_terminated_length": 146.50859375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.002343946554398565, "frac_reward_zero_std": 0.3375, "grad_norm": 0.20349228382110596, "kl": 0.04275867652322631, "learning_rate": 1.8492857142857144e-07, "loss": 0.0, "num_tokens": 192457909.0, "reward": 0.015625, "reward_std": 0.597215610742569, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9929068326950073, "step": 2590 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 148.91875, "completions/mean_terminated_length": 148.91875, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.002348471547746825, "frac_reward_zero_std": 0.3625, "grad_norm": 0.20350264012813568, "kl": 0.03345810080500087, "learning_rate": 1.8528571428571428e-07, "loss": 0.0, "num_tokens": 192848653.0, "reward": -0.05, "reward_std": 0.5674979150295257, "rewards/verify_chess_move/mean": -0.05, "rewards/verify_chess_move/std": 0.9904330611228943, "step": 2595 }, { "completion_length": 382.8, "completions/clipped_ratio": 0.0, "completions/max_length": 382.8, "completions/max_terminated_length": 382.8, "completions/mean_length": 137.0609375, "completions/mean_terminated_length": 137.0609375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0023529965410950845, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17795556783676147, "kl": 0.03779394396551652, "learning_rate": 1.8564285714285714e-07, "loss": 0.0, "num_tokens": 193223003.0, "reward": -0.075, "reward_std": 0.5331431090831756, "rewards/verify_chess_move/mean": -0.075, "rewards/verify_chess_move/std": 0.9857029676437378, "step": 2600 }, { "completion_length": 436.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 143.39765625, "completions/mean_terminated_length": 143.39765625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.002357521534443344, "frac_reward_zero_std": 0.4125, "grad_norm": 0.10576451569795609, "kl": 0.018983066227519885, "learning_rate": 1.86e-07, "loss": 0.0, "num_tokens": 193606248.0, "reward": -0.078125, "reward_std": 0.5400807678699493, "rewards/verify_chess_move/mean": -0.078125, "rewards/verify_chess_move/std": 0.9966934442520141, "step": 2605 }, { "completion_length": 465.4, "completions/clipped_ratio": 0.0, "completions/max_length": 465.4, "completions/max_terminated_length": 465.4, "completions/mean_length": 138.44921875, "completions/mean_terminated_length": 138.44921875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0023620465277916043, "frac_reward_zero_std": 0.325, "grad_norm": 0.2537280321121216, "kl": 0.023633995014824903, "learning_rate": 1.8635714285714286e-07, "loss": 0.0, "num_tokens": 193980583.0, "reward": -0.0046875, "reward_std": 0.6107423901557922, "rewards/verify_chess_move/mean": -0.0046875, "rewards/verify_chess_move/std": 0.9995226502418518, "step": 2610 }, { "completion_length": 330.2, "completions/clipped_ratio": 0.0, "completions/max_length": 330.2, "completions/max_terminated_length": 330.2, "completions/mean_length": 137.92109375, "completions/mean_terminated_length": 137.92109375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002366571521139864, "frac_reward_zero_std": 0.30625, "grad_norm": 0.18980392813682556, "kl": 0.022726716480974574, "learning_rate": 1.8671428571428572e-07, "loss": 0.0, "num_tokens": 194357562.0, "reward": -0.065625, "reward_std": 0.6244734287261963, "rewards/verify_chess_move/mean": -0.065625, "rewards/verify_chess_move/std": 0.9975497603416443, "step": 2615 }, { "completion_length": 385.6, "completions/clipped_ratio": 0.0, "completions/max_length": 385.6, "completions/max_terminated_length": 385.6, "completions/mean_length": 133.396875, "completions/mean_terminated_length": 133.396875, "completions/min_length": 41.6, "completions/min_terminated_length": 41.6, "epoch": 0.0023710965144881236, "frac_reward_zero_std": 0.375, "grad_norm": 0.23039275407791138, "kl": 0.024107029433071147, "learning_rate": 1.8707142857142858e-07, "loss": 0.0, "num_tokens": 194723742.0, "reward": 0.0140625, "reward_std": 0.5650191128253936, "rewards/verify_chess_move/mean": 0.0140625, "rewards/verify_chess_move/std": 0.9892754673957824, "step": 2620 }, { "completion_length": 398.4, "completions/clipped_ratio": 0.0, "completions/max_length": 398.4, "completions/max_terminated_length": 398.4, "completions/mean_length": 148.5671875, "completions/mean_terminated_length": 148.5671875, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0023756215078363837, "frac_reward_zero_std": 0.34375, "grad_norm": 0.18760375678539276, "kl": 0.013502047365909675, "learning_rate": 1.8742857142857142e-07, "loss": 0.0, "num_tokens": 195116380.0, "reward": 0.0171875, "reward_std": 0.5757610917091369, "rewards/verify_chess_move/mean": 0.0171875, "rewards/verify_chess_move/std": 0.9977030873298645, "step": 2625 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 134.0328125, "completions/mean_terminated_length": 134.0328125, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0023801465011846434, "frac_reward_zero_std": 0.375, "grad_norm": 0.15079563856124878, "kl": 0.037773804060998374, "learning_rate": 1.8778571428571428e-07, "loss": 0.0, "num_tokens": 195486278.0, "reward": -0.1703125, "reward_std": 0.5837980270385742, "rewards/verify_chess_move/mean": -0.1703125, "rewards/verify_chess_move/std": 0.9798417687416077, "step": 2630 }, { "completion_length": 414.8, "completions/clipped_ratio": 0.0, "completions/max_length": 414.8, "completions/max_terminated_length": 414.8, "completions/mean_length": 143.8203125, "completions/mean_terminated_length": 143.8203125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.002384671494532903, "frac_reward_zero_std": 0.29375, "grad_norm": 0.14810222387313843, "kl": 0.03574077554658288, "learning_rate": 1.8814285714285714e-07, "loss": 0.0, "num_tokens": 195867088.0, "reward": 0.0390625, "reward_std": 0.6353631973266601, "rewards/verify_chess_move/mean": 0.0390625, "rewards/verify_chess_move/std": 0.9999918699264526, "step": 2635 }, { "completion_length": 484.4, "completions/clipped_ratio": 0.0, "completions/max_length": 484.4, "completions/max_terminated_length": 484.4, "completions/mean_length": 143.590625, "completions/mean_terminated_length": 143.590625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0023891964878811627, "frac_reward_zero_std": 0.4125, "grad_norm": 0.11644193530082703, "kl": 0.03330253482054104, "learning_rate": 1.885e-07, "loss": 0.0, "num_tokens": 196250012.0, "reward": 0.01875, "reward_std": 0.5209027230739594, "rewards/verify_chess_move/mean": 0.01875, "rewards/verify_chess_move/std": 1.000059974193573, "step": 2640 }, { "completion_length": 360.6, "completions/clipped_ratio": 0.0, "completions/max_length": 360.6, "completions/max_terminated_length": 360.6, "completions/mean_length": 145.06484375, "completions/mean_terminated_length": 145.06484375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0023937214812294228, "frac_reward_zero_std": 0.3625, "grad_norm": 0.20598198473453522, "kl": 0.05739295100502204, "learning_rate": 1.8885714285714286e-07, "loss": 0.0001, "num_tokens": 196636151.0, "reward": -0.0609375, "reward_std": 0.582811462879181, "rewards/verify_chess_move/mean": -0.0609375, "rewards/verify_chess_move/std": 0.9956138134002686, "step": 2645 }, { "completion_length": 373.4, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 147.8125, "completions/mean_terminated_length": 147.8125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0023982464745776824, "frac_reward_zero_std": 0.36875, "grad_norm": 0.20672066509723663, "kl": 0.053189598519384165, "learning_rate": 1.8921428571428572e-07, "loss": 0.0001, "num_tokens": 197025903.0, "reward": -0.05, "reward_std": 0.5819532990455627, "rewards/verify_chess_move/mean": -0.05, "rewards/verify_chess_move/std": 0.9952781796455383, "step": 2650 }, { "completion_length": 405.8, "completions/clipped_ratio": 0.0, "completions/max_length": 405.8, "completions/max_terminated_length": 405.8, "completions/mean_length": 144.1796875, "completions/mean_terminated_length": 144.1796875, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.002402771467925942, "frac_reward_zero_std": 0.3, "grad_norm": 0.216613307595253, "kl": 0.08049721754941856, "learning_rate": 1.8957142857142858e-07, "loss": 0.0001, "num_tokens": 197410853.0, "reward": -0.0078125, "reward_std": 0.6477221727371216, "rewards/verify_chess_move/mean": -0.0078125, "rewards/verify_chess_move/std": 0.9997222900390625, "step": 2655 }, { "completion_length": 399.6, "completions/clipped_ratio": 0.0, "completions/max_length": 399.6, "completions/max_terminated_length": 399.6, "completions/mean_length": 145.21640625, "completions/mean_terminated_length": 145.21640625, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0024072964612742017, "frac_reward_zero_std": 0.39375, "grad_norm": 0.17246128618717194, "kl": 0.07235924137785332, "learning_rate": 1.8992857142857142e-07, "loss": 0.0001, "num_tokens": 197796130.0, "reward": -0.0453125, "reward_std": 0.5574873685836792, "rewards/verify_chess_move/mean": -0.0453125, "rewards/verify_chess_move/std": 0.9856555104255676, "step": 2660 }, { "completion_length": 430.6, "completions/clipped_ratio": 0.0, "completions/max_length": 430.6, "completions/max_terminated_length": 430.6, "completions/mean_length": 147.7140625, "completions/mean_terminated_length": 147.7140625, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.002411821454622462, "frac_reward_zero_std": 0.3375, "grad_norm": 0.2031717300415039, "kl": 0.09240497320133727, "learning_rate": 1.9028571428571428e-07, "loss": 0.0001, "num_tokens": 198185524.0, "reward": -0.0359375, "reward_std": 0.6026379466056824, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 1.0007648229599, "step": 2665 }, { "completion_length": 438.4, "completions/clipped_ratio": 0.0, "completions/max_length": 438.4, "completions/max_terminated_length": 438.4, "completions/mean_length": 138.76953125, "completions/mean_terminated_length": 138.76953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0024163464479707215, "frac_reward_zero_std": 0.35, "grad_norm": 0.10905912518501282, "kl": 0.0966279417516489, "learning_rate": 1.9064285714285714e-07, "loss": 0.0001, "num_tokens": 198559821.0, "reward": 0.0671875, "reward_std": 0.5833827257156372, "rewards/verify_chess_move/mean": 0.0671875, "rewards/verify_chess_move/std": 0.9989222884178162, "step": 2670 }, { "completion_length": 342.2, "completions/clipped_ratio": 0.0, "completions/max_length": 342.2, "completions/max_terminated_length": 342.2, "completions/mean_length": 140.0171875, "completions/mean_terminated_length": 140.0171875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.002420871441318981, "frac_reward_zero_std": 0.3, "grad_norm": 0.36390164494514465, "kl": 0.07598571239650483, "learning_rate": 1.91e-07, "loss": 0.0001, "num_tokens": 198938131.0, "reward": -0.021875, "reward_std": 0.6323562502861023, "rewards/verify_chess_move/mean": -0.021875, "rewards/verify_chess_move/std": 0.9940041422843933, "step": 2675 }, { "completion_length": 388.2, "completions/clipped_ratio": 0.0, "completions/max_length": 388.2, "completions/max_terminated_length": 388.2, "completions/mean_length": 143.10625, "completions/mean_terminated_length": 143.10625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0024253964346672412, "frac_reward_zero_std": 0.38125, "grad_norm": 0.17446011304855347, "kl": 0.029929279201314784, "learning_rate": 1.9135714285714286e-07, "loss": 0.0, "num_tokens": 199320203.0, "reward": -0.0234375, "reward_std": 0.5564542889595032, "rewards/verify_chess_move/mean": -0.0234375, "rewards/verify_chess_move/std": 1.0001853108406067, "step": 2680 }, { "completion_length": 431.6, "completions/clipped_ratio": 0.0, "completions/max_length": 431.6, "completions/max_terminated_length": 431.6, "completions/mean_length": 143.6609375, "completions/mean_terminated_length": 143.6609375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.002429921428015501, "frac_reward_zero_std": 0.33125, "grad_norm": 0.20405372977256775, "kl": 0.05094424001144944, "learning_rate": 1.9171428571428573e-07, "loss": 0.0001, "num_tokens": 199701649.0, "reward": 0.0578125, "reward_std": 0.5919045686721802, "rewards/verify_chess_move/mean": 0.0578125, "rewards/verify_chess_move/std": 0.9887238264083862, "step": 2685 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 145.1421875, "completions/mean_terminated_length": 145.1421875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0024344464213637606, "frac_reward_zero_std": 0.40625, "grad_norm": 0.17931269109249115, "kl": 0.02410589205828728, "learning_rate": 1.9207142857142856e-07, "loss": 0.0, "num_tokens": 200087239.0, "reward": -0.0546875, "reward_std": 0.5335664629936219, "rewards/verify_chess_move/mean": -0.0546875, "rewards/verify_chess_move/std": 0.9943621039390564, "step": 2690 }, { "completion_length": 434.6, "completions/clipped_ratio": 0.0, "completions/max_length": 434.6, "completions/max_terminated_length": 434.6, "completions/mean_length": 134.87890625, "completions/mean_terminated_length": 134.87890625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00243897141471202, "frac_reward_zero_std": 0.3, "grad_norm": 0.1284271627664566, "kl": 0.024255791356699773, "learning_rate": 1.9242857142857142e-07, "loss": 0.0, "num_tokens": 200457812.0, "reward": 0.0234375, "reward_std": 0.645032799243927, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 0.9972763538360596, "step": 2695 }, { "completion_length": 413.2, "completions/clipped_ratio": 0.0, "completions/max_length": 413.2, "completions/max_terminated_length": 413.2, "completions/mean_length": 149.865625, "completions/mean_terminated_length": 149.865625, "completions/min_length": 43.4, "completions/min_terminated_length": 43.4, "epoch": 0.0024434964080602803, "frac_reward_zero_std": 0.33125, "grad_norm": 0.22371402382850647, "kl": 0.026011108353122835, "learning_rate": 1.9278571428571428e-07, "loss": 0.0, "num_tokens": 200848536.0, "reward": 0.015625, "reward_std": 0.6004773139953613, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9988304495811462, "step": 2700 }, { "completion_length": 405.8, "completions/clipped_ratio": 0.0, "completions/max_length": 405.8, "completions/max_terminated_length": 405.8, "completions/mean_length": 138.37109375, "completions/mean_terminated_length": 138.37109375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.00244802140140854, "frac_reward_zero_std": 0.36875, "grad_norm": 0.18731056153774261, "kl": 0.02869696469279006, "learning_rate": 1.9314285714285714e-07, "loss": 0.0, "num_tokens": 201224467.0, "reward": -0.0375, "reward_std": 0.5754953980445862, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9898350954055786, "step": 2705 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 141.60859375, "completions/mean_terminated_length": 141.60859375, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0024525463947567996, "frac_reward_zero_std": 0.39375, "grad_norm": 0.16130127012729645, "kl": 0.05598037891904824, "learning_rate": 1.935e-07, "loss": 0.0001, "num_tokens": 201606038.0, "reward": -0.0140625, "reward_std": 0.5430408239364624, "rewards/verify_chess_move/mean": -0.0140625, "rewards/verify_chess_move/std": 0.9982327103614808, "step": 2710 }, { "completion_length": 482.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 482.2, "completions/max_terminated_length": 413.2, "completions/mean_length": 149.48125, "completions/mean_terminated_length": 149.0028045654297, "completions/min_length": 42.8, "completions/min_terminated_length": 42.8, "epoch": 0.0024570713881050593, "frac_reward_zero_std": 0.3125, "grad_norm": 0.21296991407871246, "kl": 0.03980097898529493, "learning_rate": 1.9385714285714287e-07, "loss": 0.0, "num_tokens": 201994006.0, "reward": -0.0015625, "reward_std": 0.6411909699440003, "rewards/verify_chess_move/mean": -0.0015625, "rewards/verify_chess_move/std": 0.9971160650253296, "step": 2715 }, { "completion_length": 373.2, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 140.47109375, "completions/mean_terminated_length": 140.47109375, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0024615963814533194, "frac_reward_zero_std": 0.3375, "grad_norm": 0.18846730887889862, "kl": 0.060577877188916315, "learning_rate": 1.942142857142857e-07, "loss": 0.0001, "num_tokens": 202371113.0, "reward": 0.0640625, "reward_std": 0.596488106250763, "rewards/verify_chess_move/mean": 0.0640625, "rewards/verify_chess_move/std": 0.9964776515960694, "step": 2720 }, { "completion_length": 341.8, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/max_terminated_length": 341.8, "completions/mean_length": 146.19296875, "completions/mean_terminated_length": 146.19296875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.002466121374801579, "frac_reward_zero_std": 0.35, "grad_norm": 0.2086666375398636, "kl": 0.05834838872542605, "learning_rate": 1.9457142857142856e-07, "loss": 0.0001, "num_tokens": 202759088.0, "reward": -0.0359375, "reward_std": 0.5819663286209107, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 0.9982444763183593, "step": 2725 }, { "completion_length": 347.6, "completions/clipped_ratio": 0.0, "completions/max_length": 347.6, "completions/max_terminated_length": 347.6, "completions/mean_length": 147.728125, "completions/mean_terminated_length": 147.728125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0024706463681498387, "frac_reward_zero_std": 0.4, "grad_norm": 0.18475885689258575, "kl": 0.0675810810585972, "learning_rate": 1.9492857142857142e-07, "loss": 0.0001, "num_tokens": 203151028.0, "reward": -0.0625, "reward_std": 0.5471292555332183, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.993757140636444, "step": 2730 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 143.5921875, "completions/mean_terminated_length": 143.5921875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002475171361498099, "frac_reward_zero_std": 0.38125, "grad_norm": 0.2833133637905121, "kl": 0.058094062504824254, "learning_rate": 1.9528571428571429e-07, "loss": 0.0001, "num_tokens": 203534362.0, "reward": -0.04375, "reward_std": 0.5683266997337342, "rewards/verify_chess_move/mean": -0.04375, "rewards/verify_chess_move/std": 0.9992406845092774, "step": 2735 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 146.28828125, "completions/mean_terminated_length": 146.28828125, "completions/min_length": 43.4, "completions/min_terminated_length": 43.4, "epoch": 0.0024796963548463584, "frac_reward_zero_std": 0.34375, "grad_norm": 0.15490157902240753, "kl": 0.049454071387299334, "learning_rate": 1.9564285714285715e-07, "loss": 0.0, "num_tokens": 203921059.0, "reward": -0.003125, "reward_std": 0.5953179597854614, "rewards/verify_chess_move/mean": -0.003125, "rewards/verify_chess_move/std": 0.9919876337051392, "step": 2740 }, { "completion_length": 452.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 452.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 137.25390625, "completions/mean_terminated_length": 136.75524291992187, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "epoch": 0.002484221348194618, "frac_reward_zero_std": 0.36875, "grad_norm": 0.17347609996795654, "kl": 0.049162076583161254, "learning_rate": 1.96e-07, "loss": 0.0, "num_tokens": 204295008.0, "reward": 0.021875, "reward_std": 0.5757085263729096, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.9982569932937622, "step": 2745 }, { "completion_length": 357.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 144.8296875, "completions/mean_terminated_length": 144.8296875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0024887463415428778, "frac_reward_zero_std": 0.31875, "grad_norm": 0.3226628005504608, "kl": 0.07286704948637635, "learning_rate": 1.9635714285714287e-07, "loss": 0.0001, "num_tokens": 204681014.0, "reward": -0.059375, "reward_std": 0.6195717334747315, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9940801143646241, "step": 2750 }, { "completion_length": 438.2, "completions/clipped_ratio": 0.0, "completions/max_length": 438.2, "completions/max_terminated_length": 438.2, "completions/mean_length": 136.71328125, "completions/mean_terminated_length": 136.71328125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.002493271334891138, "frac_reward_zero_std": 0.4, "grad_norm": 0.30899307131767273, "kl": 0.09741827514444594, "learning_rate": 1.967142857142857e-07, "loss": 0.0001, "num_tokens": 205054095.0, "reward": 0.025, "reward_std": 0.5345719158649445, "rewards/verify_chess_move/mean": 0.025, "rewards/verify_chess_move/std": 0.9963056921958924, "step": 2755 }, { "completion_length": 370.4, "completions/clipped_ratio": 0.0, "completions/max_length": 370.4, "completions/max_terminated_length": 370.4, "completions/mean_length": 149.7640625, "completions/mean_terminated_length": 149.7640625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0024977963282393975, "frac_reward_zero_std": 0.38125, "grad_norm": 0.5056033730506897, "kl": 0.1271527669217903, "learning_rate": 1.9707142857142857e-07, "loss": 0.0001, "num_tokens": 205447529.0, "reward": -0.084375, "reward_std": 0.5610259890556335, "rewards/verify_chess_move/mean": -0.084375, "rewards/verify_chess_move/std": 0.9974273562431335, "step": 2760 }, { "completion_length": 377.2, "completions/clipped_ratio": 0.0, "completions/max_length": 377.2, "completions/max_terminated_length": 377.2, "completions/mean_length": 139.99765625, "completions/mean_terminated_length": 139.99765625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.002502321321587657, "frac_reward_zero_std": 0.275, "grad_norm": 0.24881166219711304, "kl": 0.11695199709938606, "learning_rate": 1.9742857142857143e-07, "loss": 0.0001, "num_tokens": 205822414.0, "reward": 0.003125, "reward_std": 0.6403034567832947, "rewards/verify_chess_move/mean": 0.003125, "rewards/verify_chess_move/std": 0.9925192832946778, "step": 2765 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 143.55390625, "completions/mean_terminated_length": 143.55390625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002506846314935917, "frac_reward_zero_std": 0.4125, "grad_norm": 0.16854356229305267, "kl": 0.07761681096089887, "learning_rate": 1.977857142857143e-07, "loss": 0.0001, "num_tokens": 206205859.0, "reward": 0.0921875, "reward_std": 0.5171060144901276, "rewards/verify_chess_move/mean": 0.0921875, "rewards/verify_chess_move/std": 0.9928213000297547, "step": 2770 }, { "completion_length": 462.4, "completions/clipped_ratio": 0.0, "completions/max_length": 462.4, "completions/max_terminated_length": 462.4, "completions/mean_length": 138.2171875, "completions/mean_terminated_length": 138.2171875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.002511371308284177, "frac_reward_zero_std": 0.33125, "grad_norm": 0.2315710335969925, "kl": 0.07388390338601311, "learning_rate": 1.9814285714285715e-07, "loss": 0.0001, "num_tokens": 206580153.0, "reward": 0.0109375, "reward_std": 0.6075717329978942, "rewards/verify_chess_move/mean": 0.0109375, "rewards/verify_chess_move/std": 0.9950387597084045, "step": 2775 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 149.23671875, "completions/mean_terminated_length": 149.23671875, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.0025158963016324366, "frac_reward_zero_std": 0.425, "grad_norm": 0.2003902643918991, "kl": 0.04785031103820074, "learning_rate": 1.985e-07, "loss": 0.0, "num_tokens": 206971416.0, "reward": -0.0515625, "reward_std": 0.53204505443573, "rewards/verify_chess_move/mean": -0.0515625, "rewards/verify_chess_move/std": 0.9920813560485839, "step": 2780 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 146.78515625, "completions/mean_terminated_length": 146.78515625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0025204212949806962, "frac_reward_zero_std": 0.35, "grad_norm": 0.18888838589191437, "kl": 0.042442763596773146, "learning_rate": 1.9885714285714285e-07, "loss": 0.0, "num_tokens": 207360189.0, "reward": -0.09375, "reward_std": 0.5835948944091797, "rewards/verify_chess_move/mean": -0.09375, "rewards/verify_chess_move/std": 0.9766398906707764, "step": 2785 }, { "completion_length": 397.2, "completions/clipped_ratio": 0.0, "completions/max_length": 397.2, "completions/max_terminated_length": 397.2, "completions/mean_length": 139.19453125, "completions/mean_terminated_length": 139.19453125, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.0025249462883289563, "frac_reward_zero_std": 0.3875, "grad_norm": 0.15699411928653717, "kl": 0.030140263588691596, "learning_rate": 1.992142857142857e-07, "loss": 0.0, "num_tokens": 207734614.0, "reward": 0.015625, "reward_std": 0.5556639552116394, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9949331641197204, "step": 2790 }, { "completion_length": 393.4, "completions/clipped_ratio": 0.0, "completions/max_length": 393.4, "completions/max_terminated_length": 393.4, "completions/mean_length": 138.615625, "completions/mean_terminated_length": 138.615625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002529471281677216, "frac_reward_zero_std": 0.375, "grad_norm": 0.24698932468891144, "kl": 0.03475859792233678, "learning_rate": 1.9957142857142857e-07, "loss": 0.0, "num_tokens": 208110434.0, "reward": 0.0609375, "reward_std": 0.5677530169487, "rewards/verify_chess_move/mean": 0.0609375, "rewards/verify_chess_move/std": 0.9970146059989929, "step": 2795 }, { "completion_length": 392.2, "completions/clipped_ratio": 0.0, "completions/max_length": 392.2, "completions/max_terminated_length": 392.2, "completions/mean_length": 151.01015625, "completions/mean_terminated_length": 151.01015625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0025339962750254756, "frac_reward_zero_std": 0.3, "grad_norm": 0.20458786189556122, "kl": 0.02387430270173354, "learning_rate": 1.9992857142857143e-07, "loss": 0.0, "num_tokens": 208504655.0, "reward": -0.0765625, "reward_std": 0.6430797338485718, "rewards/verify_chess_move/mean": -0.0765625, "rewards/verify_chess_move/std": 0.9914902329444886, "step": 2800 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 144.4171875, "completions/mean_terminated_length": 144.4171875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0025385212683737353, "frac_reward_zero_std": 0.36875, "grad_norm": 0.17395378649234772, "kl": 0.022817260175361297, "learning_rate": 2.002857142857143e-07, "loss": 0.0, "num_tokens": 208889341.0, "reward": -0.034375, "reward_std": 0.5698628544807434, "rewards/verify_chess_move/mean": -0.034375, "rewards/verify_chess_move/std": 0.9924721479415893, "step": 2805 }, { "completion_length": 446.4, "completions/clipped_ratio": 0.0, "completions/max_length": 446.4, "completions/max_terminated_length": 446.4, "completions/mean_length": 143.18203125, "completions/mean_terminated_length": 143.18203125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0025430462617219954, "frac_reward_zero_std": 0.35625, "grad_norm": 0.2596970498561859, "kl": 0.01595851122328895, "learning_rate": 2.0064285714285715e-07, "loss": 0.0, "num_tokens": 209269062.0, "reward": -0.0078125, "reward_std": 0.5725548028945923, "rewards/verify_chess_move/mean": -0.0078125, "rewards/verify_chess_move/std": 0.9964093089103698, "step": 2810 }, { "completion_length": 505.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 505.0, "completions/max_terminated_length": 415.6, "completions/mean_length": 132.24765625, "completions/mean_terminated_length": 131.75084228515624, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.002547571255070255, "frac_reward_zero_std": 0.38125, "grad_norm": 0.13862615823745728, "kl": 0.016627658079960382, "learning_rate": 2.01e-07, "loss": 0.0, "num_tokens": 209633459.0, "reward": 0.1109375, "reward_std": 0.5748088836669922, "rewards/verify_chess_move/mean": 0.1109375, "rewards/verify_chess_move/std": 0.9938464999198914, "step": 2815 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 140.88125, "completions/mean_terminated_length": 140.88125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0025520962484185147, "frac_reward_zero_std": 0.35, "grad_norm": 0.20981717109680176, "kl": 0.02025348859024234, "learning_rate": 2.0135714285714285e-07, "loss": 0.0, "num_tokens": 210011275.0, "reward": 0.0484375, "reward_std": 0.5818055331707, "rewards/verify_chess_move/mean": 0.0484375, "rewards/verify_chess_move/std": 0.9985977649688721, "step": 2820 }, { "completion_length": 372.2, "completions/clipped_ratio": 0.0, "completions/max_length": 372.2, "completions/max_terminated_length": 372.2, "completions/mean_length": 135.33359375, "completions/mean_terminated_length": 135.33359375, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.002556621241766775, "frac_reward_zero_std": 0.3375, "grad_norm": 0.14590561389923096, "kl": 0.033414466273097784, "learning_rate": 2.017142857142857e-07, "loss": 0.0, "num_tokens": 210381638.0, "reward": -0.0515625, "reward_std": 0.6126241207122802, "rewards/verify_chess_move/mean": -0.0515625, "rewards/verify_chess_move/std": 0.9958585500717163, "step": 2825 }, { "completion_length": 410.4, "completions/clipped_ratio": 0.0, "completions/max_length": 410.4, "completions/max_terminated_length": 410.4, "completions/mean_length": 142.81953125, "completions/mean_terminated_length": 142.81953125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0025611462351150345, "frac_reward_zero_std": 0.2625, "grad_norm": 0.17935006320476532, "kl": 0.04466946073880536, "learning_rate": 2.0207142857142857e-07, "loss": 0.0, "num_tokens": 210759375.0, "reward": 0.0578125, "reward_std": 0.6867010951042175, "rewards/verify_chess_move/mean": 0.0578125, "rewards/verify_chess_move/std": 0.9966297507286072, "step": 2830 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 142.2640625, "completions/mean_terminated_length": 142.2640625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.002565671228463294, "frac_reward_zero_std": 0.33125, "grad_norm": 0.17951402068138123, "kl": 0.09587420194147853, "learning_rate": 2.0242857142857143e-07, "loss": 0.0001, "num_tokens": 211140945.0, "reward": -0.1578125, "reward_std": 0.6077383875846862, "rewards/verify_chess_move/mean": -0.1578125, "rewards/verify_chess_move/std": 0.9828914523124694, "step": 2835 }, { "completion_length": 424.8, "completions/clipped_ratio": 0.0, "completions/max_length": 424.8, "completions/max_terminated_length": 424.8, "completions/mean_length": 144.83515625, "completions/mean_terminated_length": 144.83515625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.002570196221811554, "frac_reward_zero_std": 0.33125, "grad_norm": 0.16370247304439545, "kl": 0.04532385691054515, "learning_rate": 2.027857142857143e-07, "loss": 0.0, "num_tokens": 211525734.0, "reward": 0.0640625, "reward_std": 0.5949678838253021, "rewards/verify_chess_move/mean": 0.0640625, "rewards/verify_chess_move/std": 0.9929860949516296, "step": 2840 }, { "completion_length": 450.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 144.4609375, "completions/mean_terminated_length": 144.4609375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.002574721215159814, "frac_reward_zero_std": 0.4, "grad_norm": 0.17820541560649872, "kl": 0.0747794393770164, "learning_rate": 2.0314285714285715e-07, "loss": 0.0001, "num_tokens": 211911692.0, "reward": -0.134375, "reward_std": 0.5572375774383544, "rewards/verify_chess_move/mean": -0.134375, "rewards/verify_chess_move/std": 0.990887713432312, "step": 2845 }, { "completion_length": 424.4, "completions/clipped_ratio": 0.0, "completions/max_length": 424.4, "completions/max_terminated_length": 424.4, "completions/mean_length": 140.440625, "completions/mean_terminated_length": 140.440625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0025792462085080735, "frac_reward_zero_std": 0.35625, "grad_norm": 0.1966959834098816, "kl": 0.06008198050549254, "learning_rate": 2.035e-07, "loss": 0.0001, "num_tokens": 212289936.0, "reward": 0.0234375, "reward_std": 0.5904346287250519, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 0.99773188829422, "step": 2850 }, { "completion_length": 329.8, "completions/clipped_ratio": 0.0, "completions/max_length": 329.8, "completions/max_terminated_length": 329.8, "completions/mean_length": 140.878125, "completions/mean_terminated_length": 140.878125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.002583771201856333, "frac_reward_zero_std": 0.40625, "grad_norm": 0.18217605352401733, "kl": 0.07074110386165558, "learning_rate": 2.0385714285714285e-07, "loss": 0.0001, "num_tokens": 212671052.0, "reward": -0.0421875, "reward_std": 0.538563483953476, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.9969262957572937, "step": 2855 }, { "completion_length": 428.6, "completions/clipped_ratio": 0.0, "completions/max_length": 428.6, "completions/max_terminated_length": 428.6, "completions/mean_length": 129.54453125, "completions/mean_terminated_length": 129.54453125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.002588296195204593, "frac_reward_zero_std": 0.3875, "grad_norm": 0.12653885781764984, "kl": 0.053352383107994686, "learning_rate": 2.042142857142857e-07, "loss": 0.0001, "num_tokens": 213031109.0, "reward": -0.00625, "reward_std": 0.5568167805671692, "rewards/verify_chess_move/mean": -0.00625, "rewards/verify_chess_move/std": 0.9993711829185485, "step": 2860 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 142.784375, "completions/mean_terminated_length": 142.784375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002592821188552853, "frac_reward_zero_std": 0.3375, "grad_norm": 0.18236978352069855, "kl": 0.04436097483994672, "learning_rate": 2.0457142857142857e-07, "loss": 0.0, "num_tokens": 213411553.0, "reward": 0.0234375, "reward_std": 0.599902069568634, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 0.9892151474952697, "step": 2865 }, { "completion_length": 329.6, "completions/clipped_ratio": 0.0, "completions/max_length": 329.6, "completions/max_terminated_length": 329.6, "completions/mean_length": 142.328125, "completions/mean_terminated_length": 142.328125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0025973461819011126, "frac_reward_zero_std": 0.33125, "grad_norm": 0.2387070506811142, "kl": 0.039315757786971516, "learning_rate": 2.0492857142857143e-07, "loss": 0.0, "num_tokens": 213794173.0, "reward": -0.00625, "reward_std": 0.6053115844726562, "rewards/verify_chess_move/mean": -0.00625, "rewards/verify_chess_move/std": 1.001652979850769, "step": 2870 }, { "completion_length": 418.2, "completions/clipped_ratio": 0.0, "completions/max_length": 418.2, "completions/max_terminated_length": 418.2, "completions/mean_length": 148.59375, "completions/mean_terminated_length": 148.59375, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "epoch": 0.0026018711752493723, "frac_reward_zero_std": 0.39375, "grad_norm": 0.20256061851978302, "kl": 0.03846399152826052, "learning_rate": 2.052857142857143e-07, "loss": 0.0, "num_tokens": 214184269.0, "reward": -0.084375, "reward_std": 0.5459869027137756, "rewards/verify_chess_move/mean": -0.084375, "rewards/verify_chess_move/std": 0.988305675983429, "step": 2875 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 134.93828125, "completions/mean_terminated_length": 134.93828125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0026063961685976324, "frac_reward_zero_std": 0.34375, "grad_norm": 0.174128457903862, "kl": 0.06979416034882888, "learning_rate": 2.0564285714285716e-07, "loss": 0.0001, "num_tokens": 214554142.0, "reward": 0.0359375, "reward_std": 0.5788708806037903, "rewards/verify_chess_move/mean": 0.0359375, "rewards/verify_chess_move/std": 0.9891647338867188, "step": 2880 }, { "completion_length": 404.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 135.82109375, "completions/mean_terminated_length": 135.82109375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.002610921161945892, "frac_reward_zero_std": 0.4125, "grad_norm": 0.3268451988697052, "kl": 0.06669816772773629, "learning_rate": 2.06e-07, "loss": 0.0001, "num_tokens": 214926985.0, "reward": -0.040625, "reward_std": 0.536569881439209, "rewards/verify_chess_move/mean": -0.040625, "rewards/verify_chess_move/std": 0.9887813091278076, "step": 2885 }, { "completion_length": 422.2, "completions/clipped_ratio": 0.0, "completions/max_length": 422.2, "completions/max_terminated_length": 422.2, "completions/mean_length": 140.0234375, "completions/mean_terminated_length": 140.0234375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0026154461552941517, "frac_reward_zero_std": 0.3625, "grad_norm": 0.16672419011592865, "kl": 0.07627755088469712, "learning_rate": 2.0635714285714285e-07, "loss": 0.0001, "num_tokens": 215304015.0, "reward": 0.1171875, "reward_std": 0.5779113411903382, "rewards/verify_chess_move/mean": 0.1171875, "rewards/verify_chess_move/std": 0.9915220141410828, "step": 2890 }, { "completion_length": 374.8, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 141.821875, "completions/mean_terminated_length": 141.821875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0026199711486424113, "frac_reward_zero_std": 0.4, "grad_norm": 0.26435720920562744, "kl": 0.08524454001744744, "learning_rate": 2.0671428571428571e-07, "loss": 0.0001, "num_tokens": 215686795.0, "reward": -0.059375, "reward_std": 0.5438780426979065, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9950489521026611, "step": 2895 }, { "completion_length": 390.8, "completions/clipped_ratio": 0.0, "completions/max_length": 390.8, "completions/max_terminated_length": 390.8, "completions/mean_length": 142.075, "completions/mean_terminated_length": 142.075, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.0026244961419906714, "frac_reward_zero_std": 0.36875, "grad_norm": 0.19306524097919464, "kl": 0.07979007469984936, "learning_rate": 2.0707142857142858e-07, "loss": 0.0001, "num_tokens": 216067747.0, "reward": -0.0359375, "reward_std": 0.5660216212272644, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 0.9926186203956604, "step": 2900 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 139.3984375, "completions/mean_terminated_length": 139.3984375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.002629021135338931, "frac_reward_zero_std": 0.38125, "grad_norm": 0.12943032383918762, "kl": 0.06389008084952366, "learning_rate": 2.0742857142857144e-07, "loss": 0.0001, "num_tokens": 216444265.0, "reward": 0.034375, "reward_std": 0.5380948066711426, "rewards/verify_chess_move/mean": 0.034375, "rewards/verify_chess_move/std": 0.9947614312171936, "step": 2905 }, { "completion_length": 408.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 148.8875, "completions/mean_terminated_length": 148.8875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0026335461286871907, "frac_reward_zero_std": 0.4125, "grad_norm": 0.2849118709564209, "kl": 0.04444600228889613, "learning_rate": 2.077857142857143e-07, "loss": 0.0, "num_tokens": 216836649.0, "reward": 0.021875, "reward_std": 0.5389259696006775, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.9968002200126648, "step": 2910 }, { "completion_length": 395.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 134.07109375, "completions/mean_terminated_length": 134.07109375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0026380711220354504, "frac_reward_zero_std": 0.2875, "grad_norm": 0.19844554364681244, "kl": 0.06021101249498315, "learning_rate": 2.0814285714285713e-07, "loss": 0.0001, "num_tokens": 217205780.0, "reward": 0.05625, "reward_std": 0.6563953280448913, "rewards/verify_chess_move/mean": 0.05625, "rewards/verify_chess_move/std": 0.9973775625228882, "step": 2915 }, { "completion_length": 363.8, "completions/clipped_ratio": 0.0, "completions/max_length": 363.8, "completions/max_terminated_length": 363.8, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0026425961153837105, "frac_reward_zero_std": 0.36875, "grad_norm": 0.16116921603679657, "kl": 0.06553687562700361, "learning_rate": 2.085e-07, "loss": 0.0001, "num_tokens": 217592092.0, "reward": -0.0625, "reward_std": 0.5671289443969727, "rewards/verify_chess_move/mean": -0.0625, "rewards/verify_chess_move/std": 0.9991269588470459, "step": 2920 }, { "completion_length": 365.8, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 134.8640625, "completions/mean_terminated_length": 134.8640625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.00264712110873197, "frac_reward_zero_std": 0.25, "grad_norm": 0.39239028096199036, "kl": 0.12853862479241798, "learning_rate": 2.0885714285714286e-07, "loss": 0.0001, "num_tokens": 217960230.0, "reward": -0.003125, "reward_std": 0.6961066603660584, "rewards/verify_chess_move/mean": -0.003125, "rewards/verify_chess_move/std": 0.9974455118179322, "step": 2925 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 146.17890625, "completions/mean_terminated_length": 146.17890625, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.00265164610208023, "frac_reward_zero_std": 0.3, "grad_norm": 0.21026825904846191, "kl": 0.0873032370058354, "learning_rate": 2.0921428571428572e-07, "loss": 0.0001, "num_tokens": 218345843.0, "reward": 0.14375, "reward_std": 0.6343112230300904, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.989188802242279, "step": 2930 }, { "completion_length": 356.6, "completions/clipped_ratio": 0.0, "completions/max_length": 356.6, "completions/max_terminated_length": 356.6, "completions/mean_length": 145.51015625, "completions/mean_terminated_length": 145.51015625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.00265617109542849, "frac_reward_zero_std": 0.33125, "grad_norm": 0.16753827035427094, "kl": 0.10922853374795523, "learning_rate": 2.0957142857142858e-07, "loss": 0.0001, "num_tokens": 218734288.0, "reward": -0.021875, "reward_std": 0.5965925216674804, "rewards/verify_chess_move/mean": -0.021875, "rewards/verify_chess_move/std": 0.9952378034591675, "step": 2935 }, { "completion_length": 389.8, "completions/clipped_ratio": 0.0, "completions/max_length": 389.8, "completions/max_terminated_length": 389.8, "completions/mean_length": 138.9546875, "completions/mean_terminated_length": 138.9546875, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.0026606960887767496, "frac_reward_zero_std": 0.41875, "grad_norm": 0.14352142810821533, "kl": 0.07111701992835151, "learning_rate": 2.0992857142857144e-07, "loss": 0.0001, "num_tokens": 219108966.0, "reward": 0.028125, "reward_std": 0.5220431387424469, "rewards/verify_chess_move/mean": 0.028125, "rewards/verify_chess_move/std": 0.9907069683074952, "step": 2940 }, { "completion_length": 451.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.4, "completions/max_terminated_length": 361.0, "completions/mean_length": 141.2796875, "completions/mean_terminated_length": 140.78466796875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0026652210821250092, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15334859490394592, "kl": 0.029485700624354648, "learning_rate": 2.102857142857143e-07, "loss": 0.0, "num_tokens": 219489604.0, "reward": 0.05625, "reward_std": 0.5887522757053375, "rewards/verify_chess_move/mean": 0.05625, "rewards/verify_chess_move/std": 0.9983514189720154, "step": 2945 }, { "completion_length": 386.2, "completions/clipped_ratio": 0.0, "completions/max_length": 386.2, "completions/max_terminated_length": 386.2, "completions/mean_length": 136.5484375, "completions/mean_terminated_length": 136.5484375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002669746075473269, "frac_reward_zero_std": 0.46875, "grad_norm": 0.17584915459156036, "kl": 0.01874641769682057, "learning_rate": 2.106428571428571e-07, "loss": 0.0, "num_tokens": 219862650.0, "reward": -0.015625, "reward_std": 0.4926942527294159, "rewards/verify_chess_move/mean": -0.015625, "rewards/verify_chess_move/std": 0.99698566198349, "step": 2950 }, { "completion_length": 407.8, "completions/clipped_ratio": 0.0, "completions/max_length": 407.8, "completions/max_terminated_length": 407.8, "completions/mean_length": 144.3359375, "completions/mean_terminated_length": 144.3359375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.002674271068821529, "frac_reward_zero_std": 0.3375, "grad_norm": 0.12964916229248047, "kl": 0.020956725762516726, "learning_rate": 2.1099999999999997e-07, "loss": 0.0, "num_tokens": 220246336.0, "reward": 0.0359375, "reward_std": 0.593068253993988, "rewards/verify_chess_move/mean": 0.0359375, "rewards/verify_chess_move/std": 0.9999672651290894, "step": 2955 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.0, "completions/max_length": 457.4, "completions/max_terminated_length": 457.4, "completions/mean_length": 142.3, "completions/mean_terminated_length": 142.3, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0026787960621697886, "frac_reward_zero_std": 0.35, "grad_norm": 0.1739894449710846, "kl": 0.017766965735063422, "learning_rate": 2.1135714285714286e-07, "loss": 0.0, "num_tokens": 220626856.0, "reward": 0.05, "reward_std": 0.5726127028465271, "rewards/verify_chess_move/mean": 0.05, "rewards/verify_chess_move/std": 0.9914204835891723, "step": 2960 }, { "completion_length": 389.8, "completions/clipped_ratio": 0.0, "completions/max_length": 389.8, "completions/max_terminated_length": 389.8, "completions/mean_length": 140.85859375, "completions/mean_terminated_length": 140.85859375, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0026833210555180483, "frac_reward_zero_std": 0.4, "grad_norm": 0.18998293578624725, "kl": 0.01504813435167307, "learning_rate": 2.1171428571428572e-07, "loss": 0.0, "num_tokens": 221007731.0, "reward": -0.0375, "reward_std": 0.5375170171260834, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.9988823890686035, "step": 2965 }, { "completion_length": 370.4, "completions/clipped_ratio": 0.0, "completions/max_length": 370.4, "completions/max_terminated_length": 370.4, "completions/mean_length": 142.1125, "completions/mean_terminated_length": 142.1125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0026878460488663084, "frac_reward_zero_std": 0.38125, "grad_norm": 0.17969419062137604, "kl": 0.010800863977783593, "learning_rate": 2.1207142857142858e-07, "loss": 0.0, "num_tokens": 221389691.0, "reward": -0.0421875, "reward_std": 0.5646521270275116, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.9993422269821167, "step": 2970 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.0, "completions/max_length": 419.4, "completions/max_terminated_length": 419.4, "completions/mean_length": 144.740625, "completions/mean_terminated_length": 144.740625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002692371042214568, "frac_reward_zero_std": 0.33125, "grad_norm": 0.18455561995506287, "kl": 0.011072560973116197, "learning_rate": 2.1242857142857144e-07, "loss": 0.0, "num_tokens": 221776567.0, "reward": -0.034375, "reward_std": 0.6030019998550415, "rewards/verify_chess_move/mean": -0.034375, "rewards/verify_chess_move/std": 0.9827834606170655, "step": 2975 }, { "completion_length": 354.8, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 136.59375, "completions/mean_terminated_length": 136.59375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0026968960355628277, "frac_reward_zero_std": 0.3, "grad_norm": 0.203921377658844, "kl": 0.013390718854498117, "learning_rate": 2.1278571428571425e-07, "loss": 0.0, "num_tokens": 222146751.0, "reward": 0.015625, "reward_std": 0.6399941980838776, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9964565992355346, "step": 2980 }, { "completion_length": 344.6, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 140.1515625, "completions/mean_terminated_length": 140.1515625, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.0027014210289110874, "frac_reward_zero_std": 0.34375, "grad_norm": 0.23696918785572052, "kl": 0.02167613956844434, "learning_rate": 2.131428571428571e-07, "loss": 0.0, "num_tokens": 222524689.0, "reward": -0.0390625, "reward_std": 0.5936933279037475, "rewards/verify_chess_move/mean": -0.0390625, "rewards/verify_chess_move/std": 0.9960961818695069, "step": 2985 }, { "completion_length": 417.4, "completions/clipped_ratio": 0.0, "completions/max_length": 417.4, "completions/max_terminated_length": 417.4, "completions/mean_length": 131.6078125, "completions/mean_terminated_length": 131.6078125, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0027059460222593474, "frac_reward_zero_std": 0.36875, "grad_norm": 0.2401231825351715, "kl": 0.059703217800415584, "learning_rate": 2.1349999999999997e-07, "loss": 0.0001, "num_tokens": 222888355.0, "reward": 0.0515625, "reward_std": 0.5783697366714478, "rewards/verify_chess_move/mean": 0.0515625, "rewards/verify_chess_move/std": 0.9890750527381897, "step": 2990 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 134.853125, "completions/mean_terminated_length": 134.853125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.002710471015607607, "frac_reward_zero_std": 0.36875, "grad_norm": 0.24345435202121735, "kl": 0.09164392510574544, "learning_rate": 2.1385714285714283e-07, "loss": 0.0001, "num_tokens": 223260239.0, "reward": 0.0578125, "reward_std": 0.5611912786960602, "rewards/verify_chess_move/mean": 0.0578125, "rewards/verify_chess_move/std": 0.993207323551178, "step": 2995 }, { "completion_length": 436.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.4, "completions/max_terminated_length": 417.0, "completions/mean_length": 144.884375, "completions/mean_terminated_length": 144.39236450195312, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0027149960089558668, "frac_reward_zero_std": 0.375, "grad_norm": 0.2187476009130478, "kl": 0.12055416969233193, "learning_rate": 2.142142857142857e-07, "loss": 0.0001, "num_tokens": 223645075.0, "reward": -0.0328125, "reward_std": 0.5599775791168213, "rewards/verify_chess_move/mean": -0.0328125, "rewards/verify_chess_move/std": 1.00100998878479, "step": 3000 }, { "completion_length": 350.8, "completions/clipped_ratio": 0.0, "completions/max_length": 350.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 138.73671875, "completions/mean_terminated_length": 138.73671875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0027195210023041264, "frac_reward_zero_std": 0.41875, "grad_norm": 0.1891598105430603, "kl": 0.1324120300647337, "learning_rate": 2.1457142857142856e-07, "loss": 0.0001, "num_tokens": 224022730.0, "reward": 0.0359375, "reward_std": 0.5232221841812134, "rewards/verify_chess_move/mean": 0.0359375, "rewards/verify_chess_move/std": 0.9987273573875427, "step": 3005 }, { "completion_length": 405.2, "completions/clipped_ratio": 0.0, "completions/max_length": 405.2, "completions/max_terminated_length": 405.2, "completions/mean_length": 139.72109375, "completions/mean_terminated_length": 139.72109375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0027240459956523865, "frac_reward_zero_std": 0.41875, "grad_norm": 0.24805189669132233, "kl": 0.1077723662747303, "learning_rate": 2.1492857142857144e-07, "loss": 0.0001, "num_tokens": 224402037.0, "reward": -0.0359375, "reward_std": 0.5258843779563904, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 0.9929163098335266, "step": 3010 }, { "completion_length": 391.8, "completions/clipped_ratio": 0.0, "completions/max_length": 391.8, "completions/max_terminated_length": 391.8, "completions/mean_length": 149.91953125, "completions/mean_terminated_length": 149.91953125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.002728570989000646, "frac_reward_zero_std": 0.39375, "grad_norm": 0.18304993212223053, "kl": 0.06489462577737867, "learning_rate": 2.1528571428571425e-07, "loss": 0.0001, "num_tokens": 224797334.0, "reward": -0.0234375, "reward_std": 0.5394602000713349, "rewards/verify_chess_move/mean": -0.0234375, "rewards/verify_chess_move/std": 0.9980499505996704, "step": 3015 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 133.03203125, "completions/mean_terminated_length": 133.03203125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.002733095982348906, "frac_reward_zero_std": 0.33125, "grad_norm": 0.28365835547447205, "kl": 0.08090295134461485, "learning_rate": 2.1564285714285711e-07, "loss": 0.0001, "num_tokens": 225165319.0, "reward": 0.01875, "reward_std": 0.6111068487167358, "rewards/verify_chess_move/mean": 0.01875, "rewards/verify_chess_move/std": 0.9912403106689454, "step": 3020 }, { "completion_length": 364.6, "completions/clipped_ratio": 0.0, "completions/max_length": 364.6, "completions/max_terminated_length": 364.6, "completions/mean_length": 142.290625, "completions/mean_terminated_length": 142.290625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002737620975697166, "frac_reward_zero_std": 0.35625, "grad_norm": 0.18517979979515076, "kl": 0.023159492699778638, "learning_rate": 2.1599999999999998e-07, "loss": 0.0, "num_tokens": 225545683.0, "reward": -0.0453125, "reward_std": 0.5871136605739593, "rewards/verify_chess_move/mean": -0.0453125, "rewards/verify_chess_move/std": 0.9950727820396423, "step": 3025 }, { "completion_length": 387.8, "completions/clipped_ratio": 0.0, "completions/max_length": 387.8, "completions/max_terminated_length": 387.8, "completions/mean_length": 140.11484375, "completions/mean_terminated_length": 140.11484375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0027421459690454256, "frac_reward_zero_std": 0.3, "grad_norm": 0.2585054934024811, "kl": 0.02625478764093714, "learning_rate": 2.1635714285714284e-07, "loss": 0.0, "num_tokens": 225923070.0, "reward": 0.021875, "reward_std": 0.6407672047615052, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.9993953108787537, "step": 3030 }, { "completion_length": 402.4, "completions/clipped_ratio": 0.0, "completions/max_length": 402.4, "completions/max_terminated_length": 402.4, "completions/mean_length": 140.02578125, "completions/mean_terminated_length": 140.02578125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0027466709623936852, "frac_reward_zero_std": 0.3375, "grad_norm": 0.13793717324733734, "kl": 0.014745806399878348, "learning_rate": 2.167142857142857e-07, "loss": 0.0, "num_tokens": 226300503.0, "reward": 0.0328125, "reward_std": 0.6013164937496185, "rewards/verify_chess_move/mean": 0.0328125, "rewards/verify_chess_move/std": 0.9986316084861755, "step": 3035 }, { "completion_length": 380.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 140.41171875, "completions/mean_terminated_length": 140.41171875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.002751195955741945, "frac_reward_zero_std": 0.3375, "grad_norm": 0.15842267870903015, "kl": 0.016224302038608585, "learning_rate": 2.1707142857142856e-07, "loss": 0.0, "num_tokens": 226678518.0, "reward": 0.10625, "reward_std": 0.5827012658119202, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9939791917800903, "step": 3040 }, { "completion_length": 356.4, "completions/clipped_ratio": 0.0, "completions/max_length": 356.4, "completions/max_terminated_length": 356.4, "completions/mean_length": 138.37890625, "completions/mean_terminated_length": 138.37890625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.002755720949090205, "frac_reward_zero_std": 0.35, "grad_norm": 0.19781652092933655, "kl": 0.018781442166073248, "learning_rate": 2.174285714285714e-07, "loss": 0.0, "num_tokens": 227053387.0, "reward": 0.078125, "reward_std": 0.586584496498108, "rewards/verify_chess_move/mean": 0.078125, "rewards/verify_chess_move/std": 0.9908719420433044, "step": 3045 }, { "completion_length": 420.8, "completions/clipped_ratio": 0.0, "completions/max_length": 420.8, "completions/max_terminated_length": 420.8, "completions/mean_length": 140.0546875, "completions/mean_terminated_length": 140.0546875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0027602459424384647, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12847565114498138, "kl": 0.02135376266669482, "learning_rate": 2.1778571428571426e-07, "loss": 0.0, "num_tokens": 227429465.0, "reward": 0.08125, "reward_std": 0.5854441106319428, "rewards/verify_chess_move/mean": 0.08125, "rewards/verify_chess_move/std": 0.9903626322746277, "step": 3050 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 131.1015625, "completions/mean_terminated_length": 131.1015625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0027647709357867243, "frac_reward_zero_std": 0.3625, "grad_norm": 0.19002167880535126, "kl": 0.046779983464512044, "learning_rate": 2.1814285714285712e-07, "loss": 0.0, "num_tokens": 227793859.0, "reward": 0.1125, "reward_std": 0.5729133486747742, "rewards/verify_chess_move/mean": 0.1125, "rewards/verify_chess_move/std": 0.9926398396492004, "step": 3055 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 129.934375, "completions/mean_terminated_length": 129.934375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.002769295929134984, "frac_reward_zero_std": 0.35, "grad_norm": 0.17588397860527039, "kl": 0.07308277377596824, "learning_rate": 2.1849999999999998e-07, "loss": 0.0001, "num_tokens": 228157367.0, "reward": 0.053125, "reward_std": 0.5672398209571838, "rewards/verify_chess_move/mean": 0.053125, "rewards/verify_chess_move/std": 0.9951992869377136, "step": 3060 }, { "completion_length": 364.4, "completions/clipped_ratio": 0.0, "completions/max_length": 364.4, "completions/max_terminated_length": 364.4, "completions/mean_length": 140.02265625, "completions/mean_terminated_length": 140.02265625, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.002773820922483244, "frac_reward_zero_std": 0.35, "grad_norm": 0.5620772242546082, "kl": 0.0898710622350336, "learning_rate": 2.1885714285714284e-07, "loss": 0.0001, "num_tokens": 228536332.0, "reward": 0.0234375, "reward_std": 0.594314444065094, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 0.9954816699028015, "step": 3065 }, { "completion_length": 448.2, "completions/clipped_ratio": 0.0, "completions/max_length": 448.2, "completions/max_terminated_length": 448.2, "completions/mean_length": 139.5078125, "completions/mean_terminated_length": 139.5078125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0027783459158315037, "frac_reward_zero_std": 0.35625, "grad_norm": 0.1805059015750885, "kl": 0.1290460809104843, "learning_rate": 2.192142857142857e-07, "loss": 0.0001, "num_tokens": 228912638.0, "reward": 0.025, "reward_std": 0.5705503523349762, "rewards/verify_chess_move/mean": 0.025, "rewards/verify_chess_move/std": 0.9949711561203003, "step": 3070 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 134.425, "completions/mean_terminated_length": 134.425, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0027828709091797634, "frac_reward_zero_std": 0.30625, "grad_norm": 0.2207699865102768, "kl": 0.16861430535500405, "learning_rate": 2.1957142857142856e-07, "loss": 0.0002, "num_tokens": 229282190.0, "reward": 0.04375, "reward_std": 0.6155896425247193, "rewards/verify_chess_move/mean": 0.04375, "rewards/verify_chess_move/std": 0.9951652407646179, "step": 3075 }, { "completion_length": 339.4, "completions/clipped_ratio": 0.0, "completions/max_length": 339.4, "completions/max_terminated_length": 339.4, "completions/mean_length": 136.05859375, "completions/mean_terminated_length": 136.05859375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0027873959025280235, "frac_reward_zero_std": 0.40625, "grad_norm": 0.2668861746788025, "kl": 0.05729155665030703, "learning_rate": 2.199285714285714e-07, "loss": 0.0001, "num_tokens": 229657313.0, "reward": 0.0203125, "reward_std": 0.5334017395973205, "rewards/verify_chess_move/mean": 0.0203125, "rewards/verify_chess_move/std": 0.9967538595199585, "step": 3080 }, { "completion_length": 329.4, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/max_terminated_length": 329.4, "completions/mean_length": 134.23046875, "completions/mean_terminated_length": 134.23046875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.002791920895876283, "frac_reward_zero_std": 0.3625, "grad_norm": 0.21197527647018433, "kl": 0.036126045043056366, "learning_rate": 2.2028571428571426e-07, "loss": 0.0, "num_tokens": 230028784.0, "reward": 0.021875, "reward_std": 0.5556225478649139, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.9884798765182495, "step": 3085 }, { "completion_length": 429.2, "completions/clipped_ratio": 0.0, "completions/max_length": 429.2, "completions/max_terminated_length": 429.2, "completions/mean_length": 138.9265625, "completions/mean_terminated_length": 138.9265625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.002796445889224543, "frac_reward_zero_std": 0.425, "grad_norm": 0.1931169182062149, "kl": 0.010415082401596009, "learning_rate": 2.2064285714285712e-07, "loss": 0.0, "num_tokens": 230408578.0, "reward": 0.009375, "reward_std": 0.5166810989379883, "rewards/verify_chess_move/mean": 0.009375, "rewards/verify_chess_move/std": 0.9935093522071838, "step": 3090 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.0, "completions/max_length": 444.8, "completions/max_terminated_length": 444.8, "completions/mean_length": 135.80078125, "completions/mean_terminated_length": 135.80078125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0028009708825728024, "frac_reward_zero_std": 0.35625, "grad_norm": 0.2030038833618164, "kl": 0.011350121034047334, "learning_rate": 2.2099999999999998e-07, "loss": 0.0, "num_tokens": 230779563.0, "reward": 0.11875, "reward_std": 0.5709262371063233, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9820788979530335, "step": 3095 }, { "completion_length": 388.2, "completions/clipped_ratio": 0.0, "completions/max_length": 388.2, "completions/max_terminated_length": 388.2, "completions/mean_length": 136.1921875, "completions/mean_terminated_length": 136.1921875, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.0028054958759210625, "frac_reward_zero_std": 0.4125, "grad_norm": 0.11673879623413086, "kl": 0.012210474860330578, "learning_rate": 2.2135714285714284e-07, "loss": 0.0, "num_tokens": 231154073.0, "reward": -0.0609375, "reward_std": 0.537514466047287, "rewards/verify_chess_move/mean": -0.0609375, "rewards/verify_chess_move/std": 0.9878202319145203, "step": 3100 }, { "completion_length": 354.4, "completions/clipped_ratio": 0.0, "completions/max_length": 354.4, "completions/max_terminated_length": 354.4, "completions/mean_length": 135.415625, "completions/mean_terminated_length": 135.415625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.002810020869269322, "frac_reward_zero_std": 0.36875, "grad_norm": 0.3569614291191101, "kl": 0.010729004527092911, "learning_rate": 2.217142857142857e-07, "loss": 0.0, "num_tokens": 231525773.0, "reward": -0.015625, "reward_std": 0.5646566390991211, "rewards/verify_chess_move/mean": -0.015625, "rewards/verify_chess_move/std": 0.9939613223075867, "step": 3105 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 139.03203125, "completions/mean_terminated_length": 139.03203125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.002814545862617582, "frac_reward_zero_std": 0.39375, "grad_norm": 0.231692835688591, "kl": 0.01007627112121554, "learning_rate": 2.2207142857142854e-07, "loss": 0.0, "num_tokens": 231905614.0, "reward": -0.1203125, "reward_std": 0.5590180099010468, "rewards/verify_chess_move/mean": -0.1203125, "rewards/verify_chess_move/std": 0.9826468110084534, "step": 3110 }, { "completion_length": 416.2, "completions/clipped_ratio": 0.0, "completions/max_length": 416.2, "completions/max_terminated_length": 416.2, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0028190708559658415, "frac_reward_zero_std": 0.425, "grad_norm": 0.19261106848716736, "kl": 0.013780201121699065, "learning_rate": 2.224285714285714e-07, "loss": 0.0, "num_tokens": 232280054.0, "reward": 0.0390625, "reward_std": 0.5155253350734711, "rewards/verify_chess_move/mean": 0.0390625, "rewards/verify_chess_move/std": 0.9979750394821167, "step": 3115 }, { "completion_length": 371.2, "completions/clipped_ratio": 0.0, "completions/max_length": 371.2, "completions/max_terminated_length": 371.2, "completions/mean_length": 131.41796875, "completions/mean_terminated_length": 131.41796875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0028235958493141016, "frac_reward_zero_std": 0.3625, "grad_norm": 0.2588920295238495, "kl": 0.030976229975931347, "learning_rate": 2.2278571428571426e-07, "loss": 0.0, "num_tokens": 232646085.0, "reward": 0.071875, "reward_std": 0.5683446049690246, "rewards/verify_chess_move/mean": 0.071875, "rewards/verify_chess_move/std": 0.9958941698074341, "step": 3120 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 142.8828125, "completions/mean_terminated_length": 142.8828125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0028281208426623613, "frac_reward_zero_std": 0.36875, "grad_norm": 0.16442057490348816, "kl": 0.039747505285777154, "learning_rate": 2.2314285714285712e-07, "loss": 0.0, "num_tokens": 233029575.0, "reward": -0.1109375, "reward_std": 0.579807460308075, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.991046404838562, "step": 3125 }, { "completion_length": 457.2, "completions/clipped_ratio": 0.0, "completions/max_length": 457.2, "completions/max_terminated_length": 457.2, "completions/mean_length": 143.865625, "completions/mean_terminated_length": 143.865625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.002832645836010621, "frac_reward_zero_std": 0.39375, "grad_norm": 0.22042037546634674, "kl": 0.05761983355041593, "learning_rate": 2.2349999999999998e-07, "loss": 0.0001, "num_tokens": 233414243.0, "reward": 0.0375, "reward_std": 0.5426174640655518, "rewards/verify_chess_move/mean": 0.0375, "rewards/verify_chess_move/std": 0.9929689526557922, "step": 3130 }, { "completion_length": 423.6, "completions/clipped_ratio": 0.0, "completions/max_length": 423.6, "completions/max_terminated_length": 423.6, "completions/mean_length": 141.07734375, "completions/mean_terminated_length": 141.07734375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.002837170829358881, "frac_reward_zero_std": 0.35, "grad_norm": 0.45639437437057495, "kl": 0.11183355804168968, "learning_rate": 2.2385714285714284e-07, "loss": 0.0001, "num_tokens": 233794774.0, "reward": -0.0359375, "reward_std": 0.5974242627620697, "rewards/verify_chess_move/mean": -0.0359375, "rewards/verify_chess_move/std": 0.9956906080245972, "step": 3135 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 131.19296875, "completions/mean_terminated_length": 131.19296875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0028416958227071407, "frac_reward_zero_std": 0.36875, "grad_norm": 0.27350664138793945, "kl": 0.06731074250856181, "learning_rate": 2.242142857142857e-07, "loss": 0.0001, "num_tokens": 234158909.0, "reward": -0.059375, "reward_std": 0.5660710573196411, "rewards/verify_chess_move/mean": -0.059375, "rewards/verify_chess_move/std": 0.9949981808662415, "step": 3140 }, { "completion_length": 363.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 142.64609375, "completions/mean_terminated_length": 142.64609375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0028462208160554003, "frac_reward_zero_std": 0.3625, "grad_norm": 0.20077593624591827, "kl": 0.04869036047894042, "learning_rate": 2.2457142857142854e-07, "loss": 0.0, "num_tokens": 234542776.0, "reward": 0.0046875, "reward_std": 0.5771808743476867, "rewards/verify_chess_move/mean": 0.0046875, "rewards/verify_chess_move/std": 0.9977485775947571, "step": 3145 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 136.7453125, "completions/mean_terminated_length": 136.7453125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.00285074580940366, "frac_reward_zero_std": 0.375, "grad_norm": 0.22835826873779297, "kl": 0.017532855132594706, "learning_rate": 2.249285714285714e-07, "loss": 0.0, "num_tokens": 234917434.0, "reward": -0.00625, "reward_std": 0.5514009177684784, "rewards/verify_chess_move/mean": -0.00625, "rewards/verify_chess_move/std": 0.9959778785705566, "step": 3150 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 132.80234375, "completions/mean_terminated_length": 132.80234375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.00285527080275192, "frac_reward_zero_std": 0.43125, "grad_norm": 0.2990169823169708, "kl": 0.030767897369514685, "learning_rate": 2.2528571428571426e-07, "loss": 0.0, "num_tokens": 235286501.0, "reward": -0.0421875, "reward_std": 0.504743903875351, "rewards/verify_chess_move/mean": -0.0421875, "rewards/verify_chess_move/std": 0.999023711681366, "step": 3155 }, { "completion_length": 458.4, "completions/clipped_ratio": 0.0, "completions/max_length": 458.4, "completions/max_terminated_length": 458.4, "completions/mean_length": 129.04375, "completions/mean_terminated_length": 129.04375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0028597957961001797, "frac_reward_zero_std": 0.31875, "grad_norm": 0.22178441286087036, "kl": 0.02377816159860231, "learning_rate": 2.2564285714285712e-07, "loss": 0.0, "num_tokens": 235648621.0, "reward": -0.0125, "reward_std": 0.6350257635116577, "rewards/verify_chess_move/mean": -0.0125, "rewards/verify_chess_move/std": 0.9995776057243347, "step": 3160 }, { "completion_length": 392.4, "completions/clipped_ratio": 0.0, "completions/max_length": 392.4, "completions/max_terminated_length": 392.4, "completions/mean_length": 133.58046875, "completions/mean_terminated_length": 133.58046875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0028643207894484394, "frac_reward_zero_std": 0.3625, "grad_norm": 0.23960106074810028, "kl": 0.02867113903630525, "learning_rate": 2.2599999999999999e-07, "loss": 0.0, "num_tokens": 236016484.0, "reward": 0.0140625, "reward_std": 0.5697580575942993, "rewards/verify_chess_move/mean": 0.0140625, "rewards/verify_chess_move/std": 1.0002609014511108, "step": 3165 }, { "completion_length": 403.8, "completions/clipped_ratio": 0.0, "completions/max_length": 403.8, "completions/max_terminated_length": 403.8, "completions/mean_length": 138.51015625, "completions/mean_terminated_length": 138.51015625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0028688457827966995, "frac_reward_zero_std": 0.40625, "grad_norm": 0.22883929312229156, "kl": 0.026469117210945116, "learning_rate": 2.2635714285714285e-07, "loss": 0.0, "num_tokens": 236391657.0, "reward": 0.08125, "reward_std": 0.53988196849823, "rewards/verify_chess_move/mean": 0.08125, "rewards/verify_chess_move/std": 0.9880556344985962, "step": 3170 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.0, "completions/max_length": 394.6, "completions/max_terminated_length": 394.6, "completions/mean_length": 139.95, "completions/mean_terminated_length": 139.95, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.002873370776144959, "frac_reward_zero_std": 0.4, "grad_norm": 0.2167138010263443, "kl": 0.045371617353521285, "learning_rate": 2.2671428571428568e-07, "loss": 0.0, "num_tokens": 236769425.0, "reward": 0.05, "reward_std": 0.5522909820079803, "rewards/verify_chess_move/mean": 0.05, "rewards/verify_chess_move/std": 0.9972497701644898, "step": 3175 }, { "completion_length": 458.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 139.746875, "completions/mean_terminated_length": 139.746875, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.002877895769493219, "frac_reward_zero_std": 0.4125, "grad_norm": 0.3276597261428833, "kl": 0.14889208230160875, "learning_rate": 2.2707142857142854e-07, "loss": 0.0001, "num_tokens": 237152133.0, "reward": -0.0484375, "reward_std": 0.5243661224842071, "rewards/verify_chess_move/mean": -0.0484375, "rewards/verify_chess_move/std": 0.989346706867218, "step": 3180 }, { "completion_length": 445.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.0, "completions/max_terminated_length": 369.8, "completions/mean_length": 133.23203125, "completions/mean_terminated_length": 132.7248046875, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0028824207628414785, "frac_reward_zero_std": 0.375, "grad_norm": 0.41899573802948, "kl": 0.14627119614888215, "learning_rate": 2.274285714285714e-07, "loss": 0.0001, "num_tokens": 237521422.0, "reward": 0.0609375, "reward_std": 0.5590824127197266, "rewards/verify_chess_move/mean": 0.0609375, "rewards/verify_chess_move/std": 0.9993169665336609, "step": 3185 }, { "completion_length": 356.2, "completions/clipped_ratio": 0.0, "completions/max_length": 356.2, "completions/max_terminated_length": 356.2, "completions/mean_length": 144.93203125, "completions/mean_terminated_length": 144.93203125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0028869457561897386, "frac_reward_zero_std": 0.3875, "grad_norm": 0.29395657777786255, "kl": 0.12191052369598765, "learning_rate": 2.2778571428571427e-07, "loss": 0.0001, "num_tokens": 237907151.0, "reward": -0.046875, "reward_std": 0.5366234242916107, "rewards/verify_chess_move/mean": -0.046875, "rewards/verify_chess_move/std": 1.0000962734222412, "step": 3190 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 440.6, "completions/max_terminated_length": 367.4, "completions/mean_length": 125.67421875, "completions/mean_terminated_length": 125.17823486328125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0028914707495379982, "frac_reward_zero_std": 0.375, "grad_norm": 0.2232106775045395, "kl": 0.10432845698815071, "learning_rate": 2.2814285714285713e-07, "loss": 0.0001, "num_tokens": 238264718.0, "reward": 0.046875, "reward_std": 0.5667639136314392, "rewards/verify_chess_move/mean": 0.046875, "rewards/verify_chess_move/std": 0.9948697447776794, "step": 3195 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.4, "completions/max_terminated_length": 366.4, "completions/mean_length": 137.08203125, "completions/mean_terminated_length": 136.58493041992188, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.002895995742886258, "frac_reward_zero_std": 0.45, "grad_norm": 0.17866390943527222, "kl": 0.0586459168640431, "learning_rate": 2.285e-07, "loss": 0.0001, "num_tokens": 238639519.0, "reward": 0.00625, "reward_std": 0.4966453969478607, "rewards/verify_chess_move/mean": 0.00625, "rewards/verify_chess_move/std": 0.9965755939483643, "step": 3200 }, { "completion_length": 420.2, "completions/clipped_ratio": 0.0, "completions/max_length": 420.2, "completions/max_terminated_length": 420.2, "completions/mean_length": 131.975, "completions/mean_terminated_length": 131.975, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0029005207362345175, "frac_reward_zero_std": 0.38125, "grad_norm": 0.18370305001735687, "kl": 0.06475974107452202, "learning_rate": 2.2885714285714285e-07, "loss": 0.0001, "num_tokens": 239005887.0, "reward": 0.0078125, "reward_std": 0.5622747361660003, "rewards/verify_chess_move/mean": 0.0078125, "rewards/verify_chess_move/std": 0.9909906148910522, "step": 3205 }, { "completion_length": 426.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 148.46953125, "completions/mean_terminated_length": 148.46953125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0029050457295827776, "frac_reward_zero_std": 0.38125, "grad_norm": 0.125040203332901, "kl": 0.03734533486131113, "learning_rate": 2.2921428571428568e-07, "loss": 0.0, "num_tokens": 239399160.0, "reward": 0.0578125, "reward_std": 0.5489849627017975, "rewards/verify_chess_move/mean": 0.0578125, "rewards/verify_chess_move/std": 0.9899937510490417, "step": 3210 }, { "completion_length": 378.8, "completions/clipped_ratio": 0.0, "completions/max_length": 378.8, "completions/max_terminated_length": 378.8, "completions/mean_length": 134.61484375, "completions/mean_terminated_length": 134.61484375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0029095707229310373, "frac_reward_zero_std": 0.3625, "grad_norm": 0.18911363184452057, "kl": 0.040553475185879505, "learning_rate": 2.2957142857142855e-07, "loss": 0.0, "num_tokens": 239765899.0, "reward": 0.09375, "reward_std": 0.5699721693992614, "rewards/verify_chess_move/mean": 0.09375, "rewards/verify_chess_move/std": 0.9956663846969604, "step": 3215 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 127.48671875, "completions/mean_terminated_length": 127.48671875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.002914095716279297, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1531221717596054, "kl": 0.028640648545115256, "learning_rate": 2.299285714285714e-07, "loss": 0.0, "num_tokens": 240124882.0, "reward": 0.053125, "reward_std": 0.5194754898548126, "rewards/verify_chess_move/mean": 0.053125, "rewards/verify_chess_move/std": 0.9941048622131348, "step": 3220 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 127.98984375, "completions/mean_terminated_length": 127.98984375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.002918620709627557, "frac_reward_zero_std": 0.4375, "grad_norm": 0.37126949429512024, "kl": 0.03290068541100481, "learning_rate": 2.3028571428571427e-07, "loss": 0.0, "num_tokens": 240486245.0, "reward": 0.06875, "reward_std": 0.5248773515224456, "rewards/verify_chess_move/mean": 0.06875, "rewards/verify_chess_move/std": 0.9959948897361756, "step": 3225 }, { "completion_length": 449.8, "completions/clipped_ratio": 0.0, "completions/max_length": 449.8, "completions/max_terminated_length": 449.8, "completions/mean_length": 139.321875, "completions/mean_terminated_length": 139.321875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0029231457029758167, "frac_reward_zero_std": 0.43125, "grad_norm": 0.2431028187274933, "kl": 0.018672385018726347, "learning_rate": 2.3064285714285713e-07, "loss": 0.0, "num_tokens": 240863705.0, "reward": 0.103125, "reward_std": 0.5219931185245514, "rewards/verify_chess_move/mean": 0.103125, "rewards/verify_chess_move/std": 0.9938048005104065, "step": 3230 }, { "completion_length": 429.2, "completions/clipped_ratio": 0.0, "completions/max_length": 429.2, "completions/max_terminated_length": 429.2, "completions/mean_length": 143.66875, "completions/mean_terminated_length": 143.66875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.0029276706963240764, "frac_reward_zero_std": 0.3625, "grad_norm": 0.1700858473777771, "kl": 0.029049599052086705, "learning_rate": 2.31e-07, "loss": 0.0, "num_tokens": 241245441.0, "reward": -0.009375, "reward_std": 0.5781235218048095, "rewards/verify_chess_move/mean": -0.009375, "rewards/verify_chess_move/std": 0.9941948175430297, "step": 3235 }, { "completion_length": 381.2, "completions/clipped_ratio": 0.0, "completions/max_length": 381.2, "completions/max_terminated_length": 381.2, "completions/mean_length": 136.1515625, "completions/mean_terminated_length": 136.1515625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002932195689672336, "frac_reward_zero_std": 0.38125, "grad_norm": 0.11887414008378983, "kl": 0.025427801786281633, "learning_rate": 2.3135714285714283e-07, "loss": 0.0, "num_tokens": 241617699.0, "reward": 0.0625, "reward_std": 0.5619191884994507, "rewards/verify_chess_move/mean": 0.0625, "rewards/verify_chess_move/std": 0.993757688999176, "step": 3240 }, { "completion_length": 397.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 139.2015625, "completions/mean_terminated_length": 139.2015625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.002936720683020596, "frac_reward_zero_std": 0.33125, "grad_norm": 0.7705086469650269, "kl": 0.137481205817312, "learning_rate": 2.3171428571428569e-07, "loss": 0.0001, "num_tokens": 241993341.0, "reward": 0.0484375, "reward_std": 0.6020613312721252, "rewards/verify_chess_move/mean": 0.0484375, "rewards/verify_chess_move/std": 0.9901392698287964, "step": 3245 }, { "completion_length": 422.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 138.69453125, "completions/mean_terminated_length": 138.69453125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0029412456763688558, "frac_reward_zero_std": 0.38125, "grad_norm": 0.1749819964170456, "kl": 0.042236417885578706, "learning_rate": 2.3207142857142855e-07, "loss": 0.0, "num_tokens": 242368254.0, "reward": -0.025, "reward_std": 0.5641357839107514, "rewards/verify_chess_move/mean": -0.025, "rewards/verify_chess_move/std": 0.9806150794029236, "step": 3250 }, { "completion_length": 455.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 135.58203125, "completions/mean_terminated_length": 135.58203125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0029457706697171154, "frac_reward_zero_std": 0.29375, "grad_norm": 0.10367705672979355, "kl": 0.057621258369181305, "learning_rate": 2.324285714285714e-07, "loss": 0.0001, "num_tokens": 242736455.0, "reward": 0.0546875, "reward_std": 0.6261015892028808, "rewards/verify_chess_move/mean": 0.0546875, "rewards/verify_chess_move/std": 0.9926410675048828, "step": 3255 }, { "completion_length": 392.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 145.5578125, "completions/mean_terminated_length": 145.5578125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.002950295663065375, "frac_reward_zero_std": 0.425, "grad_norm": 0.31186380982398987, "kl": 0.02561829904007027, "learning_rate": 2.3278571428571427e-07, "loss": 0.0, "num_tokens": 243123825.0, "reward": 0.0359375, "reward_std": 0.5158373236656189, "rewards/verify_chess_move/mean": 0.0359375, "rewards/verify_chess_move/std": 0.9921561002731323, "step": 3260 }, { "completion_length": 460.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 460.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 145.03046875, "completions/mean_terminated_length": 144.54332275390624, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.002954820656413635, "frac_reward_zero_std": 0.375, "grad_norm": 0.3378784656524658, "kl": 0.025468236418964806, "learning_rate": 2.3314285714285713e-07, "loss": 0.0, "num_tokens": 243507728.0, "reward": 0.140625, "reward_std": 0.5662911057472229, "rewards/verify_chess_move/mean": 0.140625, "rewards/verify_chess_move/std": 0.9834042072296143, "step": 3265 }, { "completion_length": 356.4, "completions/clipped_ratio": 0.0, "completions/max_length": 356.4, "completions/max_terminated_length": 356.4, "completions/mean_length": 139.91953125, "completions/mean_terminated_length": 139.91953125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.002959345649761895, "frac_reward_zero_std": 0.4, "grad_norm": 0.20487269759178162, "kl": 0.04202262380858883, "learning_rate": 2.335e-07, "loss": 0.0, "num_tokens": 243885737.0, "reward": 0.05625, "reward_std": 0.5370926976203918, "rewards/verify_chess_move/mean": 0.05625, "rewards/verify_chess_move/std": 0.9974122285842896, "step": 3270 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 137.5609375, "completions/mean_terminated_length": 137.5609375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0029638706431101545, "frac_reward_zero_std": 0.45625, "grad_norm": 0.2150009125471115, "kl": 0.08110373482923024, "learning_rate": 2.3385714285714283e-07, "loss": 0.0001, "num_tokens": 244261327.0, "reward": -0.0578125, "reward_std": 0.4768610239028931, "rewards/verify_chess_move/mean": -0.0578125, "rewards/verify_chess_move/std": 0.9972123146057129, "step": 3275 }, { "completion_length": 372.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 135.06328125, "completions/mean_terminated_length": 135.06328125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0029683956364584146, "frac_reward_zero_std": 0.44375, "grad_norm": 0.2053702473640442, "kl": 0.07565115298348246, "learning_rate": 2.342142857142857e-07, "loss": 0.0001, "num_tokens": 244633336.0, "reward": -0.1109375, "reward_std": 0.4963313639163971, "rewards/verify_chess_move/mean": -0.1109375, "rewards/verify_chess_move/std": 0.9880969166755676, "step": 3280 }, { "completion_length": 377.6, "completions/clipped_ratio": 0.0, "completions/max_length": 377.6, "completions/max_terminated_length": 377.6, "completions/mean_length": 137.95546875, "completions/mean_terminated_length": 137.95546875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0029729206298066743, "frac_reward_zero_std": 0.40625, "grad_norm": 0.20514163374900818, "kl": 0.0538238707522396, "learning_rate": 2.3457142857142855e-07, "loss": 0.0001, "num_tokens": 245010927.0, "reward": 0.0, "reward_std": 0.5155402779579162, "rewards/verify_chess_move/mean": 0.0, "rewards/verify_chess_move/std": 0.9935560345649719, "step": 3285 }, { "completion_length": 418.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 139.2671875, "completions/mean_terminated_length": 139.2671875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.002977445623154934, "frac_reward_zero_std": 0.4, "grad_norm": 0.2779700756072998, "kl": 0.05707559121801751, "learning_rate": 2.349285714285714e-07, "loss": 0.0001, "num_tokens": 245390693.0, "reward": 0.0421875, "reward_std": 0.5489670693874359, "rewards/verify_chess_move/mean": 0.0421875, "rewards/verify_chess_move/std": 0.9994574666023255, "step": 3290 }, { "completion_length": 412.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 132.01171875, "completions/mean_terminated_length": 132.01171875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0029819706165031936, "frac_reward_zero_std": 0.44375, "grad_norm": 0.20607174932956696, "kl": 0.043117678734415674, "learning_rate": 2.3528571428571427e-07, "loss": 0.0, "num_tokens": 245758124.0, "reward": 0.053125, "reward_std": 0.49322057962417604, "rewards/verify_chess_move/mean": 0.053125, "rewards/verify_chess_move/std": 0.9965461492538452, "step": 3295 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 136.30546875, "completions/mean_terminated_length": 136.30546875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0029864956098514537, "frac_reward_zero_std": 0.39375, "grad_norm": 0.16912294924259186, "kl": 0.05578517341928091, "learning_rate": 2.3564285714285713e-07, "loss": 0.0001, "num_tokens": 246129555.0, "reward": 0.0953125, "reward_std": 0.5494542121887207, "rewards/verify_chess_move/mean": 0.0953125, "rewards/verify_chess_move/std": 0.9894268870353699, "step": 3300 }, { "completion_length": 337.4, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/max_terminated_length": 337.4, "completions/mean_length": 133.8390625, "completions/mean_terminated_length": 133.8390625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0029910206031997133, "frac_reward_zero_std": 0.36875, "grad_norm": 0.23725105822086334, "kl": 0.06457369424169883, "learning_rate": 2.3599999999999997e-07, "loss": 0.0001, "num_tokens": 246501109.0, "reward": 0.0046875, "reward_std": 0.5629331290721893, "rewards/verify_chess_move/mean": 0.0046875, "rewards/verify_chess_move/std": 0.9905322790145874, "step": 3305 }, { "completion_length": 382.6, "completions/clipped_ratio": 0.0, "completions/max_length": 382.6, "completions/max_terminated_length": 382.6, "completions/mean_length": 140.56171875, "completions/mean_terminated_length": 140.56171875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.002995545596547973, "frac_reward_zero_std": 0.35625, "grad_norm": 0.3673396110534668, "kl": 0.06636852108349559, "learning_rate": 2.3635714285714283e-07, "loss": 0.0001, "num_tokens": 246882508.0, "reward": 0.05, "reward_std": 0.5821195840835571, "rewards/verify_chess_move/mean": 0.05, "rewards/verify_chess_move/std": 0.997470247745514, "step": 3310 }, { "completion_length": 410.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 138.38984375, "completions/mean_terminated_length": 138.38984375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0030000705898962326, "frac_reward_zero_std": 0.375, "grad_norm": 0.10805315524339676, "kl": 0.09622836937196552, "learning_rate": 2.367142857142857e-07, "loss": 0.0001, "num_tokens": 247257503.0, "reward": 0.015625, "reward_std": 0.5592480838298798, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9973136067390442, "step": 3315 }, { "completion_length": 401.6, "completions/clipped_ratio": 0.0, "completions/max_length": 401.6, "completions/max_terminated_length": 401.6, "completions/mean_length": 139.54375, "completions/mean_terminated_length": 139.54375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0030045955832444927, "frac_reward_zero_std": 0.43125, "grad_norm": 0.2544986605644226, "kl": 0.040348457609070465, "learning_rate": 2.3707142857142855e-07, "loss": 0.0, "num_tokens": 247634847.0, "reward": 0.040625, "reward_std": 0.5213556528091431, "rewards/verify_chess_move/mean": 0.040625, "rewards/verify_chess_move/std": 0.9821455359458924, "step": 3320 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 130.19921875, "completions/mean_terminated_length": 130.19921875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0030091205765927524, "frac_reward_zero_std": 0.45625, "grad_norm": 0.1223650798201561, "kl": 0.03352993177686585, "learning_rate": 2.3742857142857141e-07, "loss": 0.0, "num_tokens": 247999350.0, "reward": -0.025, "reward_std": 0.4974816620349884, "rewards/verify_chess_move/mean": -0.025, "rewards/verify_chess_move/std": 0.9837981700897217, "step": 3325 }, { "completion_length": 441.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 441.8, "completions/max_terminated_length": 354.4, "completions/mean_length": 139.740625, "completions/mean_terminated_length": 139.25010681152344, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.003013645569941012, "frac_reward_zero_std": 0.45, "grad_norm": 0.1658814549446106, "kl": 0.02773150939756306, "learning_rate": 2.3778571428571428e-07, "loss": 0.0, "num_tokens": 248376842.0, "reward": 0.1828125, "reward_std": 0.4801750659942627, "rewards/verify_chess_move/mean": 0.1828125, "rewards/verify_chess_move/std": 0.9772915124893189, "step": 3330 }, { "completion_length": 379.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 137.6546875, "completions/mean_terminated_length": 137.6546875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.003018170563289272, "frac_reward_zero_std": 0.3875, "grad_norm": 0.22626745700836182, "kl": 0.04046775334863924, "learning_rate": 2.3814285714285714e-07, "loss": 0.0, "num_tokens": 248749896.0, "reward": 0.090625, "reward_std": 0.5670180678367615, "rewards/verify_chess_move/mean": 0.090625, "rewards/verify_chess_move/std": 0.9825337529182434, "step": 3335 }, { "completion_length": 403.2, "completions/clipped_ratio": 0.0, "completions/max_length": 403.2, "completions/max_terminated_length": 403.2, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.003022695556637532, "frac_reward_zero_std": 0.425, "grad_norm": 0.2307920902967453, "kl": 0.02794586987583898, "learning_rate": 2.3849999999999997e-07, "loss": 0.0, "num_tokens": 249129632.0, "reward": 0.0234375, "reward_std": 0.5170579552650452, "rewards/verify_chess_move/mean": 0.0234375, "rewards/verify_chess_move/std": 0.9945760726928711, "step": 3340 }, { "completion_length": 418.8, "completions/clipped_ratio": 0.0, "completions/max_length": 418.8, "completions/max_terminated_length": 418.8, "completions/mean_length": 137.24140625, "completions/mean_terminated_length": 137.24140625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0030272205499857915, "frac_reward_zero_std": 0.34375, "grad_norm": 0.12894843518733978, "kl": 0.0226960562315071, "learning_rate": 2.3885714285714283e-07, "loss": 0.0, "num_tokens": 249503621.0, "reward": 0.125, "reward_std": 0.5719178915023804, "rewards/verify_chess_move/mean": 0.125, "rewards/verify_chess_move/std": 0.9847584128379822, "step": 3345 }, { "completion_length": 392.4, "completions/clipped_ratio": 0.0, "completions/max_length": 392.4, "completions/max_terminated_length": 392.4, "completions/mean_length": 132.44453125, "completions/mean_terminated_length": 132.44453125, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.003031745543334051, "frac_reward_zero_std": 0.39375, "grad_norm": 0.18994827568531036, "kl": 0.01868231147964252, "learning_rate": 2.392142857142857e-07, "loss": 0.0, "num_tokens": 249868766.0, "reward": 0.14375, "reward_std": 0.5482964754104614, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9900466799736023, "step": 3350 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 129.99609375, "completions/mean_terminated_length": 129.99609375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.003036270536682311, "frac_reward_zero_std": 0.38125, "grad_norm": 0.18611709773540497, "kl": 0.021787286180187947, "learning_rate": 2.3957142857142856e-07, "loss": 0.0, "num_tokens": 250234049.0, "reward": 0.1328125, "reward_std": 0.5464157462120056, "rewards/verify_chess_move/mean": 0.1328125, "rewards/verify_chess_move/std": 0.9820151925086975, "step": 3355 }, { "completion_length": 348.8, "completions/clipped_ratio": 0.0, "completions/max_length": 348.8, "completions/max_terminated_length": 348.8, "completions/mean_length": 129.959375, "completions/mean_terminated_length": 129.959375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.003040795530030571, "frac_reward_zero_std": 0.3375, "grad_norm": 0.2041739970445633, "kl": 0.062021219957387075, "learning_rate": 2.399285714285714e-07, "loss": 0.0001, "num_tokens": 250596477.0, "reward": 0.2421875, "reward_std": 0.5921730995178223, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9629348993301392, "step": 3360 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 136.19296875, "completions/mean_terminated_length": 136.19296875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0030453205233788305, "frac_reward_zero_std": 0.3875, "grad_norm": 0.38753023743629456, "kl": 0.039607075127423744, "learning_rate": 2.402857142857143e-07, "loss": 0.0, "num_tokens": 250967604.0, "reward": 0.10625, "reward_std": 0.537942910194397, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9925128102302552, "step": 3365 }, { "completion_length": 354.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 127.646875, "completions/mean_terminated_length": 127.646875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0030498455167270906, "frac_reward_zero_std": 0.43125, "grad_norm": 0.3317696750164032, "kl": 0.052394932782044636, "learning_rate": 2.4064285714285714e-07, "loss": 0.0001, "num_tokens": 251328792.0, "reward": 0.0875, "reward_std": 0.49581463932991027, "rewards/verify_chess_move/mean": 0.0875, "rewards/verify_chess_move/std": 0.9833384871482849, "step": 3370 }, { "completion_length": 431.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 431.6, "completions/max_terminated_length": 372.8, "completions/mean_length": 137.6984375, "completions/mean_terminated_length": 137.2123779296875, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0030543705100753503, "frac_reward_zero_std": 0.34375, "grad_norm": 0.24349860846996307, "kl": 0.08551917564327596, "learning_rate": 2.41e-07, "loss": 0.0001, "num_tokens": 251700606.0, "reward": 0.125, "reward_std": 0.5877536535263062, "rewards/verify_chess_move/mean": 0.125, "rewards/verify_chess_move/std": 0.9768988251686096, "step": 3375 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 129.9125, "completions/mean_terminated_length": 129.9125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00305889550342361, "frac_reward_zero_std": 0.45, "grad_norm": 0.275729238986969, "kl": 0.10644119236967527, "learning_rate": 2.4135714285714286e-07, "loss": 0.0001, "num_tokens": 252065134.0, "reward": 0.1109375, "reward_std": 0.4842284798622131, "rewards/verify_chess_move/mean": 0.1109375, "rewards/verify_chess_move/std": 0.9945269584655761, "step": 3380 }, { "completion_length": 352.2, "completions/clipped_ratio": 0.0, "completions/max_length": 352.2, "completions/max_terminated_length": 352.2, "completions/mean_length": 138.73046875, "completions/mean_terminated_length": 138.73046875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0030634204967718696, "frac_reward_zero_std": 0.375, "grad_norm": 0.14710499346256256, "kl": 0.04831665401725331, "learning_rate": 2.417142857142857e-07, "loss": 0.0, "num_tokens": 252442117.0, "reward": 0.10625, "reward_std": 0.5583994269371033, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9926542162895202, "step": 3385 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 138.93046875, "completions/mean_terminated_length": 138.93046875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0030679454901201297, "frac_reward_zero_std": 0.4125, "grad_norm": 0.20134352147579193, "kl": 0.07851204607577529, "learning_rate": 2.420714285714286e-07, "loss": 0.0001, "num_tokens": 252821844.0, "reward": 0.015625, "reward_std": 0.5311969876289367, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9893004059791565, "step": 3390 }, { "completion_length": 379.6, "completions/clipped_ratio": 0.0, "completions/max_length": 379.6, "completions/max_terminated_length": 379.6, "completions/mean_length": 128.7578125, "completions/mean_terminated_length": 128.7578125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0030724704834683893, "frac_reward_zero_std": 0.44375, "grad_norm": 0.1883792281150818, "kl": 0.04271078906604089, "learning_rate": 2.4242857142857145e-07, "loss": 0.0, "num_tokens": 253186078.0, "reward": -0.028125, "reward_std": 0.48676264882087705, "rewards/verify_chess_move/mean": -0.028125, "rewards/verify_chess_move/std": 0.9980377554893494, "step": 3395 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 143.18828125, "completions/mean_terminated_length": 143.18828125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.003076995476816649, "frac_reward_zero_std": 0.475, "grad_norm": 0.17710141837596893, "kl": 0.03579606867569964, "learning_rate": 2.427857142857143e-07, "loss": 0.0, "num_tokens": 253570671.0, "reward": 0.0171875, "reward_std": 0.46812993884086607, "rewards/verify_chess_move/mean": 0.0171875, "rewards/verify_chess_move/std": 0.98848477602005, "step": 3400 }, { "completion_length": 349.4, "completions/clipped_ratio": 0.0, "completions/max_length": 349.4, "completions/max_terminated_length": 349.4, "completions/mean_length": 148.09140625, "completions/mean_terminated_length": 148.09140625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0030815204701649087, "frac_reward_zero_std": 0.45625, "grad_norm": 0.2239820659160614, "kl": 0.06820237229403574, "learning_rate": 2.431428571428571e-07, "loss": 0.0001, "num_tokens": 253964204.0, "reward": 0.0375, "reward_std": 0.4954316973686218, "rewards/verify_chess_move/mean": 0.0375, "rewards/verify_chess_move/std": 0.9983790159225464, "step": 3405 }, { "completion_length": 346.2, "completions/clipped_ratio": 0.0, "completions/max_length": 346.2, "completions/max_terminated_length": 346.2, "completions/mean_length": 142.2984375, "completions/mean_terminated_length": 142.2984375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0030860454635131688, "frac_reward_zero_std": 0.39375, "grad_norm": 0.1567464917898178, "kl": 0.042106247961055485, "learning_rate": 2.435e-07, "loss": 0.0, "num_tokens": 254348242.0, "reward": -0.0765625, "reward_std": 0.5476134896278382, "rewards/verify_chess_move/mean": -0.0765625, "rewards/verify_chess_move/std": 0.9954261898994445, "step": 3410 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 133.46328125, "completions/mean_terminated_length": 133.46328125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0030905704568614284, "frac_reward_zero_std": 0.38125, "grad_norm": 0.19472965598106384, "kl": 0.048875164068886076, "learning_rate": 2.4385714285714284e-07, "loss": 0.0, "num_tokens": 254718011.0, "reward": -0.00625, "reward_std": 0.5489839673042297, "rewards/verify_chess_move/mean": -0.00625, "rewards/verify_chess_move/std": 0.9885016560554505, "step": 3415 }, { "completion_length": 341.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 132.92890625, "completions/mean_terminated_length": 132.92890625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.003095095450209688, "frac_reward_zero_std": 0.4125, "grad_norm": 0.20400285720825195, "kl": 0.03944881568895653, "learning_rate": 2.442142857142857e-07, "loss": 0.0, "num_tokens": 255084736.0, "reward": 0.1546875, "reward_std": 0.4964418649673462, "rewards/verify_chess_move/mean": 0.1546875, "rewards/verify_chess_move/std": 0.9675344586372375, "step": 3420 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 133.89140625, "completions/mean_terminated_length": 133.89140625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003099620443557948, "frac_reward_zero_std": 0.41875, "grad_norm": 0.14796492457389832, "kl": 0.07914040791802109, "learning_rate": 2.4457142857142856e-07, "loss": 0.0001, "num_tokens": 255451317.0, "reward": 0.065625, "reward_std": 0.5324091136455535, "rewards/verify_chess_move/mean": 0.065625, "rewards/verify_chess_move/std": 0.9957627534866333, "step": 3425 }, { "completion_length": 371.6, "completions/clipped_ratio": 0.0, "completions/max_length": 371.6, "completions/max_terminated_length": 371.6, "completions/mean_length": 127.99609375, "completions/mean_terminated_length": 127.99609375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.003104145436906208, "frac_reward_zero_std": 0.46875, "grad_norm": 0.21034184098243713, "kl": 0.07751639831403737, "learning_rate": 2.449285714285714e-07, "loss": 0.0001, "num_tokens": 255812448.0, "reward": 0.1109375, "reward_std": 0.4762259066104889, "rewards/verify_chess_move/mean": 0.1109375, "rewards/verify_chess_move/std": 0.9944922804832459, "step": 3430 }, { "completion_length": 415.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 415.0, "completions/max_terminated_length": 331.8, "completions/mean_length": 129.2875, "completions/mean_terminated_length": 128.78783264160157, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0031086704302544675, "frac_reward_zero_std": 0.4625, "grad_norm": 0.19621241092681885, "kl": 0.19980379868939052, "learning_rate": 2.452857142857143e-07, "loss": 0.0002, "num_tokens": 256177376.0, "reward": 0.0890625, "reward_std": 0.4848140001296997, "rewards/verify_chess_move/mean": 0.0890625, "rewards/verify_chess_move/std": 0.9879819035530091, "step": 3435 }, { "completion_length": 359.6, "completions/clipped_ratio": 0.0, "completions/max_length": 359.6, "completions/max_terminated_length": 359.6, "completions/mean_length": 128.865625, "completions/mean_terminated_length": 128.865625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.003113195423602727, "frac_reward_zero_std": 0.45625, "grad_norm": 0.1724279671907425, "kl": 0.051244030459201895, "learning_rate": 2.4564285714285714e-07, "loss": 0.0001, "num_tokens": 256538740.0, "reward": 0.0828125, "reward_std": 0.47182143926620485, "rewards/verify_chess_move/mean": 0.0828125, "rewards/verify_chess_move/std": 0.9956011772155762, "step": 3440 }, { "completion_length": 396.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 131.71484375, "completions/mean_terminated_length": 131.71484375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0031177204169509872, "frac_reward_zero_std": 0.45, "grad_norm": 0.2183787077665329, "kl": 0.03348133610124933, "learning_rate": 2.46e-07, "loss": 0.0, "num_tokens": 256905615.0, "reward": 0.1078125, "reward_std": 0.4798689603805542, "rewards/verify_chess_move/mean": 0.1078125, "rewards/verify_chess_move/std": 0.9930726885795593, "step": 3445 }, { "completion_length": 364.8, "completions/clipped_ratio": 0.0, "completions/max_length": 364.8, "completions/max_terminated_length": 364.8, "completions/mean_length": 138.15703125, "completions/mean_terminated_length": 138.15703125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.003122245410299247, "frac_reward_zero_std": 0.40625, "grad_norm": 0.24373553693294525, "kl": 0.04127616075857077, "learning_rate": 2.4635714285714287e-07, "loss": 0.0, "num_tokens": 257281376.0, "reward": 0.0140625, "reward_std": 0.5280541062355042, "rewards/verify_chess_move/mean": 0.0140625, "rewards/verify_chess_move/std": 0.9916175842285156, "step": 3450 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 132.7703125, "completions/mean_terminated_length": 132.7703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0031267704036475065, "frac_reward_zero_std": 0.3875, "grad_norm": 0.20954716205596924, "kl": 0.05533330391626805, "learning_rate": 2.4671428571428573e-07, "loss": 0.0001, "num_tokens": 257648754.0, "reward": 0.0453125, "reward_std": 0.5256906986236572, "rewards/verify_chess_move/mean": 0.0453125, "rewards/verify_chess_move/std": 0.9871896266937256, "step": 3455 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 136.3234375, "completions/mean_terminated_length": 136.3234375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003131295396995766, "frac_reward_zero_std": 0.5, "grad_norm": 0.14253945648670197, "kl": 0.07143862499215174, "learning_rate": 2.470714285714286e-07, "loss": 0.0001, "num_tokens": 258022640.0, "reward": -0.075, "reward_std": 0.457496303319931, "rewards/verify_chess_move/mean": -0.075, "rewards/verify_chess_move/std": 0.9919223070144654, "step": 3460 }, { "completion_length": 403.2, "completions/clipped_ratio": 0.0, "completions/max_length": 403.2, "completions/max_terminated_length": 403.2, "completions/mean_length": 129.4671875, "completions/mean_terminated_length": 129.4671875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0031358203903440263, "frac_reward_zero_std": 0.41875, "grad_norm": 0.25622960925102234, "kl": 0.12421487208921463, "learning_rate": 2.4742857142857145e-07, "loss": 0.0001, "num_tokens": 258386094.0, "reward": 0.0, "reward_std": 0.5149477243423461, "rewards/verify_chess_move/mean": 0.0, "rewards/verify_chess_move/std": 1.0008936882019044, "step": 3465 }, { "completion_length": 349.4, "completions/clipped_ratio": 0.0, "completions/max_length": 349.4, "completions/max_terminated_length": 349.4, "completions/mean_length": 133.13671875, "completions/mean_terminated_length": 133.13671875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.003140345383692286, "frac_reward_zero_std": 0.49375, "grad_norm": 0.13677722215652466, "kl": 0.04492823786567897, "learning_rate": 2.4778571428571426e-07, "loss": 0.0, "num_tokens": 258755405.0, "reward": 0.13125, "reward_std": 0.43941375613212585, "rewards/verify_chess_move/mean": 0.13125, "rewards/verify_chess_move/std": 0.9814045906066895, "step": 3470 }, { "completion_length": 413.2, "completions/clipped_ratio": 0.0, "completions/max_length": 413.2, "completions/max_terminated_length": 413.2, "completions/mean_length": 141.475, "completions/mean_terminated_length": 141.475, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0031448703770405456, "frac_reward_zero_std": 0.41875, "grad_norm": 0.21032355725765228, "kl": 0.07280164198455168, "learning_rate": 2.481428571428571e-07, "loss": 0.0001, "num_tokens": 259137061.0, "reward": 0.0296875, "reward_std": 0.5309259355068207, "rewards/verify_chess_move/mean": 0.0296875, "rewards/verify_chess_move/std": 0.9960774302482605, "step": 3475 }, { "completion_length": 425.2, "completions/clipped_ratio": 0.0, "completions/max_length": 425.2, "completions/max_terminated_length": 425.2, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0031493953703888057, "frac_reward_zero_std": 0.44375, "grad_norm": 0.14390721917152405, "kl": 0.04441176382824778, "learning_rate": 2.485e-07, "loss": 0.0, "num_tokens": 259502413.0, "reward": 0.1390625, "reward_std": 0.48749408721923826, "rewards/verify_chess_move/mean": 0.1390625, "rewards/verify_chess_move/std": 0.9882688879966736, "step": 3480 }, { "completion_length": 343.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 141.1421875, "completions/mean_terminated_length": 141.1421875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0031539203637370654, "frac_reward_zero_std": 0.375, "grad_norm": 0.1941126137971878, "kl": 0.07619354896014556, "learning_rate": 2.4885714285714284e-07, "loss": 0.0001, "num_tokens": 259884435.0, "reward": 0.0078125, "reward_std": 0.5440468549728393, "rewards/verify_chess_move/mean": 0.0078125, "rewards/verify_chess_move/std": 0.9908973455429078, "step": 3485 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 133.37265625, "completions/mean_terminated_length": 133.37265625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003158445357085325, "frac_reward_zero_std": 0.41875, "grad_norm": 0.22652797400951385, "kl": 0.19969056600821206, "learning_rate": 2.492142857142857e-07, "loss": 0.0002, "num_tokens": 260253160.0, "reward": 0.1203125, "reward_std": 0.521782523393631, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9873779535293579, "step": 3490 }, { "completion_length": 387.2, "completions/clipped_ratio": 0.0, "completions/max_length": 387.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 126.31953125, "completions/mean_terminated_length": 126.31953125, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0031629703504335847, "frac_reward_zero_std": 0.39375, "grad_norm": 0.23610709607601166, "kl": 0.09870569678023458, "learning_rate": 2.4957142857142857e-07, "loss": 0.0001, "num_tokens": 260609017.0, "reward": 0.265625, "reward_std": 0.5326777756214142, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9626694917678833, "step": 3495 }, { "completion_length": 343.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 130.9640625, "completions/mean_terminated_length": 130.9640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0031674953437818448, "frac_reward_zero_std": 0.475, "grad_norm": 0.21019834280014038, "kl": 0.05646478665003087, "learning_rate": 2.4992857142857143e-07, "loss": 0.0001, "num_tokens": 260976131.0, "reward": 0.15625, "reward_std": 0.4608727812767029, "rewards/verify_chess_move/mean": 0.15625, "rewards/verify_chess_move/std": 0.9746751189231873, "step": 3500 }, { "completion_length": 388.8, "completions/clipped_ratio": 0.0, "completions/max_length": 388.8, "completions/max_terminated_length": 388.8, "completions/mean_length": 138.2, "completions/mean_terminated_length": 138.2, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0031720203371301044, "frac_reward_zero_std": 0.50625, "grad_norm": 0.2330554872751236, "kl": 0.03768327236321056, "learning_rate": 2.502857142857143e-07, "loss": 0.0, "num_tokens": 261353531.0, "reward": 0.0203125, "reward_std": 0.44756255149841306, "rewards/verify_chess_move/mean": 0.0203125, "rewards/verify_chess_move/std": 0.9995351195335388, "step": 3505 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.0, "completions/max_length": 440.6, "completions/max_terminated_length": 440.6, "completions/mean_length": 135.87421875, "completions/mean_terminated_length": 135.87421875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003176545330478364, "frac_reward_zero_std": 0.4625, "grad_norm": 0.23773600161075592, "kl": 0.03209684228932019, "learning_rate": 2.5064285714285715e-07, "loss": 0.0, "num_tokens": 261729522.0, "reward": 0.121875, "reward_std": 0.4735983610153198, "rewards/verify_chess_move/mean": 0.121875, "rewards/verify_chess_move/std": 0.989178204536438, "step": 3510 }, { "completion_length": 486.8, "completions/clipped_ratio": 0.0, "completions/max_length": 486.8, "completions/max_terminated_length": 486.8, "completions/mean_length": 134.4078125, "completions/mean_terminated_length": 134.4078125, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.003181070323826624, "frac_reward_zero_std": 0.40625, "grad_norm": 0.1810966432094574, "kl": 0.04331377757625887, "learning_rate": 2.51e-07, "loss": 0.0, "num_tokens": 262100268.0, "reward": 0.1046875, "reward_std": 0.5467613101005554, "rewards/verify_chess_move/mean": 0.1046875, "rewards/verify_chess_move/std": 0.9949828386306763, "step": 3515 }, { "completion_length": 339.6, "completions/clipped_ratio": 0.0, "completions/max_length": 339.6, "completions/max_terminated_length": 339.6, "completions/mean_length": 124.20625, "completions/mean_terminated_length": 124.20625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.003185595317174884, "frac_reward_zero_std": 0.45625, "grad_norm": 0.4732564389705658, "kl": 0.6103190346242627, "learning_rate": 2.513571428571428e-07, "loss": 0.0006, "num_tokens": 262453580.0, "reward": 0.1765625, "reward_std": 0.486806583404541, "rewards/verify_chess_move/mean": 0.1765625, "rewards/verify_chess_move/std": 0.9850785851478576, "step": 3520 }, { "completion_length": 469.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 469.6, "completions/max_terminated_length": 381.8, "completions/mean_length": 129.9546875, "completions/mean_terminated_length": 129.4494155883789, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0031901203105231435, "frac_reward_zero_std": 0.5125, "grad_norm": 0.11410211771726608, "kl": 0.12223192483215825, "learning_rate": 2.5171428571428573e-07, "loss": 0.0001, "num_tokens": 262822426.0, "reward": 0.0484375, "reward_std": 0.42499876022338867, "rewards/verify_chess_move/mean": 0.0484375, "rewards/verify_chess_move/std": 0.9991464018821716, "step": 3525 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 129.17734375, "completions/mean_terminated_length": 129.17734375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003194645303871403, "frac_reward_zero_std": 0.49375, "grad_norm": 1.2343440055847168, "kl": 0.2321312947460683, "learning_rate": 2.5207142857142854e-07, "loss": 0.0002, "num_tokens": 263187989.0, "reward": 0.0703125, "reward_std": 0.44214866161346433, "rewards/verify_chess_move/mean": 0.0703125, "rewards/verify_chess_move/std": 0.988346517086029, "step": 3530 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 133.053125, "completions/mean_terminated_length": 133.053125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0031991702972196633, "frac_reward_zero_std": 0.5125, "grad_norm": 0.18847574293613434, "kl": 0.20761907416017494, "learning_rate": 2.5242857142857146e-07, "loss": 0.0002, "num_tokens": 263557009.0, "reward": 0.0921875, "reward_std": 0.4354616403579712, "rewards/verify_chess_move/mean": 0.0921875, "rewards/verify_chess_move/std": 0.99115469455719, "step": 3535 }, { "completion_length": 375.4, "completions/clipped_ratio": 0.0, "completions/max_length": 375.4, "completions/max_terminated_length": 375.4, "completions/mean_length": 132.21015625, "completions/mean_terminated_length": 132.21015625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.003203695290567923, "frac_reward_zero_std": 0.475, "grad_norm": 0.20509693026542664, "kl": 0.04015889941947535, "learning_rate": 2.5278571428571426e-07, "loss": 0.0, "num_tokens": 263925022.0, "reward": 0.075, "reward_std": 0.46603056192398074, "rewards/verify_chess_move/mean": 0.075, "rewards/verify_chess_move/std": 0.9924092054367065, "step": 3540 }, { "completion_length": 441.8, "completions/clipped_ratio": 0.0, "completions/max_length": 441.8, "completions/max_terminated_length": 441.8, "completions/mean_length": 130.934375, "completions/mean_terminated_length": 130.934375, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0032082202839161826, "frac_reward_zero_std": 0.525, "grad_norm": 0.11209287494421005, "kl": 0.03034162342955824, "learning_rate": 2.531428571428572e-07, "loss": 0.0, "num_tokens": 264291186.0, "reward": 0.13125, "reward_std": 0.4184704840183258, "rewards/verify_chess_move/mean": 0.13125, "rewards/verify_chess_move/std": 0.9792698860168457, "step": 3545 }, { "completion_length": 386.2, "completions/clipped_ratio": 0.0, "completions/max_length": 386.2, "completions/max_terminated_length": 386.2, "completions/mean_length": 139.33828125, "completions/mean_terminated_length": 139.33828125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0032127452772644422, "frac_reward_zero_std": 0.44375, "grad_norm": 0.19799479842185974, "kl": 0.017169577220920475, "learning_rate": 2.535e-07, "loss": 0.0, "num_tokens": 264670187.0, "reward": 0.0921875, "reward_std": 0.49880170822143555, "rewards/verify_chess_move/mean": 0.0921875, "rewards/verify_chess_move/std": 0.9783776521682739, "step": 3550 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 130.70390625, "completions/mean_terminated_length": 130.70390625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0032172702706127023, "frac_reward_zero_std": 0.39375, "grad_norm": 0.16080330312252045, "kl": 0.025136705383192747, "learning_rate": 2.5385714285714285e-07, "loss": 0.0, "num_tokens": 265035440.0, "reward": 0.05, "reward_std": 0.5142262816429138, "rewards/verify_chess_move/mean": 0.05, "rewards/verify_chess_move/std": 0.9910440802574157, "step": 3555 }, { "completion_length": 430.6, "completions/clipped_ratio": 0.0, "completions/max_length": 430.6, "completions/max_terminated_length": 430.6, "completions/mean_length": 133.321875, "completions/mean_terminated_length": 133.321875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.003221795263960962, "frac_reward_zero_std": 0.48125, "grad_norm": 0.17717088758945465, "kl": 0.013843431949499062, "learning_rate": 2.542142857142857e-07, "loss": 0.0, "num_tokens": 265404932.0, "reward": 0.1625, "reward_std": 0.460032993555069, "rewards/verify_chess_move/mean": 0.1625, "rewards/verify_chess_move/std": 0.986160933971405, "step": 3560 }, { "completion_length": 397.6, "completions/clipped_ratio": 0.0, "completions/max_length": 397.6, "completions/max_terminated_length": 397.6, "completions/mean_length": 130.55078125, "completions/mean_terminated_length": 130.55078125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0032263202573092216, "frac_reward_zero_std": 0.45625, "grad_norm": 0.18321888148784637, "kl": 0.022028705677075777, "learning_rate": 2.5457142857142857e-07, "loss": 0.0, "num_tokens": 265767565.0, "reward": 0.1, "reward_std": 0.48612165451049805, "rewards/verify_chess_move/mean": 0.1, "rewards/verify_chess_move/std": 0.9940768480300903, "step": 3565 }, { "completion_length": 359.8, "completions/clipped_ratio": 0.0, "completions/max_length": 359.8, "completions/max_terminated_length": 359.8, "completions/mean_length": 138.6671875, "completions/mean_terminated_length": 138.6671875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0032308452506574817, "frac_reward_zero_std": 0.4125, "grad_norm": 0.19599959254264832, "kl": 0.030162154277786613, "learning_rate": 2.5492857142857143e-07, "loss": 0.0, "num_tokens": 266142979.0, "reward": 0.096875, "reward_std": 0.5032282054424286, "rewards/verify_chess_move/mean": 0.096875, "rewards/verify_chess_move/std": 0.989920151233673, "step": 3570 }, { "completion_length": 385.4, "completions/clipped_ratio": 0.0, "completions/max_length": 385.4, "completions/max_terminated_length": 385.4, "completions/mean_length": 125.0484375, "completions/mean_terminated_length": 125.0484375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0032353702440057414, "frac_reward_zero_std": 0.4375, "grad_norm": 0.20891189575195312, "kl": 0.045201419151271696, "learning_rate": 2.552857142857143e-07, "loss": 0.0, "num_tokens": 266498577.0, "reward": 0.0375, "reward_std": 0.5032696008682251, "rewards/verify_chess_move/mean": 0.0375, "rewards/verify_chess_move/std": 0.9956509470939636, "step": 3575 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.0, "completions/max_length": 394.6, "completions/max_terminated_length": 394.6, "completions/mean_length": 132.01015625, "completions/mean_terminated_length": 132.01015625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003239895237354001, "frac_reward_zero_std": 0.39375, "grad_norm": 0.16387629508972168, "kl": 0.07843731167085935, "learning_rate": 2.556428571428571e-07, "loss": 0.0001, "num_tokens": 266863430.0, "reward": 0.19375, "reward_std": 0.5231545686721801, "rewards/verify_chess_move/mean": 0.19375, "rewards/verify_chess_move/std": 0.9752360224723816, "step": 3580 }, { "completion_length": 376.4, "completions/clipped_ratio": 0.0, "completions/max_length": 376.4, "completions/max_terminated_length": 376.4, "completions/mean_length": 135.65078125, "completions/mean_terminated_length": 135.65078125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0032444202307022607, "frac_reward_zero_std": 0.40625, "grad_norm": 0.2515929937362671, "kl": 0.0730273297871463, "learning_rate": 2.56e-07, "loss": 0.0001, "num_tokens": 267238751.0, "reward": -0.0078125, "reward_std": 0.5023929238319397, "rewards/verify_chess_move/mean": -0.0078125, "rewards/verify_chess_move/std": 0.9906600117683411, "step": 3585 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 133.91328125, "completions/mean_terminated_length": 133.91328125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.003248945224050521, "frac_reward_zero_std": 0.5125, "grad_norm": 0.2125893086194992, "kl": 0.07950470771465916, "learning_rate": 2.563571428571428e-07, "loss": 0.0001, "num_tokens": 267612408.0, "reward": -0.0609375, "reward_std": 0.4333652019500732, "rewards/verify_chess_move/mean": -0.0609375, "rewards/verify_chess_move/std": 0.992380928993225, "step": 3590 }, { "completion_length": 362.2, "completions/clipped_ratio": 0.0, "completions/max_length": 362.2, "completions/max_terminated_length": 362.2, "completions/mean_length": 131.43359375, "completions/mean_terminated_length": 131.43359375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0032534702173987805, "frac_reward_zero_std": 0.48125, "grad_norm": 0.1510070562362671, "kl": 0.04359829891764093, "learning_rate": 2.5671428571428574e-07, "loss": 0.0, "num_tokens": 267978043.0, "reward": 0.175, "reward_std": 0.46123428344726564, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9707242250442505, "step": 3595 }, { "completion_length": 331.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 127.19609375, "completions/mean_terminated_length": 127.19609375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00325799521074704, "frac_reward_zero_std": 0.45, "grad_norm": 0.2013392448425293, "kl": 0.08938809296814725, "learning_rate": 2.5707142857142855e-07, "loss": 0.0001, "num_tokens": 268341350.0, "reward": 0.0484375, "reward_std": 0.4878071486949921, "rewards/verify_chess_move/mean": 0.0484375, "rewards/verify_chess_move/std": 0.9951591730117798, "step": 3600 }, { "completion_length": 323.4, "completions/clipped_ratio": 0.0, "completions/max_length": 323.4, "completions/max_terminated_length": 323.4, "completions/mean_length": 132.0890625, "completions/mean_terminated_length": 132.0890625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0032625202040952998, "frac_reward_zero_std": 0.38125, "grad_norm": 0.38020244240760803, "kl": 0.09602129414852242, "learning_rate": 2.5742857142857146e-07, "loss": 0.0001, "num_tokens": 268710056.0, "reward": 0.0046875, "reward_std": 0.5432594239711761, "rewards/verify_chess_move/mean": 0.0046875, "rewards/verify_chess_move/std": 0.9841393113136292, "step": 3605 }, { "completion_length": 416.8, "completions/clipped_ratio": 0.0, "completions/max_length": 416.8, "completions/max_terminated_length": 416.8, "completions/mean_length": 131.61796875, "completions/mean_terminated_length": 131.61796875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.00326704519744356, "frac_reward_zero_std": 0.425, "grad_norm": 0.1708999127149582, "kl": 0.04449192004685756, "learning_rate": 2.5778571428571427e-07, "loss": 0.0, "num_tokens": 269077327.0, "reward": 0.0203125, "reward_std": 0.5259417355060577, "rewards/verify_chess_move/mean": 0.0203125, "rewards/verify_chess_move/std": 0.9984849095344543, "step": 3610 }, { "completion_length": 426.6, "completions/clipped_ratio": 0.0, "completions/max_length": 426.6, "completions/max_terminated_length": 426.6, "completions/mean_length": 137.0546875, "completions/mean_terminated_length": 137.0546875, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0032715701907918195, "frac_reward_zero_std": 0.4375, "grad_norm": 0.159941628575325, "kl": 0.041427257671603004, "learning_rate": 2.5814285714285713e-07, "loss": 0.0, "num_tokens": 269451005.0, "reward": 0.1390625, "reward_std": 0.5007498085498809, "rewards/verify_chess_move/mean": 0.1390625, "rewards/verify_chess_move/std": 0.9888753294944763, "step": 3615 }, { "completion_length": 453.8, "completions/clipped_ratio": 0.0, "completions/max_length": 453.8, "completions/max_terminated_length": 453.8, "completions/mean_length": 136.71328125, "completions/mean_terminated_length": 136.71328125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.003276095184140079, "frac_reward_zero_std": 0.475, "grad_norm": 0.19963116943836212, "kl": 0.028362040780484675, "learning_rate": 2.585e-07, "loss": 0.0, "num_tokens": 269824726.0, "reward": 0.0109375, "reward_std": 0.44920762777328493, "rewards/verify_chess_move/mean": 0.0109375, "rewards/verify_chess_move/std": 0.9980051159858704, "step": 3620 }, { "completion_length": 346.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 134.83359375, "completions/mean_terminated_length": 134.83359375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0032806201774883393, "frac_reward_zero_std": 0.48125, "grad_norm": 0.16404050588607788, "kl": 0.05291722640104126, "learning_rate": 2.5885714285714285e-07, "loss": 0.0001, "num_tokens": 270198545.0, "reward": 0.0875, "reward_std": 0.4562392234802246, "rewards/verify_chess_move/mean": 0.0875, "rewards/verify_chess_move/std": 0.9968879818916321, "step": 3625 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 132.2921875, "completions/mean_terminated_length": 132.2921875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003285145170836599, "frac_reward_zero_std": 0.475, "grad_norm": 0.5850141048431396, "kl": 0.0773287016985705, "learning_rate": 2.592142857142857e-07, "loss": 0.0001, "num_tokens": 270565695.0, "reward": 0.1203125, "reward_std": 0.465182888507843, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9838302850723266, "step": 3630 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 135.46171875, "completions/mean_terminated_length": 135.46171875, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.0032896701641848586, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3574429452419281, "kl": 0.10739010778488592, "learning_rate": 2.595714285714286e-07, "loss": 0.0001, "num_tokens": 270937358.0, "reward": -0.0046875, "reward_std": 0.4976400136947632, "rewards/verify_chess_move/mean": -0.0046875, "rewards/verify_chess_move/std": 0.9973701715469361, "step": 3635 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.0, "completions/max_length": 390.4, "completions/max_terminated_length": 390.4, "completions/mean_length": 133.53046875, "completions/mean_terminated_length": 133.53046875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0032941951575331183, "frac_reward_zero_std": 0.45625, "grad_norm": 0.28825515508651733, "kl": 0.0772758503415389, "learning_rate": 2.5992857142857144e-07, "loss": 0.0001, "num_tokens": 271308405.0, "reward": 0.128125, "reward_std": 0.48322696089744566, "rewards/verify_chess_move/mean": 0.128125, "rewards/verify_chess_move/std": 0.9810174345970154, "step": 3640 }, { "completion_length": 333.8, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/max_terminated_length": 333.8, "completions/mean_length": 124.728125, "completions/mean_terminated_length": 124.728125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0032987201508813783, "frac_reward_zero_std": 0.5, "grad_norm": 0.2801654636859894, "kl": 0.07879499716800638, "learning_rate": 2.602857142857143e-07, "loss": 0.0001, "num_tokens": 271664265.0, "reward": 0.1078125, "reward_std": 0.4297861516475677, "rewards/verify_chess_move/mean": 0.1078125, "rewards/verify_chess_move/std": 0.9834717035293579, "step": 3645 }, { "completion_length": 341.6, "completions/clipped_ratio": 0.0, "completions/max_length": 341.6, "completions/max_terminated_length": 341.6, "completions/mean_length": 126.80390625, "completions/mean_terminated_length": 126.80390625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.003303245144229638, "frac_reward_zero_std": 0.4375, "grad_norm": 0.27132901549339294, "kl": 0.030653813484241253, "learning_rate": 2.606428571428571e-07, "loss": 0.0, "num_tokens": 272023870.0, "reward": 0.146875, "reward_std": 0.5062650799751282, "rewards/verify_chess_move/mean": 0.146875, "rewards/verify_chess_move/std": 0.9851505994796753, "step": 3650 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 134.684375, "completions/mean_terminated_length": 134.684375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0033077701375778977, "frac_reward_zero_std": 0.45625, "grad_norm": 0.16357094049453735, "kl": 0.020809845149051397, "learning_rate": 2.61e-07, "loss": 0.0, "num_tokens": 272396562.0, "reward": 0.034375, "reward_std": 0.46576557159423826, "rewards/verify_chess_move/mean": 0.034375, "rewards/verify_chess_move/std": 0.9971606492996216, "step": 3655 }, { "completion_length": 326.4, "completions/clipped_ratio": 0.0, "completions/max_length": 326.4, "completions/max_terminated_length": 326.4, "completions/mean_length": 123.23671875, "completions/mean_terminated_length": 123.23671875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0033122951309261573, "frac_reward_zero_std": 0.4, "grad_norm": 0.19303728640079498, "kl": 0.031073224989813752, "learning_rate": 2.6135714285714283e-07, "loss": 0.0, "num_tokens": 272750977.0, "reward": 0.2234375, "reward_std": 0.5102292358875274, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9734926104545594, "step": 3660 }, { "completion_length": 434.8, "completions/clipped_ratio": 0.0, "completions/max_length": 434.8, "completions/max_terminated_length": 434.8, "completions/mean_length": 131.3, "completions/mean_terminated_length": 131.3, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0033168201242744174, "frac_reward_zero_std": 0.44375, "grad_norm": 0.20812733471393585, "kl": 0.02484153092955239, "learning_rate": 2.6171428571428574e-07, "loss": 0.0, "num_tokens": 273117257.0, "reward": 0.1140625, "reward_std": 0.49213260412216187, "rewards/verify_chess_move/mean": 0.1140625, "rewards/verify_chess_move/std": 0.9859790325164794, "step": 3665 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 134.34453125, "completions/mean_terminated_length": 134.34453125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.003321345117622677, "frac_reward_zero_std": 0.45625, "grad_norm": 0.2120746374130249, "kl": 0.06237712503352668, "learning_rate": 2.6207142857142855e-07, "loss": 0.0001, "num_tokens": 273489138.0, "reward": 0.04375, "reward_std": 0.4849688053131104, "rewards/verify_chess_move/mean": 0.04375, "rewards/verify_chess_move/std": 0.9913721799850463, "step": 3670 }, { "completion_length": 321.8, "completions/clipped_ratio": 0.0, "completions/max_length": 321.8, "completions/max_terminated_length": 321.8, "completions/mean_length": 129.1296875, "completions/mean_terminated_length": 129.1296875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0033258701109709367, "frac_reward_zero_std": 0.4875, "grad_norm": 0.246012344956398, "kl": 0.061707334741367956, "learning_rate": 2.6242857142857147e-07, "loss": 0.0001, "num_tokens": 273852872.0, "reward": 0.0546875, "reward_std": 0.45182177424430847, "rewards/verify_chess_move/mean": 0.0546875, "rewards/verify_chess_move/std": 0.9955825448036194, "step": 3675 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 132.1015625, "completions/mean_terminated_length": 132.1015625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003330395104319197, "frac_reward_zero_std": 0.43125, "grad_norm": 0.39692744612693787, "kl": 0.1555466435936978, "learning_rate": 2.627857142857143e-07, "loss": 0.0002, "num_tokens": 274220122.0, "reward": 0.1515625, "reward_std": 0.5097893655300141, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9864456176757812, "step": 3680 }, { "completion_length": 378.2, "completions/clipped_ratio": 0.0, "completions/max_length": 378.2, "completions/max_terminated_length": 378.2, "completions/mean_length": 141.13046875, "completions/mean_terminated_length": 141.13046875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0033349200976674565, "frac_reward_zero_std": 0.46875, "grad_norm": 0.12657688558101654, "kl": 0.05319224104750901, "learning_rate": 2.6314285714285714e-07, "loss": 0.0001, "num_tokens": 274601945.0, "reward": 0.0625, "reward_std": 0.47002860307693484, "rewards/verify_chess_move/mean": 0.0625, "rewards/verify_chess_move/std": 0.9874049186706543, "step": 3685 }, { "completion_length": 450.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 450.2, "completions/max_terminated_length": 386.8, "completions/mean_length": 130.25859375, "completions/mean_terminated_length": 129.7710968017578, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.003339445091015716, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2541447877883911, "kl": 0.06041401948605198, "learning_rate": 2.635e-07, "loss": 0.0001, "num_tokens": 274964524.0, "reward": 0.071875, "reward_std": 0.4745993077754974, "rewards/verify_chess_move/mean": 0.071875, "rewards/verify_chess_move/std": 0.9922198176383972, "step": 3690 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 119.86953125, "completions/mean_terminated_length": 119.86953125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003343970084363976, "frac_reward_zero_std": 0.4, "grad_norm": 0.17292329668998718, "kl": 0.1119830575567903, "learning_rate": 2.6385714285714286e-07, "loss": 0.0001, "num_tokens": 275313485.0, "reward": 0.078125, "reward_std": 0.5400862455368042, "rewards/verify_chess_move/mean": 0.078125, "rewards/verify_chess_move/std": 0.996781837940216, "step": 3695 }, { "completion_length": 354.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 122.140625, "completions/mean_terminated_length": 122.140625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003348495077712236, "frac_reward_zero_std": 0.45625, "grad_norm": 0.44002294540405273, "kl": 0.22966190109436865, "learning_rate": 2.642142857142857e-07, "loss": 0.0002, "num_tokens": 275665057.0, "reward": 0.0765625, "reward_std": 0.4868570029735565, "rewards/verify_chess_move/mean": 0.0765625, "rewards/verify_chess_move/std": 0.9843416452407837, "step": 3700 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 130.88671875, "completions/mean_terminated_length": 130.88671875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0033530200710604956, "frac_reward_zero_std": 0.475, "grad_norm": 0.24187657237052917, "kl": 0.14211635031970218, "learning_rate": 2.645714285714286e-07, "loss": 0.0001, "num_tokens": 276029352.0, "reward": 0.08125, "reward_std": 0.45971603989601134, "rewards/verify_chess_move/mean": 0.08125, "rewards/verify_chess_move/std": 0.9731966495513916, "step": 3705 }, { "completion_length": 466.4, "completions/clipped_ratio": 0.0, "completions/max_length": 466.4, "completions/max_terminated_length": 466.4, "completions/mean_length": 140.30703125, "completions/mean_terminated_length": 140.30703125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.003357545064408755, "frac_reward_zero_std": 0.4375, "grad_norm": 0.17184355854988098, "kl": 0.09072314701334108, "learning_rate": 2.649285714285714e-07, "loss": 0.0001, "num_tokens": 276408809.0, "reward": 0.0453125, "reward_std": 0.48618505597114564, "rewards/verify_chess_move/mean": 0.0453125, "rewards/verify_chess_move/std": 0.9972105860710144, "step": 3710 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 130.321875, "completions/mean_terminated_length": 130.321875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0033620700577570153, "frac_reward_zero_std": 0.4625, "grad_norm": 0.244135320186615, "kl": 0.10961883424024563, "learning_rate": 2.652857142857143e-07, "loss": 0.0001, "num_tokens": 276776013.0, "reward": 0.0453125, "reward_std": 0.48422693014144896, "rewards/verify_chess_move/mean": 0.0453125, "rewards/verify_chess_move/std": 0.9909975409507752, "step": 3715 }, { "completion_length": 422.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 128.7203125, "completions/mean_terminated_length": 128.7203125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.003366595051105275, "frac_reward_zero_std": 0.4375, "grad_norm": 0.19124442338943481, "kl": 0.1310083607546403, "learning_rate": 2.656428571428571e-07, "loss": 0.0001, "num_tokens": 277136871.0, "reward": 0.14375, "reward_std": 0.49917166233062743, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9904894948005676, "step": 3720 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 127.77421875, "completions/mean_terminated_length": 127.77421875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0033711200444535346, "frac_reward_zero_std": 0.425, "grad_norm": 0.2998805046081543, "kl": 0.09983646924956702, "learning_rate": 2.66e-07, "loss": 0.0001, "num_tokens": 277498558.0, "reward": 0.115625, "reward_std": 0.5062686562538147, "rewards/verify_chess_move/mean": 0.115625, "rewards/verify_chess_move/std": 0.9920193552970886, "step": 3725 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 119.11484375, "completions/mean_terminated_length": 119.11484375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0033756450378017943, "frac_reward_zero_std": 0.43125, "grad_norm": 0.2547791302204132, "kl": 0.1282760868809419, "learning_rate": 2.6635714285714283e-07, "loss": 0.0001, "num_tokens": 277845641.0, "reward": 0.090625, "reward_std": 0.5058996617794037, "rewards/verify_chess_move/mean": 0.090625, "rewards/verify_chess_move/std": 0.9963653922080994, "step": 3730 }, { "completion_length": 415.2, "completions/clipped_ratio": 0.0, "completions/max_length": 415.2, "completions/max_terminated_length": 415.2, "completions/mean_length": 130.18046875, "completions/mean_terminated_length": 130.18046875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0033801700311500544, "frac_reward_zero_std": 0.45, "grad_norm": 0.40852493047714233, "kl": 0.26605777302465866, "learning_rate": 2.6671428571428575e-07, "loss": 0.0003, "num_tokens": 278206248.0, "reward": 0.1515625, "reward_std": 0.48701086044311526, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9826949834823608, "step": 3735 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 137.05625, "completions/mean_terminated_length": 137.05625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003384695024498314, "frac_reward_zero_std": 0.4625, "grad_norm": 0.20577304065227509, "kl": 0.06239780875330325, "learning_rate": 2.6707142857142856e-07, "loss": 0.0001, "num_tokens": 278581200.0, "reward": 0.071875, "reward_std": 0.47154841423034666, "rewards/verify_chess_move/mean": 0.071875, "rewards/verify_chess_move/std": 0.9916368007659913, "step": 3740 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 132.4390625, "completions/mean_terminated_length": 132.4390625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0033892200178465737, "frac_reward_zero_std": 0.5125, "grad_norm": 0.22319765388965607, "kl": 0.04127831525693182, "learning_rate": 2.674285714285714e-07, "loss": 0.0, "num_tokens": 278947938.0, "reward": 0.1015625, "reward_std": 0.44413224458694456, "rewards/verify_chess_move/mean": 0.1015625, "rewards/verify_chess_move/std": 0.989684271812439, "step": 3745 }, { "completion_length": 337.2, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/max_terminated_length": 337.2, "completions/mean_length": 132.33046875, "completions/mean_terminated_length": 132.33046875, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0033937450111948333, "frac_reward_zero_std": 0.4875, "grad_norm": 0.17847700417041779, "kl": 0.031648548212251625, "learning_rate": 2.677857142857143e-07, "loss": 0.0, "num_tokens": 279316889.0, "reward": 0.05, "reward_std": 0.45566301941871645, "rewards/verify_chess_move/mean": 0.05, "rewards/verify_chess_move/std": 0.9872287750244141, "step": 3750 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.0, "completions/max_length": 380.6, "completions/max_terminated_length": 380.6, "completions/mean_length": 128.3828125, "completions/mean_terminated_length": 128.3828125, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0033982700045430934, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3519718050956726, "kl": 0.05340172098949551, "learning_rate": 2.6814285714285714e-07, "loss": 0.0001, "num_tokens": 279678083.0, "reward": 0.09375, "reward_std": 0.4828166127204895, "rewards/verify_chess_move/mean": 0.09375, "rewards/verify_chess_move/std": 0.9917262315750122, "step": 3755 }, { "completion_length": 390.2, "completions/clipped_ratio": 0.0, "completions/max_length": 390.2, "completions/max_terminated_length": 390.2, "completions/mean_length": 131.35390625, "completions/mean_terminated_length": 131.35390625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.003402794997891353, "frac_reward_zero_std": 0.46875, "grad_norm": 0.21772494912147522, "kl": 0.017944681461085566, "learning_rate": 2.685e-07, "loss": 0.0, "num_tokens": 280047016.0, "reward": 0.0578125, "reward_std": 0.4681888520717621, "rewards/verify_chess_move/mean": 0.0578125, "rewards/verify_chess_move/std": 0.9993403792381287, "step": 3760 }, { "completion_length": 379.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 137.0921875, "completions/mean_terminated_length": 137.0921875, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0034073199912396128, "frac_reward_zero_std": 0.5375, "grad_norm": 0.197189062833786, "kl": 0.015151255048112943, "learning_rate": 2.6885714285714286e-07, "loss": 0.0, "num_tokens": 280423462.0, "reward": -0.0453125, "reward_std": 0.41915388107299806, "rewards/verify_chess_move/mean": -0.0453125, "rewards/verify_chess_move/std": 0.9947416424751282, "step": 3765 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 127.1796875, "completions/mean_terminated_length": 127.1796875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003411844984587873, "frac_reward_zero_std": 0.5125, "grad_norm": 0.18294267356395721, "kl": 0.026825577276758848, "learning_rate": 2.692142857142857e-07, "loss": 0.0, "num_tokens": 280783972.0, "reward": 0.0640625, "reward_std": 0.4383602440357208, "rewards/verify_chess_move/mean": 0.0640625, "rewards/verify_chess_move/std": 0.9970897197723388, "step": 3770 }, { "completion_length": 398.4, "completions/clipped_ratio": 0.0, "completions/max_length": 398.4, "completions/max_terminated_length": 398.4, "completions/mean_length": 135.02734375, "completions/mean_terminated_length": 135.02734375, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0034163699779361325, "frac_reward_zero_std": 0.45625, "grad_norm": 0.4199885129928589, "kl": 0.016294949979055673, "learning_rate": 2.695714285714286e-07, "loss": 0.0, "num_tokens": 281157047.0, "reward": 0.0421875, "reward_std": 0.4700291812419891, "rewards/verify_chess_move/mean": 0.0421875, "rewards/verify_chess_move/std": 0.993982446193695, "step": 3775 }, { "completion_length": 426.4, "completions/clipped_ratio": 0.0, "completions/max_length": 426.4, "completions/max_terminated_length": 426.4, "completions/mean_length": 132.43828125, "completions/mean_terminated_length": 132.43828125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.003420894971284392, "frac_reward_zero_std": 0.50625, "grad_norm": 0.09257037937641144, "kl": 0.026281632026075384, "learning_rate": 2.699285714285714e-07, "loss": 0.0, "num_tokens": 281525816.0, "reward": 0.21875, "reward_std": 0.42958483695983884, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9704480290412902, "step": 3780 }, { "completion_length": 481.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 481.4, "completions/max_terminated_length": 417.8, "completions/mean_length": 131.04609375, "completions/mean_terminated_length": 130.5396926879883, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.003425419964632652, "frac_reward_zero_std": 0.49375, "grad_norm": 0.1768520623445511, "kl": 0.019436012879305055, "learning_rate": 2.702857142857143e-07, "loss": 0.0, "num_tokens": 281894419.0, "reward": 0.025, "reward_std": 0.44004730582237245, "rewards/verify_chess_move/mean": 0.025, "rewards/verify_chess_move/std": 0.9797350287437439, "step": 3785 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 123.69453125, "completions/mean_terminated_length": 123.69453125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.003429944957980912, "frac_reward_zero_std": 0.475, "grad_norm": 0.2237735390663147, "kl": 0.03383632690238301, "learning_rate": 2.706428571428571e-07, "loss": 0.0, "num_tokens": 282250484.0, "reward": 0.090625, "reward_std": 0.4636531710624695, "rewards/verify_chess_move/mean": 0.090625, "rewards/verify_chess_move/std": 0.9949608206748962, "step": 3790 }, { "completion_length": 454.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 454.2, "completions/max_terminated_length": 363.8, "completions/mean_length": 120.94296875, "completions/mean_terminated_length": 120.43126983642578, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0034344699513291716, "frac_reward_zero_std": 0.5, "grad_norm": 0.10063744336366653, "kl": 0.0756409093271941, "learning_rate": 2.7100000000000003e-07, "loss": 0.0001, "num_tokens": 282600835.0, "reward": 0.0984375, "reward_std": 0.44003589153289796, "rewards/verify_chess_move/mean": 0.0984375, "rewards/verify_chess_move/std": 0.9795098066329956, "step": 3795 }, { "completion_length": 335.4, "completions/clipped_ratio": 0.0, "completions/max_length": 335.4, "completions/max_terminated_length": 335.4, "completions/mean_length": 124.0234375, "completions/mean_terminated_length": 124.0234375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0034389949446774312, "frac_reward_zero_std": 0.4875, "grad_norm": 0.27668583393096924, "kl": 0.03532692875014618, "learning_rate": 2.7135714285714284e-07, "loss": 0.0, "num_tokens": 282957569.0, "reward": 0.0625, "reward_std": 0.44609529376029966, "rewards/verify_chess_move/mean": 0.0625, "rewards/verify_chess_move/std": 0.9949660062789917, "step": 3800 }, { "completion_length": 358.6, "completions/clipped_ratio": 0.0, "completions/max_length": 358.6, "completions/max_terminated_length": 358.6, "completions/mean_length": 135.17890625, "completions/mean_terminated_length": 135.17890625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003443519938025691, "frac_reward_zero_std": 0.45625, "grad_norm": 0.20338250696659088, "kl": 0.04151144202623982, "learning_rate": 2.7171428571428575e-07, "loss": 0.0, "num_tokens": 283331974.0, "reward": 0.071875, "reward_std": 0.47481401562690734, "rewards/verify_chess_move/mean": 0.071875, "rewards/verify_chess_move/std": 0.98915194272995, "step": 3805 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 127.23515625, "completions/mean_terminated_length": 127.23515625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.003448044931373951, "frac_reward_zero_std": 0.51875, "grad_norm": 0.3755124807357788, "kl": 0.036283066461328416, "learning_rate": 2.7207142857142856e-07, "loss": 0.0, "num_tokens": 283694203.0, "reward": 0.2390625, "reward_std": 0.4031208992004395, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9692847728729248, "step": 3810 }, { "completion_length": 343.2, "completions/clipped_ratio": 0.0, "completions/max_length": 343.2, "completions/max_terminated_length": 343.2, "completions/mean_length": 120.21328125, "completions/mean_terminated_length": 120.21328125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0034525699247222106, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19813330471515656, "kl": 0.14591198005946354, "learning_rate": 2.7242857142857137e-07, "loss": 0.0001, "num_tokens": 284046780.0, "reward": 0.1359375, "reward_std": 0.38234073519706724, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.9913856148719787, "step": 3815 }, { "completion_length": 525.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 525.8, "completions/max_terminated_length": 427.0, "completions/mean_length": 133.4734375, "completions/mean_terminated_length": 132.96047973632812, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0034570949180704703, "frac_reward_zero_std": 0.46875, "grad_norm": 0.16850562393665314, "kl": 0.4150769894768018, "learning_rate": 2.727857142857143e-07, "loss": 0.0004, "num_tokens": 284417674.0, "reward": 0.025, "reward_std": 0.45414631962776186, "rewards/verify_chess_move/mean": 0.025, "rewards/verify_chess_move/std": 0.9953279733657837, "step": 3820 }, { "completion_length": 408.6, "completions/clipped_ratio": 0.0, "completions/max_length": 408.6, "completions/max_terminated_length": 408.6, "completions/mean_length": 128.4234375, "completions/mean_terminated_length": 128.4234375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0034616199114187304, "frac_reward_zero_std": 0.525, "grad_norm": 0.31801095604896545, "kl": 0.11550430002680515, "learning_rate": 2.731428571428571e-07, "loss": 0.0001, "num_tokens": 284781216.0, "reward": 0.090625, "reward_std": 0.4220511019229889, "rewards/verify_chess_move/mean": 0.090625, "rewards/verify_chess_move/std": 0.9892750382423401, "step": 3825 }, { "completion_length": 494.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 494.6, "completions/max_terminated_length": 438.0, "completions/mean_length": 135.53046875, "completions/mean_terminated_length": 135.0380096435547, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.00346614490476699, "frac_reward_zero_std": 0.49375, "grad_norm": 0.4301111400127411, "kl": 0.04284508056734922, "learning_rate": 2.735e-07, "loss": 0.0, "num_tokens": 285152519.0, "reward": 0.034375, "reward_std": 0.4366333544254303, "rewards/verify_chess_move/mean": 0.034375, "rewards/verify_chess_move/std": 0.9975394129753112, "step": 3830 }, { "completion_length": 369.4, "completions/clipped_ratio": 0.0, "completions/max_length": 369.4, "completions/max_terminated_length": 369.4, "completions/mean_length": 124.90234375, "completions/mean_terminated_length": 124.90234375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0034706698981152497, "frac_reward_zero_std": 0.50625, "grad_norm": 0.44511160254478455, "kl": 0.0663584387715673, "learning_rate": 2.7385714285714287e-07, "loss": 0.0001, "num_tokens": 285509122.0, "reward": 0.15, "reward_std": 0.43137513399124144, "rewards/verify_chess_move/mean": 0.15, "rewards/verify_chess_move/std": 0.9880056858062745, "step": 3835 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 132.32734375, "completions/mean_terminated_length": 132.32734375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0034751948914635094, "frac_reward_zero_std": 0.44375, "grad_norm": 0.6567649841308594, "kl": 0.04528543037886266, "learning_rate": 2.742142857142857e-07, "loss": 0.0, "num_tokens": 285879413.0, "reward": -0.0375, "reward_std": 0.479810631275177, "rewards/verify_chess_move/mean": -0.0375, "rewards/verify_chess_move/std": 0.99055016040802, "step": 3840 }, { "completion_length": 397.4, "completions/clipped_ratio": 0.0, "completions/max_length": 397.4, "completions/max_terminated_length": 397.4, "completions/mean_length": 114.428125, "completions/mean_terminated_length": 114.428125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0034797198848117695, "frac_reward_zero_std": 0.425, "grad_norm": 0.21836808323860168, "kl": 0.0553420003503561, "learning_rate": 2.745714285714286e-07, "loss": 0.0001, "num_tokens": 286220601.0, "reward": 0.2234375, "reward_std": 0.4908057928085327, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9653066039085388, "step": 3845 }, { "completion_length": 361.6, "completions/clipped_ratio": 0.0, "completions/max_length": 361.6, "completions/max_terminated_length": 361.6, "completions/mean_length": 141.315625, "completions/mean_terminated_length": 141.315625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.003484244878160029, "frac_reward_zero_std": 0.54375, "grad_norm": 0.28613680601119995, "kl": 0.10583260919083841, "learning_rate": 2.749285714285714e-07, "loss": 0.0001, "num_tokens": 286605125.0, "reward": 0.0125, "reward_std": 0.40742588639259336, "rewards/verify_chess_move/mean": 0.0125, "rewards/verify_chess_move/std": 0.9994549870491027, "step": 3850 }, { "completion_length": 385.4, "completions/clipped_ratio": 0.0, "completions/max_length": 385.4, "completions/max_terminated_length": 385.4, "completions/mean_length": 135.9640625, "completions/mean_terminated_length": 135.9640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.003488769871508289, "frac_reward_zero_std": 0.45, "grad_norm": 0.5224851965904236, "kl": 0.2618451435555471, "learning_rate": 2.752857142857143e-07, "loss": 0.0003, "num_tokens": 286977991.0, "reward": 0.01875, "reward_std": 0.49332440495491026, "rewards/verify_chess_move/mean": 0.01875, "rewards/verify_chess_move/std": 0.9981158971786499, "step": 3855 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 130.2375, "completions/mean_terminated_length": 130.2375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.003493294864856549, "frac_reward_zero_std": 0.5375, "grad_norm": 0.26215067505836487, "kl": 0.2671811778767733, "learning_rate": 2.756428571428571e-07, "loss": 0.0003, "num_tokens": 287341983.0, "reward": 0.1953125, "reward_std": 0.4032211899757385, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9770781755447387, "step": 3860 }, { "completion_length": 328.6, "completions/clipped_ratio": 0.0, "completions/max_length": 328.6, "completions/max_terminated_length": 328.6, "completions/mean_length": 125.13984375, "completions/mean_terminated_length": 125.13984375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0034978198582048085, "frac_reward_zero_std": 0.5375, "grad_norm": 0.14651863276958466, "kl": 0.06699127441388555, "learning_rate": 2.7600000000000004e-07, "loss": 0.0001, "num_tokens": 287700058.0, "reward": 0.1296875, "reward_std": 0.38912923336029054, "rewards/verify_chess_move/mean": 0.1296875, "rewards/verify_chess_move/std": 0.9771017909049988, "step": 3865 }, { "completion_length": 369.6, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/max_terminated_length": 369.6, "completions/mean_length": 126.91015625, "completions/mean_terminated_length": 126.91015625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.003502344851553068, "frac_reward_zero_std": 0.54375, "grad_norm": 0.30467885732650757, "kl": 0.0590732087061042, "learning_rate": 2.7635714285714284e-07, "loss": 0.0001, "num_tokens": 288059783.0, "reward": 0.1203125, "reward_std": 0.39590218663215637, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9801393747329712, "step": 3870 }, { "completion_length": 385.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 141.11953125, "completions/mean_terminated_length": 141.11953125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.003506869844901328, "frac_reward_zero_std": 0.5, "grad_norm": 0.24906566739082336, "kl": 0.04608185955439694, "learning_rate": 2.7671428571428565e-07, "loss": 0.0, "num_tokens": 288440160.0, "reward": 0.0203125, "reward_std": 0.4368311405181885, "rewards/verify_chess_move/mean": 0.0203125, "rewards/verify_chess_move/std": 0.9879185438156128, "step": 3875 }, { "completion_length": 373.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 124.4171875, "completions/mean_terminated_length": 124.4171875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.003511394838249588, "frac_reward_zero_std": 0.45, "grad_norm": 0.25827860832214355, "kl": 0.042465720398467965, "learning_rate": 2.7707142857142857e-07, "loss": 0.0, "num_tokens": 288794174.0, "reward": 0.1375, "reward_std": 0.47513256669044496, "rewards/verify_chess_move/mean": 0.1375, "rewards/verify_chess_move/std": 0.9822194933891296, "step": 3880 }, { "completion_length": 426.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.2, "completions/max_terminated_length": 342.0, "completions/mean_length": 123.27265625, "completions/mean_terminated_length": 122.76705780029297, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.0035159198315978476, "frac_reward_zero_std": 0.51875, "grad_norm": 0.1704990118741989, "kl": 0.04953849818557501, "learning_rate": 2.774285714285714e-07, "loss": 0.0, "num_tokens": 289148547.0, "reward": 0.10625, "reward_std": 0.4219472765922546, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9912440180778503, "step": 3885 }, { "completion_length": 339.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 130.08671875, "completions/mean_terminated_length": 130.08671875, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.0035204448249461073, "frac_reward_zero_std": 0.49375, "grad_norm": 0.29801779985427856, "kl": 0.06519894828961696, "learning_rate": 2.777857142857143e-07, "loss": 0.0001, "num_tokens": 289513146.0, "reward": 0.025, "reward_std": 0.46271690130233767, "rewards/verify_chess_move/mean": 0.025, "rewards/verify_chess_move/std": 1.000048005580902, "step": 3890 }, { "completion_length": 426.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.4, "completions/max_terminated_length": 360.8, "completions/mean_length": 129.34296875, "completions/mean_terminated_length": 128.84929809570312, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.003524969818294367, "frac_reward_zero_std": 0.4875, "grad_norm": 0.14066451787948608, "kl": 0.03844898488605395, "learning_rate": 2.781428571428571e-07, "loss": 0.0, "num_tokens": 289877721.0, "reward": 0.1265625, "reward_std": 0.44388163685798643, "rewards/verify_chess_move/mean": 0.1265625, "rewards/verify_chess_move/std": 0.9870067477226258, "step": 3895 }, { "completion_length": 372.2, "completions/clipped_ratio": 0.0, "completions/max_length": 372.2, "completions/max_terminated_length": 372.2, "completions/mean_length": 130.88671875, "completions/mean_terminated_length": 130.88671875, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.003529494811642627, "frac_reward_zero_std": 0.45, "grad_norm": 0.2202732414007187, "kl": 0.032360901750507765, "learning_rate": 2.785e-07, "loss": 0.0, "num_tokens": 290241952.0, "reward": 0.1234375, "reward_std": 0.47692384719848635, "rewards/verify_chess_move/mean": 0.1234375, "rewards/verify_chess_move/std": 0.987653386592865, "step": 3900 }, { "completion_length": 356.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 127.23828125, "completions/mean_terminated_length": 127.23828125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0035340198049908867, "frac_reward_zero_std": 0.4875, "grad_norm": 0.2415076047182083, "kl": 0.04814516774204094, "learning_rate": 2.788571428571428e-07, "loss": 0.0, "num_tokens": 290601569.0, "reward": 0.1453125, "reward_std": 0.45382524132728574, "rewards/verify_chess_move/mean": 0.1453125, "rewards/verify_chess_move/std": 0.9807045340538025, "step": 3905 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 133.51953125, "completions/mean_terminated_length": 133.51953125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0035385447983391463, "frac_reward_zero_std": 0.4125, "grad_norm": 0.2273244708776474, "kl": 0.07240807771449909, "learning_rate": 2.792142857142857e-07, "loss": 0.0001, "num_tokens": 290972474.0, "reward": 0.1234375, "reward_std": 0.5247904539108277, "rewards/verify_chess_move/mean": 0.1234375, "rewards/verify_chess_move/std": 0.9922232151031494, "step": 3910 }, { "completion_length": 460.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 460.6, "completions/max_terminated_length": 390.0, "completions/mean_length": 126.98046875, "completions/mean_terminated_length": 126.49681091308594, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0035430697916874064, "frac_reward_zero_std": 0.51875, "grad_norm": 0.280865877866745, "kl": 0.0580914468242554, "learning_rate": 2.7957142857142854e-07, "loss": 0.0001, "num_tokens": 291334393.0, "reward": 0.2171875, "reward_std": 0.42194628715515137, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9762595653533935, "step": 3915 }, { "completion_length": 473.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 473.8, "completions/max_terminated_length": 398.6, "completions/mean_length": 143.79765625, "completions/mean_terminated_length": 143.3334228515625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.003547594785035666, "frac_reward_zero_std": 0.45, "grad_norm": 0.11423412710428238, "kl": 0.048488945548888295, "learning_rate": 2.799285714285714e-07, "loss": 0.0, "num_tokens": 291719310.0, "reward": 0.115625, "reward_std": 0.4724471092224121, "rewards/verify_chess_move/mean": 0.115625, "rewards/verify_chess_move/std": 0.9916316628456116, "step": 3920 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 125.746875, "completions/mean_terminated_length": 125.746875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0035521197783839257, "frac_reward_zero_std": 0.525, "grad_norm": 0.12448619306087494, "kl": 0.03675382022338454, "learning_rate": 2.8028571428571427e-07, "loss": 0.0, "num_tokens": 292079330.0, "reward": 0.0, "reward_std": 0.40869157314300536, "rewards/verify_chess_move/mean": 0.0, "rewards/verify_chess_move/std": 0.9730927348136902, "step": 3925 }, { "completion_length": 397.2, "completions/clipped_ratio": 0.0, "completions/max_length": 397.2, "completions/max_terminated_length": 397.2, "completions/mean_length": 125.0640625, "completions/mean_terminated_length": 125.0640625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0035566447717321854, "frac_reward_zero_std": 0.51875, "grad_norm": 0.20631501078605652, "kl": 0.020401117447181604, "learning_rate": 2.8064285714285713e-07, "loss": 0.0, "num_tokens": 292438372.0, "reward": 0.1046875, "reward_std": 0.42736568450927737, "rewards/verify_chess_move/mean": 0.1046875, "rewards/verify_chess_move/std": 0.9927342176437378, "step": 3930 }, { "completion_length": 355.8, "completions/clipped_ratio": 0.0, "completions/max_length": 355.8, "completions/max_terminated_length": 355.8, "completions/mean_length": 132.36328125, "completions/mean_terminated_length": 132.36328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0035611697650804455, "frac_reward_zero_std": 0.48125, "grad_norm": 0.18605946004390717, "kl": 0.022236078951391392, "learning_rate": 2.8100000000000004e-07, "loss": 0.0, "num_tokens": 292806509.0, "reward": 0.1359375, "reward_std": 0.44594202637672425, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.9915688037872314, "step": 3935 }, { "completion_length": 433.6, "completions/clipped_ratio": 0.0, "completions/max_length": 433.6, "completions/max_terminated_length": 433.6, "completions/mean_length": 125.446875, "completions/mean_terminated_length": 125.446875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.003565694758428705, "frac_reward_zero_std": 0.525, "grad_norm": 0.173467755317688, "kl": 0.014619255112484098, "learning_rate": 2.8135714285714285e-07, "loss": 0.0, "num_tokens": 293166393.0, "reward": 0.1203125, "reward_std": 0.4126975178718567, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9916386127471923, "step": 3940 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 132.56484375, "completions/mean_terminated_length": 132.56484375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003570219751776965, "frac_reward_zero_std": 0.5375, "grad_norm": 0.12596724927425385, "kl": 0.02063074832840357, "learning_rate": 2.8171428571428566e-07, "loss": 0.0, "num_tokens": 293535236.0, "reward": 0.0609375, "reward_std": 0.39916975498199464, "rewards/verify_chess_move/mean": 0.0609375, "rewards/verify_chess_move/std": 0.9971395015716553, "step": 3945 }, { "completion_length": 338.2, "completions/clipped_ratio": 0.0, "completions/max_length": 338.2, "completions/max_terminated_length": 338.2, "completions/mean_length": 123.65, "completions/mean_terminated_length": 123.65, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0035747447451252245, "frac_reward_zero_std": 0.45625, "grad_norm": 0.23275478184223175, "kl": 0.05027842346171383, "learning_rate": 2.8207142857142857e-07, "loss": 0.0001, "num_tokens": 293889692.0, "reward": 0.1953125, "reward_std": 0.4804475247859955, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9774452924728394, "step": 3950 }, { "completion_length": 413.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 126.6, "completions/mean_terminated_length": 126.6, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0035792697384734846, "frac_reward_zero_std": 0.53125, "grad_norm": 0.18972691893577576, "kl": 0.02815484840539284, "learning_rate": 2.824285714285714e-07, "loss": 0.0, "num_tokens": 294247652.0, "reward": 0.109375, "reward_std": 0.4115293025970459, "rewards/verify_chess_move/mean": 0.109375, "rewards/verify_chess_move/std": 0.981152868270874, "step": 3955 }, { "completion_length": 338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 135.7859375, "completions/mean_terminated_length": 135.7859375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0035837947318217442, "frac_reward_zero_std": 0.59375, "grad_norm": 0.23152227699756622, "kl": 0.03339834965881892, "learning_rate": 2.827857142857143e-07, "loss": 0.0, "num_tokens": 294620210.0, "reward": 0.109375, "reward_std": 0.36250282526016236, "rewards/verify_chess_move/mean": 0.109375, "rewards/verify_chess_move/std": 0.9796836614608765, "step": 3960 }, { "completion_length": 342.6, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/max_terminated_length": 342.6, "completions/mean_length": 137.07109375, "completions/mean_terminated_length": 137.07109375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.003588319725170004, "frac_reward_zero_std": 0.4375, "grad_norm": 0.31313595175743103, "kl": 0.027350232825847344, "learning_rate": 2.831428571428571e-07, "loss": 0.0, "num_tokens": 294996437.0, "reward": 0.090625, "reward_std": 0.5026805520057678, "rewards/verify_chess_move/mean": 0.090625, "rewards/verify_chess_move/std": 0.9896253824234009, "step": 3965 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 130.51328125, "completions/mean_terminated_length": 130.51328125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.003592844718518264, "frac_reward_zero_std": 0.5, "grad_norm": 0.14125114679336548, "kl": 0.02069546107086353, "learning_rate": 2.8349999999999996e-07, "loss": 0.0, "num_tokens": 295359894.0, "reward": 0.175, "reward_std": 0.4454077839851379, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.982622754573822, "step": 3970 }, { "completion_length": 385.6, "completions/clipped_ratio": 0.0, "completions/max_length": 385.6, "completions/max_terminated_length": 385.6, "completions/mean_length": 120.36875, "completions/mean_terminated_length": 120.36875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0035973697118665236, "frac_reward_zero_std": 0.50625, "grad_norm": 0.1378290057182312, "kl": 0.025832336515304633, "learning_rate": 2.838571428571428e-07, "loss": 0.0, "num_tokens": 295710518.0, "reward": 0.19375, "reward_std": 0.43015059232711794, "rewards/verify_chess_move/mean": 0.19375, "rewards/verify_chess_move/std": 0.9669874548912049, "step": 3975 }, { "completion_length": 354.8, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 143.9921875, "completions/mean_terminated_length": 143.9921875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0036018947052147833, "frac_reward_zero_std": 0.44375, "grad_norm": 0.17232327163219452, "kl": 0.027501331217354163, "learning_rate": 2.842142857142857e-07, "loss": 0.0, "num_tokens": 296095820.0, "reward": 0.040625, "reward_std": 0.498331880569458, "rewards/verify_chess_move/mean": 0.040625, "rewards/verify_chess_move/std": 0.9960699081420898, "step": 3980 }, { "completion_length": 341.2, "completions/clipped_ratio": 0.0, "completions/max_length": 341.2, "completions/max_terminated_length": 341.2, "completions/mean_length": 132.1359375, "completions/mean_terminated_length": 132.1359375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.003606419698563043, "frac_reward_zero_std": 0.4125, "grad_norm": 0.16627560555934906, "kl": 0.018301051680464298, "learning_rate": 2.8457142857142855e-07, "loss": 0.0, "num_tokens": 296463698.0, "reward": 0.021875, "reward_std": 0.5205694198608398, "rewards/verify_chess_move/mean": 0.021875, "rewards/verify_chess_move/std": 0.9924443125724792, "step": 3985 }, { "completion_length": 358.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 127.74140625, "completions/mean_terminated_length": 127.74140625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.003610944691911303, "frac_reward_zero_std": 0.55625, "grad_norm": 0.1467762589454651, "kl": 0.03065905330295209, "learning_rate": 2.849285714285714e-07, "loss": 0.0, "num_tokens": 296825663.0, "reward": 0.1015625, "reward_std": 0.38385859727859495, "rewards/verify_chess_move/mean": 0.1015625, "rewards/verify_chess_move/std": 0.9901143908500671, "step": 3990 }, { "completion_length": 350.4, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 129.5484375, "completions/mean_terminated_length": 129.5484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0036154696852595627, "frac_reward_zero_std": 0.5125, "grad_norm": 0.13416752219200134, "kl": 0.07509074535337276, "learning_rate": 2.8528571428571427e-07, "loss": 0.0001, "num_tokens": 297191437.0, "reward": 0.0765625, "reward_std": 0.43178410530090333, "rewards/verify_chess_move/mean": 0.0765625, "rewards/verify_chess_move/std": 0.9874271273612976, "step": 3995 }, { "completion_length": 384.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 137.99453125, "completions/mean_terminated_length": 137.99453125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0036199946786078224, "frac_reward_zero_std": 0.49375, "grad_norm": 0.17422668635845184, "kl": 0.050145735091064125, "learning_rate": 2.8564285714285713e-07, "loss": 0.0001, "num_tokens": 297569182.0, "reward": 0.13125, "reward_std": 0.44056854844093324, "rewards/verify_chess_move/mean": 0.13125, "rewards/verify_chess_move/std": 0.9914987683296204, "step": 4000 }, { "completion_length": 409.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 409.6, "completions/max_terminated_length": 339.2, "completions/mean_length": 141.72265625, "completions/mean_terminated_length": 141.24432983398438, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.003624519671956082, "frac_reward_zero_std": 0.4875, "grad_norm": 0.19767312705516815, "kl": 0.03630173655983526, "learning_rate": 2.8599999999999994e-07, "loss": 0.0, "num_tokens": 297950763.0, "reward": 0.0703125, "reward_std": 0.44340885877609254, "rewards/verify_chess_move/mean": 0.0703125, "rewards/verify_chess_move/std": 0.995475685596466, "step": 4005 }, { "completion_length": 395.4, "completions/clipped_ratio": 0.0, "completions/max_length": 395.4, "completions/max_terminated_length": 395.4, "completions/mean_length": 130.25546875, "completions/mean_terminated_length": 130.25546875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003629044665304342, "frac_reward_zero_std": 0.55625, "grad_norm": 0.1424729824066162, "kl": 0.04675099955056794, "learning_rate": 2.8635714285714285e-07, "loss": 0.0, "num_tokens": 298315690.0, "reward": 0.134375, "reward_std": 0.3701454699039459, "rewards/verify_chess_move/mean": 0.134375, "rewards/verify_chess_move/std": 0.9856500267982483, "step": 4010 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 123.83515625, "completions/mean_terminated_length": 123.83515625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.0036335696586526018, "frac_reward_zero_std": 0.5125, "grad_norm": 0.1730610728263855, "kl": 0.05446979990811087, "learning_rate": 2.8671428571428566e-07, "loss": 0.0001, "num_tokens": 298671055.0, "reward": 0.2234375, "reward_std": 0.43114858865737915, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9709560394287109, "step": 4015 }, { "completion_length": 349.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 129.3671875, "completions/mean_terminated_length": 129.3671875, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0036380946520008614, "frac_reward_zero_std": 0.51875, "grad_norm": 0.21033768355846405, "kl": 0.040829136632964946, "learning_rate": 2.870714285714286e-07, "loss": 0.0, "num_tokens": 299036709.0, "reward": 0.1203125, "reward_std": 0.4292044460773468, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9871881008148193, "step": 4020 }, { "completion_length": 381.2, "completions/clipped_ratio": 0.0, "completions/max_length": 381.2, "completions/max_terminated_length": 381.2, "completions/mean_length": 133.44609375, "completions/mean_terminated_length": 133.44609375, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0036426196453491215, "frac_reward_zero_std": 0.53125, "grad_norm": 0.38384583592414856, "kl": 0.02787007091101259, "learning_rate": 2.874285714285714e-07, "loss": 0.0, "num_tokens": 299407768.0, "reward": 0.0859375, "reward_std": 0.3847608149051666, "rewards/verify_chess_move/mean": 0.0859375, "rewards/verify_chess_move/std": 0.9920645594596863, "step": 4025 }, { "completion_length": 419.6, "completions/clipped_ratio": 0.0, "completions/max_length": 419.6, "completions/max_terminated_length": 419.6, "completions/mean_length": 128.890625, "completions/mean_terminated_length": 128.890625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003647144638697381, "frac_reward_zero_std": 0.55625, "grad_norm": 0.39679449796676636, "kl": 0.036663099672296084, "learning_rate": 2.877857142857143e-07, "loss": 0.0, "num_tokens": 299774636.0, "reward": 0.0859375, "reward_std": 0.38360289931297303, "rewards/verify_chess_move/mean": 0.0859375, "rewards/verify_chess_move/std": 0.9729389905929565, "step": 4030 }, { "completion_length": 426.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.8, "completions/max_terminated_length": 351.8, "completions/mean_length": 138.17109375, "completions/mean_terminated_length": 137.69571838378906, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.003651669632045641, "frac_reward_zero_std": 0.49375, "grad_norm": 0.2701779901981354, "kl": 0.047278601766447534, "learning_rate": 2.881428571428571e-07, "loss": 0.0, "num_tokens": 300150127.0, "reward": 0.1765625, "reward_std": 0.4561902046203613, "rewards/verify_chess_move/mean": 0.1765625, "rewards/verify_chess_move/std": 0.9556947469711303, "step": 4035 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.0, "completions/max_length": 419.4, "completions/max_terminated_length": 419.4, "completions/mean_length": 127.3640625, "completions/mean_terminated_length": 127.3640625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.0036561946253939005, "frac_reward_zero_std": 0.46875, "grad_norm": 0.20579275488853455, "kl": 0.045581045569269917, "learning_rate": 2.8849999999999997e-07, "loss": 0.0, "num_tokens": 300510649.0, "reward": 0.171875, "reward_std": 0.4497383415699005, "rewards/verify_chess_move/mean": 0.171875, "rewards/verify_chess_move/std": 0.9760148048400878, "step": 4040 }, { "completion_length": 456.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 456.6, "completions/max_terminated_length": 386.8, "completions/mean_length": 134.63125, "completions/mean_terminated_length": 134.13746948242186, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0036607196187421606, "frac_reward_zero_std": 0.425, "grad_norm": 0.5541471242904663, "kl": 0.49073665858013554, "learning_rate": 2.8885714285714283e-07, "loss": 0.0005, "num_tokens": 300883057.0, "reward": 0.1109375, "reward_std": 0.49592297077178954, "rewards/verify_chess_move/mean": 0.1109375, "rewards/verify_chess_move/std": 0.9879557967185975, "step": 4045 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 130.48125, "completions/mean_terminated_length": 130.48125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0036652446120904202, "frac_reward_zero_std": 0.45, "grad_norm": 0.3090466260910034, "kl": 0.11444262269069441, "learning_rate": 2.892142857142857e-07, "loss": 0.0001, "num_tokens": 301250345.0, "reward": 0.165625, "reward_std": 0.47291791439056396, "rewards/verify_chess_move/mean": 0.165625, "rewards/verify_chess_move/std": 0.975056791305542, "step": 4050 }, { "completion_length": 336.2, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/max_terminated_length": 336.2, "completions/mean_length": 132.9421875, "completions/mean_terminated_length": 132.9421875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.00366976960543868, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1943763643503189, "kl": 0.0740189810632728, "learning_rate": 2.8957142857142855e-07, "loss": 0.0001, "num_tokens": 301622031.0, "reward": 0.0515625, "reward_std": 0.38002920150756836, "rewards/verify_chess_move/mean": 0.0515625, "rewards/verify_chess_move/std": 0.9986043930053711, "step": 4055 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.0, "completions/max_length": 395.2, "completions/max_terminated_length": 395.2, "completions/mean_length": 125.01875, "completions/mean_terminated_length": 125.01875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.00367429459878694, "frac_reward_zero_std": 0.5, "grad_norm": 0.2655045986175537, "kl": 0.06512042230751831, "learning_rate": 2.899285714285714e-07, "loss": 0.0001, "num_tokens": 301977671.0, "reward": 0.1296875, "reward_std": 0.4454552710056305, "rewards/verify_chess_move/mean": 0.1296875, "rewards/verify_chess_move/std": 0.9875566959381104, "step": 4060 }, { "completion_length": 393.6, "completions/clipped_ratio": 0.0, "completions/max_length": 393.6, "completions/max_terminated_length": 393.6, "completions/mean_length": 130.446875, "completions/mean_terminated_length": 130.446875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0036788195921351997, "frac_reward_zero_std": 0.55, "grad_norm": 0.09814323484897614, "kl": 0.02543923429620918, "learning_rate": 2.902857142857142e-07, "loss": 0.0, "num_tokens": 302345835.0, "reward": 0.11875, "reward_std": 0.3937478005886078, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9918523073196411, "step": 4065 }, { "completion_length": 444.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.4, "completions/max_terminated_length": 366.6, "completions/mean_length": 125.79921875, "completions/mean_terminated_length": 125.290966796875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0036833445854834593, "frac_reward_zero_std": 0.475, "grad_norm": 0.17486463487148285, "kl": 0.025309367694717366, "learning_rate": 2.9064285714285714e-07, "loss": 0.0, "num_tokens": 302703562.0, "reward": 0.0171875, "reward_std": 0.4620983123779297, "rewards/verify_chess_move/mean": 0.0171875, "rewards/verify_chess_move/std": 0.997198474407196, "step": 4070 }, { "completion_length": 376.4, "completions/clipped_ratio": 0.0, "completions/max_length": 376.4, "completions/max_terminated_length": 376.4, "completions/mean_length": 135.65, "completions/mean_terminated_length": 135.65, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.003687869578831719, "frac_reward_zero_std": 0.575, "grad_norm": 0.14715427160263062, "kl": 0.02463620200578589, "learning_rate": 2.9099999999999995e-07, "loss": 0.0, "num_tokens": 303078082.0, "reward": 0.096875, "reward_std": 0.37965075969696044, "rewards/verify_chess_move/mean": 0.096875, "rewards/verify_chess_move/std": 0.9912105202674866, "step": 4075 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 124.09296875, "completions/mean_terminated_length": 124.09296875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.003692394572179979, "frac_reward_zero_std": 0.51875, "grad_norm": 0.281037300825119, "kl": 0.029292282002279534, "learning_rate": 2.9135714285714286e-07, "loss": 0.0, "num_tokens": 303435297.0, "reward": 0.11875, "reward_std": 0.41306352615356445, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9904058337211609, "step": 4080 }, { "completion_length": 352.2, "completions/clipped_ratio": 0.0, "completions/max_length": 352.2, "completions/max_terminated_length": 352.2, "completions/mean_length": 123.4578125, "completions/mean_terminated_length": 123.4578125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0036969195655282387, "frac_reward_zero_std": 0.5875, "grad_norm": 0.161737859249115, "kl": 0.04184608473151456, "learning_rate": 2.9171428571428567e-07, "loss": 0.0, "num_tokens": 303790123.0, "reward": 0.209375, "reward_std": 0.36056219339370726, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9778414964675903, "step": 4085 }, { "completion_length": 354.8, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 117.63515625, "completions/mean_terminated_length": 117.63515625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0037014445588764984, "frac_reward_zero_std": 0.46875, "grad_norm": 0.7179434895515442, "kl": 0.1135266064782627, "learning_rate": 2.920714285714286e-07, "loss": 0.0001, "num_tokens": 304137832.0, "reward": 0.0546875, "reward_std": 0.4527793645858765, "rewards/verify_chess_move/mean": 0.0546875, "rewards/verify_chess_move/std": 0.9814206480979919, "step": 4090 }, { "completion_length": 365.8, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/max_terminated_length": 365.8, "completions/mean_length": 132.84140625, "completions/mean_terminated_length": 132.84140625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.003705969552224758, "frac_reward_zero_std": 0.55625, "grad_norm": 0.4360557198524475, "kl": 0.09966555833234451, "learning_rate": 2.924285714285714e-07, "loss": 0.0001, "num_tokens": 304507029.0, "reward": 0.16875, "reward_std": 0.3771924257278442, "rewards/verify_chess_move/mean": 0.16875, "rewards/verify_chess_move/std": 0.9804231882095337, "step": 4095 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.0, "completions/max_length": 462.6, "completions/max_terminated_length": 462.6, "completions/mean_length": 131.83125, "completions/mean_terminated_length": 131.83125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003710494545573018, "frac_reward_zero_std": 0.53125, "grad_norm": 0.15244966745376587, "kl": 0.20108691847417504, "learning_rate": 2.9278571428571425e-07, "loss": 0.0002, "num_tokens": 304874773.0, "reward": 0.1078125, "reward_std": 0.4169981241226196, "rewards/verify_chess_move/mean": 0.1078125, "rewards/verify_chess_move/std": 0.9905053973197937, "step": 4100 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 130.196875, "completions/mean_terminated_length": 130.196875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003715019538921278, "frac_reward_zero_std": 0.55625, "grad_norm": 0.16388629376888275, "kl": 0.11031227066414431, "learning_rate": 2.931428571428571e-07, "loss": 0.0001, "num_tokens": 305237929.0, "reward": 0.109375, "reward_std": 0.3792888641357422, "rewards/verify_chess_move/mean": 0.109375, "rewards/verify_chess_move/std": 0.9902008414268494, "step": 4105 }, { "completion_length": 348.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 134.7546875, "completions/mean_terminated_length": 134.7546875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0037195445322695374, "frac_reward_zero_std": 0.53125, "grad_norm": 0.16612187027931213, "kl": 0.04353899033158086, "learning_rate": 2.935e-07, "loss": 0.0, "num_tokens": 305610975.0, "reward": 0.0328125, "reward_std": 0.40469648241996764, "rewards/verify_chess_move/mean": 0.0328125, "rewards/verify_chess_move/std": 0.9936658382415772, "step": 4110 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 137.3984375, "completions/mean_terminated_length": 137.3984375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0037240695256177975, "frac_reward_zero_std": 0.4875, "grad_norm": 0.18231232464313507, "kl": 0.022203177894698455, "learning_rate": 2.9385714285714284e-07, "loss": 0.0, "num_tokens": 305986381.0, "reward": 0.1703125, "reward_std": 0.4561793565750122, "rewards/verify_chess_move/mean": 0.1703125, "rewards/verify_chess_move/std": 0.9737695097923279, "step": 4115 }, { "completion_length": 346.6, "completions/clipped_ratio": 0.0, "completions/max_length": 346.6, "completions/max_terminated_length": 346.6, "completions/mean_length": 125.4734375, "completions/mean_terminated_length": 125.4734375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.003728594518966057, "frac_reward_zero_std": 0.6125, "grad_norm": 0.4961639642715454, "kl": 0.01931017726310529, "learning_rate": 2.942142857142857e-07, "loss": 0.0, "num_tokens": 306345947.0, "reward": 0.121875, "reward_std": 0.35082663893699645, "rewards/verify_chess_move/mean": 0.121875, "rewards/verify_chess_move/std": 0.991531765460968, "step": 4120 }, { "completion_length": 369.8, "completions/clipped_ratio": 0.0, "completions/max_length": 369.8, "completions/max_terminated_length": 369.8, "completions/mean_length": 131.82890625, "completions/mean_terminated_length": 131.82890625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003733119512314317, "frac_reward_zero_std": 0.44375, "grad_norm": 0.3142687976360321, "kl": 0.02058941992581822, "learning_rate": 2.9457142857142856e-07, "loss": 0.0, "num_tokens": 306713440.0, "reward": 0.1984375, "reward_std": 0.47771322131156924, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.9750577092170716, "step": 4125 }, { "completion_length": 435.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 127.3390625, "completions/mean_terminated_length": 127.3390625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0037376445056625765, "frac_reward_zero_std": 0.49375, "grad_norm": 0.359960675239563, "kl": 0.0256281300316914, "learning_rate": 2.949285714285714e-07, "loss": 0.0, "num_tokens": 307074610.0, "reward": 0.1453125, "reward_std": 0.4366808235645294, "rewards/verify_chess_move/mean": 0.1453125, "rewards/verify_chess_move/std": 0.9885692477226258, "step": 4130 }, { "completion_length": 379.6, "completions/clipped_ratio": 0.0, "completions/max_length": 379.6, "completions/max_terminated_length": 379.6, "completions/mean_length": 131.94765625, "completions/mean_terminated_length": 131.94765625, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.0037421694990108366, "frac_reward_zero_std": 0.525, "grad_norm": 0.1367947906255722, "kl": 0.05675532007298898, "learning_rate": 2.9528571428571423e-07, "loss": 0.0001, "num_tokens": 307443663.0, "reward": 0.0296875, "reward_std": 0.418636167049408, "rewards/verify_chess_move/mean": 0.0296875, "rewards/verify_chess_move/std": 0.969741940498352, "step": 4135 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 133.30390625, "completions/mean_terminated_length": 133.30390625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0037466944923590963, "frac_reward_zero_std": 0.54375, "grad_norm": 0.2704377770423889, "kl": 0.10795636521652341, "learning_rate": 2.9564285714285714e-07, "loss": 0.0001, "num_tokens": 307813804.0, "reward": 0.10625, "reward_std": 0.38949565291404725, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9887852191925048, "step": 4140 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 132.1734375, "completions/mean_terminated_length": 132.1734375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.003751219485707356, "frac_reward_zero_std": 0.6, "grad_norm": 0.2555225193500519, "kl": 0.11795258750207722, "learning_rate": 2.9599999999999995e-07, "loss": 0.0001, "num_tokens": 308181474.0, "reward": 0.153125, "reward_std": 0.3456200182437897, "rewards/verify_chess_move/mean": 0.153125, "rewards/verify_chess_move/std": 0.9820945978164672, "step": 4145 }, { "completion_length": 349.6, "completions/clipped_ratio": 0.0, "completions/max_length": 349.6, "completions/max_terminated_length": 349.6, "completions/mean_length": 131.00234375, "completions/mean_terminated_length": 131.00234375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0037557444790556156, "frac_reward_zero_std": 0.51875, "grad_norm": 0.2596726417541504, "kl": 0.11338542541780043, "learning_rate": 2.9635714285714286e-07, "loss": 0.0001, "num_tokens": 308545789.0, "reward": 0.23125, "reward_std": 0.41648141741752626, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9555915474891663, "step": 4150 }, { "completion_length": 370.2, "completions/clipped_ratio": 0.0, "completions/max_length": 370.2, "completions/max_terminated_length": 370.2, "completions/mean_length": 125.0765625, "completions/mean_terminated_length": 125.0765625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0037602694724038757, "frac_reward_zero_std": 0.5, "grad_norm": 0.2215457558631897, "kl": 0.07356231135199778, "learning_rate": 2.9671428571428567e-07, "loss": 0.0001, "num_tokens": 308900335.0, "reward": 0.3125, "reward_std": 0.4311066150665283, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9515922784805297, "step": 4155 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 139.2078125, "completions/mean_terminated_length": 139.2078125, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0037647944657521353, "frac_reward_zero_std": 0.54375, "grad_norm": 0.4924596846103668, "kl": 0.15745465075597168, "learning_rate": 2.970714285714286e-07, "loss": 0.0002, "num_tokens": 309278969.0, "reward": 0.15625, "reward_std": 0.40184378027915957, "rewards/verify_chess_move/mean": 0.15625, "rewards/verify_chess_move/std": 0.983738899230957, "step": 4160 }, { "completion_length": 453.4, "completions/clipped_ratio": 0.0, "completions/max_length": 453.4, "completions/max_terminated_length": 453.4, "completions/mean_length": 127.49375, "completions/mean_terminated_length": 127.49375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.003769319459100395, "frac_reward_zero_std": 0.525, "grad_norm": 0.20855887234210968, "kl": 0.22762025849951897, "learning_rate": 2.974285714285714e-07, "loss": 0.0002, "num_tokens": 309640673.0, "reward": 0.11875, "reward_std": 0.415052604675293, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9862456679344177, "step": 4165 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.0, "completions/max_length": 378.4, "completions/max_terminated_length": 378.4, "completions/mean_length": 135.08671875, "completions/mean_terminated_length": 135.08671875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.003773844452448655, "frac_reward_zero_std": 0.55, "grad_norm": 0.3255317807197571, "kl": 0.08644258508575149, "learning_rate": 2.9778571428571426e-07, "loss": 0.0001, "num_tokens": 310012136.0, "reward": 0.2171875, "reward_std": 0.40305688977241516, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9635229468345642, "step": 4170 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 133.51640625, "completions/mean_terminated_length": 133.51640625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0037783694457969147, "frac_reward_zero_std": 0.54375, "grad_norm": 0.19369569420814514, "kl": 0.04694212122703902, "learning_rate": 2.981428571428571e-07, "loss": 0.0, "num_tokens": 310383949.0, "reward": 0.06875, "reward_std": 0.4051627993583679, "rewards/verify_chess_move/mean": 0.06875, "rewards/verify_chess_move/std": 0.9940328478813172, "step": 4175 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 137.4140625, "completions/mean_terminated_length": 137.4140625, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0037828944391451744, "frac_reward_zero_std": 0.475, "grad_norm": 0.309211403131485, "kl": 0.04619880973477848, "learning_rate": 2.985e-07, "loss": 0.0, "num_tokens": 310757775.0, "reward": 0.1796875, "reward_std": 0.4545088171958923, "rewards/verify_chess_move/mean": 0.1796875, "rewards/verify_chess_move/std": 0.9768483757972717, "step": 4180 }, { "completion_length": 342.4, "completions/clipped_ratio": 0.0, "completions/max_length": 342.4, "completions/max_terminated_length": 342.4, "completions/mean_length": 129.55625, "completions/mean_terminated_length": 129.55625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.003787419432493434, "frac_reward_zero_std": 0.59375, "grad_norm": 0.12167324870824814, "kl": 0.02735235498403199, "learning_rate": 2.9885714285714284e-07, "loss": 0.0, "num_tokens": 311122335.0, "reward": 0.1703125, "reward_std": 0.357557213306427, "rewards/verify_chess_move/mean": 0.1703125, "rewards/verify_chess_move/std": 0.9690253734588623, "step": 4185 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 126.90390625, "completions/mean_terminated_length": 126.90390625, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.003791944425841694, "frac_reward_zero_std": 0.59375, "grad_norm": 0.24257461726665497, "kl": 0.02195331607363187, "learning_rate": 2.992142857142857e-07, "loss": 0.0, "num_tokens": 311482180.0, "reward": 0.0828125, "reward_std": 0.33820860385894774, "rewards/verify_chess_move/mean": 0.0828125, "rewards/verify_chess_move/std": 0.9816713571548462, "step": 4190 }, { "completion_length": 370.6, "completions/clipped_ratio": 0.0, "completions/max_length": 370.6, "completions/max_terminated_length": 370.6, "completions/mean_length": 146.24375, "completions/mean_terminated_length": 146.24375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.003796469419189954, "frac_reward_zero_std": 0.575, "grad_norm": 0.17443066835403442, "kl": 0.024377868289593606, "learning_rate": 2.995714285714285e-07, "loss": 0.0, "num_tokens": 311872268.0, "reward": 0.1421875, "reward_std": 0.3580885171890259, "rewards/verify_chess_move/mean": 0.1421875, "rewards/verify_chess_move/std": 0.985818064212799, "step": 4195 }, { "completion_length": 370.4, "completions/clipped_ratio": 0.0, "completions/max_length": 370.4, "completions/max_terminated_length": 370.4, "completions/mean_length": 130.23125, "completions/mean_terminated_length": 130.23125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0038009944125382135, "frac_reward_zero_std": 0.54375, "grad_norm": 0.19318798184394836, "kl": 0.015932929699192754, "learning_rate": 2.999285714285714e-07, "loss": 0.0, "num_tokens": 312238820.0, "reward": 0.0421875, "reward_std": 0.39385418891906737, "rewards/verify_chess_move/mean": 0.0421875, "rewards/verify_chess_move/std": 0.9934941291809082, "step": 4200 }, { "completion_length": 363.6, "completions/clipped_ratio": 0.0, "completions/max_length": 363.6, "completions/max_terminated_length": 363.6, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.003805519405886473, "frac_reward_zero_std": 0.55, "grad_norm": 0.14454197883605957, "kl": 0.03366526065219659, "learning_rate": 3.0028571428571423e-07, "loss": 0.0, "num_tokens": 312593364.0, "reward": 0.1515625, "reward_std": 0.39190807938575745, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9623745679855347, "step": 4205 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 120.328125, "completions/mean_terminated_length": 120.328125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0038100443992347332, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13393232226371765, "kl": 0.01978624928742647, "learning_rate": 3.0064285714285715e-07, "loss": 0.0, "num_tokens": 312944656.0, "reward": 0.090625, "reward_std": 0.35866453051567077, "rewards/verify_chess_move/mean": 0.090625, "rewards/verify_chess_move/std": 0.9911751866340637, "step": 4210 }, { "completion_length": 397.6, "completions/clipped_ratio": 0.0, "completions/max_length": 397.6, "completions/max_terminated_length": 397.6, "completions/mean_length": 131.4828125, "completions/mean_terminated_length": 131.4828125, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.003814569392582993, "frac_reward_zero_std": 0.54375, "grad_norm": 0.18536852300167084, "kl": 0.025358216217136943, "learning_rate": 3.0099999999999996e-07, "loss": 0.0, "num_tokens": 313310370.0, "reward": 0.19375, "reward_std": 0.4038452684879303, "rewards/verify_chess_move/mean": 0.19375, "rewards/verify_chess_move/std": 0.9642593502998352, "step": 4215 }, { "completion_length": 343.2, "completions/clipped_ratio": 0.0, "completions/max_length": 343.2, "completions/max_terminated_length": 343.2, "completions/mean_length": 131.415625, "completions/mean_terminated_length": 131.415625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0038190943859312525, "frac_reward_zero_std": 0.525, "grad_norm": 0.6096295714378357, "kl": 0.06286015465739184, "learning_rate": 3.0135714285714287e-07, "loss": 0.0001, "num_tokens": 313679494.0, "reward": 0.20625, "reward_std": 0.39118369221687316, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9716227412223816, "step": 4220 }, { "completion_length": 398.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.6, "completions/max_terminated_length": 309.0, "completions/mean_length": 112.35, "completions/mean_terminated_length": 111.82397155761718, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0038236193792795126, "frac_reward_zero_std": 0.575, "grad_norm": 0.6803514957427979, "kl": 0.04884022121113958, "learning_rate": 3.017142857142857e-07, "loss": 0.0, "num_tokens": 314019094.0, "reward": 0.190625, "reward_std": 0.36355535984039306, "rewards/verify_chess_move/mean": 0.190625, "rewards/verify_chess_move/std": 0.97894366979599, "step": 4225 }, { "completion_length": 405.6, "completions/clipped_ratio": 0.0, "completions/max_length": 405.6, "completions/max_terminated_length": 405.6, "completions/mean_length": 122.27421875, "completions/mean_terminated_length": 122.27421875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0038281443726277723, "frac_reward_zero_std": 0.5875, "grad_norm": 0.1887495368719101, "kl": 0.06798865353339352, "learning_rate": 3.0207142857142854e-07, "loss": 0.0001, "num_tokens": 314373397.0, "reward": 0.14375, "reward_std": 0.3626566767692566, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9788601279258728, "step": 4230 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 117.065625, "completions/mean_terminated_length": 117.065625, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.003832669365976032, "frac_reward_zero_std": 0.55625, "grad_norm": 0.257133424282074, "kl": 0.1548882760340348, "learning_rate": 3.024285714285714e-07, "loss": 0.0002, "num_tokens": 314718353.0, "reward": 0.2765625, "reward_std": 0.38355444371700287, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9535418152809143, "step": 4235 }, { "completion_length": 527.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 527.0, "completions/max_terminated_length": 376.6, "completions/mean_length": 128.7796875, "completions/mean_terminated_length": 127.7827392578125, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.0038371943593242916, "frac_reward_zero_std": 0.525, "grad_norm": 0.21992632746696472, "kl": 0.19723955739464144, "learning_rate": 3.0278571428571426e-07, "loss": 0.0002, "num_tokens": 315083471.0, "reward": 0.1296875, "reward_std": 0.4091633677482605, "rewards/verify_chess_move/mean": 0.1296875, "rewards/verify_chess_move/std": 0.9911430835723877, "step": 4240 }, { "completion_length": 361.8, "completions/clipped_ratio": 0.0, "completions/max_length": 361.8, "completions/max_terminated_length": 361.8, "completions/mean_length": 136.796875, "completions/mean_terminated_length": 136.796875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0038417193526725517, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1967713087797165, "kl": 0.21146836576517672, "learning_rate": 3.031428571428571e-07, "loss": 0.0002, "num_tokens": 315459067.0, "reward": 0.1953125, "reward_std": 0.3637730062007904, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9491534471511841, "step": 4245 }, { "completion_length": 327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 126.61953125, "completions/mean_terminated_length": 126.61953125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0038462443460208114, "frac_reward_zero_std": 0.55625, "grad_norm": 0.19893379509449005, "kl": 0.3453013674065005, "learning_rate": 3.035e-07, "loss": 0.0003, "num_tokens": 315820524.0, "reward": 0.1359375, "reward_std": 0.38833633065223694, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.971797502040863, "step": 4250 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 126.57265625, "completions/mean_terminated_length": 126.57265625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.003850769339369071, "frac_reward_zero_std": 0.5375, "grad_norm": 0.2033442109823227, "kl": 0.08974608082789928, "learning_rate": 3.0385714285714285e-07, "loss": 0.0001, "num_tokens": 316181537.0, "reward": 0.2015625, "reward_std": 0.4043294847011566, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.9779186010360718, "step": 4255 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.0, "completions/max_length": 394.6, "completions/max_terminated_length": 394.6, "completions/mean_length": 124.74453125, "completions/mean_terminated_length": 124.74453125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.003855294332717331, "frac_reward_zero_std": 0.5875, "grad_norm": 0.18727318942546844, "kl": 0.02148106796958018, "learning_rate": 3.042142857142857e-07, "loss": 0.0, "num_tokens": 316542346.0, "reward": 0.0328125, "reward_std": 0.3687629699707031, "rewards/verify_chess_move/mean": 0.0328125, "rewards/verify_chess_move/std": 0.987628984451294, "step": 4260 }, { "completion_length": 396.8, "completions/clipped_ratio": 0.0, "completions/max_length": 396.8, "completions/max_terminated_length": 396.8, "completions/mean_length": 135.334375, "completions/mean_terminated_length": 135.334375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0038598193260655908, "frac_reward_zero_std": 0.5875, "grad_norm": 0.18056625127792358, "kl": 0.019804499816382305, "learning_rate": 3.045714285714285e-07, "loss": 0.0, "num_tokens": 316915350.0, "reward": 0.075, "reward_std": 0.35971548557281496, "rewards/verify_chess_move/mean": 0.075, "rewards/verify_chess_move/std": 0.9865724563598632, "step": 4265 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 121.0921875, "completions/mean_terminated_length": 121.0921875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0038643443194138504, "frac_reward_zero_std": 0.6125, "grad_norm": 0.18397389352321625, "kl": 0.01697146727819927, "learning_rate": 3.0492857142857143e-07, "loss": 0.0, "num_tokens": 317267468.0, "reward": 0.228125, "reward_std": 0.32063926458358766, "rewards/verify_chess_move/mean": 0.228125, "rewards/verify_chess_move/std": 0.9631399512290955, "step": 4270 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 136.46015625, "completions/mean_terminated_length": 136.46015625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00386886931276211, "frac_reward_zero_std": 0.5375, "grad_norm": 0.2714367210865021, "kl": 0.022810809104703365, "learning_rate": 3.0528571428571424e-07, "loss": 0.0, "num_tokens": 317641385.0, "reward": 0.1921875, "reward_std": 0.41274242401123046, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9666921257972717, "step": 4275 }, { "completion_length": 334.8, "completions/clipped_ratio": 0.0, "completions/max_length": 334.8, "completions/max_terminated_length": 334.8, "completions/mean_length": 126.49609375, "completions/mean_terminated_length": 126.49609375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00387339430611037, "frac_reward_zero_std": 0.625, "grad_norm": 0.22184030711650848, "kl": 0.022773366828914732, "learning_rate": 3.0564285714285715e-07, "loss": 0.0, "num_tokens": 317999812.0, "reward": 0.21875, "reward_std": 0.3227361023426056, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9700302720069885, "step": 4280 }, { "completion_length": 390.8, "completions/clipped_ratio": 0.0, "completions/max_length": 390.8, "completions/max_terminated_length": 390.8, "completions/mean_length": 135.06796875, "completions/mean_terminated_length": 135.06796875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.00387791929945863, "frac_reward_zero_std": 0.56875, "grad_norm": 0.28255122900009155, "kl": 0.026293712784536183, "learning_rate": 3.0599999999999996e-07, "loss": 0.0, "num_tokens": 318373555.0, "reward": 0.025, "reward_std": 0.36724530458450316, "rewards/verify_chess_move/mean": 0.025, "rewards/verify_chess_move/std": 0.9795449256896973, "step": 4285 }, { "completion_length": 344.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 130.78125, "completions/mean_terminated_length": 130.78125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0038824442928068895, "frac_reward_zero_std": 0.6125, "grad_norm": 0.4044862985610962, "kl": 0.1081321434583515, "learning_rate": 3.063571428571429e-07, "loss": 0.0001, "num_tokens": 318740115.0, "reward": 0.259375, "reward_std": 0.33567286729812623, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.95827556848526, "step": 4290 }, { "completion_length": 451.4, "completions/clipped_ratio": 0.0, "completions/max_length": 451.4, "completions/max_terminated_length": 451.4, "completions/mean_length": 122.321875, "completions/mean_terminated_length": 122.321875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.003886969286155149, "frac_reward_zero_std": 0.6, "grad_norm": 1.2954223155975342, "kl": 0.28442116124206224, "learning_rate": 3.067142857142857e-07, "loss": 0.0003, "num_tokens": 319092223.0, "reward": 0.165625, "reward_std": 0.35356212258338926, "rewards/verify_chess_move/mean": 0.165625, "rewards/verify_chess_move/std": 0.9800842642784119, "step": 4295 }, { "completion_length": 337.4, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/max_terminated_length": 337.4, "completions/mean_length": 129.0296875, "completions/mean_terminated_length": 129.0296875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.0038914942795034092, "frac_reward_zero_std": 0.58125, "grad_norm": 0.4163007438182831, "kl": 0.08943171259597875, "learning_rate": 3.0707142857142854e-07, "loss": 0.0001, "num_tokens": 319455933.0, "reward": 0.0859375, "reward_std": 0.35067847967147825, "rewards/verify_chess_move/mean": 0.0859375, "rewards/verify_chess_move/std": 0.9868924140930175, "step": 4300 }, { "completion_length": 424.4, "completions/clipped_ratio": 0.0, "completions/max_length": 424.4, "completions/max_terminated_length": 424.4, "completions/mean_length": 129.79296875, "completions/mean_terminated_length": 129.79296875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003896019272851669, "frac_reward_zero_std": 0.575, "grad_norm": 0.2828281819820404, "kl": 0.08260645577684045, "learning_rate": 3.074285714285714e-07, "loss": 0.0001, "num_tokens": 319821532.0, "reward": 0.109375, "reward_std": 0.36471211314201357, "rewards/verify_chess_move/mean": 0.109375, "rewards/verify_chess_move/std": 0.975961971282959, "step": 4305 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 120.23125, "completions/mean_terminated_length": 120.23125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0039005442661999286, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1537982076406479, "kl": 0.04206502094166353, "learning_rate": 3.0778571428571427e-07, "loss": 0.0, "num_tokens": 320173972.0, "reward": 0.2265625, "reward_std": 0.2981608182191849, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.960320246219635, "step": 4310 }, { "completion_length": 378.8, "completions/clipped_ratio": 0.0, "completions/max_length": 378.8, "completions/max_terminated_length": 378.8, "completions/mean_length": 130.2140625, "completions/mean_terminated_length": 130.2140625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0039050692595481887, "frac_reward_zero_std": 0.625, "grad_norm": 0.40260910987854004, "kl": 0.03825732415425591, "learning_rate": 3.0814285714285713e-07, "loss": 0.0, "num_tokens": 320541254.0, "reward": 0.1640625, "reward_std": 0.32478506565093995, "rewards/verify_chess_move/mean": 0.1640625, "rewards/verify_chess_move/std": 0.9651480436325073, "step": 4315 }, { "completion_length": 429.4, "completions/clipped_ratio": 0.0, "completions/max_length": 429.4, "completions/max_terminated_length": 429.4, "completions/mean_length": 119.83671875, "completions/mean_terminated_length": 119.83671875, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.003909594252896448, "frac_reward_zero_std": 0.56875, "grad_norm": 0.25389790534973145, "kl": 0.05690183723054361, "learning_rate": 3.085e-07, "loss": 0.0001, "num_tokens": 320891301.0, "reward": 0.06875, "reward_std": 0.3933288395404816, "rewards/verify_chess_move/mean": 0.06875, "rewards/verify_chess_move/std": 0.9712810158729553, "step": 4320 }, { "completion_length": 429.6, "completions/clipped_ratio": 0.0, "completions/max_length": 429.6, "completions/max_terminated_length": 429.6, "completions/mean_length": 132.0203125, "completions/mean_terminated_length": 132.0203125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.003914119246244708, "frac_reward_zero_std": 0.55, "grad_norm": 0.394802987575531, "kl": 0.03975557539379224, "learning_rate": 3.088571428571428e-07, "loss": 0.0, "num_tokens": 321257383.0, "reward": 0.1375, "reward_std": 0.3980589032173157, "rewards/verify_chess_move/mean": 0.1375, "rewards/verify_chess_move/std": 0.9892168760299682, "step": 4325 }, { "completion_length": 398.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.2, "completions/max_terminated_length": 320.6, "completions/mean_length": 124.659375, "completions/mean_terminated_length": 124.15840454101563, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.003918644239592968, "frac_reward_zero_std": 0.60625, "grad_norm": 0.18671132624149323, "kl": 0.055737059921375474, "learning_rate": 3.092142857142857e-07, "loss": 0.0001, "num_tokens": 321616667.0, "reward": 0.20625, "reward_std": 0.3235270261764526, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9697999238967896, "step": 4330 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 130.4828125, "completions/mean_terminated_length": 130.4828125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003923169232941228, "frac_reward_zero_std": 0.60625, "grad_norm": 0.1769411414861679, "kl": 0.17421298100380228, "learning_rate": 3.095714285714285e-07, "loss": 0.0002, "num_tokens": 321985597.0, "reward": 0.0375, "reward_std": 0.3362065255641937, "rewards/verify_chess_move/mean": 0.0375, "rewards/verify_chess_move/std": 0.9891274690628051, "step": 4335 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 127.82109375, "completions/mean_terminated_length": 127.82109375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.003927694226289487, "frac_reward_zero_std": 0.525, "grad_norm": 0.26253557205200195, "kl": 0.19506275925668887, "learning_rate": 3.0992857142857143e-07, "loss": 0.0002, "num_tokens": 322347280.0, "reward": 0.1421875, "reward_std": 0.4097988963127136, "rewards/verify_chess_move/mean": 0.1421875, "rewards/verify_chess_move/std": 0.9881285309791565, "step": 4340 }, { "completion_length": 319.2, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/max_terminated_length": 319.2, "completions/mean_length": 123.9078125, "completions/mean_terminated_length": 123.9078125, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.003932219219637747, "frac_reward_zero_std": 0.6, "grad_norm": 0.19001901149749756, "kl": 0.106804431846831, "learning_rate": 3.1028571428571424e-07, "loss": 0.0001, "num_tokens": 322705098.0, "reward": 0.159375, "reward_std": 0.3476699531078339, "rewards/verify_chess_move/mean": 0.159375, "rewards/verify_chess_move/std": 0.9822484254837036, "step": 4345 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.0, "completions/max_length": 462.6, "completions/max_terminated_length": 462.6, "completions/mean_length": 127.2828125, "completions/mean_terminated_length": 127.2828125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.003936744212986007, "frac_reward_zero_std": 0.59375, "grad_norm": 0.16251106560230255, "kl": 0.044481942558195445, "learning_rate": 3.1064285714285716e-07, "loss": 0.0, "num_tokens": 323067164.0, "reward": 0.15, "reward_std": 0.3650255620479584, "rewards/verify_chess_move/mean": 0.15, "rewards/verify_chess_move/std": 0.9823464035987854, "step": 4350 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0, "completions/max_length": 367.8, "completions/max_terminated_length": 367.8, "completions/mean_length": 132.13984375, "completions/mean_terminated_length": 132.13984375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.003941269206334266, "frac_reward_zero_std": 0.5875, "grad_norm": 0.2421264499425888, "kl": 0.045111143140820785, "learning_rate": 3.1099999999999997e-07, "loss": 0.0, "num_tokens": 323439039.0, "reward": 0.015625, "reward_std": 0.3608198642730713, "rewards/verify_chess_move/mean": 0.015625, "rewards/verify_chess_move/std": 0.9935002446174621, "step": 4355 }, { "completion_length": 367.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 128.0328125, "completions/mean_terminated_length": 128.0328125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0039457941996825265, "frac_reward_zero_std": 0.55625, "grad_norm": 0.19923348724842072, "kl": 0.06798890152131207, "learning_rate": 3.1135714285714283e-07, "loss": 0.0001, "num_tokens": 323801729.0, "reward": 0.10625, "reward_std": 0.38628738522529604, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9956441879272461, "step": 4360 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 121.02734375, "completions/mean_terminated_length": 121.02734375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0039503191930307865, "frac_reward_zero_std": 0.64375, "grad_norm": 0.19314493238925934, "kl": 0.05056453211582266, "learning_rate": 3.117142857142857e-07, "loss": 0.0001, "num_tokens": 324154060.0, "reward": 0.1359375, "reward_std": 0.3124694496393204, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.9869500279426575, "step": 4365 }, { "completion_length": 380.2, "completions/clipped_ratio": 0.0, "completions/max_length": 380.2, "completions/max_terminated_length": 380.2, "completions/mean_length": 121.059375, "completions/mean_terminated_length": 121.059375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.003954844186379046, "frac_reward_zero_std": 0.525, "grad_norm": 0.25455063581466675, "kl": 0.049614271434256806, "learning_rate": 3.1207142857142855e-07, "loss": 0.0, "num_tokens": 324507064.0, "reward": 0.1140625, "reward_std": 0.4114225447177887, "rewards/verify_chess_move/mean": 0.1140625, "rewards/verify_chess_move/std": 0.9853638529777526, "step": 4370 }, { "completion_length": 396.2, "completions/clipped_ratio": 0.0, "completions/max_length": 396.2, "completions/max_terminated_length": 396.2, "completions/mean_length": 129.4203125, "completions/mean_terminated_length": 129.4203125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.003959369179727306, "frac_reward_zero_std": 0.56875, "grad_norm": 0.13372957706451416, "kl": 0.05749381818459369, "learning_rate": 3.124285714285714e-07, "loss": 0.0001, "num_tokens": 324870802.0, "reward": 0.153125, "reward_std": 0.36413549184799193, "rewards/verify_chess_move/mean": 0.153125, "rewards/verify_chess_move/std": 0.9752146124839782, "step": 4375 }, { "completion_length": 419.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.6, "completions/max_terminated_length": 325.8, "completions/mean_length": 131.975, "completions/mean_terminated_length": 131.46302795410156, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.003963894173075566, "frac_reward_zero_std": 0.6125, "grad_norm": 0.26879245042800903, "kl": 0.05348224918998312, "learning_rate": 3.1278571428571427e-07, "loss": 0.0001, "num_tokens": 325240730.0, "reward": 0.128125, "reward_std": 0.34865848422050477, "rewards/verify_chess_move/mean": 0.128125, "rewards/verify_chess_move/std": 0.986147427558899, "step": 4380 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 129.86171875, "completions/mean_terminated_length": 129.86171875, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.003968419166423825, "frac_reward_zero_std": 0.6, "grad_norm": 0.2972905933856964, "kl": 0.15138432285166345, "learning_rate": 3.1314285714285713e-07, "loss": 0.0002, "num_tokens": 325608849.0, "reward": 0.25, "reward_std": 0.3444652080535889, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.956236469745636, "step": 4385 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 130.0421875, "completions/mean_terminated_length": 130.0421875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.003972944159772085, "frac_reward_zero_std": 0.625, "grad_norm": 0.25922247767448425, "kl": 0.03212727386271581, "learning_rate": 3.135e-07, "loss": 0.0, "num_tokens": 325975119.0, "reward": 0.0453125, "reward_std": 0.32568023204803465, "rewards/verify_chess_move/mean": 0.0453125, "rewards/verify_chess_move/std": 0.9794247031211853, "step": 4390 }, { "completion_length": 425.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 125.49453125, "completions/mean_terminated_length": 124.99411010742188, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0039774691531203445, "frac_reward_zero_std": 0.5875, "grad_norm": 0.10607033967971802, "kl": 0.030750531700323335, "learning_rate": 3.138571428571428e-07, "loss": 0.0, "num_tokens": 326333288.0, "reward": 0.2, "reward_std": 0.36287177801132203, "rewards/verify_chess_move/mean": 0.2, "rewards/verify_chess_move/std": 0.9720124959945678, "step": 4395 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 130.89375, "completions/mean_terminated_length": 130.89375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.003981994146468605, "frac_reward_zero_std": 0.58125, "grad_norm": 0.3473818004131317, "kl": 0.043555659760022535, "learning_rate": 3.142142857142857e-07, "loss": 0.0, "num_tokens": 326701408.0, "reward": 0.0578125, "reward_std": 0.3721215546131134, "rewards/verify_chess_move/mean": 0.0578125, "rewards/verify_chess_move/std": 0.9957449436187744, "step": 4400 }, { "completion_length": 399.6, "completions/clipped_ratio": 0.0, "completions/max_length": 399.6, "completions/max_terminated_length": 399.6, "completions/mean_length": 130.41953125, "completions/mean_terminated_length": 130.41953125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.003986519139816865, "frac_reward_zero_std": 0.5875, "grad_norm": 0.21102747321128845, "kl": 0.08681065179989673, "learning_rate": 3.145714285714285e-07, "loss": 0.0001, "num_tokens": 327066505.0, "reward": 0.1078125, "reward_std": 0.36697069406509397, "rewards/verify_chess_move/mean": 0.1078125, "rewards/verify_chess_move/std": 0.9911404848098755, "step": 4405 }, { "completion_length": 426.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 123.42265625, "completions/mean_terminated_length": 123.42265625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.003991044133165124, "frac_reward_zero_std": 0.66875, "grad_norm": 0.583440363407135, "kl": 0.14928857510676607, "learning_rate": 3.1492857142857144e-07, "loss": 0.0001, "num_tokens": 327421638.0, "reward": 0.2015625, "reward_std": 0.2897947609424591, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.9650066614151, "step": 4410 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 119.50625, "completions/mean_terminated_length": 119.50625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.003995569126513384, "frac_reward_zero_std": 0.63125, "grad_norm": 0.27918753027915955, "kl": 0.2341082679340616, "learning_rate": 3.1528571428571425e-07, "loss": 0.0002, "num_tokens": 327772406.0, "reward": 0.08125, "reward_std": 0.3151574581861496, "rewards/verify_chess_move/mean": 0.08125, "rewards/verify_chess_move/std": 0.9873324632644653, "step": 4415 }, { "completion_length": 361.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 119.6796875, "completions/mean_terminated_length": 119.6796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004000094119861644, "frac_reward_zero_std": 0.64375, "grad_norm": 0.34047940373420715, "kl": 0.10616380396531895, "learning_rate": 3.1564285714285716e-07, "loss": 0.0001, "num_tokens": 328124836.0, "reward": 0.175, "reward_std": 0.3057993590831757, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9761346459388733, "step": 4420 }, { "completion_length": 333.8, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/max_terminated_length": 333.8, "completions/mean_length": 128.94921875, "completions/mean_terminated_length": 128.94921875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004004619113209903, "frac_reward_zero_std": 0.58125, "grad_norm": 0.17021997272968292, "kl": 0.05295643209828995, "learning_rate": 3.1599999999999997e-07, "loss": 0.0001, "num_tokens": 328486291.0, "reward": 0.25, "reward_std": 0.36203553676605227, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9628273367881774, "step": 4425 }, { "completion_length": 356.8, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 131.740625, "completions/mean_terminated_length": 131.740625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004009144106558163, "frac_reward_zero_std": 0.56875, "grad_norm": 0.25222983956336975, "kl": 0.02802420999505557, "learning_rate": 3.1635714285714283e-07, "loss": 0.0, "num_tokens": 328854847.0, "reward": 0.1359375, "reward_std": 0.37139266133308413, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.9834214925765992, "step": 4430 }, { "completion_length": 468.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 468.2, "completions/max_terminated_length": 378.4, "completions/mean_length": 123.2140625, "completions/mean_terminated_length": 122.71322174072266, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0040136690999064235, "frac_reward_zero_std": 0.49375, "grad_norm": 0.30253374576568604, "kl": 0.01988327673461754, "learning_rate": 3.167142857142857e-07, "loss": 0.0, "num_tokens": 329209129.0, "reward": 0.25, "reward_std": 0.43441674709320066, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.964669668674469, "step": 4435 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 121.3515625, "completions/mean_terminated_length": 121.3515625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.004018194093254683, "frac_reward_zero_std": 0.55625, "grad_norm": 0.21611104905605316, "kl": 0.02238572689238936, "learning_rate": 3.1707142857142855e-07, "loss": 0.0, "num_tokens": 329563299.0, "reward": 0.2921875, "reward_std": 0.372407591342926, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9447964072227478, "step": 4440 }, { "completion_length": 410.8, "completions/clipped_ratio": 0.0, "completions/max_length": 410.8, "completions/max_terminated_length": 410.8, "completions/mean_length": 126.05390625, "completions/mean_terminated_length": 126.05390625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004022719086602943, "frac_reward_zero_std": 0.63125, "grad_norm": 0.1680254489183426, "kl": 0.019199829030549155, "learning_rate": 3.174285714285714e-07, "loss": 0.0, "num_tokens": 329922648.0, "reward": 0.2546875, "reward_std": 0.3205778241157532, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9589967727661133, "step": 4445 }, { "completion_length": 475.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 126.671875, "completions/mean_terminated_length": 126.671875, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.004027244079951202, "frac_reward_zero_std": 0.66875, "grad_norm": 0.16809745132923126, "kl": 0.02336173568619415, "learning_rate": 3.177857142857143e-07, "loss": 0.0, "num_tokens": 330283404.0, "reward": 0.2203125, "reward_std": 0.2831276148557663, "rewards/verify_chess_move/mean": 0.2203125, "rewards/verify_chess_move/std": 0.9745964407920837, "step": 4450 }, { "completion_length": 447.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.0, "completions/max_terminated_length": 357.8, "completions/mean_length": 122.775, "completions/mean_terminated_length": 122.27013244628907, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.004031769073299462, "frac_reward_zero_std": 0.61875, "grad_norm": 0.17588631808757782, "kl": 0.06918657756177708, "learning_rate": 3.181428571428571e-07, "loss": 0.0001, "num_tokens": 330637940.0, "reward": 0.2515625, "reward_std": 0.3339408844709396, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9555272817611694, "step": 4455 }, { "completion_length": 386.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 127.5140625, "completions/mean_terminated_length": 127.5140625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004036294066647722, "frac_reward_zero_std": 0.6, "grad_norm": 0.16031436622142792, "kl": 0.0829255159071181, "learning_rate": 3.185e-07, "loss": 0.0001, "num_tokens": 330997846.0, "reward": 0.221875, "reward_std": 0.3454088240861893, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9608400583267211, "step": 4460 }, { "completion_length": 397.2, "completions/clipped_ratio": 0.0, "completions/max_length": 397.2, "completions/max_terminated_length": 397.2, "completions/mean_length": 120.3984375, "completions/mean_terminated_length": 120.3984375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0040408190599959815, "frac_reward_zero_std": 0.5875, "grad_norm": 0.16140469908714294, "kl": 0.11904776582377963, "learning_rate": 3.188571428571428e-07, "loss": 0.0001, "num_tokens": 331348764.0, "reward": 0.1890625, "reward_std": 0.3635063201189041, "rewards/verify_chess_move/mean": 0.1890625, "rewards/verify_chess_move/std": 0.9822257399559021, "step": 4465 }, { "completion_length": 333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 122.33203125, "completions/mean_terminated_length": 122.33203125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0040453440533442415, "frac_reward_zero_std": 0.625, "grad_norm": 0.19125457108020782, "kl": 0.04605120184132829, "learning_rate": 3.192142857142857e-07, "loss": 0.0, "num_tokens": 331703869.0, "reward": 0.1171875, "reward_std": 0.32683503031730654, "rewards/verify_chess_move/mean": 0.1171875, "rewards/verify_chess_move/std": 0.9909471154212952, "step": 4470 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 124.28828125, "completions/mean_terminated_length": 124.28828125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.004049869046692502, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4320034980773926, "kl": 0.040332361741457136, "learning_rate": 3.1957142857142853e-07, "loss": 0.0, "num_tokens": 332060846.0, "reward": 0.2359375, "reward_std": 0.3928966045379639, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9537748694419861, "step": 4475 }, { "completion_length": 331.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 118.80546875, "completions/mean_terminated_length": 118.80546875, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.004054394040040761, "frac_reward_zero_std": 0.625, "grad_norm": 0.11770159751176834, "kl": 0.07945252388599329, "learning_rate": 3.1992857142857145e-07, "loss": 0.0001, "num_tokens": 332411557.0, "reward": 0.153125, "reward_std": 0.3447177976369858, "rewards/verify_chess_move/mean": 0.153125, "rewards/verify_chess_move/std": 0.981763482093811, "step": 4480 }, { "completion_length": 363.6, "completions/clipped_ratio": 0.0, "completions/max_length": 363.6, "completions/max_terminated_length": 363.6, "completions/mean_length": 123.92109375, "completions/mean_terminated_length": 123.92109375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004058919033389021, "frac_reward_zero_std": 0.64375, "grad_norm": 0.19174638390541077, "kl": 0.042310038831783456, "learning_rate": 3.2028571428571425e-07, "loss": 0.0, "num_tokens": 332768024.0, "reward": 0.24375, "reward_std": 0.30990121364593504, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9626854181289672, "step": 4485 }, { "completion_length": 346.4, "completions/clipped_ratio": 0.0, "completions/max_length": 346.4, "completions/max_terminated_length": 346.4, "completions/mean_length": 122.66328125, "completions/mean_terminated_length": 122.66328125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004063444026737281, "frac_reward_zero_std": 0.60625, "grad_norm": 0.3542250692844391, "kl": 0.05387488328269683, "learning_rate": 3.206428571428571e-07, "loss": 0.0001, "num_tokens": 333126065.0, "reward": 0.0390625, "reward_std": 0.3257426589727402, "rewards/verify_chess_move/mean": 0.0390625, "rewards/verify_chess_move/std": 0.9961562991142273, "step": 4490 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 124.9484375, "completions/mean_terminated_length": 124.9484375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.00406796902008554, "frac_reward_zero_std": 0.5625, "grad_norm": 0.17680592834949493, "kl": 0.04275085240660701, "learning_rate": 3.21e-07, "loss": 0.0, "num_tokens": 333484159.0, "reward": 0.178125, "reward_std": 0.383023738861084, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.9685844659805298, "step": 4495 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 123.67109375, "completions/mean_terminated_length": 123.67109375, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0040724940134338, "frac_reward_zero_std": 0.625, "grad_norm": 0.12447645515203476, "kl": 0.03532054392853752, "learning_rate": 3.2135714285714284e-07, "loss": 0.0, "num_tokens": 333840634.0, "reward": 0.203125, "reward_std": 0.31978904604911806, "rewards/verify_chess_move/mean": 0.203125, "rewards/verify_chess_move/std": 0.966597318649292, "step": 4500 }, { "completion_length": 405.6, "completions/clipped_ratio": 0.0, "completions/max_length": 405.6, "completions/max_terminated_length": 405.6, "completions/mean_length": 112.71328125, "completions/mean_terminated_length": 112.71328125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0040770190067820605, "frac_reward_zero_std": 0.61875, "grad_norm": 0.163923978805542, "kl": 0.03114930085139349, "learning_rate": 3.217142857142857e-07, "loss": 0.0, "num_tokens": 334178691.0, "reward": 0.221875, "reward_std": 0.31737563014030457, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9753228545188903, "step": 4505 }, { "completion_length": 437.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 437.8, "completions/max_terminated_length": 391.4, "completions/mean_length": 123.88828125, "completions/mean_terminated_length": 123.37960357666016, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.00408154400013032, "frac_reward_zero_std": 0.61875, "grad_norm": 0.15111424028873444, "kl": 0.025621916723321193, "learning_rate": 3.2207142857142856e-07, "loss": 0.0, "num_tokens": 334535516.0, "reward": 0.0625, "reward_std": 0.33167681097984314, "rewards/verify_chess_move/mean": 0.0625, "rewards/verify_chess_move/std": 0.9887690901756286, "step": 4510 }, { "completion_length": 315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 111.3890625, "completions/mean_terminated_length": 111.3890625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.00408606899347858, "frac_reward_zero_std": 0.61875, "grad_norm": 0.3946858048439026, "kl": 0.027370755997253582, "learning_rate": 3.224285714285714e-07, "loss": 0.0, "num_tokens": 334874118.0, "reward": 0.184375, "reward_std": 0.3282608866691589, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9820080518722534, "step": 4515 }, { "completion_length": 341.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 125.86015625, "completions/mean_terminated_length": 125.86015625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004090593986826839, "frac_reward_zero_std": 0.65625, "grad_norm": 0.19269534945487976, "kl": 0.021095103857805952, "learning_rate": 3.227857142857143e-07, "loss": 0.0, "num_tokens": 335235363.0, "reward": 0.14375, "reward_std": 0.29043478667736056, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9894597649574279, "step": 4520 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 124.0109375, "completions/mean_terminated_length": 124.0109375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.004095118980175099, "frac_reward_zero_std": 0.58125, "grad_norm": 0.2084505558013916, "kl": 0.03412425384740345, "learning_rate": 3.231428571428571e-07, "loss": 0.0, "num_tokens": 335591905.0, "reward": 0.2203125, "reward_std": 0.3559311985969543, "rewards/verify_chess_move/mean": 0.2203125, "rewards/verify_chess_move/std": 0.9686151385307312, "step": 4525 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 125.74375, "completions/mean_terminated_length": 125.74375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004099643973523359, "frac_reward_zero_std": 0.5875, "grad_norm": 0.18412518501281738, "kl": 0.028551904112100603, "learning_rate": 3.235e-07, "loss": 0.0, "num_tokens": 335951905.0, "reward": 0.2125, "reward_std": 0.3680780053138733, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.9775470137596131, "step": 4530 }, { "completion_length": 433.6, "completions/clipped_ratio": 0.0, "completions/max_length": 433.6, "completions/max_terminated_length": 433.6, "completions/mean_length": 125.8640625, "completions/mean_terminated_length": 125.8640625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004104168966871618, "frac_reward_zero_std": 0.6625, "grad_norm": 0.1906985342502594, "kl": 0.030381098433281295, "learning_rate": 3.238571428571428e-07, "loss": 0.0, "num_tokens": 336311291.0, "reward": 0.146875, "reward_std": 0.30347381830215453, "rewards/verify_chess_move/mean": 0.146875, "rewards/verify_chess_move/std": 0.98592689037323, "step": 4535 }, { "completion_length": 350.6, "completions/clipped_ratio": 0.0, "completions/max_length": 350.6, "completions/max_terminated_length": 350.6, "completions/mean_length": 123.03359375, "completions/mean_terminated_length": 123.03359375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0041086939602198785, "frac_reward_zero_std": 0.65625, "grad_norm": 0.1767202764749527, "kl": 0.09957525377394631, "learning_rate": 3.2421428571428573e-07, "loss": 0.0001, "num_tokens": 336667854.0, "reward": 0.0984375, "reward_std": 0.2965351969003677, "rewards/verify_chess_move/mean": 0.0984375, "rewards/verify_chess_move/std": 0.9900255084037781, "step": 4540 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 126.20546875, "completions/mean_terminated_length": 126.20546875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004113218953568139, "frac_reward_zero_std": 0.61875, "grad_norm": 0.1800951510667801, "kl": 0.06274105471093208, "learning_rate": 3.2457142857142854e-07, "loss": 0.0001, "num_tokens": 337029517.0, "reward": 0.1484375, "reward_std": 0.32825990319252013, "rewards/verify_chess_move/mean": 0.1484375, "rewards/verify_chess_move/std": 0.970782196521759, "step": 4545 }, { "completion_length": 327.4, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/max_terminated_length": 327.4, "completions/mean_length": 119.78671875, "completions/mean_terminated_length": 119.78671875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004117743946916398, "frac_reward_zero_std": 0.6625, "grad_norm": 0.2069573849439621, "kl": 0.06872379513224587, "learning_rate": 3.2492857142857145e-07, "loss": 0.0001, "num_tokens": 337383444.0, "reward": 0.1484375, "reward_std": 0.29395553171634675, "rewards/verify_chess_move/mean": 0.1484375, "rewards/verify_chess_move/std": 0.9853020548820496, "step": 4550 }, { "completion_length": 318.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 120.85625, "completions/mean_terminated_length": 120.85625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004122268940264658, "frac_reward_zero_std": 0.6, "grad_norm": 0.3185000419616699, "kl": 0.0570410571526736, "learning_rate": 3.2528571428571426e-07, "loss": 0.0001, "num_tokens": 337737724.0, "reward": 0.1765625, "reward_std": 0.33610171675682066, "rewards/verify_chess_move/mean": 0.1765625, "rewards/verify_chess_move/std": 0.969069492816925, "step": 4555 }, { "completion_length": 456.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 456.6, "completions/max_terminated_length": 367.2, "completions/mean_length": 123.89140625, "completions/mean_terminated_length": 123.37765197753906, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004126793933612918, "frac_reward_zero_std": 0.61875, "grad_norm": 0.1510261446237564, "kl": 0.03572095212584827, "learning_rate": 3.256428571428571e-07, "loss": 0.0, "num_tokens": 338094465.0, "reward": 0.25, "reward_std": 0.3389349579811096, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9487573385238648, "step": 4560 }, { "completion_length": 318.4, "completions/clipped_ratio": 0.0, "completions/max_length": 318.4, "completions/max_terminated_length": 318.4, "completions/mean_length": 118.1078125, "completions/mean_terminated_length": 118.1078125, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.004131318926961177, "frac_reward_zero_std": 0.675, "grad_norm": 0.37803003191947937, "kl": 0.04474702068255283, "learning_rate": 3.26e-07, "loss": 0.0, "num_tokens": 338444507.0, "reward": 0.11875, "reward_std": 0.27554994225502016, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9928122758865356, "step": 4565 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 118.74609375, "completions/mean_terminated_length": 118.74609375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.004135843920309437, "frac_reward_zero_std": 0.64375, "grad_norm": 0.12473364174365997, "kl": 0.028382449480704963, "learning_rate": 3.2635714285714284e-07, "loss": 0.0, "num_tokens": 338794174.0, "reward": 0.3078125, "reward_std": 0.3050698757171631, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9474204659461976, "step": 4570 }, { "completion_length": 346.2, "completions/clipped_ratio": 0.0, "completions/max_length": 346.2, "completions/max_terminated_length": 346.2, "completions/mean_length": 123.28046875, "completions/mean_terminated_length": 123.28046875, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.0041403689136576965, "frac_reward_zero_std": 0.6375, "grad_norm": 0.38912642002105713, "kl": 0.03316824423382059, "learning_rate": 3.267142857142857e-07, "loss": 0.0, "num_tokens": 339151797.0, "reward": 0.1421875, "reward_std": 0.3218829333782196, "rewards/verify_chess_move/mean": 0.1421875, "rewards/verify_chess_move/std": 0.9887999296188354, "step": 4575 }, { "completion_length": 408.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 408.6, "completions/max_terminated_length": 317.0, "completions/mean_length": 118.4046875, "completions/mean_terminated_length": 117.38118896484374, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004144893907005957, "frac_reward_zero_std": 0.64375, "grad_norm": 0.2374524623155594, "kl": 0.020692580763716252, "learning_rate": 3.2707142857142856e-07, "loss": 0.0, "num_tokens": 339499691.0, "reward": 0.2015625, "reward_std": 0.31011140942573545, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.9760565996170044, "step": 4580 }, { "completion_length": 320.8, "completions/clipped_ratio": 0.0, "completions/max_length": 320.8, "completions/max_terminated_length": 320.8, "completions/mean_length": 115.2234375, "completions/mean_terminated_length": 115.2234375, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.004149418900354217, "frac_reward_zero_std": 0.64375, "grad_norm": 0.24225252866744995, "kl": 0.02153780372464098, "learning_rate": 3.2742857142857137e-07, "loss": 0.0, "num_tokens": 339844545.0, "reward": 0.115625, "reward_std": 0.2962316244840622, "rewards/verify_chess_move/mean": 0.115625, "rewards/verify_chess_move/std": 0.9911910653114319, "step": 4585 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 115.8640625, "completions/mean_terminated_length": 115.8640625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004153943893702476, "frac_reward_zero_std": 0.60625, "grad_norm": 0.17564135789871216, "kl": 0.024458767601754517, "learning_rate": 3.277857142857143e-07, "loss": 0.0, "num_tokens": 340189307.0, "reward": 0.209375, "reward_std": 0.3400467813014984, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9674764275550842, "step": 4590 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 127.22265625, "completions/mean_terminated_length": 127.22265625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004158468887050736, "frac_reward_zero_std": 0.6625, "grad_norm": 0.27429455518722534, "kl": 0.025570290806354024, "learning_rate": 3.281428571428571e-07, "loss": 0.0, "num_tokens": 340552128.0, "reward": 0.1890625, "reward_std": 0.29664099514484404, "rewards/verify_chess_move/mean": 0.1890625, "rewards/verify_chess_move/std": 0.9793807625770569, "step": 4595 }, { "completion_length": 350.8, "completions/clipped_ratio": 0.0, "completions/max_length": 350.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 123.596875, "completions/mean_terminated_length": 123.596875, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.004162993880398996, "frac_reward_zero_std": 0.58125, "grad_norm": 0.16143538057804108, "kl": 0.02412077671906445, "learning_rate": 3.285e-07, "loss": 0.0, "num_tokens": 340909172.0, "reward": 0.1796875, "reward_std": 0.3629316747188568, "rewards/verify_chess_move/mean": 0.1796875, "rewards/verify_chess_move/std": 0.976927387714386, "step": 4600 }, { "completion_length": 318.6, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/max_terminated_length": 318.6, "completions/mean_length": 127.92578125, "completions/mean_terminated_length": 127.92578125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004167518873747255, "frac_reward_zero_std": 0.59375, "grad_norm": 0.1557893007993698, "kl": 0.02591217905282974, "learning_rate": 3.288571428571428e-07, "loss": 0.0, "num_tokens": 341274061.0, "reward": 0.11875, "reward_std": 0.33769031167030333, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9897127747535706, "step": 4605 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 131.40078125, "completions/mean_terminated_length": 131.40078125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0041720438670955155, "frac_reward_zero_std": 0.58125, "grad_norm": 0.14758436381816864, "kl": 0.028115819027880205, "learning_rate": 3.2921428571428573e-07, "loss": 0.0, "num_tokens": 341643910.0, "reward": 0.0953125, "reward_std": 0.3625538110733032, "rewards/verify_chess_move/mean": 0.0953125, "rewards/verify_chess_move/std": 0.983694314956665, "step": 4610 }, { "completion_length": 413.8, "completions/clipped_ratio": 0.0, "completions/max_length": 413.8, "completions/max_terminated_length": 413.8, "completions/mean_length": 118.3125, "completions/mean_terminated_length": 118.3125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0041765688604437756, "frac_reward_zero_std": 0.64375, "grad_norm": 0.1284889280796051, "kl": 0.02462222593021579, "learning_rate": 3.2957142857142854e-07, "loss": 0.0, "num_tokens": 341993622.0, "reward": 0.2875, "reward_std": 0.297434875369072, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9526752948760986, "step": 4615 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 128.859375, "completions/mean_terminated_length": 128.859375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004181093853792035, "frac_reward_zero_std": 0.65, "grad_norm": 0.5600035190582275, "kl": 0.10456571358954533, "learning_rate": 3.299285714285714e-07, "loss": 0.0001, "num_tokens": 342359522.0, "reward": 0.1203125, "reward_std": 0.30915539264678954, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9879956126213074, "step": 4620 }, { "completion_length": 323.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 118.98359375, "completions/mean_terminated_length": 118.98359375, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.004185618847140295, "frac_reward_zero_std": 0.6625, "grad_norm": 0.24904923141002655, "kl": 0.0806541275116615, "learning_rate": 3.3028571428571426e-07, "loss": 0.0001, "num_tokens": 342709733.0, "reward": 0.215625, "reward_std": 0.2941211938858032, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9780671954154968, "step": 4625 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 116.56796875, "completions/mean_terminated_length": 116.56796875, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.004190143840488554, "frac_reward_zero_std": 0.6875, "grad_norm": 0.20340076088905334, "kl": 0.05632038920884952, "learning_rate": 3.306428571428571e-07, "loss": 0.0001, "num_tokens": 343058500.0, "reward": 0.15, "reward_std": 0.2701270252466202, "rewards/verify_chess_move/mean": 0.15, "rewards/verify_chess_move/std": 0.9823262214660644, "step": 4630 }, { "completion_length": 456.6, "completions/clipped_ratio": 0.0, "completions/max_length": 456.6, "completions/max_terminated_length": 456.6, "completions/mean_length": 120.78671875, "completions/mean_terminated_length": 120.78671875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004194668833836814, "frac_reward_zero_std": 0.60625, "grad_norm": 0.4053203761577606, "kl": 0.03770174743840471, "learning_rate": 3.31e-07, "loss": 0.0, "num_tokens": 343411347.0, "reward": 0.1953125, "reward_std": 0.3280522465705872, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9645248889923096, "step": 4635 }, { "completion_length": 330.8, "completions/clipped_ratio": 0.0, "completions/max_length": 330.8, "completions/max_terminated_length": 330.8, "completions/mean_length": 118.234375, "completions/mean_terminated_length": 118.234375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004199193827185074, "frac_reward_zero_std": 0.66875, "grad_norm": 0.25051796436309814, "kl": 0.026687402866082267, "learning_rate": 3.3135714285714285e-07, "loss": 0.0, "num_tokens": 343761607.0, "reward": 0.23125, "reward_std": 0.2945311665534973, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9695397257804871, "step": 4640 }, { "completion_length": 376.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 126.21953125, "completions/mean_terminated_length": 126.21953125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0042037188205333335, "frac_reward_zero_std": 0.64375, "grad_norm": 0.16102761030197144, "kl": 0.030149899097159504, "learning_rate": 3.317142857142857e-07, "loss": 0.0, "num_tokens": 344122920.0, "reward": 0.2328125, "reward_std": 0.2967518657445908, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.9582330584526062, "step": 4645 }, { "completion_length": 333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 116.74140625, "completions/mean_terminated_length": 116.74140625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004208243813881594, "frac_reward_zero_std": 0.6375, "grad_norm": 0.2949542999267578, "kl": 0.03848379196133465, "learning_rate": 3.3207142857142857e-07, "loss": 0.0, "num_tokens": 344470517.0, "reward": 0.0078125, "reward_std": 0.32047046422958375, "rewards/verify_chess_move/mean": 0.0078125, "rewards/verify_chess_move/std": 0.9927641987800598, "step": 4650 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0, "completions/max_length": 393.8, "completions/max_terminated_length": 393.8, "completions/mean_length": 123.90625, "completions/mean_terminated_length": 123.90625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004212768807229854, "frac_reward_zero_std": 0.625, "grad_norm": 0.1433904469013214, "kl": 0.025777766312239693, "learning_rate": 3.324285714285714e-07, "loss": 0.0, "num_tokens": 344826741.0, "reward": 0.334375, "reward_std": 0.32042455673217773, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9412930011749268, "step": 4655 }, { "completion_length": 320.6, "completions/clipped_ratio": 0.0, "completions/max_length": 320.6, "completions/max_terminated_length": 320.6, "completions/mean_length": 121.2578125, "completions/mean_terminated_length": 121.2578125, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.004217293800578113, "frac_reward_zero_std": 0.63125, "grad_norm": 0.2354012280702591, "kl": 0.03998441194999032, "learning_rate": 3.327857142857143e-07, "loss": 0.0, "num_tokens": 345183231.0, "reward": 0.1359375, "reward_std": 0.32147297263145447, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.9853208065032959, "step": 4660 }, { "completion_length": 324.8, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/max_terminated_length": 324.8, "completions/mean_length": 123.63125, "completions/mean_terminated_length": 123.63125, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.004221818793926373, "frac_reward_zero_std": 0.6625, "grad_norm": 0.3195115923881531, "kl": 0.03211738386307843, "learning_rate": 3.331428571428571e-07, "loss": 0.0, "num_tokens": 345538159.0, "reward": 0.1890625, "reward_std": 0.3045811355113983, "rewards/verify_chess_move/mean": 0.1890625, "rewards/verify_chess_move/std": 0.9819552659988403, "step": 4665 }, { "completion_length": 327.6, "completions/clipped_ratio": 0.0, "completions/max_length": 327.6, "completions/max_terminated_length": 327.6, "completions/mean_length": 125.09765625, "completions/mean_terminated_length": 125.09765625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004226343787274633, "frac_reward_zero_std": 0.66875, "grad_norm": 0.5439039468765259, "kl": 0.0499537393276114, "learning_rate": 3.335e-07, "loss": 0.0, "num_tokens": 345897892.0, "reward": 0.275, "reward_std": 0.2797571986913681, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9545600414276123, "step": 4670 }, { "completion_length": 342.8, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 119.74921875, "completions/mean_terminated_length": 119.74921875, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.004230868780622892, "frac_reward_zero_std": 0.60625, "grad_norm": 0.5135943293571472, "kl": 0.09751264741062186, "learning_rate": 3.338571428571428e-07, "loss": 0.0001, "num_tokens": 346247067.0, "reward": 0.2109375, "reward_std": 0.3250102400779724, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.9648400664329528, "step": 4675 }, { "completion_length": 388.2, "completions/clipped_ratio": 0.0, "completions/max_length": 388.2, "completions/max_terminated_length": 388.2, "completions/mean_length": 126.33515625, "completions/mean_terminated_length": 126.33515625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.004235393773971152, "frac_reward_zero_std": 0.6125, "grad_norm": 0.3594011664390564, "kl": 0.06289221409824677, "learning_rate": 3.3421428571428574e-07, "loss": 0.0001, "num_tokens": 346609952.0, "reward": 0.0609375, "reward_std": 0.35065900087356566, "rewards/verify_chess_move/mean": 0.0609375, "rewards/verify_chess_move/std": 0.9756619572639466, "step": 4680 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.0, "completions/max_length": 419.4, "completions/max_terminated_length": 419.4, "completions/mean_length": 117.71484375, "completions/mean_terminated_length": 117.71484375, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.004239918767319412, "frac_reward_zero_std": 0.63125, "grad_norm": 0.2824150025844574, "kl": 0.06273081686813384, "learning_rate": 3.3457142857142855e-07, "loss": 0.0001, "num_tokens": 346957907.0, "reward": 0.2078125, "reward_std": 0.32420492768287656, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.9722478151321411, "step": 4685 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 114.01171875, "completions/mean_terminated_length": 114.01171875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004244443760667672, "frac_reward_zero_std": 0.675, "grad_norm": 0.38325342535972595, "kl": 0.04095816551707685, "learning_rate": 3.349285714285714e-07, "loss": 0.0, "num_tokens": 347300858.0, "reward": 0.225, "reward_std": 0.2718724191188812, "rewards/verify_chess_move/mean": 0.225, "rewards/verify_chess_move/std": 0.972052538394928, "step": 4690 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 119.74921875, "completions/mean_terminated_length": 119.74921875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.004248968754015932, "frac_reward_zero_std": 0.6125, "grad_norm": 0.5040019750595093, "kl": 0.48029972686781547, "learning_rate": 3.3528571428571427e-07, "loss": 0.0005, "num_tokens": 347652777.0, "reward": 0.240625, "reward_std": 0.3349424242973328, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9655497670173645, "step": 4695 }, { "completion_length": 382.8, "completions/clipped_ratio": 0.0, "completions/max_length": 382.8, "completions/max_terminated_length": 382.8, "completions/mean_length": 117.7875, "completions/mean_terminated_length": 117.7875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004253493747364191, "frac_reward_zero_std": 0.65, "grad_norm": 0.2584799528121948, "kl": 0.11348415916436352, "learning_rate": 3.3564285714285713e-07, "loss": 0.0001, "num_tokens": 348001665.0, "reward": 0.1609375, "reward_std": 0.3000139623880386, "rewards/verify_chess_move/mean": 0.1609375, "rewards/verify_chess_move/std": 0.9801913142204285, "step": 4700 }, { "completion_length": 439.6, "completions/clipped_ratio": 0.0, "completions/max_length": 439.6, "completions/max_terminated_length": 439.6, "completions/mean_length": 119.84140625, "completions/mean_terminated_length": 119.84140625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.004258018740712451, "frac_reward_zero_std": 0.625, "grad_norm": 0.21159674227237701, "kl": 0.0567615640728036, "learning_rate": 3.36e-07, "loss": 0.0001, "num_tokens": 348353254.0, "reward": 0.1515625, "reward_std": 0.3147930145263672, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9836025357246398, "step": 4705 }, { "completion_length": 356.8, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 118.43671875, "completions/mean_terminated_length": 118.43671875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.004262543734060711, "frac_reward_zero_std": 0.7375, "grad_norm": 0.3915605843067169, "kl": 0.04731674803770147, "learning_rate": 3.3635714285714285e-07, "loss": 0.0, "num_tokens": 348704837.0, "reward": 0.1890625, "reward_std": 0.22247102856636047, "rewards/verify_chess_move/mean": 0.1890625, "rewards/verify_chess_move/std": 0.9709698796272278, "step": 4710 }, { "completion_length": 320.4, "completions/clipped_ratio": 0.0, "completions/max_length": 320.4, "completions/max_terminated_length": 320.4, "completions/mean_length": 124.275, "completions/mean_terminated_length": 124.275, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0042670687274089705, "frac_reward_zero_std": 0.63125, "grad_norm": 0.1857142597436905, "kl": 0.08388297606725245, "learning_rate": 3.3671428571428566e-07, "loss": 0.0001, "num_tokens": 349065285.0, "reward": 0.11875, "reward_std": 0.3010170519351959, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9746312975883484, "step": 4715 }, { "completion_length": 319.6, "completions/clipped_ratio": 0.0, "completions/max_length": 319.6, "completions/max_terminated_length": 319.6, "completions/mean_length": 118.1078125, "completions/mean_terminated_length": 118.1078125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0042715937207572306, "frac_reward_zero_std": 0.68125, "grad_norm": 0.1501009315252304, "kl": 0.1513606802560389, "learning_rate": 3.370714285714286e-07, "loss": 0.0002, "num_tokens": 349416071.0, "reward": 0.15625, "reward_std": 0.27796531021595, "rewards/verify_chess_move/mean": 0.15625, "rewards/verify_chess_move/std": 0.9802200078964234, "step": 4720 }, { "completion_length": 322.8, "completions/clipped_ratio": 0.0, "completions/max_length": 322.8, "completions/max_terminated_length": 322.8, "completions/mean_length": 123.1328125, "completions/mean_terminated_length": 123.1328125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.004276118714105491, "frac_reward_zero_std": 0.63125, "grad_norm": 0.6879270672798157, "kl": 0.10762545613688417, "learning_rate": 3.374285714285714e-07, "loss": 0.0001, "num_tokens": 349774065.0, "reward": 0.109375, "reward_std": 0.3271510124206543, "rewards/verify_chess_move/mean": 0.109375, "rewards/verify_chess_move/std": 0.9866475224494934, "step": 4725 }, { "completion_length": 372.6, "completions/clipped_ratio": 0.0, "completions/max_length": 372.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 116.89296875, "completions/mean_terminated_length": 116.89296875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.00428064370745375, "frac_reward_zero_std": 0.575, "grad_norm": 0.18476304411888123, "kl": 0.06631483209202997, "learning_rate": 3.377857142857143e-07, "loss": 0.0001, "num_tokens": 350120744.0, "reward": 0.1953125, "reward_std": 0.3708164393901825, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9710379004478454, "step": 4730 }, { "completion_length": 304.2, "completions/clipped_ratio": 0.0, "completions/max_length": 304.2, "completions/max_terminated_length": 304.2, "completions/mean_length": 118.5671875, "completions/mean_terminated_length": 118.5671875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.00428516870080201, "frac_reward_zero_std": 0.63125, "grad_norm": 0.1694999635219574, "kl": 0.04937950554303825, "learning_rate": 3.381428571428571e-07, "loss": 0.0, "num_tokens": 350472342.0, "reward": 0.1671875, "reward_std": 0.32173261046409607, "rewards/verify_chess_move/mean": 0.1671875, "rewards/verify_chess_move/std": 0.9806502819061279, "step": 4735 }, { "completion_length": 327.6, "completions/clipped_ratio": 0.0, "completions/max_length": 327.6, "completions/max_terminated_length": 327.6, "completions/mean_length": 114.196875, "completions/mean_terminated_length": 114.196875, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.004289693694150269, "frac_reward_zero_std": 0.65, "grad_norm": 0.25105130672454834, "kl": 0.02540331208729185, "learning_rate": 3.385e-07, "loss": 0.0, "num_tokens": 350814970.0, "reward": 0.203125, "reward_std": 0.30300652384758, "rewards/verify_chess_move/mean": 0.203125, "rewards/verify_chess_move/std": 0.9698148608207703, "step": 4740 }, { "completion_length": 396.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 396.8, "completions/max_terminated_length": 307.2, "completions/mean_length": 125.93671875, "completions/mean_terminated_length": 125.43539123535156, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.004294218687498529, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18743358552455902, "kl": 0.025500100693898277, "learning_rate": 3.3885714285714283e-07, "loss": 0.0, "num_tokens": 351175481.0, "reward": 0.0546875, "reward_std": 0.27328626811504364, "rewards/verify_chess_move/mean": 0.0546875, "rewards/verify_chess_move/std": 0.9894884824752808, "step": 4745 }, { "completion_length": 369.6, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/max_terminated_length": 369.6, "completions/mean_length": 123.96484375, "completions/mean_terminated_length": 123.96484375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004298743680846789, "frac_reward_zero_std": 0.59375, "grad_norm": 0.13662753999233246, "kl": 0.025846136052859948, "learning_rate": 3.392142857142857e-07, "loss": 0.0, "num_tokens": 351532324.0, "reward": 0.1234375, "reward_std": 0.351147735118866, "rewards/verify_chess_move/mean": 0.1234375, "rewards/verify_chess_move/std": 0.9908722877502442, "step": 4750 }, { "completion_length": 332.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 123.4875, "completions/mean_terminated_length": 123.4875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004303268674195049, "frac_reward_zero_std": 0.65, "grad_norm": 0.15752820670604706, "kl": 0.02451645758119412, "learning_rate": 3.3957142857142855e-07, "loss": 0.0, "num_tokens": 351890428.0, "reward": 0.2609375, "reward_std": 0.29869446754455564, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9638716340065002, "step": 4755 }, { "completion_length": 401.6, "completions/clipped_ratio": 0.0, "completions/max_length": 401.6, "completions/max_terminated_length": 401.6, "completions/mean_length": 128.75625, "completions/mean_terminated_length": 128.75625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004307793667543309, "frac_reward_zero_std": 0.65, "grad_norm": 0.1516522318124771, "kl": 0.02382672555395402, "learning_rate": 3.399285714285714e-07, "loss": 0.0, "num_tokens": 352256380.0, "reward": 0.1734375, "reward_std": 0.3036895155906677, "rewards/verify_chess_move/mean": 0.1734375, "rewards/verify_chess_move/std": 0.9810305118560791, "step": 4760 }, { "completion_length": 309.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 117.315625, "completions/mean_terminated_length": 117.315625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004312318660891569, "frac_reward_zero_std": 0.64375, "grad_norm": 0.21902956068515778, "kl": 0.02880285815917887, "learning_rate": 3.402857142857143e-07, "loss": 0.0, "num_tokens": 352604560.0, "reward": 0.1953125, "reward_std": 0.30170044004917146, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9796490550041199, "step": 4765 }, { "completion_length": 384.4, "completions/clipped_ratio": 0.0, "completions/max_length": 384.4, "completions/max_terminated_length": 384.4, "completions/mean_length": 120.5859375, "completions/mean_terminated_length": 120.5859375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004316843654239828, "frac_reward_zero_std": 0.64375, "grad_norm": 0.15267416834831238, "kl": 0.02785990494303405, "learning_rate": 3.4064285714285713e-07, "loss": 0.0, "num_tokens": 352955502.0, "reward": 0.1578125, "reward_std": 0.31126424074172976, "rewards/verify_chess_move/mean": 0.1578125, "rewards/verify_chess_move/std": 0.9780982375144959, "step": 4770 }, { "completion_length": 326.2, "completions/clipped_ratio": 0.0, "completions/max_length": 326.2, "completions/max_terminated_length": 326.2, "completions/mean_length": 124.396875, "completions/mean_terminated_length": 124.396875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004321368647588088, "frac_reward_zero_std": 0.675, "grad_norm": 0.6859766244888306, "kl": 0.02868145835818723, "learning_rate": 3.41e-07, "loss": 0.0, "num_tokens": 353313994.0, "reward": 0.1234375, "reward_std": 0.27976705729961393, "rewards/verify_chess_move/mean": 0.1234375, "rewards/verify_chess_move/std": 0.9866770505905151, "step": 4775 }, { "completion_length": 357.2, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 125.69765625, "completions/mean_terminated_length": 125.69765625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.004325893640936348, "frac_reward_zero_std": 0.6875, "grad_norm": 0.26415061950683594, "kl": 0.026786200917558744, "learning_rate": 3.4135714285714286e-07, "loss": 0.0, "num_tokens": 353675111.0, "reward": 0.2171875, "reward_std": 0.27853899598121645, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9723373413085937, "step": 4780 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 113.21484375, "completions/mean_terminated_length": 113.21484375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004330418634284607, "frac_reward_zero_std": 0.6875, "grad_norm": 0.7889008522033691, "kl": 0.058365633222274484, "learning_rate": 3.4171428571428567e-07, "loss": 0.0001, "num_tokens": 354016818.0, "reward": 0.275, "reward_std": 0.2566725671291351, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9397473573684693, "step": 4785 }, { "completion_length": 426.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.8, "completions/max_terminated_length": 336.2, "completions/mean_length": 120.9015625, "completions/mean_terminated_length": 120.40607604980468, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.0043349436276328675, "frac_reward_zero_std": 0.65625, "grad_norm": 0.13525770604610443, "kl": 0.027718722150893882, "learning_rate": 3.420714285714286e-07, "loss": 0.0, "num_tokens": 354366236.0, "reward": 0.2703125, "reward_std": 0.2978101551532745, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9437273859977722, "step": 4790 }, { "completion_length": 449.4, "completions/clipped_ratio": 0.0, "completions/max_length": 449.4, "completions/max_terminated_length": 449.4, "completions/mean_length": 118.76796875, "completions/mean_terminated_length": 118.76796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004339468620981127, "frac_reward_zero_std": 0.68125, "grad_norm": 0.17094357311725616, "kl": 0.027057796815643086, "learning_rate": 3.424285714285714e-07, "loss": 0.0, "num_tokens": 354715683.0, "reward": 0.184375, "reward_std": 0.2731329917907715, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9750310897827148, "step": 4795 }, { "completion_length": 342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 122.6890625, "completions/mean_terminated_length": 122.6890625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004343993614329387, "frac_reward_zero_std": 0.6625, "grad_norm": 0.2299318164587021, "kl": 0.02932522139744833, "learning_rate": 3.427857142857143e-07, "loss": 0.0, "num_tokens": 355073221.0, "reward": 0.1515625, "reward_std": 0.29096393287181854, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9880384087562561, "step": 4800 }, { "completion_length": 317.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 120.3671875, "completions/mean_terminated_length": 120.3671875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.004348518607677647, "frac_reward_zero_std": 0.66875, "grad_norm": 0.19301952421665192, "kl": 0.029117460845736787, "learning_rate": 3.431428571428571e-07, "loss": 0.0, "num_tokens": 355425491.0, "reward": 0.1578125, "reward_std": 0.27996936440467834, "rewards/verify_chess_move/mean": 0.1578125, "rewards/verify_chess_move/std": 0.9689990520477295, "step": 4805 }, { "completion_length": 360.8, "completions/clipped_ratio": 0.0, "completions/max_length": 360.8, "completions/max_terminated_length": 360.8, "completions/mean_length": 121.46484375, "completions/mean_terminated_length": 121.46484375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004353043601025906, "frac_reward_zero_std": 0.65625, "grad_norm": 0.30461132526397705, "kl": 0.024508816143497824, "learning_rate": 3.435e-07, "loss": 0.0, "num_tokens": 355780566.0, "reward": 0.221875, "reward_std": 0.29696052372455595, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9593791246414185, "step": 4810 }, { "completion_length": 384.4, "completions/clipped_ratio": 0.0, "completions/max_length": 384.4, "completions/max_terminated_length": 384.4, "completions/mean_length": 119.9859375, "completions/mean_terminated_length": 119.9859375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004357568594374166, "frac_reward_zero_std": 0.6125, "grad_norm": 0.2659713923931122, "kl": 0.031031454470939933, "learning_rate": 3.4385714285714283e-07, "loss": 0.0, "num_tokens": 356133548.0, "reward": 0.2375, "reward_std": 0.3231115996837616, "rewards/verify_chess_move/mean": 0.2375, "rewards/verify_chess_move/std": 0.9578399896621704, "step": 4815 }, { "completion_length": 312.8, "completions/clipped_ratio": 0.0, "completions/max_length": 312.8, "completions/max_terminated_length": 312.8, "completions/mean_length": 128.50625, "completions/mean_terminated_length": 128.50625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.004362093587722426, "frac_reward_zero_std": 0.6125, "grad_norm": 0.19220410287380219, "kl": 0.0478420720086433, "learning_rate": 3.442142857142857e-07, "loss": 0.0, "num_tokens": 356501484.0, "reward": 0.1375, "reward_std": 0.3340472638607025, "rewards/verify_chess_move/mean": 0.1375, "rewards/verify_chess_move/std": 0.9825525999069213, "step": 4820 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 120.728125, "completions/mean_terminated_length": 120.728125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.0043666185810706856, "frac_reward_zero_std": 0.6625, "grad_norm": 0.2796441316604614, "kl": 0.10084423668449745, "learning_rate": 3.4457142857142856e-07, "loss": 0.0001, "num_tokens": 356854480.0, "reward": 0.1453125, "reward_std": 0.2911731481552124, "rewards/verify_chess_move/mean": 0.1453125, "rewards/verify_chess_move/std": 0.9861366987228394, "step": 4825 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.0, "completions/max_length": 380.6, "completions/max_terminated_length": 380.6, "completions/mean_length": 127.415625, "completions/mean_terminated_length": 127.415625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004371143574418946, "frac_reward_zero_std": 0.675, "grad_norm": 0.27116310596466064, "kl": 0.09711991600343026, "learning_rate": 3.449285714285714e-07, "loss": 0.0001, "num_tokens": 357221572.0, "reward": 0.1046875, "reward_std": 0.2821715921163559, "rewards/verify_chess_move/mean": 0.1046875, "rewards/verify_chess_move/std": 0.9844091773033142, "step": 4830 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.0, "completions/max_length": 390.4, "completions/max_terminated_length": 390.4, "completions/mean_length": 114.0390625, "completions/mean_terminated_length": 114.0390625, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.004375668567767206, "frac_reward_zero_std": 0.6375, "grad_norm": 0.1699310839176178, "kl": 0.20521426723571495, "learning_rate": 3.452857142857143e-07, "loss": 0.0002, "num_tokens": 357565094.0, "reward": 0.2171875, "reward_std": 0.32388638854026797, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9698048949241638, "step": 4835 }, { "completion_length": 429.6, "completions/clipped_ratio": 0.0, "completions/max_length": 429.6, "completions/max_terminated_length": 429.6, "completions/mean_length": 133.16015625, "completions/mean_terminated_length": 133.16015625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004380193561115465, "frac_reward_zero_std": 0.63125, "grad_norm": 0.15907655656337738, "kl": 0.04051080831559375, "learning_rate": 3.4564285714285714e-07, "loss": 0.0, "num_tokens": 357934827.0, "reward": 0.159375, "reward_std": 0.3389314204454422, "rewards/verify_chess_move/mean": 0.159375, "rewards/verify_chess_move/std": 0.9828718066215515, "step": 4840 }, { "completion_length": 343.4, "completions/clipped_ratio": 0.0, "completions/max_length": 343.4, "completions/max_terminated_length": 343.4, "completions/mean_length": 116.403125, "completions/mean_terminated_length": 116.403125, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.004384718554463725, "frac_reward_zero_std": 0.65625, "grad_norm": 0.19375644624233246, "kl": 0.03696030200808309, "learning_rate": 3.4599999999999995e-07, "loss": 0.0, "num_tokens": 358281079.0, "reward": 0.2640625, "reward_std": 0.2928616017103195, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9636032342910766, "step": 4845 }, { "completion_length": 330.2, "completions/clipped_ratio": 0.0, "completions/max_length": 330.2, "completions/max_terminated_length": 330.2, "completions/mean_length": 128.475, "completions/mean_terminated_length": 128.475, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.004389243547811985, "frac_reward_zero_std": 0.63125, "grad_norm": 0.1769915521144867, "kl": 0.0342183725675568, "learning_rate": 3.4635714285714286e-07, "loss": 0.0, "num_tokens": 358645087.0, "reward": 0.265625, "reward_std": 0.30464808344841005, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9586694717407227, "step": 4850 }, { "completion_length": 331.6, "completions/clipped_ratio": 0.0, "completions/max_length": 331.6, "completions/max_terminated_length": 331.6, "completions/mean_length": 124.6046875, "completions/mean_terminated_length": 124.6046875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004393768541160244, "frac_reward_zero_std": 0.6375, "grad_norm": 0.17002391815185547, "kl": 0.04060615268535912, "learning_rate": 3.4671428571428567e-07, "loss": 0.0, "num_tokens": 359004757.0, "reward": 0.2171875, "reward_std": 0.29980434477329254, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9596474766731262, "step": 4855 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 120.90546875, "completions/mean_terminated_length": 120.90546875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0043982935345085045, "frac_reward_zero_std": 0.625, "grad_norm": 0.39160221815109253, "kl": 0.03308986708289012, "learning_rate": 3.470714285714286e-07, "loss": 0.0, "num_tokens": 359355452.0, "reward": 0.2859375, "reward_std": 0.32721286416053774, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9559532880783081, "step": 4860 }, { "completion_length": 427.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 427.4, "completions/max_terminated_length": 354.6, "completions/mean_length": 130.37734375, "completions/mean_terminated_length": 129.87757263183593, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004402818527856764, "frac_reward_zero_std": 0.6375, "grad_norm": 0.4985176920890808, "kl": 0.07748159277543891, "learning_rate": 3.474285714285714e-07, "loss": 0.0001, "num_tokens": 359724143.0, "reward": 0.0859375, "reward_std": 0.31111392974853513, "rewards/verify_chess_move/mean": 0.0859375, "rewards/verify_chess_move/std": 0.987647819519043, "step": 4865 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 129.65078125, "completions/mean_terminated_length": 129.65078125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004407343521205024, "frac_reward_zero_std": 0.60625, "grad_norm": 0.31388482451438904, "kl": 0.03031153375050053, "learning_rate": 3.477857142857143e-07, "loss": 0.0, "num_tokens": 360090760.0, "reward": 0.2125, "reward_std": 0.3441466748714447, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.9767943143844604, "step": 4870 }, { "completion_length": 352.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 123.9109375, "completions/mean_terminated_length": 123.9109375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004411868514553284, "frac_reward_zero_std": 0.6, "grad_norm": 0.20448662340641022, "kl": 0.06255380563670769, "learning_rate": 3.481428571428571e-07, "loss": 0.0001, "num_tokens": 360448518.0, "reward": 0.1296875, "reward_std": 0.3392560422420502, "rewards/verify_chess_move/mean": 0.1296875, "rewards/verify_chess_move/std": 0.9839545726776123, "step": 4875 }, { "completion_length": 372.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 126.65234375, "completions/mean_terminated_length": 126.65234375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004416393507901543, "frac_reward_zero_std": 0.675, "grad_norm": 0.1483006477355957, "kl": 0.03245960942585953, "learning_rate": 3.485e-07, "loss": 0.0, "num_tokens": 360810745.0, "reward": 0.175, "reward_std": 0.27550149261951445, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9775237083435059, "step": 4880 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 126.75078125, "completions/mean_terminated_length": 126.75078125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004420918501249803, "frac_reward_zero_std": 0.68125, "grad_norm": 0.15564563870429993, "kl": 0.09140813069534488, "learning_rate": 3.4885714285714284e-07, "loss": 0.0001, "num_tokens": 361172586.0, "reward": 0.1578125, "reward_std": 0.27108207643032073, "rewards/verify_chess_move/mean": 0.1578125, "rewards/verify_chess_move/std": 0.9881150364875794, "step": 4885 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0, "completions/max_length": 393.8, "completions/max_terminated_length": 393.8, "completions/mean_length": 115.534375, "completions/mean_terminated_length": 115.534375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004425443494598063, "frac_reward_zero_std": 0.775, "grad_norm": 0.12218930572271347, "kl": 0.05711695878999308, "learning_rate": 3.492142857142857e-07, "loss": 0.0001, "num_tokens": 361515966.0, "reward": 0.33125, "reward_std": 0.19616669565439224, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9326194047927856, "step": 4890 }, { "completion_length": 344.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 126.5578125, "completions/mean_terminated_length": 126.5578125, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.0044299684879463225, "frac_reward_zero_std": 0.675, "grad_norm": 0.1538800299167633, "kl": 0.15718928287387826, "learning_rate": 3.4957142857142856e-07, "loss": 0.0002, "num_tokens": 361878680.0, "reward": 0.1203125, "reward_std": 0.28327793478965757, "rewards/verify_chess_move/mean": 0.1203125, "rewards/verify_chess_move/std": 0.9865529298782348, "step": 4895 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 127.24140625, "completions/mean_terminated_length": 127.24140625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.004434493481294583, "frac_reward_zero_std": 0.64375, "grad_norm": 0.20451806485652924, "kl": 0.07871960076736287, "learning_rate": 3.499285714285714e-07, "loss": 0.0001, "num_tokens": 362241277.0, "reward": 0.2015625, "reward_std": 0.310795384645462, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.9736518502235413, "step": 4900 }, { "completion_length": 325.6, "completions/clipped_ratio": 0.0, "completions/max_length": 325.6, "completions/max_terminated_length": 325.6, "completions/mean_length": 114.434375, "completions/mean_terminated_length": 114.434375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.004439018474642843, "frac_reward_zero_std": 0.6125, "grad_norm": 0.2310693860054016, "kl": 0.0325574820919428, "learning_rate": 3.502857142857143e-07, "loss": 0.0, "num_tokens": 362584137.0, "reward": 0.103125, "reward_std": 0.32268922328948973, "rewards/verify_chess_move/mean": 0.103125, "rewards/verify_chess_move/std": 0.9959015250205994, "step": 4905 }, { "completion_length": 342.4, "completions/clipped_ratio": 0.0, "completions/max_length": 342.4, "completions/max_terminated_length": 342.4, "completions/mean_length": 107.546875, "completions/mean_terminated_length": 107.546875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004443543467991102, "frac_reward_zero_std": 0.74375, "grad_norm": 0.06723391264677048, "kl": 0.024016893620137127, "learning_rate": 3.5064285714285715e-07, "loss": 0.0, "num_tokens": 362917309.0, "reward": 0.240625, "reward_std": 0.2119017779827118, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9637383222579956, "step": 4910 }, { "completion_length": 399.2, "completions/clipped_ratio": 0.0, "completions/max_length": 399.2, "completions/max_terminated_length": 399.2, "completions/mean_length": 123.52265625, "completions/mean_terminated_length": 123.52265625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004448068461339362, "frac_reward_zero_std": 0.70625, "grad_norm": 0.22626528143882751, "kl": 0.02292147313710302, "learning_rate": 3.5099999999999995e-07, "loss": 0.0, "num_tokens": 363277122.0, "reward": 0.1859375, "reward_std": 0.24708987176418304, "rewards/verify_chess_move/mean": 0.1859375, "rewards/verify_chess_move/std": 0.9792798399925232, "step": 4915 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 112.8734375, "completions/mean_terminated_length": 112.8734375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004452593454687621, "frac_reward_zero_std": 0.6625, "grad_norm": 0.21274860203266144, "kl": 0.025628323794808238, "learning_rate": 3.5135714285714287e-07, "loss": 0.0, "num_tokens": 363616680.0, "reward": 0.375, "reward_std": 0.28434032797813413, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9250186920166016, "step": 4920 }, { "completion_length": 352.2, "completions/clipped_ratio": 0.0, "completions/max_length": 352.2, "completions/max_terminated_length": 352.2, "completions/mean_length": 114.74921875, "completions/mean_terminated_length": 114.74921875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004457118448035881, "frac_reward_zero_std": 0.74375, "grad_norm": 0.14164212346076965, "kl": 0.022796355572063477, "learning_rate": 3.517142857142857e-07, "loss": 0.0, "num_tokens": 363960703.0, "reward": 0.2703125, "reward_std": 0.22462677359580993, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9487927317619324, "step": 4925 }, { "completion_length": 484.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 484.2, "completions/max_terminated_length": 390.6, "completions/mean_length": 126.2578125, "completions/mean_terminated_length": 125.75506896972657, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.004461643441384141, "frac_reward_zero_std": 0.70625, "grad_norm": 0.16604533791542053, "kl": 0.019884531252318993, "learning_rate": 3.520714285714286e-07, "loss": 0.0, "num_tokens": 364320505.0, "reward": 0.3203125, "reward_std": 0.25140291452407837, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9371677875518799, "step": 4930 }, { "completion_length": 419.6, "completions/clipped_ratio": 0.0, "completions/max_length": 419.6, "completions/max_terminated_length": 419.6, "completions/mean_length": 117.03828125, "completions/mean_terminated_length": 117.03828125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004466168434732401, "frac_reward_zero_std": 0.66875, "grad_norm": 0.09203781932592392, "kl": 0.02057165756414179, "learning_rate": 3.524285714285714e-07, "loss": 0.0, "num_tokens": 364666434.0, "reward": 0.253125, "reward_std": 0.289065283536911, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9585360646247864, "step": 4935 }, { "completion_length": 354.8, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 123.82421875, "completions/mean_terminated_length": 123.82421875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004470693428080661, "frac_reward_zero_std": 0.625, "grad_norm": 0.12338090687990189, "kl": 0.025609644403448328, "learning_rate": 3.5278571428571426e-07, "loss": 0.0, "num_tokens": 365020689.0, "reward": 0.275, "reward_std": 0.3202133685350418, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9620990633964539, "step": 4940 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 114.42890625, "completions/mean_terminated_length": 114.42890625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.004475218421428921, "frac_reward_zero_std": 0.66875, "grad_norm": 0.1928068995475769, "kl": 0.02275678248843178, "learning_rate": 3.531428571428571e-07, "loss": 0.0, "num_tokens": 365363758.0, "reward": 0.1984375, "reward_std": 0.28312564790248873, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.964760422706604, "step": 4945 }, { "completion_length": 420.4, "completions/clipped_ratio": 0.0, "completions/max_length": 420.4, "completions/max_terminated_length": 420.4, "completions/mean_length": 124.32109375, "completions/mean_terminated_length": 124.32109375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.00447974341477718, "frac_reward_zero_std": 0.675, "grad_norm": 0.1645708829164505, "kl": 0.024849997088313102, "learning_rate": 3.535e-07, "loss": 0.0, "num_tokens": 365721777.0, "reward": 0.2796875, "reward_std": 0.2800751537084579, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9539040207862854, "step": 4950 }, { "completion_length": 355.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 129.54140625, "completions/mean_terminated_length": 129.54140625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.00448426840812544, "frac_reward_zero_std": 0.6, "grad_norm": 0.17600543797016144, "kl": 0.02525382788735442, "learning_rate": 3.5385714285714284e-07, "loss": 0.0, "num_tokens": 366088662.0, "reward": 0.175, "reward_std": 0.3408361434936523, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9843490719795227, "step": 4955 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 119.696875, "completions/mean_terminated_length": 119.696875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0044887934014737, "frac_reward_zero_std": 0.65, "grad_norm": 0.1381489336490631, "kl": 0.02099358119885437, "learning_rate": 3.542142857142857e-07, "loss": 0.0, "num_tokens": 366439434.0, "reward": 0.1703125, "reward_std": 0.29984925389289857, "rewards/verify_chess_move/mean": 0.1703125, "rewards/verify_chess_move/std": 0.9821064233779907, "step": 4960 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.0, "completions/max_terminated_length": 354.2, "completions/mean_length": 121.0984375, "completions/mean_terminated_length": 120.59951477050781, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0044933183948219595, "frac_reward_zero_std": 0.65625, "grad_norm": 0.14181005954742432, "kl": 0.02016644751711283, "learning_rate": 3.5457142857142857e-07, "loss": 0.0, "num_tokens": 366792608.0, "reward": 0.14375, "reward_std": 0.29648772478103635, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9841787338256835, "step": 4965 }, { "completion_length": 322.6, "completions/clipped_ratio": 0.0, "completions/max_length": 322.6, "completions/max_terminated_length": 322.6, "completions/mean_length": 122.17578125, "completions/mean_terminated_length": 122.17578125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0044978433881702196, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22302144765853882, "kl": 0.024366421828744933, "learning_rate": 3.5492857142857143e-07, "loss": 0.0, "num_tokens": 367148881.0, "reward": 0.159375, "reward_std": 0.2582981824874878, "rewards/verify_chess_move/mean": 0.159375, "rewards/verify_chess_move/std": 0.9603287816047669, "step": 4970 }, { "completion_length": 399.2, "completions/clipped_ratio": 0.0, "completions/max_length": 399.2, "completions/max_terminated_length": 399.2, "completions/mean_length": 126.6890625, "completions/mean_terminated_length": 126.6890625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004502368381518479, "frac_reward_zero_std": 0.6375, "grad_norm": 0.11710292845964432, "kl": 0.021423478343058376, "learning_rate": 3.5528571428571424e-07, "loss": 0.0, "num_tokens": 367511971.0, "reward": 0.175, "reward_std": 0.32409659028053284, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9736912727355957, "step": 4975 }, { "completion_length": 334.8, "completions/clipped_ratio": 0.0, "completions/max_length": 334.8, "completions/max_terminated_length": 334.8, "completions/mean_length": 123.94765625, "completions/mean_terminated_length": 123.94765625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004506893374866739, "frac_reward_zero_std": 0.73125, "grad_norm": 0.20527076721191406, "kl": 0.025969205936416984, "learning_rate": 3.5564285714285715e-07, "loss": 0.0, "num_tokens": 367868488.0, "reward": 0.1390625, "reward_std": 0.24077024757862092, "rewards/verify_chess_move/mean": 0.1390625, "rewards/verify_chess_move/std": 0.9775426506996154, "step": 4980 }, { "completion_length": 469.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 126.990625, "completions/mean_terminated_length": 126.990625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004511418368214999, "frac_reward_zero_std": 0.6125, "grad_norm": 0.10344571620225906, "kl": 0.01997962761670351, "learning_rate": 3.5599999999999996e-07, "loss": 0.0, "num_tokens": 368229068.0, "reward": 0.1765625, "reward_std": 0.33883014023303987, "rewards/verify_chess_move/mean": 0.1765625, "rewards/verify_chess_move/std": 0.9823518872261048, "step": 4985 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 129.04453125, "completions/mean_terminated_length": 129.04453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.004515943361563258, "frac_reward_zero_std": 0.58125, "grad_norm": 0.30811256170272827, "kl": 0.03168453160906211, "learning_rate": 3.5635714285714287e-07, "loss": 0.0, "num_tokens": 368592749.0, "reward": 0.2578125, "reward_std": 0.34841538071632383, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9604550123214721, "step": 4990 }, { "completion_length": 355.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 121.99453125, "completions/mean_terminated_length": 121.99453125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004520468354911518, "frac_reward_zero_std": 0.69375, "grad_norm": 0.2720293402671814, "kl": 0.03934267368749715, "learning_rate": 3.567142857142857e-07, "loss": 0.0, "num_tokens": 368949790.0, "reward": 0.1515625, "reward_std": 0.26591880321502687, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9831811308860778, "step": 4995 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 114.68203125, "completions/mean_terminated_length": 114.68203125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004524993348259778, "frac_reward_zero_std": 0.63125, "grad_norm": 0.3388347327709198, "kl": 0.061753738508559766, "learning_rate": 3.570714285714286e-07, "loss": 0.0001, "num_tokens": 369293687.0, "reward": 0.1796875, "reward_std": 0.31400364339351655, "rewards/verify_chess_move/mean": 0.1796875, "rewards/verify_chess_move/std": 0.9823304057121277, "step": 5000 }, { "completion_length": 385.8, "completions/clipped_ratio": 0.0, "completions/max_length": 385.8, "completions/max_terminated_length": 385.8, "completions/mean_length": 125.6421875, "completions/mean_terminated_length": 125.6421875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.004529518341608038, "frac_reward_zero_std": 0.7, "grad_norm": 0.07771598547697067, "kl": 0.07275769503321498, "learning_rate": 3.574285714285714e-07, "loss": 0.0001, "num_tokens": 369653597.0, "reward": 0.2078125, "reward_std": 0.25881490111351013, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.9713160276412964, "step": 5005 }, { "completion_length": 409.4, "completions/clipped_ratio": 0.0, "completions/max_length": 409.4, "completions/max_terminated_length": 409.4, "completions/mean_length": 117.37890625, "completions/mean_terminated_length": 117.37890625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004534043334956298, "frac_reward_zero_std": 0.69375, "grad_norm": 0.2002081274986267, "kl": 0.03549813170102425, "learning_rate": 3.5778571428571426e-07, "loss": 0.0, "num_tokens": 370002258.0, "reward": 0.2046875, "reward_std": 0.2663935422897339, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9740064859390258, "step": 5010 }, { "completion_length": 334.8, "completions/clipped_ratio": 0.0, "completions/max_length": 334.8, "completions/max_terminated_length": 334.8, "completions/mean_length": 130.17265625, "completions/mean_terminated_length": 130.17265625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004538568328304558, "frac_reward_zero_std": 0.74375, "grad_norm": 0.14366759359836578, "kl": 0.04156534283538349, "learning_rate": 3.581428571428571e-07, "loss": 0.0, "num_tokens": 370372063.0, "reward": 0.1546875, "reward_std": 0.222621351480484, "rewards/verify_chess_move/mean": 0.1546875, "rewards/verify_chess_move/std": 0.9864946246147156, "step": 5015 }, { "completion_length": 343.6, "completions/clipped_ratio": 0.0, "completions/max_length": 343.6, "completions/max_terminated_length": 343.6, "completions/mean_length": 118.43515625, "completions/mean_terminated_length": 118.43515625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004543093321652817, "frac_reward_zero_std": 0.725, "grad_norm": 0.1761097013950348, "kl": 0.03045652155415155, "learning_rate": 3.585e-07, "loss": 0.0, "num_tokens": 370721084.0, "reward": 0.425, "reward_std": 0.2369414120912552, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8896543860435486, "step": 5020 }, { "completion_length": 335.4, "completions/clipped_ratio": 0.0, "completions/max_length": 335.4, "completions/max_terminated_length": 335.4, "completions/mean_length": 125.15234375, "completions/mean_terminated_length": 125.15234375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004547618315001077, "frac_reward_zero_std": 0.75625, "grad_norm": 0.1260533481836319, "kl": 0.03306016460992396, "learning_rate": 3.5885714285714285e-07, "loss": 0.0, "num_tokens": 371082255.0, "reward": 0.14375, "reward_std": 0.2117355138063431, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9900677919387817, "step": 5025 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 131.28046875, "completions/mean_terminated_length": 131.28046875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.004552143308349336, "frac_reward_zero_std": 0.64375, "grad_norm": 0.14874449372291565, "kl": 0.03101182640530169, "learning_rate": 3.592142857142857e-07, "loss": 0.0, "num_tokens": 371453214.0, "reward": 0.23125, "reward_std": 0.30238145887851714, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9610414385795594, "step": 5030 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 117.3328125, "completions/mean_terminated_length": 117.3328125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004556668301697596, "frac_reward_zero_std": 0.6625, "grad_norm": 0.21664959192276, "kl": 0.029924280795967206, "learning_rate": 3.5957142857142857e-07, "loss": 0.0, "num_tokens": 371800632.0, "reward": 0.1796875, "reward_std": 0.29642980694770815, "rewards/verify_chess_move/mean": 0.1796875, "rewards/verify_chess_move/std": 0.9771271228790284, "step": 5035 }, { "completion_length": 348.8, "completions/clipped_ratio": 0.0, "completions/max_length": 348.8, "completions/max_terminated_length": 348.8, "completions/mean_length": 116.02578125, "completions/mean_terminated_length": 116.02578125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0045611932950458565, "frac_reward_zero_std": 0.7625, "grad_norm": 0.16923940181732178, "kl": 0.02928433066117577, "learning_rate": 3.5992857142857143e-07, "loss": 0.0, "num_tokens": 372147513.0, "reward": 0.278125, "reward_std": 0.19979733228683472, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.959678304195404, "step": 5040 }, { "completion_length": 407.4, "completions/clipped_ratio": 0.0, "completions/max_length": 407.4, "completions/max_terminated_length": 407.4, "completions/mean_length": 119.5890625, "completions/mean_terminated_length": 119.5890625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004565718288394116, "frac_reward_zero_std": 0.75, "grad_norm": 0.19948381185531616, "kl": 0.025273365815519356, "learning_rate": 3.6028571428571424e-07, "loss": 0.0, "num_tokens": 372500323.0, "reward": 0.1265625, "reward_std": 0.2113690972328186, "rewards/verify_chess_move/mean": 0.1265625, "rewards/verify_chess_move/std": 0.9921292781829834, "step": 5045 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 113.565625, "completions/mean_terminated_length": 113.565625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.004570243281742376, "frac_reward_zero_std": 0.73125, "grad_norm": 0.18753020465373993, "kl": 0.03310952302999794, "learning_rate": 3.6064285714285716e-07, "loss": 0.0, "num_tokens": 372843287.0, "reward": 0.215625, "reward_std": 0.2264196276664734, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9622879385948181, "step": 5050 }, { "completion_length": 356.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 133.9640625, "completions/mean_terminated_length": 133.9640625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004574768275090636, "frac_reward_zero_std": 0.6375, "grad_norm": 0.3535173833370209, "kl": 0.04059263615636155, "learning_rate": 3.6099999999999996e-07, "loss": 0.0, "num_tokens": 373215465.0, "reward": 0.0953125, "reward_std": 0.3124818682670593, "rewards/verify_chess_move/mean": 0.0953125, "rewards/verify_chess_move/std": 0.9839309811592102, "step": 5055 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 116.37578125, "completions/mean_terminated_length": 116.37578125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004579293268438895, "frac_reward_zero_std": 0.8, "grad_norm": 0.1796426624059677, "kl": 0.05700292250839993, "learning_rate": 3.613571428571429e-07, "loss": 0.0001, "num_tokens": 373563466.0, "reward": 0.2515625, "reward_std": 0.17690891921520233, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9639847636222839, "step": 5060 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 120.4515625, "completions/mean_terminated_length": 120.4515625, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.004583818261787155, "frac_reward_zero_std": 0.7125, "grad_norm": 0.16340476274490356, "kl": 0.05466707438463345, "learning_rate": 3.617142857142857e-07, "loss": 0.0001, "num_tokens": 373917844.0, "reward": 0.171875, "reward_std": 0.25329803228378295, "rewards/verify_chess_move/mean": 0.171875, "rewards/verify_chess_move/std": 0.9832727670669555, "step": 5065 }, { "completion_length": 341.2, "completions/clipped_ratio": 0.0, "completions/max_length": 341.2, "completions/max_terminated_length": 341.2, "completions/mean_length": 116.44765625, "completions/mean_terminated_length": 116.44765625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004588343255135415, "frac_reward_zero_std": 0.64375, "grad_norm": 0.15338006615638733, "kl": 0.033714137418428436, "learning_rate": 3.6207142857142855e-07, "loss": 0.0, "num_tokens": 374264593.0, "reward": 0.1953125, "reward_std": 0.2985886722803116, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9654932022094727, "step": 5070 }, { "completion_length": 335.8, "completions/clipped_ratio": 0.0, "completions/max_length": 335.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 123.7484375, "completions/mean_terminated_length": 123.7484375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.0045928682484836746, "frac_reward_zero_std": 0.6125, "grad_norm": 0.2901405394077301, "kl": 0.0319310990977101, "learning_rate": 3.624285714285714e-07, "loss": 0.0, "num_tokens": 374622839.0, "reward": 0.0953125, "reward_std": 0.32973324656486513, "rewards/verify_chess_move/mean": 0.0953125, "rewards/verify_chess_move/std": 0.9957481861114502, "step": 5075 }, { "completion_length": 331.2, "completions/clipped_ratio": 0.0, "completions/max_length": 331.2, "completions/max_terminated_length": 331.2, "completions/mean_length": 112.38828125, "completions/mean_terminated_length": 112.38828125, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.004597393241831935, "frac_reward_zero_std": 0.725, "grad_norm": 0.19807754456996918, "kl": 0.02914001264725812, "learning_rate": 3.6278571428571427e-07, "loss": 0.0, "num_tokens": 374964448.0, "reward": 0.2703125, "reward_std": 0.23220502138137816, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9594493508338928, "step": 5080 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 119.40078125, "completions/mean_terminated_length": 119.40078125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004601918235180194, "frac_reward_zero_std": 0.61875, "grad_norm": 0.12254414707422256, "kl": 0.031364492629654704, "learning_rate": 3.6314285714285713e-07, "loss": 0.0, "num_tokens": 375314545.0, "reward": 0.1984375, "reward_std": 0.3346248507499695, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.978834593296051, "step": 5085 }, { "completion_length": 307.2, "completions/clipped_ratio": 0.0, "completions/max_length": 307.2, "completions/max_terminated_length": 307.2, "completions/mean_length": 114.965625, "completions/mean_terminated_length": 114.965625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.004606443228528454, "frac_reward_zero_std": 0.69375, "grad_norm": 0.25088298320770264, "kl": 0.03165360139682889, "learning_rate": 3.635e-07, "loss": 0.0, "num_tokens": 375659517.0, "reward": 0.1703125, "reward_std": 0.2745448887348175, "rewards/verify_chess_move/mean": 0.1703125, "rewards/verify_chess_move/std": 0.9812090277671814, "step": 5090 }, { "completion_length": 416.6, "completions/clipped_ratio": 0.0, "completions/max_length": 416.6, "completions/max_terminated_length": 416.6, "completions/mean_length": 130.6625, "completions/mean_terminated_length": 130.6625, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.004610968221876714, "frac_reward_zero_std": 0.775, "grad_norm": 0.08224131166934967, "kl": 0.02398587965290062, "learning_rate": 3.6385714285714285e-07, "loss": 0.0, "num_tokens": 376030093.0, "reward": 0.2296875, "reward_std": 0.19164148569107056, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9677716970443726, "step": 5095 }, { "completion_length": 375.4, "completions/clipped_ratio": 0.0, "completions/max_length": 375.4, "completions/max_terminated_length": 375.4, "completions/mean_length": 111.1875, "completions/mean_terminated_length": 111.1875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.004615493215224973, "frac_reward_zero_std": 0.70625, "grad_norm": 0.09350692480802536, "kl": 0.03071149073075503, "learning_rate": 3.642142857142857e-07, "loss": 0.0, "num_tokens": 376369325.0, "reward": 0.1390625, "reward_std": 0.25387522876262664, "rewards/verify_chess_move/mean": 0.1390625, "rewards/verify_chess_move/std": 0.9660219073295593, "step": 5100 }, { "completion_length": 394.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 130.20859375, "completions/mean_terminated_length": 130.20859375, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.004620018208573233, "frac_reward_zero_std": 0.7, "grad_norm": 0.2152191549539566, "kl": 0.03465105260838754, "learning_rate": 3.645714285714285e-07, "loss": 0.0, "num_tokens": 376737592.0, "reward": 0.175, "reward_std": 0.2601334065198898, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9668094277381897, "step": 5105 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 122.1921875, "completions/mean_terminated_length": 122.1921875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0046245432019214935, "frac_reward_zero_std": 0.7875, "grad_norm": 0.20561771094799042, "kl": 0.028259670542320238, "learning_rate": 3.6492857142857144e-07, "loss": 0.0, "num_tokens": 377092558.0, "reward": 0.2015625, "reward_std": 0.18848167657852172, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.971585726737976, "step": 5110 }, { "completion_length": 311.2, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/max_terminated_length": 311.2, "completions/mean_length": 110.85625, "completions/mean_terminated_length": 110.85625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.004629068195269753, "frac_reward_zero_std": 0.63125, "grad_norm": 0.41580188274383545, "kl": 0.06219124213093892, "learning_rate": 3.6528571428571425e-07, "loss": 0.0001, "num_tokens": 377431942.0, "reward": 0.1046875, "reward_std": 0.32326328158378603, "rewards/verify_chess_move/mean": 0.1046875, "rewards/verify_chess_move/std": 0.9818721771240234, "step": 5115 }, { "completion_length": 350.2, "completions/clipped_ratio": 0.0, "completions/max_length": 350.2, "completions/max_terminated_length": 350.2, "completions/mean_length": 125.90390625, "completions/mean_terminated_length": 125.90390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004633593188618013, "frac_reward_zero_std": 0.68125, "grad_norm": 0.2756979167461395, "kl": 0.05270316328969784, "learning_rate": 3.6564285714285716e-07, "loss": 0.0001, "num_tokens": 377795035.0, "reward": 0.078125, "reward_std": 0.29047619700431826, "rewards/verify_chess_move/mean": 0.078125, "rewards/verify_chess_move/std": 0.9859324097633362, "step": 5120 }, { "completion_length": 421.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.4, "completions/max_terminated_length": 331.0, "completions/mean_length": 116.8859375, "completions/mean_terminated_length": 116.37747039794922, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.004638118181966273, "frac_reward_zero_std": 0.60625, "grad_norm": 0.22118476033210754, "kl": 0.03412119592831005, "learning_rate": 3.6599999999999997e-07, "loss": 0.0, "num_tokens": 378140601.0, "reward": 0.2671875, "reward_std": 0.33325846791267394, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9576258659362793, "step": 5125 }, { "completion_length": 344.8, "completions/clipped_ratio": 0.0, "completions/max_length": 344.8, "completions/max_terminated_length": 344.8, "completions/mean_length": 113.09453125, "completions/mean_terminated_length": 113.09453125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.004642643175314532, "frac_reward_zero_std": 0.69375, "grad_norm": 0.3139582574367523, "kl": 0.04019221222843043, "learning_rate": 3.663571428571429e-07, "loss": 0.0, "num_tokens": 378483482.0, "reward": 0.1609375, "reward_std": 0.260026627779007, "rewards/verify_chess_move/mean": 0.1609375, "rewards/verify_chess_move/std": 0.9752798318862915, "step": 5130 }, { "completion_length": 323.6, "completions/clipped_ratio": 0.0, "completions/max_length": 323.6, "completions/max_terminated_length": 323.6, "completions/mean_length": 113.46328125, "completions/mean_terminated_length": 113.46328125, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.004647168168662792, "frac_reward_zero_std": 0.7, "grad_norm": 0.17758296430110931, "kl": 0.036609881225740534, "learning_rate": 3.667142857142857e-07, "loss": 0.0, "num_tokens": 378825299.0, "reward": 0.2546875, "reward_std": 0.2557748585939407, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9634385704994202, "step": 5135 }, { "completion_length": 398.2, "completions/clipped_ratio": 0.0, "completions/max_length": 398.2, "completions/max_terminated_length": 398.2, "completions/mean_length": 119.77265625, "completions/mean_terminated_length": 119.77265625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.004651693162011051, "frac_reward_zero_std": 0.7, "grad_norm": 0.13346832990646362, "kl": 0.030318683426594363, "learning_rate": 3.6707142857142855e-07, "loss": 0.0, "num_tokens": 379175272.0, "reward": 0.2875, "reward_std": 0.2562911778688431, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9511526584625244, "step": 5140 }, { "completion_length": 354.6, "completions/clipped_ratio": 0.0, "completions/max_length": 354.6, "completions/max_terminated_length": 354.6, "completions/mean_length": 117.80625, "completions/mean_terminated_length": 117.80625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0046562181553593115, "frac_reward_zero_std": 0.7875, "grad_norm": 0.1553613841533661, "kl": 0.02913574404665269, "learning_rate": 3.674285714285714e-07, "loss": 0.0, "num_tokens": 379524400.0, "reward": 0.334375, "reward_std": 0.1875390440225601, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9254793167114258, "step": 5145 }, { "completion_length": 478.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 478.0, "completions/max_terminated_length": 386.6, "completions/mean_length": 115.3453125, "completions/mean_terminated_length": 114.82426605224609, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004660743148707572, "frac_reward_zero_std": 0.71875, "grad_norm": 0.08992978930473328, "kl": 0.022517650152440182, "learning_rate": 3.677857142857143e-07, "loss": 0.0, "num_tokens": 379868394.0, "reward": 0.4484375, "reward_std": 0.24393005073070526, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8826447248458862, "step": 5150 }, { "completion_length": 319.4, "completions/clipped_ratio": 0.0, "completions/max_length": 319.4, "completions/max_terminated_length": 319.4, "completions/mean_length": 111.6703125, "completions/mean_terminated_length": 111.6703125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.004665268142055831, "frac_reward_zero_std": 0.7, "grad_norm": 0.1713801622390747, "kl": 0.025947796879336238, "learning_rate": 3.6814285714285714e-07, "loss": 0.0, "num_tokens": 380206236.0, "reward": 0.3, "reward_std": 0.2626561373472214, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9491666197776795, "step": 5155 }, { "completion_length": 318.2, "completions/clipped_ratio": 0.0, "completions/max_length": 318.2, "completions/max_terminated_length": 318.2, "completions/mean_length": 118.9546875, "completions/mean_terminated_length": 118.9546875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004669793135404091, "frac_reward_zero_std": 0.7, "grad_norm": 0.159429132938385, "kl": 0.029767941194586457, "learning_rate": 3.685e-07, "loss": 0.0, "num_tokens": 380557346.0, "reward": 0.253125, "reward_std": 0.263502836227417, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9572718739509583, "step": 5160 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 116.69609375, "completions/mean_terminated_length": 116.69609375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.004674318128752351, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2241302877664566, "kl": 0.026591296907281504, "learning_rate": 3.6885714285714286e-07, "loss": 0.0, "num_tokens": 380904629.0, "reward": 0.2390625, "reward_std": 0.2953823655843735, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9694697380065918, "step": 5165 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 120.13828125, "completions/mean_terminated_length": 120.13828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.00467884312210061, "frac_reward_zero_std": 0.79375, "grad_norm": 0.04830751568078995, "kl": 0.028316354693379252, "learning_rate": 3.692142857142857e-07, "loss": 0.0, "num_tokens": 381260590.0, "reward": 0.2109375, "reward_std": 0.17565029673278332, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.973319947719574, "step": 5170 }, { "completion_length": 325.8, "completions/clipped_ratio": 0.0, "completions/max_length": 325.8, "completions/max_terminated_length": 325.8, "completions/mean_length": 114.26015625, "completions/mean_terminated_length": 114.26015625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.00468336811544887, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1945696622133255, "kl": 0.033948714390862736, "learning_rate": 3.6957142857142853e-07, "loss": 0.0, "num_tokens": 381603163.0, "reward": 0.346875, "reward_std": 0.24845525920391082, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9256885409355163, "step": 5175 }, { "completion_length": 342.6, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/max_terminated_length": 342.6, "completions/mean_length": 116.48984375, "completions/mean_terminated_length": 116.48984375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.00468789310879713, "frac_reward_zero_std": 0.7, "grad_norm": 0.12726479768753052, "kl": 0.02643561115837656, "learning_rate": 3.6992857142857144e-07, "loss": 0.0, "num_tokens": 381947678.0, "reward": 0.265625, "reward_std": 0.268122011423111, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9538906335830688, "step": 5180 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 124.92109375, "completions/mean_terminated_length": 124.92109375, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.00469241810214539, "frac_reward_zero_std": 0.68125, "grad_norm": 0.18761643767356873, "kl": 0.028125718451337888, "learning_rate": 3.7028571428571425e-07, "loss": 0.0, "num_tokens": 382308697.0, "reward": 0.3171875, "reward_std": 0.2779643416404724, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9402961015701294, "step": 5185 }, { "completion_length": 364.2, "completions/clipped_ratio": 0.0, "completions/max_length": 364.2, "completions/max_terminated_length": 364.2, "completions/mean_length": 115.14609375, "completions/mean_terminated_length": 115.14609375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.00469694309549365, "frac_reward_zero_std": 0.73125, "grad_norm": 0.36220136284828186, "kl": 0.03364190079155378, "learning_rate": 3.7064285714285717e-07, "loss": 0.0, "num_tokens": 382654436.0, "reward": 0.21875, "reward_std": 0.22936472594738005, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9691309809684754, "step": 5190 }, { "completion_length": 331.8, "completions/clipped_ratio": 0.0, "completions/max_length": 331.8, "completions/max_terminated_length": 331.8, "completions/mean_length": 118.0625, "completions/mean_terminated_length": 118.0625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.004701468088841909, "frac_reward_zero_std": 0.70625, "grad_norm": 0.11366303265094757, "kl": 0.07338130541611462, "learning_rate": 3.71e-07, "loss": 0.0001, "num_tokens": 383003620.0, "reward": 0.19375, "reward_std": 0.25250826627016065, "rewards/verify_chess_move/mean": 0.19375, "rewards/verify_chess_move/std": 0.9719558119773865, "step": 5195 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 125.6171875, "completions/mean_terminated_length": 125.6171875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.004705993082190169, "frac_reward_zero_std": 0.7125, "grad_norm": 0.17759642004966736, "kl": 0.17081750418874436, "learning_rate": 3.7135714285714283e-07, "loss": 0.0002, "num_tokens": 383365690.0, "reward": 0.2953125, "reward_std": 0.24745332598686218, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9310245037078857, "step": 5200 }, { "completion_length": 327.8, "completions/clipped_ratio": 0.0, "completions/max_length": 327.8, "completions/max_terminated_length": 327.8, "completions/mean_length": 113.61171875, "completions/mean_terminated_length": 113.61171875, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.004710518075538429, "frac_reward_zero_std": 0.6625, "grad_norm": 0.35126054286956787, "kl": 0.04413633279036731, "learning_rate": 3.717142857142857e-07, "loss": 0.0, "num_tokens": 383708105.0, "reward": 0.1859375, "reward_std": 0.288441202044487, "rewards/verify_chess_move/mean": 0.1859375, "rewards/verify_chess_move/std": 0.9774745106697083, "step": 5205 }, { "completion_length": 308.6, "completions/clipped_ratio": 0.0, "completions/max_length": 308.6, "completions/max_terminated_length": 308.6, "completions/mean_length": 119.13671875, "completions/mean_terminated_length": 119.13671875, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.004715043068886688, "frac_reward_zero_std": 0.675, "grad_norm": 0.30468007922172546, "kl": 0.0727979720570147, "learning_rate": 3.7207142857142856e-07, "loss": 0.0001, "num_tokens": 384059448.0, "reward": 0.3015625, "reward_std": 0.27529128193855285, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9449558615684509, "step": 5210 }, { "completion_length": 308.2, "completions/clipped_ratio": 0.0, "completions/max_length": 308.2, "completions/max_terminated_length": 308.2, "completions/mean_length": 111.596875, "completions/mean_terminated_length": 111.596875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0047195680622349485, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2297283113002777, "kl": 0.07975355096859857, "learning_rate": 3.724285714285714e-07, "loss": 0.0001, "num_tokens": 384398316.0, "reward": 0.3421875, "reward_std": 0.2712343633174896, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9327500939369202, "step": 5215 }, { "completion_length": 406.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 114.734375, "completions/mean_terminated_length": 114.734375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0047240930555832086, "frac_reward_zero_std": 0.675, "grad_norm": 0.19337564706802368, "kl": 0.09788072486990132, "learning_rate": 3.727857142857143e-07, "loss": 0.0001, "num_tokens": 384742328.0, "reward": 0.2078125, "reward_std": 0.273287832736969, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.9772798895835877, "step": 5220 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.0, "completions/max_length": 395.2, "completions/max_terminated_length": 395.2, "completions/mean_length": 120.2359375, "completions/mean_terminated_length": 120.2359375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004728618048931468, "frac_reward_zero_std": 0.70625, "grad_norm": 0.2316080927848816, "kl": 0.14164348092162982, "learning_rate": 3.7314285714285714e-07, "loss": 0.0001, "num_tokens": 385097190.0, "reward": 0.1875, "reward_std": 0.2505067676305771, "rewards/verify_chess_move/mean": 0.1875, "rewards/verify_chess_move/std": 0.9832631707191467, "step": 5225 }, { "completion_length": 355.8, "completions/clipped_ratio": 0.0, "completions/max_length": 355.8, "completions/max_terminated_length": 355.8, "completions/mean_length": 118.57421875, "completions/mean_terminated_length": 118.57421875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.004733143042279728, "frac_reward_zero_std": 0.65625, "grad_norm": 0.24460335075855255, "kl": 0.062445785576710475, "learning_rate": 3.735e-07, "loss": 0.0001, "num_tokens": 385447661.0, "reward": 0.203125, "reward_std": 0.30464298725128175, "rewards/verify_chess_move/mean": 0.203125, "rewards/verify_chess_move/std": 0.9793812155723571, "step": 5230 }, { "completion_length": 310.6, "completions/clipped_ratio": 0.0, "completions/max_length": 310.6, "completions/max_terminated_length": 310.6, "completions/mean_length": 115.403125, "completions/mean_terminated_length": 115.403125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004737668035627988, "frac_reward_zero_std": 0.75, "grad_norm": 0.1496516317129135, "kl": 0.0297993132728152, "learning_rate": 3.738571428571428e-07, "loss": 0.0, "num_tokens": 385794385.0, "reward": 0.284375, "reward_std": 0.22157333195209503, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9262513518333435, "step": 5235 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 118.20703125, "completions/mean_terminated_length": 118.20703125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004742193028976247, "frac_reward_zero_std": 0.75, "grad_norm": 0.09230130165815353, "kl": 0.029695481027010827, "learning_rate": 3.742142857142857e-07, "loss": 0.0, "num_tokens": 386144962.0, "reward": 0.2234375, "reward_std": 0.22503672093153, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9733914971351624, "step": 5240 }, { "completion_length": 307.6, "completions/clipped_ratio": 0.0, "completions/max_length": 307.6, "completions/max_terminated_length": 307.6, "completions/mean_length": 112.74375, "completions/mean_terminated_length": 112.74375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.004746718022324507, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18408459424972534, "kl": 0.030247687420342118, "learning_rate": 3.7457142857142853e-07, "loss": 0.0, "num_tokens": 386486074.0, "reward": 0.1984375, "reward_std": 0.2605137974023819, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.9735508918762207, "step": 5245 }, { "completion_length": 367.6, "completions/clipped_ratio": 0.0, "completions/max_length": 367.6, "completions/max_terminated_length": 367.6, "completions/mean_length": 121.146875, "completions/mean_terminated_length": 121.146875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.004751243015672767, "frac_reward_zero_std": 0.65625, "grad_norm": 0.21385473012924194, "kl": 0.02990500954329036, "learning_rate": 3.7492857142857145e-07, "loss": 0.0, "num_tokens": 386843070.0, "reward": 0.178125, "reward_std": 0.28060742020606994, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.9642848491668701, "step": 5250 }, { "completion_length": 305.8, "completions/clipped_ratio": 0.0, "completions/max_length": 305.8, "completions/max_terminated_length": 305.8, "completions/mean_length": 122.0625, "completions/mean_terminated_length": 122.0625, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.004755768009021027, "frac_reward_zero_std": 0.7125, "grad_norm": 0.22701191902160645, "kl": 0.032207366998773065, "learning_rate": 3.7528571428571426e-07, "loss": 0.0, "num_tokens": 387198774.0, "reward": 0.2859375, "reward_std": 0.2551338464021683, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9571017026901245, "step": 5255 }, { "completion_length": 299.6, "completions/clipped_ratio": 0.0, "completions/max_length": 299.6, "completions/max_terminated_length": 299.6, "completions/mean_length": 109.45703125, "completions/mean_terminated_length": 109.45703125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004760293002369287, "frac_reward_zero_std": 0.66875, "grad_norm": 0.32059553265571594, "kl": 0.030458484980044886, "learning_rate": 3.7564285714285717e-07, "loss": 0.0, "num_tokens": 387534887.0, "reward": 0.184375, "reward_std": 0.28675374388694763, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9709455013275147, "step": 5260 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 121.73125, "completions/mean_terminated_length": 121.73125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004764817995717546, "frac_reward_zero_std": 0.7, "grad_norm": 0.20164726674556732, "kl": 0.026293742793495768, "learning_rate": 3.76e-07, "loss": 0.0, "num_tokens": 387891503.0, "reward": 0.178125, "reward_std": 0.26586087942123415, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.9747266888618469, "step": 5265 }, { "completion_length": 324.6, "completions/clipped_ratio": 0.0, "completions/max_length": 324.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 116.0171875, "completions/mean_terminated_length": 116.0171875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.004769342989065806, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3604830801486969, "kl": 0.02901392780477181, "learning_rate": 3.7635714285714284e-07, "loss": 0.0, "num_tokens": 388236189.0, "reward": 0.178125, "reward_std": 0.2919654667377472, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.96893230676651, "step": 5270 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 118.3484375, "completions/mean_terminated_length": 118.3484375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.004773867982414066, "frac_reward_zero_std": 0.69375, "grad_norm": 0.11676835268735886, "kl": 0.026632721349596978, "learning_rate": 3.767142857142857e-07, "loss": 0.0, "num_tokens": 388586539.0, "reward": 0.25, "reward_std": 0.2700196862220764, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9549375891685485, "step": 5275 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 119.87265625, "completions/mean_terminated_length": 119.87265625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.004778392975762325, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1418093591928482, "kl": 0.02457806941238232, "learning_rate": 3.7707142857142856e-07, "loss": 0.0, "num_tokens": 388938760.0, "reward": 0.2890625, "reward_std": 0.2682892560958862, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9489325165748597, "step": 5280 }, { "completion_length": 368.6, "completions/clipped_ratio": 0.0, "completions/max_length": 368.6, "completions/max_terminated_length": 368.6, "completions/mean_length": 117.65078125, "completions/mean_terminated_length": 117.65078125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004782917969110585, "frac_reward_zero_std": 0.7375, "grad_norm": 0.09339983761310577, "kl": 0.02617076426977292, "learning_rate": 3.774285714285714e-07, "loss": 0.0, "num_tokens": 389285721.0, "reward": 0.26875, "reward_std": 0.22110407203435897, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9505125403404235, "step": 5285 }, { "completion_length": 309.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 120.575, "completions/mean_terminated_length": 120.575, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.0047874429624588455, "frac_reward_zero_std": 0.725, "grad_norm": 0.16697411239147186, "kl": 0.030775365734007208, "learning_rate": 3.777857142857143e-07, "loss": 0.0, "num_tokens": 389641033.0, "reward": 0.115625, "reward_std": 0.25239935517311096, "rewards/verify_chess_move/mean": 0.115625, "rewards/verify_chess_move/std": 0.9930174231529236, "step": 5290 }, { "completion_length": 313.8, "completions/clipped_ratio": 0.0, "completions/max_length": 313.8, "completions/max_terminated_length": 313.8, "completions/mean_length": 111.15, "completions/mean_terminated_length": 111.15, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.004791967955807105, "frac_reward_zero_std": 0.7125, "grad_norm": 0.16118212044239044, "kl": 0.02780548920854926, "learning_rate": 3.7814285714285715e-07, "loss": 0.0, "num_tokens": 389980489.0, "reward": 0.2421875, "reward_std": 0.24382426738739013, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9609784007072448, "step": 5295 }, { "completion_length": 443.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.6, "completions/max_terminated_length": 363.2, "completions/mean_length": 122.475, "completions/mean_terminated_length": 121.97630004882812, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.004796492949155365, "frac_reward_zero_std": 0.725, "grad_norm": 0.14792346954345703, "kl": 0.023000887082889675, "learning_rate": 3.785e-07, "loss": 0.0, "num_tokens": 390335753.0, "reward": 0.3171875, "reward_std": 0.24334990680217744, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9459123849868775, "step": 5300 }, { "completion_length": 337.8, "completions/clipped_ratio": 0.0, "completions/max_length": 337.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 124.76328125, "completions/mean_terminated_length": 124.76328125, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.004801017942503625, "frac_reward_zero_std": 0.70625, "grad_norm": 0.16741497814655304, "kl": 0.027766130556119605, "learning_rate": 3.788571428571428e-07, "loss": 0.0, "num_tokens": 390695714.0, "reward": 0.259375, "reward_std": 0.2581872880458832, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9531257152557373, "step": 5305 }, { "completion_length": 336.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 115.2484375, "completions/mean_terminated_length": 115.2484375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.004805542935851884, "frac_reward_zero_std": 0.70625, "grad_norm": 0.18039299547672272, "kl": 0.030347889004042373, "learning_rate": 3.7921428571428573e-07, "loss": 0.0, "num_tokens": 391040304.0, "reward": 0.1421875, "reward_std": 0.23957405090332032, "rewards/verify_chess_move/mean": 0.1421875, "rewards/verify_chess_move/std": 0.9775810480117798, "step": 5310 }, { "completion_length": 474.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 474.8, "completions/max_terminated_length": 397.0, "completions/mean_length": 128.5453125, "completions/mean_terminated_length": 128.0446304321289, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004810067929200144, "frac_reward_zero_std": 0.66875, "grad_norm": 0.16761566698551178, "kl": 0.022608006664086133, "learning_rate": 3.7957142857142854e-07, "loss": 0.0, "num_tokens": 391406938.0, "reward": 0.2296875, "reward_std": 0.2835964739322662, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9594085097312928, "step": 5315 }, { "completion_length": 327.4, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/max_terminated_length": 327.4, "completions/mean_length": 115.5328125, "completions/mean_terminated_length": 115.5328125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0048145929225484035, "frac_reward_zero_std": 0.68125, "grad_norm": 0.17314986884593964, "kl": 0.027801228634780274, "learning_rate": 3.7992857142857145e-07, "loss": 0.0, "num_tokens": 391750692.0, "reward": 0.2859375, "reward_std": 0.2722368657588959, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.948217236995697, "step": 5320 }, { "completion_length": 336.6, "completions/clipped_ratio": 0.0, "completions/max_length": 336.6, "completions/max_terminated_length": 336.6, "completions/mean_length": 112.23125, "completions/mean_terminated_length": 112.23125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.0048191179158966636, "frac_reward_zero_std": 0.73125, "grad_norm": 0.19320513308048248, "kl": 0.02544321439345367, "learning_rate": 3.8028571428571426e-07, "loss": 0.0, "num_tokens": 392092316.0, "reward": 0.2453125, "reward_std": 0.235302397608757, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9631398797035218, "step": 5325 }, { "completion_length": 320.4, "completions/clipped_ratio": 0.0, "completions/max_length": 320.4, "completions/max_terminated_length": 320.4, "completions/mean_length": 118.18125, "completions/mean_terminated_length": 118.18125, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.004823642909244924, "frac_reward_zero_std": 0.7125, "grad_norm": 0.16098946332931519, "kl": 0.02612112942151725, "learning_rate": 3.806428571428571e-07, "loss": 0.0, "num_tokens": 392443764.0, "reward": 0.05, "reward_std": 0.2556076169013977, "rewards/verify_chess_move/mean": 0.05, "rewards/verify_chess_move/std": 0.995753014087677, "step": 5330 }, { "completion_length": 355.6, "completions/clipped_ratio": 0.0, "completions/max_length": 355.6, "completions/max_terminated_length": 355.6, "completions/mean_length": 128.1140625, "completions/mean_terminated_length": 128.1140625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.004828167902593183, "frac_reward_zero_std": 0.71875, "grad_norm": 0.16510716080665588, "kl": 0.026581750815967097, "learning_rate": 3.81e-07, "loss": 0.0, "num_tokens": 392810222.0, "reward": 0.1875, "reward_std": 0.2472539871931076, "rewards/verify_chess_move/mean": 0.1875, "rewards/verify_chess_move/std": 0.9643969535827637, "step": 5335 }, { "completion_length": 354.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 118.2890625, "completions/mean_terminated_length": 118.2890625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.004832692895941443, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1552479863166809, "kl": 0.031786034302785995, "learning_rate": 3.8135714285714285e-07, "loss": 0.0, "num_tokens": 393161152.0, "reward": 0.3203125, "reward_std": 0.2491382509469986, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9440613508224487, "step": 5340 }, { "completion_length": 327.8, "completions/clipped_ratio": 0.0, "completions/max_length": 327.8, "completions/max_terminated_length": 327.8, "completions/mean_length": 113.45, "completions/mean_terminated_length": 113.45, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.004837217889289703, "frac_reward_zero_std": 0.69375, "grad_norm": 0.13635091483592987, "kl": 0.03093277771258727, "learning_rate": 3.817142857142857e-07, "loss": 0.0, "num_tokens": 393502328.0, "reward": 0.2765625, "reward_std": 0.26618040204048155, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.959795105457306, "step": 5345 }, { "completion_length": 314.4, "completions/clipped_ratio": 0.0, "completions/max_length": 314.4, "completions/max_terminated_length": 314.4, "completions/mean_length": 115.69375, "completions/mean_terminated_length": 115.69375, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.004841742882637962, "frac_reward_zero_std": 0.6875, "grad_norm": 0.13980889320373535, "kl": 0.03271045224391855, "learning_rate": 3.8207142857142857e-07, "loss": 0.0, "num_tokens": 393847136.0, "reward": 0.278125, "reward_std": 0.2696077644824982, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9561749815940856, "step": 5350 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 120.86953125, "completions/mean_terminated_length": 120.86953125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004846267875986222, "frac_reward_zero_std": 0.60625, "grad_norm": 0.19007931649684906, "kl": 0.043126688187476246, "learning_rate": 3.8242857142857143e-07, "loss": 0.0, "num_tokens": 394199953.0, "reward": 0.2734375, "reward_std": 0.35276980996131896, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9548792243003845, "step": 5355 }, { "completion_length": 314.4, "completions/clipped_ratio": 0.0, "completions/max_length": 314.4, "completions/max_terminated_length": 314.4, "completions/mean_length": 116.38984375, "completions/mean_terminated_length": 116.38984375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0048507928693344825, "frac_reward_zero_std": 0.68125, "grad_norm": 0.19883790612220764, "kl": 0.0644281099725049, "learning_rate": 3.827857142857143e-07, "loss": 0.0001, "num_tokens": 394548028.0, "reward": 0.2421875, "reward_std": 0.26929176449775694, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9589576363563538, "step": 5360 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 116.3328125, "completions/mean_terminated_length": 116.3328125, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.004855317862682742, "frac_reward_zero_std": 0.725, "grad_norm": 0.33212777972221375, "kl": 0.05592283469159156, "learning_rate": 3.831428571428571e-07, "loss": 0.0001, "num_tokens": 394893534.0, "reward": 0.2671875, "reward_std": 0.2381456345319748, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9456420183181763, "step": 5365 }, { "completion_length": 351.4, "completions/clipped_ratio": 0.0, "completions/max_length": 351.4, "completions/max_terminated_length": 351.4, "completions/mean_length": 119.44921875, "completions/mean_terminated_length": 119.44921875, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.004859842856031002, "frac_reward_zero_std": 0.76875, "grad_norm": 0.1867237091064453, "kl": 0.06932451129541732, "learning_rate": 3.835e-07, "loss": 0.0001, "num_tokens": 395245237.0, "reward": 0.2671875, "reward_std": 0.20468600392341613, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9562150239944458, "step": 5370 }, { "completion_length": 302.2, "completions/clipped_ratio": 0.0, "completions/max_length": 302.2, "completions/max_terminated_length": 302.2, "completions/mean_length": 108.61171875, "completions/mean_terminated_length": 108.61171875, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.004864367849379261, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25921833515167236, "kl": 0.08832105058827437, "learning_rate": 3.838571428571428e-07, "loss": 0.0001, "num_tokens": 395579004.0, "reward": 0.3, "reward_std": 0.2737096220254898, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9413762331008911, "step": 5375 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 119.48671875, "completions/mean_terminated_length": 119.48671875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.004868892842727521, "frac_reward_zero_std": 0.725, "grad_norm": 0.15683984756469727, "kl": 0.03192242771619931, "learning_rate": 3.8421428571428574e-07, "loss": 0.0, "num_tokens": 395931747.0, "reward": 0.2796875, "reward_std": 0.22994388043880462, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9498264789581299, "step": 5380 }, { "completion_length": 337.8, "completions/clipped_ratio": 0.0, "completions/max_length": 337.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 120.6484375, "completions/mean_terminated_length": 120.6484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.004873417836075781, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0857415720820427, "kl": 0.03148364176158793, "learning_rate": 3.8457142857142854e-07, "loss": 0.0, "num_tokens": 396285273.0, "reward": 0.2359375, "reward_std": 0.1842294991016388, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9658228516578674, "step": 5385 }, { "completion_length": 355.4, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/max_terminated_length": 355.4, "completions/mean_length": 125.39609375, "completions/mean_terminated_length": 125.39609375, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.00487794282942404, "frac_reward_zero_std": 0.66875, "grad_norm": 0.1358407884836197, "kl": 0.03036493586259894, "learning_rate": 3.8492857142857146e-07, "loss": 0.0, "num_tokens": 396645324.0, "reward": 0.26875, "reward_std": 0.286755707859993, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.955088198184967, "step": 5390 }, { "completion_length": 358.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 115.365625, "completions/mean_terminated_length": 115.365625, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "epoch": 0.0048824678227723005, "frac_reward_zero_std": 0.725, "grad_norm": 0.1735745370388031, "kl": 0.024881377606652676, "learning_rate": 3.8528571428571427e-07, "loss": 0.0, "num_tokens": 396987936.0, "reward": 0.2671875, "reward_std": 0.23788404166698457, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.950343382358551, "step": 5395 }, { "completion_length": 310.8, "completions/clipped_ratio": 0.0, "completions/max_length": 310.8, "completions/max_terminated_length": 310.8, "completions/mean_length": 124.03203125, "completions/mean_terminated_length": 124.03203125, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.004886992816120561, "frac_reward_zero_std": 0.71875, "grad_norm": 0.19294250011444092, "kl": 0.0308192229247652, "learning_rate": 3.8564285714285713e-07, "loss": 0.0, "num_tokens": 397346105.0, "reward": 0.178125, "reward_std": 0.25681778192520144, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.9746527552604676, "step": 5400 }, { "completion_length": 369.6, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/max_terminated_length": 369.6, "completions/mean_length": 124.75546875, "completions/mean_terminated_length": 124.75546875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.00489151780946882, "frac_reward_zero_std": 0.7, "grad_norm": 0.13219328224658966, "kl": 0.025071002746699378, "learning_rate": 3.86e-07, "loss": 0.0, "num_tokens": 397704376.0, "reward": 0.2796875, "reward_std": 0.26334109008312223, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9485198020935058, "step": 5405 }, { "completion_length": 351.2, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/max_terminated_length": 351.2, "completions/mean_length": 119.90234375, "completions/mean_terminated_length": 119.90234375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.00489604280281708, "frac_reward_zero_std": 0.7, "grad_norm": 0.3285316526889801, "kl": 0.02689366050180979, "learning_rate": 3.8635714285714285e-07, "loss": 0.0, "num_tokens": 398056395.0, "reward": 0.221875, "reward_std": 0.251720467209816, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9635333895683289, "step": 5410 }, { "completion_length": 335.4, "completions/clipped_ratio": 0.0, "completions/max_length": 335.4, "completions/max_terminated_length": 335.4, "completions/mean_length": 114.0796875, "completions/mean_terminated_length": 114.0796875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.00490056779616534, "frac_reward_zero_std": 0.7375, "grad_norm": 0.08633870631456375, "kl": 0.02996192780556157, "learning_rate": 3.867142857142857e-07, "loss": 0.0, "num_tokens": 398397985.0, "reward": 0.425, "reward_std": 0.22909267544746398, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8915756940841675, "step": 5415 }, { "completion_length": 370.4, "completions/clipped_ratio": 0.0, "completions/max_length": 370.4, "completions/max_terminated_length": 370.4, "completions/mean_length": 115.26796875, "completions/mean_terminated_length": 115.26796875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.004905092789513599, "frac_reward_zero_std": 0.75, "grad_norm": 0.23377814888954163, "kl": 0.02987289713928476, "learning_rate": 3.8707142857142857e-07, "loss": 0.0, "num_tokens": 398741816.0, "reward": 0.4046875, "reward_std": 0.20658720433712005, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9098605990409852, "step": 5420 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 124.66953125, "completions/mean_terminated_length": 124.66953125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.004909617782861859, "frac_reward_zero_std": 0.74375, "grad_norm": 0.27858880162239075, "kl": 0.03431749726296403, "learning_rate": 3.8742857142857143e-07, "loss": 0.0, "num_tokens": 399101009.0, "reward": 0.2, "reward_std": 0.23282557129859924, "rewards/verify_chess_move/mean": 0.2, "rewards/verify_chess_move/std": 0.9772661447525024, "step": 5425 }, { "completion_length": 324.8, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/max_terminated_length": 324.8, "completions/mean_length": 126.2375, "completions/mean_terminated_length": 126.2375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0049141427762101186, "frac_reward_zero_std": 0.71875, "grad_norm": 0.176677867770195, "kl": 0.06469816530006937, "learning_rate": 3.877857142857143e-07, "loss": 0.0001, "num_tokens": 399462777.0, "reward": 0.2609375, "reward_std": 0.25114170610904696, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9458821535110473, "step": 5430 }, { "completion_length": 350.6, "completions/clipped_ratio": 0.0, "completions/max_length": 350.6, "completions/max_terminated_length": 350.6, "completions/mean_length": 118.58984375, "completions/mean_terminated_length": 118.58984375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004918667769558379, "frac_reward_zero_std": 0.70625, "grad_norm": 0.13150237500667572, "kl": 0.03144784336909652, "learning_rate": 3.881428571428571e-07, "loss": 0.0, "num_tokens": 399810772.0, "reward": 0.2609375, "reward_std": 0.24777384102344513, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9553216218948364, "step": 5435 }, { "completion_length": 360.8, "completions/clipped_ratio": 0.0, "completions/max_length": 360.8, "completions/max_terminated_length": 360.8, "completions/mean_length": 122.3390625, "completions/mean_terminated_length": 122.3390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004923192762906639, "frac_reward_zero_std": 0.76875, "grad_norm": 0.33058181405067444, "kl": 0.03164730229182169, "learning_rate": 3.885e-07, "loss": 0.0, "num_tokens": 400165990.0, "reward": 0.246875, "reward_std": 0.2016914665699005, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9589142203330994, "step": 5440 }, { "completion_length": 427.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 427.8, "completions/max_terminated_length": 396.0, "completions/mean_length": 119.63203125, "completions/mean_terminated_length": 119.12748260498047, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004927717756254898, "frac_reward_zero_std": 0.69375, "grad_norm": 0.2734813988208771, "kl": 0.02753305652004201, "learning_rate": 3.888571428571428e-07, "loss": 0.0, "num_tokens": 400517255.0, "reward": 0.275, "reward_std": 0.2615602433681488, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9520767807960511, "step": 5445 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 122.796875, "completions/mean_terminated_length": 122.796875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.004932242749603158, "frac_reward_zero_std": 0.70625, "grad_norm": 0.08702127635478973, "kl": 0.03430520973051898, "learning_rate": 3.8921428571428574e-07, "loss": 0.0, "num_tokens": 400876299.0, "reward": 0.2046875, "reward_std": 0.2547723472118378, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9590554594993591, "step": 5450 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 119.671875, "completions/mean_terminated_length": 119.671875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.004936767742951418, "frac_reward_zero_std": 0.69375, "grad_norm": 0.21060705184936523, "kl": 0.03853051676996984, "learning_rate": 3.8957142857142855e-07, "loss": 0.0, "num_tokens": 401228023.0, "reward": 0.084375, "reward_std": 0.26650881469249726, "rewards/verify_chess_move/mean": 0.084375, "rewards/verify_chess_move/std": 0.9950085639953613, "step": 5455 }, { "completion_length": 404.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 404.0, "completions/max_terminated_length": 308.8, "completions/mean_length": 121.1859375, "completions/mean_terminated_length": 120.6673568725586, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.004941292736299677, "frac_reward_zero_std": 0.65625, "grad_norm": 0.180568665266037, "kl": 0.03522726218507159, "learning_rate": 3.899285714285714e-07, "loss": 0.0, "num_tokens": 401582357.0, "reward": 0.15, "reward_std": 0.3028991729021072, "rewards/verify_chess_move/mean": 0.15, "rewards/verify_chess_move/std": 0.9699558615684509, "step": 5460 }, { "completion_length": 329.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 123.0515625, "completions/mean_terminated_length": 123.0515625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.0049458177296479375, "frac_reward_zero_std": 0.78125, "grad_norm": 0.11168897151947021, "kl": 0.03185519895632751, "learning_rate": 3.9028571428571427e-07, "loss": 0.0, "num_tokens": 401939183.0, "reward": 0.39375, "reward_std": 0.19195944368839263, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9054647088050842, "step": 5465 }, { "completion_length": 379.8, "completions/clipped_ratio": 0.0, "completions/max_length": 379.8, "completions/max_terminated_length": 379.8, "completions/mean_length": 122.8703125, "completions/mean_terminated_length": 122.8703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.004950342722996198, "frac_reward_zero_std": 0.7375, "grad_norm": 0.14418014883995056, "kl": 0.029593749984633176, "learning_rate": 3.9064285714285713e-07, "loss": 0.0, "num_tokens": 402294793.0, "reward": 0.353125, "reward_std": 0.22199922800064087, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9134045004844665, "step": 5470 }, { "completion_length": 448.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 448.2, "completions/max_terminated_length": 383.4, "completions/mean_length": 124.821875, "completions/mean_terminated_length": 124.30731811523438, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.004954867716344457, "frac_reward_zero_std": 0.675, "grad_norm": 0.23819634318351746, "kl": 0.03781101090135053, "learning_rate": 3.91e-07, "loss": 0.0, "num_tokens": 402652669.0, "reward": 0.2984375, "reward_std": 0.28007317781448365, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9438048124313354, "step": 5475 }, { "completion_length": 322.2, "completions/clipped_ratio": 0.0, "completions/max_length": 322.2, "completions/max_terminated_length": 322.2, "completions/mean_length": 125.940625, "completions/mean_terminated_length": 125.940625, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.004959392709692717, "frac_reward_zero_std": 0.74375, "grad_norm": 0.1221386268734932, "kl": 0.0694065876188688, "learning_rate": 3.9135714285714286e-07, "loss": 0.0001, "num_tokens": 403013177.0, "reward": 0.309375, "reward_std": 0.22535624504089355, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9454953789710998, "step": 5480 }, { "completion_length": 333.4, "completions/clipped_ratio": 0.0, "completions/max_length": 333.4, "completions/max_terminated_length": 333.4, "completions/mean_length": 132.05234375, "completions/mean_terminated_length": 132.05234375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.004963917703040976, "frac_reward_zero_std": 0.675, "grad_norm": 0.2630668580532074, "kl": 0.053284262627130374, "learning_rate": 3.917142857142857e-07, "loss": 0.0001, "num_tokens": 403382820.0, "reward": 0.265625, "reward_std": 0.2835365891456604, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9521185040473938, "step": 5485 }, { "completion_length": 332.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 117.503125, "completions/mean_terminated_length": 117.503125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.004968442696389236, "frac_reward_zero_std": 0.74375, "grad_norm": 0.21503491699695587, "kl": 0.04586763355182484, "learning_rate": 3.920714285714286e-07, "loss": 0.0, "num_tokens": 403731888.0, "reward": 0.209375, "reward_std": 0.21937110424041747, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9435391187667846, "step": 5490 }, { "completion_length": 350.2, "completions/clipped_ratio": 0.0, "completions/max_length": 350.2, "completions/max_terminated_length": 350.2, "completions/mean_length": 124.22421875, "completions/mean_terminated_length": 124.22421875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.004972967689737496, "frac_reward_zero_std": 0.69375, "grad_norm": 0.27728286385536194, "kl": 0.042217688908567655, "learning_rate": 3.924285714285714e-07, "loss": 0.0, "num_tokens": 404089167.0, "reward": 0.2984375, "reward_std": 0.2611369013786316, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9541737794876098, "step": 5495 }, { "completion_length": 338.2, "completions/clipped_ratio": 0.0, "completions/max_length": 338.2, "completions/max_terminated_length": 338.2, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.0049774926830857555, "frac_reward_zero_std": 0.70625, "grad_norm": 0.28093039989471436, "kl": 0.05955203539924696, "learning_rate": 3.927857142857143e-07, "loss": 0.0001, "num_tokens": 404443207.0, "reward": 0.175, "reward_std": 0.2563969761133194, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9798243403434753, "step": 5500 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 123.38984375, "completions/mean_terminated_length": 123.38984375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.004982017676434016, "frac_reward_zero_std": 0.7125, "grad_norm": 0.1641901433467865, "kl": 0.041470447852043436, "learning_rate": 3.931428571428571e-07, "loss": 0.0, "num_tokens": 404801194.0, "reward": 0.2734375, "reward_std": 0.2501407593488693, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9280389547348022, "step": 5505 }, { "completion_length": 402.8, "completions/clipped_ratio": 0.0, "completions/max_length": 402.8, "completions/max_terminated_length": 402.8, "completions/mean_length": 118.5734375, "completions/mean_terminated_length": 118.5734375, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.004986542669782276, "frac_reward_zero_std": 0.7, "grad_norm": 0.1670205146074295, "kl": 0.0235928803042043, "learning_rate": 3.935e-07, "loss": 0.0, "num_tokens": 405148640.0, "reward": 0.178125, "reward_std": 0.2489885151386261, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.9695302367210388, "step": 5510 }, { "completion_length": 392.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 119.53828125, "completions/mean_terminated_length": 119.53828125, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.004991067663130535, "frac_reward_zero_std": 0.75, "grad_norm": 0.18732377886772156, "kl": 0.035100203880574554, "learning_rate": 3.9385714285714283e-07, "loss": 0.0, "num_tokens": 405499425.0, "reward": 0.3109375, "reward_std": 0.21341904997825623, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9502930045127869, "step": 5515 }, { "completion_length": 388.8, "completions/clipped_ratio": 0.0, "completions/max_length": 388.8, "completions/max_terminated_length": 388.8, "completions/mean_length": 115.725, "completions/mean_terminated_length": 115.725, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.004995592656478795, "frac_reward_zero_std": 0.825, "grad_norm": 0.11453662812709808, "kl": 0.023701717611402273, "learning_rate": 3.9421428571428575e-07, "loss": 0.0, "num_tokens": 405844609.0, "reward": 0.3828125, "reward_std": 0.1521397650241852, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.906894862651825, "step": 5520 }, { "completion_length": 336.4, "completions/clipped_ratio": 0.0, "completions/max_length": 336.4, "completions/max_terminated_length": 336.4, "completions/mean_length": 130.1984375, "completions/mean_terminated_length": 130.1984375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.005000117649827055, "frac_reward_zero_std": 0.71875, "grad_norm": 0.17056606709957123, "kl": 0.044694295269437134, "learning_rate": 3.9457142857142855e-07, "loss": 0.0, "num_tokens": 406213183.0, "reward": 0.2234375, "reward_std": 0.24277526438236235, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9677288889884949, "step": 5525 }, { "completion_length": 333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 135.3484375, "completions/mean_terminated_length": 135.3484375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005004642643175314, "frac_reward_zero_std": 0.81875, "grad_norm": 0.41988083720207214, "kl": 0.03340821152669378, "learning_rate": 3.949285714285714e-07, "loss": 0.0, "num_tokens": 406590973.0, "reward": 0.215625, "reward_std": 0.16070556417107582, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9674758911132812, "step": 5530 }, { "completion_length": 491.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 491.8, "completions/max_terminated_length": 454.6, "completions/mean_length": 113.08515625, "completions/mean_terminated_length": 112.56505584716797, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.005009167636523574, "frac_reward_zero_std": 0.73125, "grad_norm": 0.0734872967004776, "kl": 0.027723643399076535, "learning_rate": 3.952857142857143e-07, "loss": 0.0, "num_tokens": 406930434.0, "reward": 0.3421875, "reward_std": 0.23073169142007827, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9343894004821778, "step": 5535 }, { "completion_length": 331.4, "completions/clipped_ratio": 0.0, "completions/max_length": 331.4, "completions/max_terminated_length": 331.4, "completions/mean_length": 118.95390625, "completions/mean_terminated_length": 118.95390625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005013692629871834, "frac_reward_zero_std": 0.7, "grad_norm": 0.20348666608333588, "kl": 0.05275744081591256, "learning_rate": 3.9564285714285714e-07, "loss": 0.0001, "num_tokens": 407280887.0, "reward": 0.2390625, "reward_std": 0.2630329966545105, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9552846193313599, "step": 5540 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 117.903125, "completions/mean_terminated_length": 117.903125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.005018217623220094, "frac_reward_zero_std": 0.7, "grad_norm": 0.109247125685215, "kl": 0.053922815568512306, "learning_rate": 3.96e-07, "loss": 0.0001, "num_tokens": 407629755.0, "reward": 0.2953125, "reward_std": 0.2591907799243927, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9521932244300843, "step": 5545 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 120.65546875, "completions/mean_terminated_length": 120.65546875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.005022742616568354, "frac_reward_zero_std": 0.75625, "grad_norm": 0.1761004626750946, "kl": 0.15244417879730462, "learning_rate": 3.9635714285714286e-07, "loss": 0.0002, "num_tokens": 407982170.0, "reward": 0.221875, "reward_std": 0.19917423278093338, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9642848134040832, "step": 5550 }, { "completion_length": 403.6, "completions/clipped_ratio": 0.0, "completions/max_length": 403.6, "completions/max_terminated_length": 403.6, "completions/mean_length": 115.36171875, "completions/mean_terminated_length": 115.36171875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.005027267609916613, "frac_reward_zero_std": 0.71875, "grad_norm": 0.36875906586647034, "kl": 0.2622437178622931, "learning_rate": 3.967142857142857e-07, "loss": 0.0003, "num_tokens": 408328001.0, "reward": 0.1625, "reward_std": 0.24051314890384673, "rewards/verify_chess_move/mean": 0.1625, "rewards/verify_chess_move/std": 0.9872451424598694, "step": 5555 }, { "completion_length": 348.8, "completions/clipped_ratio": 0.0, "completions/max_length": 348.8, "completions/max_terminated_length": 348.8, "completions/mean_length": 118.55703125, "completions/mean_terminated_length": 118.55703125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.005031792603264873, "frac_reward_zero_std": 0.74375, "grad_norm": 0.16454653441905975, "kl": 0.2356713611749001, "learning_rate": 3.970714285714286e-07, "loss": 0.0002, "num_tokens": 408678650.0, "reward": 0.153125, "reward_std": 0.22530975937843323, "rewards/verify_chess_move/mean": 0.153125, "rewards/verify_chess_move/std": 0.9734599113464355, "step": 5560 }, { "completion_length": 308.2, "completions/clipped_ratio": 0.0, "completions/max_length": 308.2, "completions/max_terminated_length": 308.2, "completions/mean_length": 112.11171875, "completions/mean_terminated_length": 112.11171875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.005036317596613133, "frac_reward_zero_std": 0.75625, "grad_norm": 0.28548768162727356, "kl": 0.6207250385312364, "learning_rate": 3.974285714285714e-07, "loss": 0.0006, "num_tokens": 409020569.0, "reward": 0.2390625, "reward_std": 0.20784581899642945, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9593113541603089, "step": 5565 }, { "completion_length": 430.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 124.021875, "completions/mean_terminated_length": 124.021875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0050408425899613925, "frac_reward_zero_std": 0.7375, "grad_norm": 0.34464266896247864, "kl": 0.14748012393247337, "learning_rate": 3.977857142857143e-07, "loss": 0.0001, "num_tokens": 409379245.0, "reward": 0.125, "reward_std": 0.234512060880661, "rewards/verify_chess_move/mean": 0.125, "rewards/verify_chess_move/std": 0.9918551802635193, "step": 5570 }, { "completion_length": 350.4, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 118.49375, "completions/mean_terminated_length": 118.49375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005045367583309653, "frac_reward_zero_std": 0.7625, "grad_norm": 0.3825739622116089, "kl": 0.06318774434039369, "learning_rate": 3.981428571428571e-07, "loss": 0.0001, "num_tokens": 409729029.0, "reward": 0.1109375, "reward_std": 0.2104723870754242, "rewards/verify_chess_move/mean": 0.1109375, "rewards/verify_chess_move/std": 0.9866410970687867, "step": 5575 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 119.8296875, "completions/mean_terminated_length": 119.8296875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005049892576657913, "frac_reward_zero_std": 0.65, "grad_norm": 0.23371043801307678, "kl": 0.05792719548335299, "learning_rate": 3.9850000000000003e-07, "loss": 0.0001, "num_tokens": 410077659.0, "reward": 0.365625, "reward_std": 0.29407431483268737, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9302193760871887, "step": 5580 }, { "completion_length": 312.4, "completions/clipped_ratio": 0.0, "completions/max_length": 312.4, "completions/max_terminated_length": 312.4, "completions/mean_length": 116.19765625, "completions/mean_terminated_length": 116.19765625, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.005054417570006172, "frac_reward_zero_std": 0.74375, "grad_norm": 0.16122226417064667, "kl": 0.047684451175155115, "learning_rate": 3.9885714285714284e-07, "loss": 0.0, "num_tokens": 410424864.0, "reward": 0.1859375, "reward_std": 0.23393289744853973, "rewards/verify_chess_move/mean": 0.1859375, "rewards/verify_chess_move/std": 0.9805099248886109, "step": 5585 }, { "completion_length": 379.4, "completions/clipped_ratio": 0.0, "completions/max_length": 379.4, "completions/max_terminated_length": 379.4, "completions/mean_length": 123.4109375, "completions/mean_terminated_length": 123.4109375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005058942563354432, "frac_reward_zero_std": 0.7, "grad_norm": 0.2564846873283386, "kl": 0.06562476714025252, "learning_rate": 3.9921428571428564e-07, "loss": 0.0001, "num_tokens": 410783406.0, "reward": 0.140625, "reward_std": 0.2582481563091278, "rewards/verify_chess_move/mean": 0.140625, "rewards/verify_chess_move/std": 0.9856392741203308, "step": 5590 }, { "completion_length": 325.8, "completions/clipped_ratio": 0.0, "completions/max_length": 325.8, "completions/max_terminated_length": 325.8, "completions/mean_length": 120.5953125, "completions/mean_terminated_length": 120.5953125, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.005063467556702692, "frac_reward_zero_std": 0.68125, "grad_norm": 0.14505484700202942, "kl": 0.10165955026168376, "learning_rate": 3.9957142857142856e-07, "loss": 0.0001, "num_tokens": 411135656.0, "reward": 0.2640625, "reward_std": 0.26655784249305725, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9418067216873169, "step": 5595 }, { "completion_length": 322.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 120.8296875, "completions/mean_terminated_length": 120.8296875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005067992550050951, "frac_reward_zero_std": 0.7, "grad_norm": 0.19128964841365814, "kl": 0.08176624607294798, "learning_rate": 3.9992857142857137e-07, "loss": 0.0001, "num_tokens": 411490606.0, "reward": 0.190625, "reward_std": 0.25718829929828646, "rewards/verify_chess_move/mean": 0.190625, "rewards/verify_chess_move/std": 0.9793577790260315, "step": 5600 }, { "completion_length": 333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 129.20703125, "completions/mean_terminated_length": 129.20703125, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.005072517543399211, "frac_reward_zero_std": 0.69375, "grad_norm": 0.2619400918483734, "kl": 0.07962114625843242, "learning_rate": 4.002857142857143e-07, "loss": 0.0001, "num_tokens": 411855175.0, "reward": 0.2703125, "reward_std": 0.2693347245454788, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.955954396724701, "step": 5605 }, { "completion_length": 458.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.6, "completions/max_terminated_length": 362.2, "completions/mean_length": 114.87578125, "completions/mean_terminated_length": 114.36638031005859, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005077042536747471, "frac_reward_zero_std": 0.70625, "grad_norm": 0.2033275067806244, "kl": 0.03877245663315989, "learning_rate": 4.006428571428571e-07, "loss": 0.0, "num_tokens": 412196640.0, "reward": 0.1875, "reward_std": 0.25093111097812654, "rewards/verify_chess_move/mean": 0.1875, "rewards/verify_chess_move/std": 0.9774510145187378, "step": 5610 }, { "completion_length": 325.8, "completions/clipped_ratio": 0.0, "completions/max_length": 325.8, "completions/max_terminated_length": 325.8, "completions/mean_length": 123.19921875, "completions/mean_terminated_length": 123.19921875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005081567530095731, "frac_reward_zero_std": 0.69375, "grad_norm": 0.16221855580806732, "kl": 0.05444416821701452, "learning_rate": 4.01e-07, "loss": 0.0001, "num_tokens": 412553791.0, "reward": 0.184375, "reward_std": 0.24999103248119353, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9755961179733277, "step": 5615 }, { "completion_length": 344.6, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 128.721875, "completions/mean_terminated_length": 128.721875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005086092523443991, "frac_reward_zero_std": 0.66875, "grad_norm": 0.2529352009296417, "kl": 0.06550867677433417, "learning_rate": 4.013571428571428e-07, "loss": 0.0001, "num_tokens": 412919763.0, "reward": 0.2328125, "reward_std": 0.2863323539495468, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.9607446074485779, "step": 5620 }, { "completion_length": 388.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 126.9828125, "completions/mean_terminated_length": 126.9828125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00509061751679225, "frac_reward_zero_std": 0.71875, "grad_norm": 0.13480578362941742, "kl": 0.06657156677683815, "learning_rate": 4.017142857142857e-07, "loss": 0.0001, "num_tokens": 413283325.0, "reward": 0.24375, "reward_std": 0.23499882817268372, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9588164687156677, "step": 5625 }, { "completion_length": 439.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.6, "completions/max_terminated_length": 367.6, "completions/mean_length": 122.05546875, "completions/mean_terminated_length": 121.54538116455078, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.00509514251014051, "frac_reward_zero_std": 0.71875, "grad_norm": 0.12731613218784332, "kl": 0.05867455046973191, "learning_rate": 4.0207142857142854e-07, "loss": 0.0001, "num_tokens": 413639556.0, "reward": 0.2875, "reward_std": 0.24951314628124238, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9560391068458557, "step": 5630 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 117.790625, "completions/mean_terminated_length": 117.790625, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.00509966750348877, "frac_reward_zero_std": 0.74375, "grad_norm": 0.20672093331813812, "kl": 0.07528008128283545, "learning_rate": 4.024285714285714e-07, "loss": 0.0001, "num_tokens": 413989504.0, "reward": 0.2, "reward_std": 0.2225758448243141, "rewards/verify_chess_move/mean": 0.2, "rewards/verify_chess_move/std": 0.9789739847183228, "step": 5635 }, { "completion_length": 317.2, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/max_terminated_length": 317.2, "completions/mean_length": 121.0640625, "completions/mean_terminated_length": 121.0640625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.0051041924968370294, "frac_reward_zero_std": 0.7125, "grad_norm": 0.26990601420402527, "kl": 0.09484423111425713, "learning_rate": 4.027857142857143e-07, "loss": 0.0001, "num_tokens": 414344858.0, "reward": 0.28125, "reward_std": 0.240314382314682, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9591798901557922, "step": 5640 }, { "completion_length": 425.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.4, "completions/max_terminated_length": 337.0, "completions/mean_length": 125.56171875, "completions/mean_terminated_length": 125.05253448486329, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0051087174901852895, "frac_reward_zero_std": 0.71875, "grad_norm": 0.18717272579669952, "kl": 0.04704623870784417, "learning_rate": 4.031428571428571e-07, "loss": 0.0, "num_tokens": 414706025.0, "reward": 0.2875, "reward_std": 0.23188902735710143, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9528264164924621, "step": 5645 }, { "completion_length": 319.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 121.55390625, "completions/mean_terminated_length": 121.55390625, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.00511324248353355, "frac_reward_zero_std": 0.75625, "grad_norm": 0.21138149499893188, "kl": 0.05790773534681648, "learning_rate": 4.0350000000000003e-07, "loss": 0.0001, "num_tokens": 415062166.0, "reward": 0.171875, "reward_std": 0.2098483055830002, "rewards/verify_chess_move/mean": 0.171875, "rewards/verify_chess_move/std": 0.9715186476707458, "step": 5650 }, { "completion_length": 335.8, "completions/clipped_ratio": 0.0, "completions/max_length": 335.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 116.8453125, "completions/mean_terminated_length": 116.8453125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.005117767476881809, "frac_reward_zero_std": 0.73125, "grad_norm": 0.2457953840494156, "kl": 0.06449222308001482, "learning_rate": 4.0385714285714284e-07, "loss": 0.0001, "num_tokens": 415410072.0, "reward": 0.1578125, "reward_std": 0.2280462220311165, "rewards/verify_chess_move/mean": 0.1578125, "rewards/verify_chess_move/std": 0.9710365653038024, "step": 5655 }, { "completion_length": 322.2, "completions/clipped_ratio": 0.0, "completions/max_length": 322.2, "completions/max_terminated_length": 322.2, "completions/mean_length": 113.9515625, "completions/mean_terminated_length": 113.9515625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.005122292470230069, "frac_reward_zero_std": 0.70625, "grad_norm": 0.24189935624599457, "kl": 0.050888565054629, "learning_rate": 4.0421428571428565e-07, "loss": 0.0001, "num_tokens": 415754306.0, "reward": 0.278125, "reward_std": 0.25434703230857847, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9586215257644654, "step": 5660 }, { "completion_length": 415.8, "completions/clipped_ratio": 0.0, "completions/max_length": 415.8, "completions/max_terminated_length": 415.8, "completions/mean_length": 119.2875, "completions/mean_terminated_length": 119.2875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005126817463578328, "frac_reward_zero_std": 0.78125, "grad_norm": 0.08552037924528122, "kl": 0.034425482689403, "learning_rate": 4.0457142857142856e-07, "loss": 0.0, "num_tokens": 416107666.0, "reward": 0.25625, "reward_std": 0.18086103796958924, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9666689753532409, "step": 5665 }, { "completion_length": 433.6, "completions/clipped_ratio": 0.0, "completions/max_length": 433.6, "completions/max_terminated_length": 433.6, "completions/mean_length": 110.75625, "completions/mean_terminated_length": 110.75625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005131342456926588, "frac_reward_zero_std": 0.75625, "grad_norm": 0.12889419496059418, "kl": 0.035181520361220464, "learning_rate": 4.0492857142857137e-07, "loss": 0.0, "num_tokens": 416446954.0, "reward": 0.290625, "reward_std": 0.20695164799690247, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9401789426803588, "step": 5670 }, { "completion_length": 381.4, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 109.73203125, "completions/mean_terminated_length": 109.73203125, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005135867450274848, "frac_reward_zero_std": 0.71875, "grad_norm": 0.13211338222026825, "kl": 0.041146397113334385, "learning_rate": 4.052857142857143e-07, "loss": 0.0, "num_tokens": 416782667.0, "reward": 0.3125, "reward_std": 0.252719846367836, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9506789207458496, "step": 5675 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 116.4640625, "completions/mean_terminated_length": 116.4640625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005140392443623108, "frac_reward_zero_std": 0.7, "grad_norm": 0.14690522849559784, "kl": 0.04381465918850154, "learning_rate": 4.056428571428571e-07, "loss": 0.0, "num_tokens": 417129205.0, "reward": 0.121875, "reward_std": 0.2592382520437241, "rewards/verify_chess_move/mean": 0.121875, "rewards/verify_chess_move/std": 0.9938594579696656, "step": 5680 }, { "completion_length": 345.6, "completions/clipped_ratio": 0.0, "completions/max_length": 345.6, "completions/max_terminated_length": 345.6, "completions/mean_length": 115.153125, "completions/mean_terminated_length": 115.153125, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005144917436971368, "frac_reward_zero_std": 0.775, "grad_norm": 0.14618276059627533, "kl": 0.0424336796218995, "learning_rate": 4.06e-07, "loss": 0.0, "num_tokens": 417476369.0, "reward": 0.33125, "reward_std": 0.19322159886360168, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.931894838809967, "step": 5685 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 111.99609375, "completions/mean_terminated_length": 111.99609375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005149442430319628, "frac_reward_zero_std": 0.75625, "grad_norm": 0.3313443660736084, "kl": 0.03468288439325988, "learning_rate": 4.063571428571428e-07, "loss": 0.0, "num_tokens": 417817100.0, "reward": 0.203125, "reward_std": 0.20400458574295044, "rewards/verify_chess_move/mean": 0.203125, "rewards/verify_chess_move/std": 0.9784074783325195, "step": 5690 }, { "completion_length": 344.4, "completions/clipped_ratio": 0.0, "completions/max_length": 344.4, "completions/max_terminated_length": 344.4, "completions/mean_length": 117.32265625, "completions/mean_terminated_length": 117.32265625, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.005153967423667887, "frac_reward_zero_std": 0.7625, "grad_norm": 0.22117552161216736, "kl": 0.03423902881331742, "learning_rate": 4.067142857142857e-07, "loss": 0.0, "num_tokens": 418166377.0, "reward": 0.3125, "reward_std": 0.20001048147678374, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9497768759727478, "step": 5695 }, { "completion_length": 322.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 110.0515625, "completions/mean_terminated_length": 110.0515625, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005158492417016147, "frac_reward_zero_std": 0.7625, "grad_norm": 0.25740712881088257, "kl": 0.034350941015873104, "learning_rate": 4.0707142857142854e-07, "loss": 0.0, "num_tokens": 418504267.0, "reward": 0.2578125, "reward_std": 0.20978841185569763, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9598998665809632, "step": 5700 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 114.44375, "completions/mean_terminated_length": 114.44375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005163017410364407, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17791686952114105, "kl": 0.027636262809392066, "learning_rate": 4.074285714285714e-07, "loss": 0.0, "num_tokens": 418847843.0, "reward": 0.1578125, "reward_std": 0.26939559578895567, "rewards/verify_chess_move/mean": 0.1578125, "rewards/verify_chess_move/std": 0.9779787421226501, "step": 5705 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 113.3484375, "completions/mean_terminated_length": 113.3484375, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.005167542403712666, "frac_reward_zero_std": 0.7, "grad_norm": 0.2569639980792999, "kl": 0.029744492226745932, "learning_rate": 4.0778571428571426e-07, "loss": 0.0, "num_tokens": 419189545.0, "reward": 0.309375, "reward_std": 0.2680735617876053, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9489718198776245, "step": 5710 }, { "completion_length": 339.2, "completions/clipped_ratio": 0.0, "completions/max_length": 339.2, "completions/max_terminated_length": 339.2, "completions/mean_length": 120.62109375, "completions/mean_terminated_length": 120.62109375, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0051720673970609265, "frac_reward_zero_std": 0.76875, "grad_norm": 0.3161187767982483, "kl": 0.03059627439943142, "learning_rate": 4.081428571428571e-07, "loss": 0.0, "num_tokens": 419543820.0, "reward": 0.2546875, "reward_std": 0.19511827528476716, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9599462747573853, "step": 5715 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 114.96875, "completions/mean_terminated_length": 114.96875, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.005176592390409186, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2083972692489624, "kl": 0.03578794110217132, "learning_rate": 4.0849999999999993e-07, "loss": 0.0, "num_tokens": 419889932.0, "reward": 0.2515625, "reward_std": 0.1926908865571022, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9601124405860901, "step": 5720 }, { "completion_length": 325.4, "completions/clipped_ratio": 0.0, "completions/max_length": 325.4, "completions/max_terminated_length": 325.4, "completions/mean_length": 122.4203125, "completions/mean_terminated_length": 122.4203125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.005181117383757446, "frac_reward_zero_std": 0.7125, "grad_norm": 0.19817689061164856, "kl": 0.045096542505780234, "learning_rate": 4.0885714285714285e-07, "loss": 0.0, "num_tokens": 420248246.0, "reward": 0.1765625, "reward_std": 0.24761803150177003, "rewards/verify_chess_move/mean": 0.1765625, "rewards/verify_chess_move/std": 0.9837303400039673, "step": 5725 }, { "completion_length": 305.4, "completions/clipped_ratio": 0.0, "completions/max_length": 305.4, "completions/max_terminated_length": 305.4, "completions/mean_length": 110.2921875, "completions/mean_terminated_length": 110.2921875, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005185642377105706, "frac_reward_zero_std": 0.73125, "grad_norm": 0.1581297665834427, "kl": 0.04759661984280683, "learning_rate": 4.0921428571428565e-07, "loss": 0.0, "num_tokens": 420585708.0, "reward": 0.3640625, "reward_std": 0.23483354449272156, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.917686927318573, "step": 5730 }, { "completion_length": 302.8, "completions/clipped_ratio": 0.0, "completions/max_length": 302.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 108.8515625, "completions/mean_terminated_length": 108.8515625, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.005190167370453965, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2318313717842102, "kl": 0.040494993701577185, "learning_rate": 4.0957142857142857e-07, "loss": 0.0, "num_tokens": 420921950.0, "reward": 0.3546875, "reward_std": 0.1890133649110794, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9255098342895508, "step": 5735 }, { "completion_length": 383.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 124.3625, "completions/mean_terminated_length": 124.3625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005194692363802225, "frac_reward_zero_std": 0.73125, "grad_norm": 0.19138184189796448, "kl": 0.04549395125941373, "learning_rate": 4.099285714285714e-07, "loss": 0.0, "num_tokens": 421280718.0, "reward": 0.2234375, "reward_std": 0.23619755506515502, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9705299496650696, "step": 5740 }, { "completion_length": 331.8, "completions/clipped_ratio": 0.0, "completions/max_length": 331.8, "completions/max_terminated_length": 331.8, "completions/mean_length": 116.7828125, "completions/mean_terminated_length": 116.7828125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005199217357150485, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2505626082420349, "kl": 0.06519930572248996, "learning_rate": 4.102857142857143e-07, "loss": 0.0001, "num_tokens": 421629760.0, "reward": 0.2625, "reward_std": 0.18927202224731446, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9632384419441223, "step": 5745 }, { "completion_length": 310.8, "completions/clipped_ratio": 0.0, "completions/max_length": 310.8, "completions/max_terminated_length": 310.8, "completions/mean_length": 118.17890625, "completions/mean_terminated_length": 118.17890625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.0052037423504987445, "frac_reward_zero_std": 0.74375, "grad_norm": 0.2336340993642807, "kl": 0.09618097397033125, "learning_rate": 4.106428571428571e-07, "loss": 0.0001, "num_tokens": 421981101.0, "reward": 0.3015625, "reward_std": 0.2257815569639206, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9505411505699157, "step": 5750 }, { "completion_length": 411.8, "completions/clipped_ratio": 0.0, "completions/max_length": 411.8, "completions/max_terminated_length": 411.8, "completions/mean_length": 117.63125, "completions/mean_terminated_length": 117.63125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.005208267343847005, "frac_reward_zero_std": 0.75625, "grad_norm": 0.13805167376995087, "kl": 0.07477716397261247, "learning_rate": 4.1099999999999996e-07, "loss": 0.0001, "num_tokens": 422332053.0, "reward": 0.321875, "reward_std": 0.21967369318008423, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9360445737838745, "step": 5755 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 115.8265625, "completions/mean_terminated_length": 115.8265625, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.005212792337195265, "frac_reward_zero_std": 0.775, "grad_norm": 0.22366322576999664, "kl": 0.09606207086471841, "learning_rate": 4.113571428571428e-07, "loss": 0.0001, "num_tokens": 422679959.0, "reward": 0.16875, "reward_std": 0.1950603485107422, "rewards/verify_chess_move/mean": 0.16875, "rewards/verify_chess_move/std": 0.9802753925323486, "step": 5760 }, { "completion_length": 325.6, "completions/clipped_ratio": 0.0, "completions/max_length": 325.6, "completions/max_terminated_length": 325.6, "completions/mean_length": 122.0609375, "completions/mean_terminated_length": 122.0609375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.005217317330543524, "frac_reward_zero_std": 0.75625, "grad_norm": 0.3091264069080353, "kl": 0.1526487820665352, "learning_rate": 4.117142857142857e-07, "loss": 0.0002, "num_tokens": 423036005.0, "reward": 0.209375, "reward_std": 0.21877854466438293, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9562015414237977, "step": 5765 }, { "completion_length": 402.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 120.34296875, "completions/mean_terminated_length": 120.34296875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.005221842323891784, "frac_reward_zero_std": 0.73125, "grad_norm": 0.5081127285957336, "kl": 0.10924529629410244, "learning_rate": 4.1207142857142855e-07, "loss": 0.0001, "num_tokens": 423391084.0, "reward": 0.1703125, "reward_std": 0.23141369223594666, "rewards/verify_chess_move/mean": 0.1703125, "rewards/verify_chess_move/std": 0.9818762540817261, "step": 5770 }, { "completion_length": 465.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 465.2, "completions/max_terminated_length": 366.8, "completions/mean_length": 110.66640625, "completions/mean_terminated_length": 110.13920440673829, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.005226367317240043, "frac_reward_zero_std": 0.7875, "grad_norm": 0.16567453742027283, "kl": 0.17396164991660043, "learning_rate": 4.124285714285714e-07, "loss": 0.0002, "num_tokens": 423730857.0, "reward": 0.2765625, "reward_std": 0.17071611434221268, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9578110814094544, "step": 5775 }, { "completion_length": 394.2, "completions/clipped_ratio": 0.0, "completions/max_length": 394.2, "completions/max_terminated_length": 394.2, "completions/mean_length": 117.7984375, "completions/mean_terminated_length": 117.7984375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.005230892310588303, "frac_reward_zero_std": 0.8, "grad_norm": 0.6362207531929016, "kl": 0.2380146705894731, "learning_rate": 4.1278571428571427e-07, "loss": 0.0002, "num_tokens": 424082615.0, "reward": 0.2890625, "reward_std": 0.1737061396241188, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9529647707939148, "step": 5780 }, { "completion_length": 445.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.8, "completions/max_terminated_length": 400.6, "completions/mean_length": 114.7375, "completions/mean_terminated_length": 114.23478698730469, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0052354173039365634, "frac_reward_zero_std": 0.69375, "grad_norm": 0.44374892115592957, "kl": 0.15090776041033677, "learning_rate": 4.1314285714285713e-07, "loss": 0.0002, "num_tokens": 424427839.0, "reward": 0.2265625, "reward_std": 0.2643416315317154, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.961353600025177, "step": 5785 }, { "completion_length": 426.8, "completions/clipped_ratio": 0.0, "completions/max_length": 426.8, "completions/max_terminated_length": 426.8, "completions/mean_length": 115.4046875, "completions/mean_terminated_length": 115.4046875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.005239942297284823, "frac_reward_zero_std": 0.73125, "grad_norm": 0.23643846809864044, "kl": 0.06361408436787315, "learning_rate": 4.1349999999999994e-07, "loss": 0.0001, "num_tokens": 424775477.0, "reward": 0.284375, "reward_std": 0.2362450271844864, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9499392509460449, "step": 5790 }, { "completion_length": 346.6, "completions/clipped_ratio": 0.0, "completions/max_length": 346.6, "completions/max_terminated_length": 346.6, "completions/mean_length": 124.553125, "completions/mean_terminated_length": 124.553125, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005244467290633083, "frac_reward_zero_std": 0.75625, "grad_norm": 0.2518150508403778, "kl": 0.08315404888708144, "learning_rate": 4.1385714285714285e-07, "loss": 0.0001, "num_tokens": 425138017.0, "reward": 0.265625, "reward_std": 0.21967566907405853, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9526559591293335, "step": 5795 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 118.603125, "completions/mean_terminated_length": 118.603125, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.005248992283981343, "frac_reward_zero_std": 0.7375, "grad_norm": 0.4860943555831909, "kl": 0.07288487928453832, "learning_rate": 4.1421428571428566e-07, "loss": 0.0001, "num_tokens": 425488341.0, "reward": 0.2703125, "reward_std": 0.23835329711437225, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9639590382575989, "step": 5800 }, { "completion_length": 374.8, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 115.93671875, "completions/mean_terminated_length": 115.93671875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005253517277329602, "frac_reward_zero_std": 0.78125, "grad_norm": 0.3516562879085541, "kl": 0.1057541981106624, "learning_rate": 4.145714285714286e-07, "loss": 0.0001, "num_tokens": 425837004.0, "reward": 0.046875, "reward_std": 0.1817562073469162, "rewards/verify_chess_move/mean": 0.046875, "rewards/verify_chess_move/std": 0.9970943331718445, "step": 5805 }, { "completion_length": 315.4, "completions/clipped_ratio": 0.0, "completions/max_length": 315.4, "completions/max_terminated_length": 315.4, "completions/mean_length": 109.95390625, "completions/mean_terminated_length": 109.95390625, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.005258042270677862, "frac_reward_zero_std": 0.7625, "grad_norm": 0.19201725721359253, "kl": 0.041567096777725965, "learning_rate": 4.149285714285714e-07, "loss": 0.0, "num_tokens": 426175561.0, "reward": 0.2765625, "reward_std": 0.2052641898393631, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9535407185554504, "step": 5810 }, { "completion_length": 346.6, "completions/clipped_ratio": 0.0, "completions/max_length": 346.6, "completions/max_terminated_length": 346.6, "completions/mean_length": 118.15625, "completions/mean_terminated_length": 118.15625, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.005262567264026122, "frac_reward_zero_std": 0.70625, "grad_norm": 0.2806057035923004, "kl": 0.035507672547828405, "learning_rate": 4.1528571428571424e-07, "loss": 0.0, "num_tokens": 426527329.0, "reward": 0.303125, "reward_std": 0.25020064115524293, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9422365307807923, "step": 5815 }, { "completion_length": 308.2, "completions/clipped_ratio": 0.0, "completions/max_length": 308.2, "completions/max_terminated_length": 308.2, "completions/mean_length": 115.2125, "completions/mean_terminated_length": 115.2125, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0052670922573743815, "frac_reward_zero_std": 0.74375, "grad_norm": 0.2351110577583313, "kl": 0.03891870540101081, "learning_rate": 4.156428571428571e-07, "loss": 0.0, "num_tokens": 426872209.0, "reward": 0.31875, "reward_std": 0.22373063266277313, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9383200407028198, "step": 5820 }, { "completion_length": 317.2, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/max_terminated_length": 317.2, "completions/mean_length": 111.34296875, "completions/mean_terminated_length": 111.34296875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.005271617250722642, "frac_reward_zero_std": 0.76875, "grad_norm": 0.09887982159852982, "kl": 0.03694148645736277, "learning_rate": 4.1599999999999997e-07, "loss": 0.0, "num_tokens": 427211336.0, "reward": 0.26875, "reward_std": 0.20195307284593583, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9573778510093689, "step": 5825 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 122.403125, "completions/mean_terminated_length": 122.403125, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.005276142244070901, "frac_reward_zero_std": 0.75625, "grad_norm": 0.1572813242673874, "kl": 0.051392863382352515, "learning_rate": 4.1635714285714283e-07, "loss": 0.0001, "num_tokens": 427567628.0, "reward": 0.2625, "reward_std": 0.20690319538116456, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9589514136314392, "step": 5830 }, { "completion_length": 358.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 122.428125, "completions/mean_terminated_length": 122.428125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.005280667237419161, "frac_reward_zero_std": 0.7875, "grad_norm": 0.15168406069278717, "kl": 0.033048360573593526, "learning_rate": 4.167142857142857e-07, "loss": 0.0, "num_tokens": 427925440.0, "reward": 0.196875, "reward_std": 0.1882230132818222, "rewards/verify_chess_move/mean": 0.196875, "rewards/verify_chess_move/std": 0.9787122130393981, "step": 5835 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 119.96796875, "completions/mean_terminated_length": 119.96796875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005285192230767421, "frac_reward_zero_std": 0.7625, "grad_norm": 0.13271868228912354, "kl": 0.035230932419653985, "learning_rate": 4.1707142857142855e-07, "loss": 0.0, "num_tokens": 428277143.0, "reward": 0.2328125, "reward_std": 0.2093175858259201, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.9487215042114258, "step": 5840 }, { "completion_length": 314.6, "completions/clipped_ratio": 0.0, "completions/max_length": 314.6, "completions/max_terminated_length": 314.6, "completions/mean_length": 115.309375, "completions/mean_terminated_length": 115.309375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.00528971722411568, "frac_reward_zero_std": 0.6625, "grad_norm": 0.5499518513679504, "kl": 0.054517581965774296, "learning_rate": 4.174285714285714e-07, "loss": 0.0001, "num_tokens": 428622259.0, "reward": 0.1921875, "reward_std": 0.2903264552354813, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9600184202194214, "step": 5845 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 117.3390625, "completions/mean_terminated_length": 117.3390625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.00529424221746394, "frac_reward_zero_std": 0.7375, "grad_norm": 0.2013290524482727, "kl": 0.06738642976852134, "learning_rate": 4.177857142857142e-07, "loss": 0.0001, "num_tokens": 428969453.0, "reward": 0.303125, "reward_std": 0.2347232460975647, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9366185903549195, "step": 5850 }, { "completion_length": 291.6, "completions/clipped_ratio": 0.0, "completions/max_length": 291.6, "completions/max_terminated_length": 291.6, "completions/mean_length": 110.1171875, "completions/mean_terminated_length": 110.1171875, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.0052987672108122, "frac_reward_zero_std": 0.76875, "grad_norm": 0.2713363766670227, "kl": 0.04768659648252651, "learning_rate": 4.1814285714285713e-07, "loss": 0.0, "num_tokens": 429306619.0, "reward": 0.2140625, "reward_std": 0.2037443622946739, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9685346484184265, "step": 5855 }, { "completion_length": 310.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 115.61875, "completions/mean_terminated_length": 115.61875, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.00530329220416046, "frac_reward_zero_std": 0.75625, "grad_norm": 0.19018524885177612, "kl": 0.03850295851007104, "learning_rate": 4.1849999999999994e-07, "loss": 0.0, "num_tokens": 429654475.0, "reward": 0.240625, "reward_std": 0.20605453252792358, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.96402028799057, "step": 5860 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 119.15234375, "completions/mean_terminated_length": 119.15234375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.00530781719750872, "frac_reward_zero_std": 0.73125, "grad_norm": 0.19829271733760834, "kl": 0.04323049462400377, "learning_rate": 4.1885714285714286e-07, "loss": 0.0, "num_tokens": 430007478.0, "reward": 0.2296875, "reward_std": 0.22936374545097352, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9695867538452149, "step": 5865 }, { "completion_length": 342.4, "completions/clipped_ratio": 0.0, "completions/max_length": 342.4, "completions/max_terminated_length": 342.4, "completions/mean_length": 112.6875, "completions/mean_terminated_length": 112.6875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.00531234219085698, "frac_reward_zero_std": 0.73125, "grad_norm": 0.26809537410736084, "kl": 0.03514871663646772, "learning_rate": 4.1921428571428566e-07, "loss": 0.0, "num_tokens": 430349038.0, "reward": 0.3578125, "reward_std": 0.23115601539611816, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9223249912261963, "step": 5870 }, { "completion_length": 323.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 114.03046875, "completions/mean_terminated_length": 114.03046875, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005316867184205239, "frac_reward_zero_std": 0.7625, "grad_norm": 0.29343387484550476, "kl": 0.042835700453724715, "learning_rate": 4.195714285714286e-07, "loss": 0.0, "num_tokens": 430693925.0, "reward": 0.321875, "reward_std": 0.20568949580192566, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9354507327079773, "step": 5875 }, { "completion_length": 330.6, "completions/clipped_ratio": 0.0, "completions/max_length": 330.6, "completions/max_terminated_length": 330.6, "completions/mean_length": 116.68671875, "completions/mean_terminated_length": 116.68671875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.005321392177553499, "frac_reward_zero_std": 0.76875, "grad_norm": 0.18312208354473114, "kl": 0.03639656474115327, "learning_rate": 4.199285714285714e-07, "loss": 0.0, "num_tokens": 431042780.0, "reward": 0.3640625, "reward_std": 0.19290364384651185, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9227221846580506, "step": 5880 }, { "completion_length": 379.8, "completions/clipped_ratio": 0.0, "completions/max_length": 379.8, "completions/max_terminated_length": 379.8, "completions/mean_length": 111.12734375, "completions/mean_terminated_length": 111.12734375, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.005325917170901758, "frac_reward_zero_std": 0.725, "grad_norm": 0.1454046070575714, "kl": 0.03324137639137916, "learning_rate": 4.2028571428571425e-07, "loss": 0.0, "num_tokens": 431380279.0, "reward": 0.2703125, "reward_std": 0.23493893444538116, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9441238403320312, "step": 5885 }, { "completion_length": 386.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 110.1375, "completions/mean_terminated_length": 110.1375, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.0053304421642500184, "frac_reward_zero_std": 0.73125, "grad_norm": 0.1814977079629898, "kl": 0.04211494080373086, "learning_rate": 4.206428571428571e-07, "loss": 0.0, "num_tokens": 431717919.0, "reward": 0.215625, "reward_std": 0.23009519279003143, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9717147946357727, "step": 5890 }, { "completion_length": 327.6, "completions/clipped_ratio": 0.0, "completions/max_length": 327.6, "completions/max_terminated_length": 327.6, "completions/mean_length": 113.3140625, "completions/mean_terminated_length": 113.3140625, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.0053349671575982785, "frac_reward_zero_std": 0.7625, "grad_norm": 0.1412525326013565, "kl": 0.03390540758264251, "learning_rate": 4.2099999999999997e-07, "loss": 0.0, "num_tokens": 432060809.0, "reward": 0.2484375, "reward_std": 0.2004803240299225, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9515222907066345, "step": 5895 }, { "completion_length": 318.6, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/max_terminated_length": 318.6, "completions/mean_length": 114.4953125, "completions/mean_terminated_length": 114.4953125, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005339492150946538, "frac_reward_zero_std": 0.725, "grad_norm": 0.1745930314064026, "kl": 0.03706031548790634, "learning_rate": 4.2135714285714283e-07, "loss": 0.0, "num_tokens": 432403987.0, "reward": 0.3765625, "reward_std": 0.22742311656475067, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9252823114395141, "step": 5900 }, { "completion_length": 332.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 110.36640625, "completions/mean_terminated_length": 110.36640625, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.005344017144294798, "frac_reward_zero_std": 0.79375, "grad_norm": 0.09505756944417953, "kl": 0.03240258102305234, "learning_rate": 4.217142857142857e-07, "loss": 0.0, "num_tokens": 432743720.0, "reward": 0.35, "reward_std": 0.17496730983257294, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.918940794467926, "step": 5905 }, { "completion_length": 321.4, "completions/clipped_ratio": 0.0, "completions/max_length": 321.4, "completions/max_terminated_length": 321.4, "completions/mean_length": 115.703125, "completions/mean_terminated_length": 115.703125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005348542137643058, "frac_reward_zero_std": 0.7625, "grad_norm": 0.18413202464580536, "kl": 0.0320882112137042, "learning_rate": 4.2207142857142856e-07, "loss": 0.0, "num_tokens": 433092836.0, "reward": 0.209375, "reward_std": 0.21063610017299653, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9647917509078979, "step": 5910 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 118.6640625, "completions/mean_terminated_length": 118.6640625, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.005353067130991317, "frac_reward_zero_std": 0.75, "grad_norm": 0.09832140058279037, "kl": 0.03464841804234311, "learning_rate": 4.224285714285714e-07, "loss": 0.0, "num_tokens": 433442030.0, "reward": 0.30625, "reward_std": 0.21726028621196747, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9430613040924072, "step": 5915 }, { "completion_length": 318.6, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/max_terminated_length": 318.6, "completions/mean_length": 118.21953125, "completions/mean_terminated_length": 118.21953125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.005357592124339577, "frac_reward_zero_std": 0.725, "grad_norm": 0.19349268078804016, "kl": 0.03599890378536656, "learning_rate": 4.227857142857142e-07, "loss": 0.0, "num_tokens": 433793743.0, "reward": 0.1171875, "reward_std": 0.23173418939113616, "rewards/verify_chess_move/mean": 0.1171875, "rewards/verify_chess_move/std": 0.9904076457023621, "step": 5920 }, { "completion_length": 327.2, "completions/clipped_ratio": 0.0, "completions/max_length": 327.2, "completions/max_terminated_length": 327.2, "completions/mean_length": 103.3359375, "completions/mean_terminated_length": 103.3359375, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.005362117117687837, "frac_reward_zero_std": 0.775, "grad_norm": 0.20177200436592102, "kl": 0.03264011992723681, "learning_rate": 4.2314285714285714e-07, "loss": 0.0, "num_tokens": 434121141.0, "reward": 0.2890625, "reward_std": 0.18207572102546693, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9361400246620178, "step": 5925 }, { "completion_length": 361.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 113.3203125, "completions/mean_terminated_length": 113.3203125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005366642111036097, "frac_reward_zero_std": 0.80625, "grad_norm": 0.19482198357582092, "kl": 0.0327100396330934, "learning_rate": 4.2349999999999995e-07, "loss": 0.0, "num_tokens": 434465295.0, "reward": 0.31875, "reward_std": 0.17270265221595765, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9246659994125366, "step": 5930 }, { "completion_length": 313.2, "completions/clipped_ratio": 0.0, "completions/max_length": 313.2, "completions/max_terminated_length": 313.2, "completions/mean_length": 114.090625, "completions/mean_terminated_length": 114.090625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005371167104384357, "frac_reward_zero_std": 0.75, "grad_norm": 0.2137754261493683, "kl": 0.03838317213812843, "learning_rate": 4.2385714285714286e-07, "loss": 0.0, "num_tokens": 434807491.0, "reward": 0.3625, "reward_std": 0.22183099389076233, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.929034149646759, "step": 5935 }, { "completion_length": 344.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 126.240625, "completions/mean_terminated_length": 126.240625, "completions/min_length": 41.6, "completions/min_terminated_length": 41.6, "epoch": 0.005375692097732617, "frac_reward_zero_std": 0.79375, "grad_norm": 0.1331167370080948, "kl": 0.036395743174944074, "learning_rate": 4.2421428571428567e-07, "loss": 0.0, "num_tokens": 435172087.0, "reward": 0.18125, "reward_std": 0.17859442234039308, "rewards/verify_chess_move/mean": 0.18125, "rewards/verify_chess_move/std": 0.9628275156021118, "step": 5940 }, { "completion_length": 357.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 107.27109375, "completions/mean_terminated_length": 107.27109375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.005380217091080876, "frac_reward_zero_std": 0.75, "grad_norm": 0.42165911197662354, "kl": 0.033318683493416754, "learning_rate": 4.2457142857142853e-07, "loss": 0.0, "num_tokens": 435505410.0, "reward": 0.2890625, "reward_std": 0.22409508526325225, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9491365075111389, "step": 5945 }, { "completion_length": 315.2, "completions/clipped_ratio": 0.0, "completions/max_length": 315.2, "completions/max_terminated_length": 315.2, "completions/mean_length": 118.6578125, "completions/mean_terminated_length": 118.6578125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005384742084429136, "frac_reward_zero_std": 0.725, "grad_norm": 0.2997364401817322, "kl": 0.04172541204607114, "learning_rate": 4.249285714285714e-07, "loss": 0.0, "num_tokens": 435857828.0, "reward": 0.1734375, "reward_std": 0.23993398547172545, "rewards/verify_chess_move/mean": 0.1734375, "rewards/verify_chess_move/std": 0.9802643775939941, "step": 5950 }, { "completion_length": 343.4, "completions/clipped_ratio": 0.0, "completions/max_length": 343.4, "completions/max_terminated_length": 343.4, "completions/mean_length": 116.73046875, "completions/mean_terminated_length": 116.73046875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005389267077777395, "frac_reward_zero_std": 0.725, "grad_norm": 0.2610864043235779, "kl": 0.03994203155161813, "learning_rate": 4.2528571428571425e-07, "loss": 0.0, "num_tokens": 436205507.0, "reward": 0.309375, "reward_std": 0.24339738190174104, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9356810092926026, "step": 5955 }, { "completion_length": 327.4, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/max_terminated_length": 327.4, "completions/mean_length": 112.65078125, "completions/mean_terminated_length": 112.65078125, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.005393792071125655, "frac_reward_zero_std": 0.85625, "grad_norm": 0.13261981308460236, "kl": 0.06082789284409955, "learning_rate": 4.256428571428571e-07, "loss": 0.0001, "num_tokens": 436549052.0, "reward": 0.3171875, "reward_std": 0.1250466451048851, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9355594873428345, "step": 5960 }, { "completion_length": 371.8, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 124.746875, "completions/mean_terminated_length": 124.746875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0053983170644739155, "frac_reward_zero_std": 0.79375, "grad_norm": 0.23815308511257172, "kl": 0.1269283509347588, "learning_rate": 4.26e-07, "loss": 0.0001, "num_tokens": 436909432.0, "reward": 0.315625, "reward_std": 0.17133823931217193, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9462652683258057, "step": 5965 }, { "completion_length": 354.6, "completions/clipped_ratio": 0.0, "completions/max_length": 354.6, "completions/max_terminated_length": 354.6, "completions/mean_length": 113.81640625, "completions/mean_terminated_length": 113.81640625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005402842057822175, "frac_reward_zero_std": 0.78125, "grad_norm": 0.20252059400081635, "kl": 0.1833012020913884, "learning_rate": 4.2635714285714284e-07, "loss": 0.0002, "num_tokens": 437252661.0, "reward": 0.4390625, "reward_std": 0.18764541447162628, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.875691831111908, "step": 5970 }, { "completion_length": 341.6, "completions/clipped_ratio": 0.0, "completions/max_length": 341.6, "completions/max_terminated_length": 341.6, "completions/mean_length": 113.87734375, "completions/mean_terminated_length": 113.87734375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005407367051170435, "frac_reward_zero_std": 0.7625, "grad_norm": 0.29612860083580017, "kl": 0.3693465417716652, "learning_rate": 4.267142857142857e-07, "loss": 0.0004, "num_tokens": 437596144.0, "reward": 0.2796875, "reward_std": 0.20594814866781236, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9431578755378723, "step": 5975 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 114.97578125, "completions/mean_terminated_length": 114.97578125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005411892044518695, "frac_reward_zero_std": 0.65625, "grad_norm": 0.337520569562912, "kl": 0.1496275174198672, "learning_rate": 4.270714285714285e-07, "loss": 0.0001, "num_tokens": 437940025.0, "reward": 0.24375, "reward_std": 0.30353665053844453, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9690696239471436, "step": 5980 }, { "completion_length": 328.2, "completions/clipped_ratio": 0.0, "completions/max_length": 328.2, "completions/max_terminated_length": 328.2, "completions/mean_length": 119.890625, "completions/mean_terminated_length": 119.890625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005416417037866954, "frac_reward_zero_std": 0.83125, "grad_norm": 0.286182701587677, "kl": 0.06952031928813085, "learning_rate": 4.274285714285714e-07, "loss": 0.0001, "num_tokens": 438294645.0, "reward": 0.2890625, "reward_std": 0.14844982028007508, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9542667865753174, "step": 5985 }, { "completion_length": 310.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 116.6515625, "completions/mean_terminated_length": 116.6515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.005420942031215214, "frac_reward_zero_std": 0.8, "grad_norm": 0.2039734125137329, "kl": 0.14149507391266525, "learning_rate": 4.2778571428571423e-07, "loss": 0.0001, "num_tokens": 438642679.0, "reward": 0.3, "reward_std": 0.18138567209243775, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9377940177917481, "step": 5990 }, { "completion_length": 332.4, "completions/clipped_ratio": 0.0, "completions/max_length": 332.4, "completions/max_terminated_length": 332.4, "completions/mean_length": 113.140625, "completions/mean_terminated_length": 113.140625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.005425467024563474, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1507950872182846, "kl": 0.055594176752492785, "learning_rate": 4.2814285714285714e-07, "loss": 0.0001, "num_tokens": 438984315.0, "reward": 0.246875, "reward_std": 0.1635468363761902, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9637229442596436, "step": 5995 }, { "completion_length": 336.6, "completions/clipped_ratio": 0.0, "completions/max_length": 336.6, "completions/max_terminated_length": 336.6, "completions/mean_length": 118.25234375, "completions/mean_terminated_length": 118.25234375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0054299920179117335, "frac_reward_zero_std": 0.78125, "grad_norm": 0.32550475001335144, "kl": 0.07417388391913846, "learning_rate": 4.2849999999999995e-07, "loss": 0.0001, "num_tokens": 439336774.0, "reward": 0.20625, "reward_std": 0.18585609197616576, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9622326970100403, "step": 6000 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 115.9921875, "completions/mean_terminated_length": 115.9921875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005434517011259994, "frac_reward_zero_std": 0.8, "grad_norm": 0.13169892132282257, "kl": 0.08850631377426907, "learning_rate": 4.2885714285714287e-07, "loss": 0.0001, "num_tokens": 439686964.0, "reward": 0.165625, "reward_std": 0.17669872269034387, "rewards/verify_chess_move/mean": 0.165625, "rewards/verify_chess_move/std": 0.9679984450340271, "step": 6005 }, { "completion_length": 373.2, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 113.19140625, "completions/mean_terminated_length": 113.19140625, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.005439042004608253, "frac_reward_zero_std": 0.78125, "grad_norm": 0.22091110050678253, "kl": 0.12311725131003186, "learning_rate": 4.292142857142857e-07, "loss": 0.0001, "num_tokens": 440029001.0, "reward": 0.3328125, "reward_std": 0.18995696157217026, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9304391741752625, "step": 6010 }, { "completion_length": 327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 120.73828125, "completions/mean_terminated_length": 120.73828125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.005443566997956513, "frac_reward_zero_std": 0.775, "grad_norm": 0.4461183547973633, "kl": 0.24505877266637982, "learning_rate": 4.2957142857142854e-07, "loss": 0.0002, "num_tokens": 440384058.0, "reward": 0.196875, "reward_std": 0.19958458244800567, "rewards/verify_chess_move/mean": 0.196875, "rewards/verify_chess_move/std": 0.9763614773750305, "step": 6015 }, { "completion_length": 324.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 113.17421875, "completions/mean_terminated_length": 113.17421875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005448091991304773, "frac_reward_zero_std": 0.8375, "grad_norm": 0.2702443599700928, "kl": 0.4728744267486036, "learning_rate": 4.299285714285714e-07, "loss": 0.0005, "num_tokens": 440729721.0, "reward": 0.1359375, "reward_std": 0.13766840174794198, "rewards/verify_chess_move/mean": 0.1359375, "rewards/verify_chess_move/std": 0.9877361536026001, "step": 6020 }, { "completion_length": 436.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.6, "completions/max_terminated_length": 352.2, "completions/mean_length": 114.3640625, "completions/mean_terminated_length": 113.86036834716796, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.005452616984653032, "frac_reward_zero_std": 0.70625, "grad_norm": 0.5354937314987183, "kl": 4.084268877306021, "learning_rate": 4.3028571428571426e-07, "loss": 0.0041, "num_tokens": 441073379.0, "reward": 0.3140625, "reward_std": 0.26087372899055483, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9466499924659729, "step": 6025 }, { "completion_length": 308.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 119.0703125, "completions/mean_terminated_length": 119.0703125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005457141978001292, "frac_reward_zero_std": 0.775, "grad_norm": 0.5410548448562622, "kl": 0.3136285826098174, "learning_rate": 4.306428571428571e-07, "loss": 0.0003, "num_tokens": 441427101.0, "reward": 0.2140625, "reward_std": 0.19842880666255952, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9733586430549621, "step": 6030 }, { "completion_length": 333.6, "completions/clipped_ratio": 0.0, "completions/max_length": 333.6, "completions/max_terminated_length": 333.6, "completions/mean_length": 115.66953125, "completions/mean_terminated_length": 115.66953125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0054616669713495524, "frac_reward_zero_std": 0.7875, "grad_norm": 0.19375824928283691, "kl": 0.17227176192682236, "learning_rate": 4.31e-07, "loss": 0.0002, "num_tokens": 441773302.0, "reward": 0.3015625, "reward_std": 0.18737336695194245, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9439716339111328, "step": 6035 }, { "completion_length": 332.8, "completions/clipped_ratio": 0.0, "completions/max_length": 332.8, "completions/max_terminated_length": 332.8, "completions/mean_length": 114.17890625, "completions/mean_terminated_length": 114.17890625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.005466191964697812, "frac_reward_zero_std": 0.80625, "grad_norm": 0.38244545459747314, "kl": 0.14030933397589251, "learning_rate": 4.3135714285714284e-07, "loss": 0.0001, "num_tokens": 442117323.0, "reward": 0.2875, "reward_std": 0.16928476095199585, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9343019962310791, "step": 6040 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 118.83984375, "completions/mean_terminated_length": 118.83984375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.005470716958046072, "frac_reward_zero_std": 0.75625, "grad_norm": 0.4905371069908142, "kl": 0.05069697416620329, "learning_rate": 4.317142857142857e-07, "loss": 0.0001, "num_tokens": 442465902.0, "reward": 0.3828125, "reward_std": 0.20332159399986266, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.920637333393097, "step": 6045 }, { "completion_length": 377.4, "completions/clipped_ratio": 0.0, "completions/max_length": 377.4, "completions/max_terminated_length": 377.4, "completions/mean_length": 112.12734375, "completions/mean_terminated_length": 112.12734375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005475241951394332, "frac_reward_zero_std": 0.78125, "grad_norm": 0.09377816319465637, "kl": 0.03772723762667738, "learning_rate": 4.320714285714285e-07, "loss": 0.0, "num_tokens": 442805657.0, "reward": 0.2828125, "reward_std": 0.1831231564283371, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9485804080963135, "step": 6050 }, { "completion_length": 294.2, "completions/clipped_ratio": 0.0, "completions/max_length": 294.2, "completions/max_terminated_length": 294.2, "completions/mean_length": 110.98671875, "completions/mean_terminated_length": 110.98671875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.005479766944742591, "frac_reward_zero_std": 0.75625, "grad_norm": 0.4282471239566803, "kl": 0.04331001670798287, "learning_rate": 4.3242857142857143e-07, "loss": 0.0, "num_tokens": 443145176.0, "reward": 0.240625, "reward_std": 0.2148917943239212, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9689464449882508, "step": 6055 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 124.4515625, "completions/mean_terminated_length": 124.4515625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005484291938090851, "frac_reward_zero_std": 0.75, "grad_norm": 0.32559478282928467, "kl": 0.0643580871692393, "learning_rate": 4.3278571428571424e-07, "loss": 0.0001, "num_tokens": 443505570.0, "reward": 0.21875, "reward_std": 0.2108992636203766, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9718159794807434, "step": 6060 }, { "completion_length": 418.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 418.0, "completions/max_terminated_length": 329.4, "completions/mean_length": 118.71796875, "completions/mean_terminated_length": 118.20745239257812, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.00548881693143911, "frac_reward_zero_std": 0.74375, "grad_norm": 0.25103768706321716, "kl": 0.03255897685303353, "learning_rate": 4.3314285714285715e-07, "loss": 0.0, "num_tokens": 443857617.0, "reward": 0.14375, "reward_std": 0.2279952198266983, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9881165027618408, "step": 6065 }, { "completion_length": 428.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.2, "completions/max_terminated_length": 341.0, "completions/mean_length": 119.29921875, "completions/mean_terminated_length": 118.78775329589844, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0054933419247873705, "frac_reward_zero_std": 0.75, "grad_norm": 0.20447592437267303, "kl": 0.03257891954272054, "learning_rate": 4.3349999999999996e-07, "loss": 0.0, "num_tokens": 444210720.0, "reward": 0.290625, "reward_std": 0.22630874514579774, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9512238740921021, "step": 6070 }, { "completion_length": 320.6, "completions/clipped_ratio": 0.0, "completions/max_length": 320.6, "completions/max_terminated_length": 320.6, "completions/mean_length": 113.9484375, "completions/mean_terminated_length": 113.9484375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.005497866918135631, "frac_reward_zero_std": 0.80625, "grad_norm": 0.21755565702915192, "kl": 0.04709612008882687, "learning_rate": 4.338571428571428e-07, "loss": 0.0, "num_tokens": 444556046.0, "reward": 0.2140625, "reward_std": 0.17022739350795746, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9594256401062011, "step": 6075 }, { "completion_length": 344.2, "completions/clipped_ratio": 0.0, "completions/max_length": 344.2, "completions/max_terminated_length": 344.2, "completions/mean_length": 116.69921875, "completions/mean_terminated_length": 116.69921875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.00550239191148389, "frac_reward_zero_std": 0.84375, "grad_norm": 0.12301802635192871, "kl": 0.04498158200294711, "learning_rate": 4.342142857142857e-07, "loss": 0.0, "num_tokens": 444905517.0, "reward": 0.2875, "reward_std": 0.14418268203735352, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9572460174560546, "step": 6080 }, { "completion_length": 317.2, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/max_terminated_length": 317.2, "completions/mean_length": 129.065625, "completions/mean_terminated_length": 129.065625, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.00550691690483215, "frac_reward_zero_std": 0.7375, "grad_norm": 0.1551414281129837, "kl": 0.047143833013251424, "learning_rate": 4.3457142857142854e-07, "loss": 0.0, "num_tokens": 445273833.0, "reward": 0.1109375, "reward_std": 0.22110505700111388, "rewards/verify_chess_move/mean": 0.1109375, "rewards/verify_chess_move/std": 0.9856562376022339, "step": 6085 }, { "completion_length": 324.8, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/max_terminated_length": 324.8, "completions/mean_length": 117.25625, "completions/mean_terminated_length": 117.25625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.00551144189818041, "frac_reward_zero_std": 0.81875, "grad_norm": 0.12150520831346512, "kl": 0.0730203058803454, "learning_rate": 4.349285714285714e-07, "loss": 0.0001, "num_tokens": 445624913.0, "reward": 0.2703125, "reward_std": 0.14840489774942398, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9547615885734558, "step": 6090 }, { "completion_length": 329.4, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/max_terminated_length": 329.4, "completions/mean_length": 112.00234375, "completions/mean_terminated_length": 112.00234375, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.005515966891528669, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2215033322572708, "kl": 0.06268228882690892, "learning_rate": 4.3528571428571426e-07, "loss": 0.0001, "num_tokens": 445964636.0, "reward": 0.4625, "reward_std": 0.1906379997730255, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8490774393081665, "step": 6095 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 122.31640625, "completions/mean_terminated_length": 122.31640625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005520491884876929, "frac_reward_zero_std": 0.7375, "grad_norm": 0.38926902413368225, "kl": 0.070642520126421, "learning_rate": 4.356428571428571e-07, "loss": 0.0001, "num_tokens": 446323537.0, "reward": 0.2296875, "reward_std": 0.22562927901744842, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9651400089263916, "step": 6100 }, { "completion_length": 313.8, "completions/clipped_ratio": 0.0, "completions/max_length": 313.8, "completions/max_terminated_length": 313.8, "completions/mean_length": 111.37265625, "completions/mean_terminated_length": 111.37265625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005525016878225189, "frac_reward_zero_std": 0.84375, "grad_norm": 0.29619720578193665, "kl": 0.06195159724447876, "learning_rate": 4.36e-07, "loss": 0.0001, "num_tokens": 446665214.0, "reward": 0.359375, "reward_std": 0.1339329481124878, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9335727095603943, "step": 6105 }, { "completion_length": 350.4, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 115.528125, "completions/mean_terminated_length": 115.528125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005529541871573449, "frac_reward_zero_std": 0.75, "grad_norm": 0.2936720550060272, "kl": 0.09808058181079105, "learning_rate": 4.363571428571428e-07, "loss": 0.0001, "num_tokens": 447013690.0, "reward": 0.184375, "reward_std": 0.20906049609184266, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9778099775314331, "step": 6110 }, { "completion_length": 324.8, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/max_terminated_length": 324.8, "completions/mean_length": 115.3125, "completions/mean_terminated_length": 115.3125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005534066864921709, "frac_reward_zero_std": 0.79375, "grad_norm": 0.4759315252304077, "kl": 0.07524420151021331, "learning_rate": 4.367142857142857e-07, "loss": 0.0001, "num_tokens": 447358122.0, "reward": 0.33125, "reward_std": 0.17838323265314102, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.932015061378479, "step": 6115 }, { "completion_length": 409.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 409.0, "completions/max_terminated_length": 318.2, "completions/mean_length": 112.38046875, "completions/mean_terminated_length": 111.85384979248047, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.005538591858269968, "frac_reward_zero_std": 0.8125, "grad_norm": 0.33573493361473083, "kl": 0.05173891222220846, "learning_rate": 4.370714285714285e-07, "loss": 0.0001, "num_tokens": 447700921.0, "reward": 0.315625, "reward_std": 0.16192122399806977, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9423964381217956, "step": 6120 }, { "completion_length": 325.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 117.0453125, "completions/mean_terminated_length": 117.0453125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.005543116851618228, "frac_reward_zero_std": 0.81875, "grad_norm": 0.27720996737480164, "kl": 0.11752792784245684, "learning_rate": 4.3742857142857143e-07, "loss": 0.0001, "num_tokens": 448050203.0, "reward": 0.3421875, "reward_std": 0.16728072315454484, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9213981986045837, "step": 6125 }, { "completion_length": 382.4, "completions/clipped_ratio": 0.0, "completions/max_length": 382.4, "completions/max_terminated_length": 382.4, "completions/mean_length": 115.54296875, "completions/mean_terminated_length": 115.54296875, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.005547641844966488, "frac_reward_zero_std": 0.7875, "grad_norm": 0.20251277089118958, "kl": 0.09598855139338411, "learning_rate": 4.3778571428571424e-07, "loss": 0.0001, "num_tokens": 448394498.0, "reward": 0.4546875, "reward_std": 0.18391095399856566, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8656901597976685, "step": 6130 }, { "completion_length": 371.8, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 116.2421875, "completions/mean_terminated_length": 116.2421875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005552166838314747, "frac_reward_zero_std": 0.81875, "grad_norm": 0.4194822907447815, "kl": 0.10902901845984161, "learning_rate": 4.3814285714285715e-07, "loss": 0.0001, "num_tokens": 448741352.0, "reward": 0.340625, "reward_std": 0.15156413316726686, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9257425308227539, "step": 6135 }, { "completion_length": 350.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 120.03828125, "completions/mean_terminated_length": 120.03828125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0055566918316630074, "frac_reward_zero_std": 0.7875, "grad_norm": 0.2241380512714386, "kl": 0.09187019031960517, "learning_rate": 4.3849999999999996e-07, "loss": 0.0001, "num_tokens": 449094305.0, "reward": 0.303125, "reward_std": 0.18186002373695373, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9469608426094055, "step": 6140 }, { "completion_length": 324.6, "completions/clipped_ratio": 0.0, "completions/max_length": 324.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 117.42890625, "completions/mean_terminated_length": 117.42890625, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0055612168250112675, "frac_reward_zero_std": 0.825, "grad_norm": 0.2315206229686737, "kl": 0.08566123475320638, "learning_rate": 4.388571428571428e-07, "loss": 0.0001, "num_tokens": 449441782.0, "reward": 0.2390625, "reward_std": 0.15628614723682405, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9638966917991638, "step": 6145 }, { "completion_length": 317.4, "completions/clipped_ratio": 0.0, "completions/max_length": 317.4, "completions/max_terminated_length": 317.4, "completions/mean_length": 126.81484375, "completions/mean_terminated_length": 126.81484375, "completions/min_length": 39.4, "completions/min_terminated_length": 39.4, "epoch": 0.005565741818359527, "frac_reward_zero_std": 0.75625, "grad_norm": 0.2657696604728699, "kl": 0.19550817222334443, "learning_rate": 4.392142857142857e-07, "loss": 0.0002, "num_tokens": 449807553.0, "reward": 0.259375, "reward_std": 0.21437252759933473, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9527201890945435, "step": 6150 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 119.66328125, "completions/mean_terminated_length": 119.66328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.005570266811707787, "frac_reward_zero_std": 0.80625, "grad_norm": 0.13654395937919617, "kl": 0.08919853561092168, "learning_rate": 4.3957142857142855e-07, "loss": 0.0001, "num_tokens": 450160450.0, "reward": 0.1234375, "reward_std": 0.16455033123493196, "rewards/verify_chess_move/mean": 0.1234375, "rewards/verify_chess_move/std": 0.9715636491775512, "step": 6155 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 113.77578125, "completions/mean_terminated_length": 113.77578125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.005574791805056047, "frac_reward_zero_std": 0.80625, "grad_norm": 0.39960530400276184, "kl": 0.07726031399797648, "learning_rate": 4.399285714285714e-07, "loss": 0.0001, "num_tokens": 450504603.0, "reward": 0.309375, "reward_std": 0.17133471071720124, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.932224440574646, "step": 6160 }, { "completion_length": 384.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 116.0421875, "completions/mean_terminated_length": 116.0421875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.005579316798404306, "frac_reward_zero_std": 0.76875, "grad_norm": 0.17777912318706512, "kl": 0.05870841634459793, "learning_rate": 4.4028571428571427e-07, "loss": 0.0001, "num_tokens": 450849993.0, "reward": 0.321875, "reward_std": 0.2044273555278778, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9399835348129273, "step": 6165 }, { "completion_length": 345.4, "completions/clipped_ratio": 0.0, "completions/max_length": 345.4, "completions/max_terminated_length": 345.4, "completions/mean_length": 113.26953125, "completions/mean_terminated_length": 113.26953125, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005583841791752566, "frac_reward_zero_std": 0.7625, "grad_norm": 0.24927948415279388, "kl": 0.042021895345533265, "learning_rate": 4.4064285714285713e-07, "loss": 0.0, "num_tokens": 451192234.0, "reward": 0.246875, "reward_std": 0.1918571799993515, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9552194356918335, "step": 6170 }, { "completion_length": 317.8, "completions/clipped_ratio": 0.0, "completions/max_length": 317.8, "completions/max_terminated_length": 317.8, "completions/mean_length": 110.5921875, "completions/mean_terminated_length": 110.5921875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0055883667851008255, "frac_reward_zero_std": 0.85, "grad_norm": 0.1782568097114563, "kl": 0.048251398652791974, "learning_rate": 4.41e-07, "loss": 0.0, "num_tokens": 451532464.0, "reward": 0.3703125, "reward_std": 0.13335379362106323, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9265484690666199, "step": 6175 }, { "completion_length": 321.8, "completions/clipped_ratio": 0.0, "completions/max_length": 321.8, "completions/max_terminated_length": 321.8, "completions/mean_length": 118.8734375, "completions/mean_terminated_length": 118.8734375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.005592891778449086, "frac_reward_zero_std": 0.79375, "grad_norm": 0.13753502070903778, "kl": 0.04650276113534346, "learning_rate": 4.413571428571428e-07, "loss": 0.0, "num_tokens": 451883910.0, "reward": 0.3421875, "reward_std": 0.17107761800289153, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9362339496612548, "step": 6180 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 117.38203125, "completions/mean_terminated_length": 117.38203125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005597416771797346, "frac_reward_zero_std": 0.7625, "grad_norm": 0.29554441571235657, "kl": 0.09148748457664624, "learning_rate": 4.417142857142857e-07, "loss": 0.0001, "num_tokens": 452231215.0, "reward": 0.2859375, "reward_std": 0.20821125209331512, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9493736147880554, "step": 6185 }, { "completion_length": 355.4, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/max_terminated_length": 355.4, "completions/mean_length": 111.9453125, "completions/mean_terminated_length": 111.9453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.005601941765145605, "frac_reward_zero_std": 0.79375, "grad_norm": 0.76078200340271, "kl": 0.13859481570543722, "learning_rate": 4.420714285714285e-07, "loss": 0.0001, "num_tokens": 452571649.0, "reward": 0.440625, "reward_std": 0.17953802943229674, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.892671549320221, "step": 6190 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 120.44921875, "completions/mean_terminated_length": 120.44921875, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.005606466758493865, "frac_reward_zero_std": 0.7875, "grad_norm": 0.7190703749656677, "kl": 0.7966145955957472, "learning_rate": 4.4242857142857144e-07, "loss": 0.0008, "num_tokens": 452925168.0, "reward": 0.26875, "reward_std": 0.17959889769554138, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9570582628250122, "step": 6195 }, { "completion_length": 448.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 448.0, "completions/max_terminated_length": 365.6, "completions/mean_length": 115.225, "completions/mean_terminated_length": 114.71260528564453, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.005610991751842125, "frac_reward_zero_std": 0.7625, "grad_norm": 0.3849545419216156, "kl": 0.29401307767257095, "learning_rate": 4.4278571428571425e-07, "loss": 0.0003, "num_tokens": 453271848.0, "reward": 0.196875, "reward_std": 0.2063734635710716, "rewards/verify_chess_move/mean": 0.196875, "rewards/verify_chess_move/std": 0.9748721361160279, "step": 6200 }, { "completion_length": 363.6, "completions/clipped_ratio": 0.0, "completions/max_length": 363.6, "completions/max_terminated_length": 363.6, "completions/mean_length": 109.07734375, "completions/mean_terminated_length": 109.07734375, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.005615516745190384, "frac_reward_zero_std": 0.81875, "grad_norm": 0.4296342730522156, "kl": 0.14389986775349825, "learning_rate": 4.431428571428571e-07, "loss": 0.0001, "num_tokens": 453608507.0, "reward": 0.2953125, "reward_std": 0.15571150183677673, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9349283337593078, "step": 6205 }, { "completion_length": 353.4, "completions/clipped_ratio": 0.0, "completions/max_length": 353.4, "completions/max_terminated_length": 353.4, "completions/mean_length": 115.75390625, "completions/mean_terminated_length": 115.75390625, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.005620041738538644, "frac_reward_zero_std": 0.78125, "grad_norm": 0.29931554198265076, "kl": 0.0833508113399148, "learning_rate": 4.4349999999999997e-07, "loss": 0.0001, "num_tokens": 453954656.0, "reward": 0.384375, "reward_std": 0.1954238161444664, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9147024989128113, "step": 6210 }, { "completion_length": 333.4, "completions/clipped_ratio": 0.0, "completions/max_length": 333.4, "completions/max_terminated_length": 333.4, "completions/mean_length": 120.0859375, "completions/mean_terminated_length": 120.0859375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.0056245667318869045, "frac_reward_zero_std": 0.80625, "grad_norm": 0.157633438706398, "kl": 0.0465205890010111, "learning_rate": 4.4385714285714283e-07, "loss": 0.0, "num_tokens": 454307462.0, "reward": 0.3140625, "reward_std": 0.1738099694252014, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9456668496131897, "step": 6215 }, { "completion_length": 299.4, "completions/clipped_ratio": 0.0, "completions/max_length": 299.4, "completions/max_terminated_length": 299.4, "completions/mean_length": 113.184375, "completions/mean_terminated_length": 113.184375, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "epoch": 0.005629091725235164, "frac_reward_zero_std": 0.7875, "grad_norm": 0.269096314907074, "kl": 0.048922970599960536, "learning_rate": 4.442142857142857e-07, "loss": 0.0, "num_tokens": 454651010.0, "reward": 0.275, "reward_std": 0.18438275754451752, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9562541365623474, "step": 6220 }, { "completion_length": 342.6, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/max_terminated_length": 342.6, "completions/mean_length": 109.925, "completions/mean_terminated_length": 109.925, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.005633616718583424, "frac_reward_zero_std": 0.7625, "grad_norm": 0.19586452841758728, "kl": 0.08231429309817032, "learning_rate": 4.4457142857142855e-07, "loss": 0.0001, "num_tokens": 454987210.0, "reward": 0.246875, "reward_std": 0.2041103795170784, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9563554048538208, "step": 6225 }, { "completion_length": 350.6, "completions/clipped_ratio": 0.0, "completions/max_length": 350.6, "completions/max_terminated_length": 350.6, "completions/mean_length": 117.71328125, "completions/mean_terminated_length": 117.71328125, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005638141711931683, "frac_reward_zero_std": 0.7625, "grad_norm": 0.23688454926013947, "kl": 0.0840716349077411, "learning_rate": 4.449285714285714e-07, "loss": 0.0001, "num_tokens": 455337099.0, "reward": 0.3859375, "reward_std": 0.21004804968833923, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9213219523429871, "step": 6230 }, { "completion_length": 402.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 120.08671875, "completions/mean_terminated_length": 120.08671875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.005642666705279943, "frac_reward_zero_std": 0.7625, "grad_norm": 0.3626321256160736, "kl": 0.12302703745663165, "learning_rate": 4.452857142857143e-07, "loss": 0.0001, "num_tokens": 455691706.0, "reward": 0.240625, "reward_std": 0.20347486138343812, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9679353833198547, "step": 6235 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 108.628125, "completions/mean_terminated_length": 108.628125, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.005647191698628203, "frac_reward_zero_std": 0.7875, "grad_norm": 0.7404000163078308, "kl": 0.27318180208094417, "learning_rate": 4.456428571428571e-07, "loss": 0.0003, "num_tokens": 456027334.0, "reward": 0.296875, "reward_std": 0.18211966156959533, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.93174649477005, "step": 6240 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 112.46875, "completions/mean_terminated_length": 112.46875, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0056517166919764624, "frac_reward_zero_std": 0.81875, "grad_norm": 0.41054168343544006, "kl": 0.28910297215916214, "learning_rate": 4.46e-07, "loss": 0.0003, "num_tokens": 456370158.0, "reward": 0.3515625, "reward_std": 0.15860814452171326, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.918462598323822, "step": 6245 }, { "completion_length": 320.6, "completions/clipped_ratio": 0.0, "completions/max_length": 320.6, "completions/max_terminated_length": 320.6, "completions/mean_length": 117.40390625, "completions/mean_terminated_length": 117.40390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0056562416853247225, "frac_reward_zero_std": 0.7875, "grad_norm": 0.517532467842102, "kl": 0.13981365124927833, "learning_rate": 4.463571428571428e-07, "loss": 0.0001, "num_tokens": 456719955.0, "reward": 0.20625, "reward_std": 0.1839099794626236, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9673580884933471, "step": 6250 }, { "completion_length": 319.4, "completions/clipped_ratio": 0.0, "completions/max_length": 319.4, "completions/max_terminated_length": 319.4, "completions/mean_length": 119.3984375, "completions/mean_terminated_length": 119.3984375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005660766678672983, "frac_reward_zero_std": 0.825, "grad_norm": 0.5129820108413696, "kl": 0.15123679457465186, "learning_rate": 4.467142857142857e-07, "loss": 0.0002, "num_tokens": 457074433.0, "reward": 0.3109375, "reward_std": 0.15444738417863846, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9426936984062195, "step": 6255 }, { "completion_length": 357.2, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 118.66015625, "completions/mean_terminated_length": 118.66015625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005665291672021242, "frac_reward_zero_std": 0.8, "grad_norm": 0.49683305621147156, "kl": 0.0771644828724675, "learning_rate": 4.4707142857142853e-07, "loss": 0.0001, "num_tokens": 457427334.0, "reward": 0.1921875, "reward_std": 0.17964283227920533, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9740267515182495, "step": 6260 }, { "completion_length": 326.6, "completions/clipped_ratio": 0.0, "completions/max_length": 326.6, "completions/max_terminated_length": 326.6, "completions/mean_length": 100.33515625, "completions/mean_terminated_length": 100.33515625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.005669816665369502, "frac_reward_zero_std": 0.825, "grad_norm": 0.36012619733810425, "kl": 0.3131868388270959, "learning_rate": 4.4742857142857144e-07, "loss": 0.0003, "num_tokens": 457750459.0, "reward": 0.3609375, "reward_std": 0.14693156778812408, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9276511311531067, "step": 6265 }, { "completion_length": 317.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 116.3265625, "completions/mean_terminated_length": 116.3265625, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005674341658717762, "frac_reward_zero_std": 0.81875, "grad_norm": 0.18997499346733093, "kl": 0.24600781660992652, "learning_rate": 4.4778571428571425e-07, "loss": 0.0002, "num_tokens": 458100877.0, "reward": 0.21875, "reward_std": 0.15019619315862656, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.97447429895401, "step": 6270 }, { "completion_length": 408.2, "completions/clipped_ratio": 0.0, "completions/max_length": 408.2, "completions/max_terminated_length": 408.2, "completions/mean_length": 114.1578125, "completions/mean_terminated_length": 114.1578125, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.005678866652066021, "frac_reward_zero_std": 0.80625, "grad_norm": 0.8622578382492065, "kl": 0.24245555829256774, "learning_rate": 4.481428571428571e-07, "loss": 0.0002, "num_tokens": 458445703.0, "reward": 0.2453125, "reward_std": 0.16881491541862487, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9608788132667542, "step": 6275 }, { "completion_length": 315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 117.9640625, "completions/mean_terminated_length": 117.9640625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005683391645414281, "frac_reward_zero_std": 0.80625, "grad_norm": 0.8804014325141907, "kl": 0.37300486946478484, "learning_rate": 4.4849999999999997e-07, "loss": 0.0004, "num_tokens": 458796817.0, "reward": 0.20625, "reward_std": 0.17586089074611663, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9674809098243713, "step": 6280 }, { "completion_length": 320.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 107.48125, "completions/mean_terminated_length": 107.48125, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0056879166387625415, "frac_reward_zero_std": 0.79375, "grad_norm": 0.3683713674545288, "kl": 0.24025859124958515, "learning_rate": 4.4885714285714283e-07, "loss": 0.0002, "num_tokens": 459132449.0, "reward": 0.275, "reward_std": 0.17428333163261414, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9541639447212219, "step": 6285 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 112.67265625, "completions/mean_terminated_length": 112.67265625, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.005692441632110801, "frac_reward_zero_std": 0.8, "grad_norm": 1.572800636291504, "kl": 0.1899668992497027, "learning_rate": 4.492142857142857e-07, "loss": 0.0002, "num_tokens": 459474598.0, "reward": 0.3765625, "reward_std": 0.17622691690921782, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9220016837120056, "step": 6290 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 109.159375, "completions/mean_terminated_length": 109.159375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005696966625459061, "frac_reward_zero_std": 0.78125, "grad_norm": 0.3745439946651459, "kl": 0.09325032690539956, "learning_rate": 4.4957142857142856e-07, "loss": 0.0001, "num_tokens": 459810186.0, "reward": 0.309375, "reward_std": 0.18474974632263183, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9458228826522828, "step": 6295 }, { "completion_length": 344.6, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 114.4828125, "completions/mean_terminated_length": 114.4828125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.00570149161880732, "frac_reward_zero_std": 0.775, "grad_norm": 0.47438302636146545, "kl": 0.10172413126565516, "learning_rate": 4.499285714285714e-07, "loss": 0.0001, "num_tokens": 460156044.0, "reward": 0.328125, "reward_std": 0.19211329221725465, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9287435054779053, "step": 6300 }, { "completion_length": 318.4, "completions/clipped_ratio": 0.0, "completions/max_length": 318.4, "completions/max_terminated_length": 318.4, "completions/mean_length": 122.69453125, "completions/mean_terminated_length": 122.69453125, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.00570601661215558, "frac_reward_zero_std": 0.825, "grad_norm": 0.012706879526376724, "kl": 0.12448350364575163, "learning_rate": 4.502857142857143e-07, "loss": 0.0001, "num_tokens": 460515917.0, "reward": 0.2125, "reward_std": 0.14850970804691316, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.9693731784820556, "step": 6305 }, { "completion_length": 307.4, "completions/clipped_ratio": 0.0, "completions/max_length": 307.4, "completions/max_terminated_length": 307.4, "completions/mean_length": 109.915625, "completions/mean_terminated_length": 109.915625, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.00571054160550384, "frac_reward_zero_std": 0.80625, "grad_norm": 0.900042712688446, "kl": 0.0974255918408744, "learning_rate": 4.506428571428571e-07, "loss": 0.0001, "num_tokens": 460853313.0, "reward": 0.3671875, "reward_std": 0.1720196634531021, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.918132209777832, "step": 6310 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 113.34375, "completions/mean_terminated_length": 113.34375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.005715066598852099, "frac_reward_zero_std": 0.8, "grad_norm": 0.6187397241592407, "kl": 0.18975276788696646, "learning_rate": 4.51e-07, "loss": 0.0002, "num_tokens": 461198449.0, "reward": 0.259375, "reward_std": 0.16871010661125183, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9575673460960388, "step": 6315 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 112.47109375, "completions/mean_terminated_length": 112.47109375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.0057195915922003595, "frac_reward_zero_std": 0.79375, "grad_norm": 0.7381647229194641, "kl": 0.27239258112385867, "learning_rate": 4.513571428571428e-07, "loss": 0.0003, "num_tokens": 461541900.0, "reward": 0.1421875, "reward_std": 0.1854292184114456, "rewards/verify_chess_move/mean": 0.1421875, "rewards/verify_chess_move/std": 0.9818507075309754, "step": 6320 }, { "completion_length": 335.4, "completions/clipped_ratio": 0.0, "completions/max_length": 335.4, "completions/max_terminated_length": 335.4, "completions/mean_length": 111.40390625, "completions/mean_terminated_length": 111.40390625, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.00572411658554862, "frac_reward_zero_std": 0.825, "grad_norm": 0.8071832060813904, "kl": 1.0069227595813572, "learning_rate": 4.517142857142857e-07, "loss": 0.001, "num_tokens": 461882489.0, "reward": 0.2609375, "reward_std": 0.1517154335975647, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9650303840637207, "step": 6325 }, { "completion_length": 313.8, "completions/clipped_ratio": 0.0, "completions/max_length": 313.8, "completions/max_terminated_length": 313.8, "completions/mean_length": 108.05546875, "completions/mean_terminated_length": 108.05546875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.005728641578896879, "frac_reward_zero_std": 0.8, "grad_norm": 0.5055733919143677, "kl": 0.8249370935373008, "learning_rate": 4.5207142857142853e-07, "loss": 0.0008, "num_tokens": 462217888.0, "reward": 0.3859375, "reward_std": 0.1666591838002205, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.8943330764770507, "step": 6330 }, { "completion_length": 334.6, "completions/clipped_ratio": 0.0, "completions/max_length": 334.6, "completions/max_terminated_length": 334.6, "completions/mean_length": 121.89140625, "completions/mean_terminated_length": 121.89140625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005733166572245139, "frac_reward_zero_std": 0.8, "grad_norm": 0.8835436701774597, "kl": 0.2808745080139488, "learning_rate": 4.524285714285714e-07, "loss": 0.0003, "num_tokens": 462574757.0, "reward": 0.2734375, "reward_std": 0.1673431485891342, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9543909668922425, "step": 6335 }, { "completion_length": 304.4, "completions/clipped_ratio": 0.0, "completions/max_length": 304.4, "completions/max_terminated_length": 304.4, "completions/mean_length": 114.19296875, "completions/mean_terminated_length": 114.19296875, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.005737691565593399, "frac_reward_zero_std": 0.84375, "grad_norm": 0.3597356677055359, "kl": 0.15984016554430128, "learning_rate": 4.5278571428571426e-07, "loss": 0.0002, "num_tokens": 462922116.0, "reward": 0.2703125, "reward_std": 0.13824500739574433, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9601613402366638, "step": 6340 }, { "completion_length": 316.2, "completions/clipped_ratio": 0.0, "completions/max_length": 316.2, "completions/max_terminated_length": 316.2, "completions/mean_length": 106.56171875, "completions/mean_terminated_length": 106.56171875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.005742216558941658, "frac_reward_zero_std": 0.85625, "grad_norm": 0.3394058048725128, "kl": 0.0684131839661859, "learning_rate": 4.531428571428571e-07, "loss": 0.0001, "num_tokens": 463255731.0, "reward": 0.4328125, "reward_std": 0.1323512762784958, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.8908271431922913, "step": 6345 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 106.76328125, "completions/mean_terminated_length": 106.76328125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.005746741552289918, "frac_reward_zero_std": 0.79375, "grad_norm": 1.198013186454773, "kl": 0.07545918107498437, "learning_rate": 4.535e-07, "loss": 0.0001, "num_tokens": 463590596.0, "reward": 0.275, "reward_std": 0.17838323414325713, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9523301720619202, "step": 6350 }, { "completion_length": 366.6, "completions/clipped_ratio": 0.0, "completions/max_length": 366.6, "completions/max_terminated_length": 366.6, "completions/mean_length": 112.690625, "completions/mean_terminated_length": 112.690625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.0057512665456381775, "frac_reward_zero_std": 0.81875, "grad_norm": 0.5983389616012573, "kl": 0.050997016858309506, "learning_rate": 4.5385714285714284e-07, "loss": 0.0001, "num_tokens": 463932736.0, "reward": 0.2828125, "reward_std": 0.1602337598800659, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9410215139389038, "step": 6355 }, { "completion_length": 333.8, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/max_terminated_length": 333.8, "completions/mean_length": 114.384375, "completions/mean_terminated_length": 114.384375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005755791538986438, "frac_reward_zero_std": 0.79375, "grad_norm": 0.24999627470970154, "kl": 0.06420010301517323, "learning_rate": 4.542142857142857e-07, "loss": 0.0001, "num_tokens": 464277988.0, "reward": 0.35, "reward_std": 0.179067200422287, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9357293128967286, "step": 6360 }, { "completion_length": 384.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 116.78984375, "completions/mean_terminated_length": 116.78984375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005760316532334698, "frac_reward_zero_std": 0.81875, "grad_norm": 0.20940428972244263, "kl": 0.047227164707146586, "learning_rate": 4.5457142857142856e-07, "loss": 0.0, "num_tokens": 464627719.0, "reward": 0.3515625, "reward_std": 0.1570290207862854, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9356694340705871, "step": 6365 }, { "completion_length": 318.2, "completions/clipped_ratio": 0.0, "completions/max_length": 318.2, "completions/max_terminated_length": 318.2, "completions/mean_length": 106.08984375, "completions/mean_terminated_length": 106.08984375, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.005764841525682957, "frac_reward_zero_std": 0.80625, "grad_norm": 0.1588040292263031, "kl": 0.0413112664129585, "learning_rate": 4.5492857142857137e-07, "loss": 0.0, "num_tokens": 464961402.0, "reward": 0.2, "reward_std": 0.1666012540459633, "rewards/verify_chess_move/mean": 0.2, "rewards/verify_chess_move/std": 0.9686041355133057, "step": 6370 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 101.8453125, "completions/mean_terminated_length": 101.8453125, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005769366519031217, "frac_reward_zero_std": 0.8625, "grad_norm": 0.14092208445072174, "kl": 0.04642786156618968, "learning_rate": 4.552857142857143e-07, "loss": 0.0, "num_tokens": 465286732.0, "reward": 0.41875, "reward_std": 0.11994325816631317, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8748950362205505, "step": 6375 }, { "completion_length": 332.4, "completions/clipped_ratio": 0.0, "completions/max_length": 332.4, "completions/max_terminated_length": 332.4, "completions/mean_length": 96.2421875, "completions/mean_terminated_length": 96.2421875, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.005773891512379477, "frac_reward_zero_std": 0.75, "grad_norm": 0.33104562759399414, "kl": 0.09361393614672124, "learning_rate": 4.556428571428571e-07, "loss": 0.0001, "num_tokens": 465602874.0, "reward": 0.30625, "reward_std": 0.2195233851671219, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9497909545898438, "step": 6380 }, { "completion_length": 316.4, "completions/clipped_ratio": 0.0, "completions/max_length": 316.4, "completions/max_terminated_length": 316.4, "completions/mean_length": 107.20625, "completions/mean_terminated_length": 107.20625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.005778416505727736, "frac_reward_zero_std": 0.80625, "grad_norm": 1.3862662315368652, "kl": 0.41609750140924007, "learning_rate": 4.56e-07, "loss": 0.0004, "num_tokens": 465937362.0, "reward": 0.26875, "reward_std": 0.16134461164474487, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9529919385910034, "step": 6385 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 108.00546875, "completions/mean_terminated_length": 108.00546875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0057829414990759965, "frac_reward_zero_std": 0.8625, "grad_norm": 1.1742604970932007, "kl": 0.24777134731411934, "learning_rate": 4.563571428571428e-07, "loss": 0.0002, "num_tokens": 466274937.0, "reward": 0.3078125, "reward_std": 0.11725681126117707, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9341504812240601, "step": 6390 }, { "completion_length": 349.2, "completions/clipped_ratio": 0.0, "completions/max_length": 349.2, "completions/max_terminated_length": 349.2, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.0057874664924242565, "frac_reward_zero_std": 0.725, "grad_norm": 0.6026498675346375, "kl": 0.2526411567581818, "learning_rate": 4.5671428571428573e-07, "loss": 0.0003, "num_tokens": 466616337.0, "reward": 0.2046875, "reward_std": 0.2383548617362976, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9600414991378784, "step": 6395 }, { "completion_length": 311.2, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/max_terminated_length": 311.2, "completions/mean_length": 106.778125, "completions/mean_terminated_length": 106.778125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005791991485772516, "frac_reward_zero_std": 0.80625, "grad_norm": 0.7190770506858826, "kl": 0.37232308557722715, "learning_rate": 4.5707142857142854e-07, "loss": 0.0004, "num_tokens": 466950357.0, "reward": 0.296875, "reward_std": 0.16087378412485123, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9526082754135132, "step": 6400 }, { "completion_length": 320.6, "completions/clipped_ratio": 0.0, "completions/max_length": 320.6, "completions/max_terminated_length": 320.6, "completions/mean_length": 103.5640625, "completions/mean_terminated_length": 103.5640625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005796516479120776, "frac_reward_zero_std": 0.78125, "grad_norm": 0.37175634503364563, "kl": 0.4175604226533324, "learning_rate": 4.574285714285714e-07, "loss": 0.0004, "num_tokens": 467281151.0, "reward": 0.1953125, "reward_std": 0.19153413474559783, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9706009507179261, "step": 6405 }, { "completion_length": 355.4, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/max_terminated_length": 355.4, "completions/mean_length": 109.62734375, "completions/mean_terminated_length": 109.62734375, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.005801041472469035, "frac_reward_zero_std": 0.81875, "grad_norm": 1.2429362535476685, "kl": 0.37851718929596245, "learning_rate": 4.5778571428571426e-07, "loss": 0.0004, "num_tokens": 467619994.0, "reward": 0.3140625, "reward_std": 0.15592268407344817, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9480375289916992, "step": 6410 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 111.39765625, "completions/mean_terminated_length": 111.39765625, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.005805566465817295, "frac_reward_zero_std": 0.8, "grad_norm": 1.8213390111923218, "kl": 1.960838482156396, "learning_rate": 4.581428571428571e-07, "loss": 0.002, "num_tokens": 467961807.0, "reward": 0.334375, "reward_std": 0.17648556232452392, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9331120848655701, "step": 6415 }, { "completion_length": 322.2, "completions/clipped_ratio": 0.0, "completions/max_length": 322.2, "completions/max_terminated_length": 322.2, "completions/mean_length": 104.53671875, "completions/mean_terminated_length": 104.53671875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005810091459165555, "frac_reward_zero_std": 0.8125, "grad_norm": 1.178388237953186, "kl": 2.120085969939828, "learning_rate": 4.585e-07, "loss": 0.0021, "num_tokens": 468291078.0, "reward": 0.3546875, "reward_std": 0.16166061013936997, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9317776799201966, "step": 6420 }, { "completion_length": 314.6, "completions/clipped_ratio": 0.0, "completions/max_length": 314.6, "completions/max_terminated_length": 314.6, "completions/mean_length": 108.40859375, "completions/mean_terminated_length": 108.40859375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.0058146164525138145, "frac_reward_zero_std": 0.81875, "grad_norm": 4.660996437072754, "kl": 1.8968099301680923, "learning_rate": 4.5885714285714284e-07, "loss": 0.0019, "num_tokens": 468627297.0, "reward": 0.415625, "reward_std": 0.16365066170692444, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.8704427123069763, "step": 6425 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 107.21796875, "completions/mean_terminated_length": 107.21796875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005819141445862075, "frac_reward_zero_std": 0.8, "grad_norm": 0.9122756123542786, "kl": 0.8825910998973996, "learning_rate": 4.592142857142857e-07, "loss": 0.0009, "num_tokens": 468964384.0, "reward": 0.246875, "reward_std": 0.17191288471221924, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9533483624458313, "step": 6430 }, { "completion_length": 317.4, "completions/clipped_ratio": 0.0, "completions/max_length": 317.4, "completions/max_terminated_length": 317.4, "completions/mean_length": 111.13515625, "completions/mean_terminated_length": 111.13515625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.005823666439210335, "frac_reward_zero_std": 0.81875, "grad_norm": 0.8595311045646667, "kl": 0.16670511341653765, "learning_rate": 4.5957142857142857e-07, "loss": 0.0002, "num_tokens": 469306357.0, "reward": 0.3078125, "reward_std": 0.15886778235435486, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9469121694564819, "step": 6435 }, { "completion_length": 307.6, "completions/clipped_ratio": 0.0, "completions/max_length": 307.6, "completions/max_terminated_length": 307.6, "completions/mean_length": 110.26796875, "completions/mean_terminated_length": 110.26796875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005828191432558594, "frac_reward_zero_std": 0.8, "grad_norm": 2.1467463970184326, "kl": 0.6639680897351354, "learning_rate": 4.599285714285714e-07, "loss": 0.0007, "num_tokens": 469647084.0, "reward": 0.3203125, "reward_std": 0.1780656695365906, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9460046648979187, "step": 6440 }, { "completion_length": 304.8, "completions/clipped_ratio": 0.0, "completions/max_length": 304.8, "completions/max_terminated_length": 304.8, "completions/mean_length": 113.7359375, "completions/mean_terminated_length": 113.7359375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.005832716425906854, "frac_reward_zero_std": 0.83125, "grad_norm": 1.7316964864730835, "kl": 0.9222223362885416, "learning_rate": 4.602857142857143e-07, "loss": 0.0009, "num_tokens": 469994074.0, "reward": 0.240625, "reward_std": 0.1550714671611786, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9620809078216552, "step": 6445 }, { "completion_length": 292.2, "completions/clipped_ratio": 0.0, "completions/max_length": 292.2, "completions/max_terminated_length": 292.2, "completions/mean_length": 108.68828125, "completions/mean_terminated_length": 108.68828125, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005837241419255114, "frac_reward_zero_std": 0.81875, "grad_norm": 4.049631118774414, "kl": 2.1586183734238147, "learning_rate": 4.606428571428571e-07, "loss": 0.0022, "num_tokens": 470332443.0, "reward": 0.2453125, "reward_std": 0.15728866159915925, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9577132582664489, "step": 6450 }, { "completion_length": 307.8, "completions/clipped_ratio": 0.0, "completions/max_length": 307.8, "completions/max_terminated_length": 307.8, "completions/mean_length": 101.34453125, "completions/mean_terminated_length": 101.34453125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.005841766412603373, "frac_reward_zero_std": 0.81875, "grad_norm": 1.8070669174194336, "kl": 2.939981071278453, "learning_rate": 4.61e-07, "loss": 0.0029, "num_tokens": 470657268.0, "reward": 0.378125, "reward_std": 0.1584444299340248, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.916647732257843, "step": 6455 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 110.75703125, "completions/mean_terminated_length": 110.75703125, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.005846291405951633, "frac_reward_zero_std": 0.8, "grad_norm": 3.2595162391662598, "kl": 1.133316872548312, "learning_rate": 4.613571428571428e-07, "loss": 0.0011, "num_tokens": 470997037.0, "reward": 0.375, "reward_std": 0.17281000316143036, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9215797066688538, "step": 6460 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 109.11015625, "completions/mean_terminated_length": 109.11015625, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.005850816399299893, "frac_reward_zero_std": 0.79375, "grad_norm": 0.9722123146057129, "kl": 0.34198707570321857, "learning_rate": 4.617142857142857e-07, "loss": 0.0003, "num_tokens": 471334938.0, "reward": 0.225, "reward_std": 0.17286890745162964, "rewards/verify_chess_move/mean": 0.225, "rewards/verify_chess_move/std": 0.9527810335159301, "step": 6465 }, { "completion_length": 288.8, "completions/clipped_ratio": 0.0, "completions/max_length": 288.8, "completions/max_terminated_length": 288.8, "completions/mean_length": 102.53046875, "completions/mean_terminated_length": 102.53046875, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.005855341392648153, "frac_reward_zero_std": 0.84375, "grad_norm": 28.87936019897461, "kl": 3.3312091626459734, "learning_rate": 4.6207142857142854e-07, "loss": 0.0033, "num_tokens": 471662769.0, "reward": 0.246875, "reward_std": 0.12731032744050025, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9659034967422485, "step": 6470 }, { "completion_length": 318.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 109.184375, "completions/mean_terminated_length": 109.184375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005859866385996413, "frac_reward_zero_std": 0.78125, "grad_norm": 2.2055044174194336, "kl": 0.3858732477994636, "learning_rate": 4.624285714285714e-07, "loss": 0.0004, "num_tokens": 472000621.0, "reward": 0.29375, "reward_std": 0.18354651033878328, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.9530279517173768, "step": 6475 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 106.634375, "completions/mean_terminated_length": 106.634375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.005864391379344672, "frac_reward_zero_std": 0.81875, "grad_norm": 1.2262470722198486, "kl": 0.18059372780844568, "learning_rate": 4.6278571428571427e-07, "loss": 0.0002, "num_tokens": 472334697.0, "reward": 0.3703125, "reward_std": 0.15608737617731094, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9140125751495362, "step": 6480 }, { "completion_length": 308.4, "completions/clipped_ratio": 0.0, "completions/max_length": 308.4, "completions/max_terminated_length": 308.4, "completions/mean_length": 111.8921875, "completions/mean_terminated_length": 111.8921875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.005868916372692932, "frac_reward_zero_std": 0.8125, "grad_norm": 1.533917784690857, "kl": 0.2958272998803295, "learning_rate": 4.6314285714285713e-07, "loss": 0.0003, "num_tokens": 472676599.0, "reward": 0.3390625, "reward_std": 0.1634993702173233, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9360683798789978, "step": 6485 }, { "completion_length": 322.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 108.10546875, "completions/mean_terminated_length": 108.10546875, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.005873441366041192, "frac_reward_zero_std": 0.85625, "grad_norm": 1.4409852027893066, "kl": 0.1516541037824936, "learning_rate": 4.635e-07, "loss": 0.0002, "num_tokens": 473013142.0, "reward": 0.3125, "reward_std": 0.12052143663167954, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9452511548995972, "step": 6490 }, { "completion_length": 311.8, "completions/clipped_ratio": 0.0, "completions/max_length": 311.8, "completions/max_terminated_length": 311.8, "completions/mean_length": 107.7171875, "completions/mean_terminated_length": 107.7171875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.0058779663593894515, "frac_reward_zero_std": 0.85, "grad_norm": 1.5612444877624512, "kl": 0.33751588037703184, "learning_rate": 4.6385714285714285e-07, "loss": 0.0003, "num_tokens": 473349196.0, "reward": 0.2546875, "reward_std": 0.12899425998330116, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9642110586166381, "step": 6495 }, { "completion_length": 382.6, "completions/clipped_ratio": 0.0, "completions/max_length": 382.6, "completions/max_terminated_length": 382.6, "completions/mean_length": 110.56484375, "completions/mean_terminated_length": 110.56484375, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.0058824913527377115, "frac_reward_zero_std": 0.825, "grad_norm": 1.4389216899871826, "kl": 0.9230265039484948, "learning_rate": 4.6421428571428566e-07, "loss": 0.0009, "num_tokens": 473688975.0, "reward": 0.3578125, "reward_std": 0.15539099276065826, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9239814758300782, "step": 6500 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 114.6296875, "completions/mean_terminated_length": 114.6296875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.005887016346085972, "frac_reward_zero_std": 0.825, "grad_norm": 1.45066499710083, "kl": 0.9263595268130302, "learning_rate": 4.6457142857142857e-07, "loss": 0.0009, "num_tokens": 474035797.0, "reward": 0.2546875, "reward_std": 0.14466846883296966, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9523215413093566, "step": 6505 }, { "completion_length": 296.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 98.9859375, "completions/mean_terminated_length": 98.9859375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005891541339434231, "frac_reward_zero_std": 0.83125, "grad_norm": 3.0237412452697754, "kl": 0.9993230825290084, "learning_rate": 4.649285714285714e-07, "loss": 0.001, "num_tokens": 474356395.0, "reward": 0.290625, "reward_std": 0.15118472576141356, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9480414509773254, "step": 6510 }, { "completion_length": 353.8, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/max_terminated_length": 353.8, "completions/mean_length": 108.54609375, "completions/mean_terminated_length": 108.54609375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005896066332782491, "frac_reward_zero_std": 0.84375, "grad_norm": 0.7838315367698669, "kl": 0.7389762248843909, "learning_rate": 4.652857142857143e-07, "loss": 0.0007, "num_tokens": 474695702.0, "reward": 0.115625, "reward_std": 0.13251851946115495, "rewards/verify_chess_move/mean": 0.115625, "rewards/verify_chess_move/std": 0.9880701899528503, "step": 6515 }, { "completion_length": 330.4, "completions/clipped_ratio": 0.0, "completions/max_length": 330.4, "completions/max_terminated_length": 330.4, "completions/mean_length": 109.36484375, "completions/mean_terminated_length": 109.36484375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.00590059132613075, "frac_reward_zero_std": 0.8375, "grad_norm": 3.2090604305267334, "kl": 1.6476653423160315, "learning_rate": 4.656428571428571e-07, "loss": 0.0016, "num_tokens": 475033857.0, "reward": 0.371875, "reward_std": 0.1369854122400284, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9295262217521667, "step": 6520 }, { "completion_length": 340.2, "completions/clipped_ratio": 0.0, "completions/max_length": 340.2, "completions/max_terminated_length": 340.2, "completions/mean_length": 118.14296875, "completions/mean_terminated_length": 118.14296875, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.00590511631947901, "frac_reward_zero_std": 0.825, "grad_norm": 2.0204198360443115, "kl": 0.3347604683833197, "learning_rate": 4.66e-07, "loss": 0.0003, "num_tokens": 475384800.0, "reward": 0.2203125, "reward_std": 0.1467184193432331, "rewards/verify_chess_move/mean": 0.2203125, "rewards/verify_chess_move/std": 0.9717175841331482, "step": 6525 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 106.10234375, "completions/mean_terminated_length": 106.10234375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.00590964131282727, "frac_reward_zero_std": 0.7625, "grad_norm": 1.1225289106369019, "kl": 0.16426455706823617, "learning_rate": 4.663571428571428e-07, "loss": 0.0002, "num_tokens": 475717459.0, "reward": 0.2640625, "reward_std": 0.20363856852054596, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9574546456336975, "step": 6530 }, { "completion_length": 319.2, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/max_terminated_length": 319.2, "completions/mean_length": 118.290625, "completions/mean_terminated_length": 118.290625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.00591416630617553, "frac_reward_zero_std": 0.79375, "grad_norm": 2.152939796447754, "kl": 0.19880851097404956, "learning_rate": 4.667142857142857e-07, "loss": 0.0002, "num_tokens": 476072919.0, "reward": 0.171875, "reward_std": 0.17927839159965514, "rewards/verify_chess_move/mean": 0.171875, "rewards/verify_chess_move/std": 0.9791166067123414, "step": 6535 }, { "completion_length": 313.2, "completions/clipped_ratio": 0.0, "completions/max_length": 313.2, "completions/max_terminated_length": 313.2, "completions/mean_length": 109.37734375, "completions/mean_terminated_length": 109.37734375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.00591869129952379, "frac_reward_zero_std": 0.80625, "grad_norm": 2.563262939453125, "kl": 0.8339549669180997, "learning_rate": 4.6707142857142855e-07, "loss": 0.0008, "num_tokens": 476412490.0, "reward": 0.178125, "reward_std": 0.1731734737753868, "rewards/verify_chess_move/mean": 0.178125, "rewards/verify_chess_move/std": 0.9811493277549743, "step": 6540 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.00592321629287205, "frac_reward_zero_std": 0.8, "grad_norm": 2.4927399158477783, "kl": 2.4624992035329343, "learning_rate": 4.674285714285714e-07, "loss": 0.0025, "num_tokens": 476753338.0, "reward": 0.315625, "reward_std": 0.16008598804473878, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.944866132736206, "step": 6545 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 106.69921875, "completions/mean_terminated_length": 106.69921875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005927741286220309, "frac_reward_zero_std": 0.80625, "grad_norm": 4.530456066131592, "kl": 6.15538963675499, "learning_rate": 4.6778571428571427e-07, "loss": 0.0062, "num_tokens": 477086673.0, "reward": 0.375, "reward_std": 0.16681048572063445, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9244413614273072, "step": 6550 }, { "completion_length": 306.6, "completions/clipped_ratio": 0.0, "completions/max_length": 306.6, "completions/max_terminated_length": 306.6, "completions/mean_length": 106.99453125, "completions/mean_terminated_length": 106.99453125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.005932266279568569, "frac_reward_zero_std": 0.825, "grad_norm": 5.3511271476745605, "kl": 4.428081849217415, "learning_rate": 4.6814285714285713e-07, "loss": 0.0044, "num_tokens": 477422794.0, "reward": 0.2546875, "reward_std": 0.15197506844997405, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9633381128311157, "step": 6555 }, { "completion_length": 318.4, "completions/clipped_ratio": 0.0, "completions/max_length": 318.4, "completions/max_terminated_length": 318.4, "completions/mean_length": 107.50390625, "completions/mean_terminated_length": 107.50390625, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.005936791272916829, "frac_reward_zero_std": 0.8, "grad_norm": 2.1672580242156982, "kl": 3.2379998723976313, "learning_rate": 4.685e-07, "loss": 0.0032, "num_tokens": 477759351.0, "reward": 0.365625, "reward_std": 0.16807459592819213, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9241964817047119, "step": 6560 }, { "completion_length": 324.4, "completions/clipped_ratio": 0.0, "completions/max_length": 324.4, "completions/max_terminated_length": 324.4, "completions/mean_length": 106.07890625, "completions/mean_terminated_length": 106.07890625, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.005941316266265088, "frac_reward_zero_std": 0.825, "grad_norm": 3.742161273956299, "kl": 1.1095193377230317, "learning_rate": 4.6885714285714285e-07, "loss": 0.0011, "num_tokens": 478093068.0, "reward": 0.3453125, "reward_std": 0.1526570737361908, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9308823823928833, "step": 6565 }, { "completion_length": 379.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 379.4, "completions/max_terminated_length": 289.0, "completions/mean_length": 99.35390625, "completions/mean_terminated_length": 98.82877807617187, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0059458412596133485, "frac_reward_zero_std": 0.8375, "grad_norm": 2.2552638053894043, "kl": 0.3208473498641979, "learning_rate": 4.6921428571428566e-07, "loss": 0.0003, "num_tokens": 478414737.0, "reward": 0.2703125, "reward_std": 0.14193298816680908, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.960320794582367, "step": 6570 }, { "completion_length": 316.2, "completions/clipped_ratio": 0.0, "completions/max_length": 316.2, "completions/max_terminated_length": 316.2, "completions/mean_length": 101.6578125, "completions/mean_terminated_length": 101.6578125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.005950366252961608, "frac_reward_zero_std": 0.8125, "grad_norm": 1.0132445096969604, "kl": 0.32441530376672745, "learning_rate": 4.695714285714286e-07, "loss": 0.0003, "num_tokens": 478738707.0, "reward": 0.44375, "reward_std": 0.16217889934778212, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8851999640464783, "step": 6575 }, { "completion_length": 319.4, "completions/clipped_ratio": 0.0, "completions/max_length": 319.4, "completions/max_terminated_length": 319.4, "completions/mean_length": 110.13125, "completions/mean_terminated_length": 110.13125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.005954891246309868, "frac_reward_zero_std": 0.8, "grad_norm": 4.87206506729126, "kl": 1.1152601568610407, "learning_rate": 4.699285714285714e-07, "loss": 0.0011, "num_tokens": 479077387.0, "reward": 0.2125, "reward_std": 0.1741759791970253, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.9776837229728699, "step": 6580 }, { "completion_length": 297.8, "completions/clipped_ratio": 0.0, "completions/max_length": 297.8, "completions/max_terminated_length": 297.8, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005959416239658128, "frac_reward_zero_std": 0.825, "grad_norm": 7.916528701782227, "kl": 0.7329142925795168, "learning_rate": 4.702857142857143e-07, "loss": 0.0007, "num_tokens": 479424067.0, "reward": 0.1140625, "reward_std": 0.1514557957649231, "rewards/verify_chess_move/mean": 0.1140625, "rewards/verify_chess_move/std": 0.9877749085426331, "step": 6585 }, { "completion_length": 291.6, "completions/clipped_ratio": 0.0, "completions/max_length": 291.6, "completions/max_terminated_length": 291.6, "completions/mean_length": 106.62734375, "completions/mean_terminated_length": 106.62734375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.005963941233006387, "frac_reward_zero_std": 0.825, "grad_norm": 2.142200469970703, "kl": 1.2045604471350089, "learning_rate": 4.706428571428571e-07, "loss": 0.0012, "num_tokens": 479759870.0, "reward": 0.2796875, "reward_std": 0.14719120413064957, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9598093748092651, "step": 6590 }, { "completion_length": 316.6, "completions/clipped_ratio": 0.0, "completions/max_length": 316.6, "completions/max_terminated_length": 316.6, "completions/mean_length": 104.5640625, "completions/mean_terminated_length": 104.5640625, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.005968466226354647, "frac_reward_zero_std": 0.86875, "grad_norm": 1.0515679121017456, "kl": 0.12469248475972564, "learning_rate": 4.7099999999999997e-07, "loss": 0.0001, "num_tokens": 480093608.0, "reward": 0.209375, "reward_std": 0.11078744679689408, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9696079730987549, "step": 6595 }, { "completion_length": 347.2, "completions/clipped_ratio": 0.0, "completions/max_length": 347.2, "completions/max_terminated_length": 347.2, "completions/mean_length": 99.5890625, "completions/mean_terminated_length": 99.5890625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.005972991219702907, "frac_reward_zero_std": 0.84375, "grad_norm": 3.0074634552001953, "kl": 0.4894173661014065, "learning_rate": 4.7135714285714283e-07, "loss": 0.0005, "num_tokens": 480417370.0, "reward": 0.278125, "reward_std": 0.13346212804317475, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9398550748825073, "step": 6600 }, { "completion_length": 373.4, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/max_terminated_length": 373.4, "completions/mean_length": 108.7125, "completions/mean_terminated_length": 108.7125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.0059775162130511665, "frac_reward_zero_std": 0.80625, "grad_norm": 5.004578590393066, "kl": 0.05191578592639416, "learning_rate": 4.717142857142857e-07, "loss": 0.0001, "num_tokens": 480754306.0, "reward": 0.4421875, "reward_std": 0.1667649745941162, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8813915371894836, "step": 6605 }, { "completion_length": 314.6, "completions/clipped_ratio": 0.0, "completions/max_length": 314.6, "completions/max_terminated_length": 314.6, "completions/mean_length": 105.290625, "completions/mean_terminated_length": 105.290625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.005982041206399427, "frac_reward_zero_std": 0.8625, "grad_norm": 0.10673943907022476, "kl": 0.06928095065522939, "learning_rate": 4.7207142857142855e-07, "loss": 0.0001, "num_tokens": 481086182.0, "reward": 0.4234375, "reward_std": 0.12545856200158595, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8935447335243225, "step": 6610 }, { "completion_length": 313.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 99.55390625, "completions/mean_terminated_length": 99.55390625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.005986566199747687, "frac_reward_zero_std": 0.8375, "grad_norm": 0.21308882534503937, "kl": 0.04397921351483092, "learning_rate": 4.724285714285714e-07, "loss": 0.0, "num_tokens": 481409243.0, "reward": 0.346875, "reward_std": 0.13993051499128342, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9297659397125244, "step": 6615 }, { "completion_length": 393.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 110.16328125, "completions/mean_terminated_length": 110.16328125, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.005991091193095946, "frac_reward_zero_std": 0.83125, "grad_norm": 2.1146605014801025, "kl": 0.041752621735213324, "learning_rate": 4.727857142857143e-07, "loss": 0.0, "num_tokens": 481747908.0, "reward": 0.2625, "reward_std": 0.14939245209097862, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9633673667907715, "step": 6620 }, { "completion_length": 319.4, "completions/clipped_ratio": 0.0, "completions/max_length": 319.4, "completions/max_terminated_length": 319.4, "completions/mean_length": 101.621875, "completions/mean_terminated_length": 101.621875, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.005995616186444206, "frac_reward_zero_std": 0.75, "grad_norm": 0.784298300743103, "kl": 0.10671058879233897, "learning_rate": 4.7314285714285714e-07, "loss": 0.0001, "num_tokens": 482073032.0, "reward": 0.28125, "reward_std": 0.22409606128931045, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9377794981002807, "step": 6625 }, { "completion_length": 329.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 107.540625, "completions/mean_terminated_length": 107.540625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.006000141179792465, "frac_reward_zero_std": 0.8625, "grad_norm": 0.11154945939779282, "kl": 0.050689136050641535, "learning_rate": 4.7349999999999995e-07, "loss": 0.0001, "num_tokens": 482409748.0, "reward": 0.378125, "reward_std": 0.12340763360261917, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9009745836257934, "step": 6630 }, { "completion_length": 343.6, "completions/clipped_ratio": 0.0, "completions/max_length": 343.6, "completions/max_terminated_length": 343.6, "completions/mean_length": 113.975, "completions/mean_terminated_length": 113.975, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.006004666173140725, "frac_reward_zero_std": 0.775, "grad_norm": 1.3442765474319458, "kl": 0.04029020932503045, "learning_rate": 4.7385714285714286e-07, "loss": 0.0, "num_tokens": 482757108.0, "reward": 0.29375, "reward_std": 0.186858606338501, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.946133017539978, "step": 6635 }, { "completion_length": 306.4, "completions/clipped_ratio": 0.0, "completions/max_length": 306.4, "completions/max_terminated_length": 306.4, "completions/mean_length": 99.8265625, "completions/mean_terminated_length": 99.8265625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.0060091911664889855, "frac_reward_zero_std": 0.8625, "grad_norm": 0.7469204068183899, "kl": 0.05124911146704107, "learning_rate": 4.7421428571428567e-07, "loss": 0.0001, "num_tokens": 483080574.0, "reward": 0.5484375, "reward_std": 0.10495713129639625, "rewards/verify_chess_move/mean": 0.5484375, "rewards/verify_chess_move/std": 0.8313227891921997, "step": 6640 }, { "completion_length": 317.4, "completions/clipped_ratio": 0.0, "completions/max_length": 317.4, "completions/max_terminated_length": 317.4, "completions/mean_length": 108.28671875, "completions/mean_terminated_length": 108.28671875, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.006013716159837245, "frac_reward_zero_std": 0.86875, "grad_norm": 0.12649710476398468, "kl": 0.043570594151969996, "learning_rate": 4.745714285714286e-07, "loss": 0.0, "num_tokens": 483419693.0, "reward": 0.221875, "reward_std": 0.11505203992128372, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9637247681617737, "step": 6645 }, { "completion_length": 317.2, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/max_terminated_length": 317.2, "completions/mean_length": 104.22265625, "completions/mean_terminated_length": 104.22265625, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.006018241153185505, "frac_reward_zero_std": 0.85, "grad_norm": 0.09132395684719086, "kl": 0.04331835970515385, "learning_rate": 4.749285714285714e-07, "loss": 0.0, "num_tokens": 483748562.0, "reward": 0.2484375, "reward_std": 0.12861837893724443, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9675808668136596, "step": 6650 }, { "completion_length": 308.8, "completions/clipped_ratio": 0.0, "completions/max_length": 308.8, "completions/max_terminated_length": 308.8, "completions/mean_length": 109.35859375, "completions/mean_terminated_length": 109.35859375, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.006022766146533765, "frac_reward_zero_std": 0.75, "grad_norm": 0.27702462673187256, "kl": 0.04789708080934361, "learning_rate": 4.752857142857143e-07, "loss": 0.0, "num_tokens": 484086965.0, "reward": 0.31875, "reward_std": 0.21294921040534973, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9364002466201782, "step": 6655 }, { "completion_length": 326.6, "completions/clipped_ratio": 0.0, "completions/max_length": 326.6, "completions/max_terminated_length": 326.6, "completions/mean_length": 112.8484375, "completions/mean_terminated_length": 112.8484375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.006027291139882024, "frac_reward_zero_std": 0.78125, "grad_norm": 1.9579088687896729, "kl": 0.044263141840929165, "learning_rate": 4.756428571428571e-07, "loss": 0.0, "num_tokens": 484431667.0, "reward": 0.3171875, "reward_std": 0.19584717154502868, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9349321842193603, "step": 6660 }, { "completion_length": 314.4, "completions/clipped_ratio": 0.0, "completions/max_length": 314.4, "completions/max_terminated_length": 314.4, "completions/mean_length": 114.86796875, "completions/mean_terminated_length": 114.86796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006031816133230284, "frac_reward_zero_std": 0.84375, "grad_norm": 0.21251735091209412, "kl": 0.04721386830788106, "learning_rate": 4.76e-07, "loss": 0.0, "num_tokens": 484780978.0, "reward": 0.2421875, "reward_std": 0.1427692398428917, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9687859416007996, "step": 6665 }, { "completion_length": 287.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 105.734375, "completions/mean_terminated_length": 105.734375, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.006036341126578544, "frac_reward_zero_std": 0.825, "grad_norm": 0.1071147695183754, "kl": 0.0912553358823061, "learning_rate": 4.7635714285714284e-07, "loss": 0.0001, "num_tokens": 485116222.0, "reward": 0.2625, "reward_std": 0.1437742903828621, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9418704271316528, "step": 6670 }, { "completion_length": 380.2, "completions/clipped_ratio": 0.0, "completions/max_length": 380.2, "completions/max_terminated_length": 380.2, "completions/mean_length": 113.640625, "completions/mean_terminated_length": 113.640625, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.0060408661199268035, "frac_reward_zero_std": 0.875, "grad_norm": 0.06180679798126221, "kl": 0.05909676292794756, "learning_rate": 4.767142857142857e-07, "loss": 0.0001, "num_tokens": 485465106.0, "reward": 0.3, "reward_std": 0.10542540587484836, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9384746193885803, "step": 6675 }, { "completion_length": 303.2, "completions/clipped_ratio": 0.0, "completions/max_length": 303.2, "completions/max_terminated_length": 303.2, "completions/mean_length": 108.07734375, "completions/mean_terminated_length": 108.07734375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.006045391113275064, "frac_reward_zero_std": 0.81875, "grad_norm": 0.1194891557097435, "kl": 0.046789288718719034, "learning_rate": 4.770714285714286e-07, "loss": 0.0, "num_tokens": 485802445.0, "reward": 0.2140625, "reward_std": 0.15408392250537872, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9768516182899475, "step": 6680 }, { "completion_length": 314.8, "completions/clipped_ratio": 0.0, "completions/max_length": 314.8, "completions/max_terminated_length": 314.8, "completions/mean_length": 113.5359375, "completions/mean_terminated_length": 113.5359375, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.006049916106623324, "frac_reward_zero_std": 0.825, "grad_norm": 0.10479214787483215, "kl": 0.05740212476812303, "learning_rate": 4.774285714285714e-07, "loss": 0.0001, "num_tokens": 486147115.0, "reward": 0.2140625, "reward_std": 0.1608103707432747, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.963892662525177, "step": 6685 }, { "completion_length": 313.6, "completions/clipped_ratio": 0.0, "completions/max_length": 313.6, "completions/max_terminated_length": 313.6, "completions/mean_length": 114.20703125, "completions/mean_terminated_length": 114.20703125, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "epoch": 0.006054441099971583, "frac_reward_zero_std": 0.76875, "grad_norm": 0.15578901767730713, "kl": 0.052604539413005115, "learning_rate": 4.777857142857142e-07, "loss": 0.0001, "num_tokens": 486494500.0, "reward": 0.3125, "reward_std": 0.19580127000808717, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9459836840629577, "step": 6690 }, { "completion_length": 298.6, "completions/clipped_ratio": 0.0, "completions/max_length": 298.6, "completions/max_terminated_length": 298.6, "completions/mean_length": 105.39609375, "completions/mean_terminated_length": 105.39609375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.006058966093319843, "frac_reward_zero_std": 0.81875, "grad_norm": 0.13778984546661377, "kl": 0.08668104589451105, "learning_rate": 4.781428571428571e-07, "loss": 0.0001, "num_tokens": 486827567.0, "reward": 0.3046875, "reward_std": 0.15455670654773712, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9462104916572571, "step": 6695 }, { "completion_length": 326.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 108.6015625, "completions/mean_terminated_length": 108.6015625, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.006063491086668102, "frac_reward_zero_std": 0.76875, "grad_norm": 0.14871086180210114, "kl": 0.04408787393476814, "learning_rate": 4.785e-07, "loss": 0.0, "num_tokens": 487165921.0, "reward": 0.2875, "reward_std": 0.20447384119033812, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9537473320960999, "step": 6700 }, { "completion_length": 311.2, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/max_terminated_length": 311.2, "completions/mean_length": 97.9421875, "completions/mean_terminated_length": 97.9421875, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006068016080016362, "frac_reward_zero_std": 0.7875, "grad_norm": 0.151204451918602, "kl": 0.046676095912698654, "learning_rate": 4.788571428571429e-07, "loss": 0.0, "num_tokens": 487486599.0, "reward": 0.378125, "reward_std": 0.18796337246894837, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.920851719379425, "step": 6705 }, { "completion_length": 306.6, "completions/clipped_ratio": 0.0, "completions/max_length": 306.6, "completions/max_terminated_length": 306.6, "completions/mean_length": 101.1796875, "completions/mean_terminated_length": 101.1796875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.006072541073364622, "frac_reward_zero_std": 0.85, "grad_norm": 0.17083479464054108, "kl": 0.05834276617970317, "learning_rate": 4.792142857142857e-07, "loss": 0.0001, "num_tokens": 487810837.0, "reward": 0.35, "reward_std": 0.12767575383186341, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9281953692436218, "step": 6710 }, { "completion_length": 302.8, "completions/clipped_ratio": 0.0, "completions/max_length": 302.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 104.3734375, "completions/mean_terminated_length": 104.3734375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006077066066712882, "frac_reward_zero_std": 0.8375, "grad_norm": 0.125014066696167, "kl": 0.07403516673948615, "learning_rate": 4.795714285714286e-07, "loss": 0.0001, "num_tokens": 488141563.0, "reward": 0.440625, "reward_std": 0.12899680882692338, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8908135056495666, "step": 6715 }, { "completion_length": 310.2, "completions/clipped_ratio": 0.0, "completions/max_length": 310.2, "completions/max_terminated_length": 310.2, "completions/mean_length": 108.49296875, "completions/mean_terminated_length": 108.49296875, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.006081591060061142, "frac_reward_zero_std": 0.81875, "grad_norm": 0.1371513158082962, "kl": 0.04811094378819689, "learning_rate": 4.799285714285714e-07, "loss": 0.0, "num_tokens": 488479538.0, "reward": 0.346875, "reward_std": 0.15519124120473862, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9285010814666748, "step": 6720 }, { "completion_length": 329.6, "completions/clipped_ratio": 0.0, "completions/max_length": 329.6, "completions/max_terminated_length": 329.6, "completions/mean_length": 113.075, "completions/mean_terminated_length": 113.075, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.006086116053409402, "frac_reward_zero_std": 0.81875, "grad_norm": 1.5374782085418701, "kl": 0.20046173024456948, "learning_rate": 4.802857142857142e-07, "loss": 0.0002, "num_tokens": 488825706.0, "reward": 0.2109375, "reward_std": 0.15545186251401902, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.96816086769104, "step": 6725 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 118.86796875, "completions/mean_terminated_length": 118.86796875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006090641046757661, "frac_reward_zero_std": 0.825, "grad_norm": 3.9581198692321777, "kl": 0.08898530079750344, "learning_rate": 4.806428571428571e-07, "loss": 0.0001, "num_tokens": 489181105.0, "reward": 0.240625, "reward_std": 0.1482985258102417, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9490713715553284, "step": 6730 }, { "completion_length": 396.6, "completions/clipped_ratio": 0.0, "completions/max_length": 396.6, "completions/max_terminated_length": 396.6, "completions/mean_length": 104.640625, "completions/mean_terminated_length": 104.640625, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.006095166040105921, "frac_reward_zero_std": 0.81875, "grad_norm": 0.6766212582588196, "kl": 8.53782332969131, "learning_rate": 4.809999999999999e-07, "loss": 0.0085, "num_tokens": 489513917.0, "reward": 0.2640625, "reward_std": 0.15933860838413239, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9594445586204529, "step": 6735 }, { "completion_length": 311.8, "completions/clipped_ratio": 0.0, "completions/max_length": 311.8, "completions/max_terminated_length": 311.8, "completions/mean_length": 106.8390625, "completions/mean_terminated_length": 106.8390625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.006099691033454181, "frac_reward_zero_std": 0.8375, "grad_norm": 5.881171703338623, "kl": 5.974071641743649, "learning_rate": 4.813571428571428e-07, "loss": 0.006, "num_tokens": 489849719.0, "reward": 0.25625, "reward_std": 0.1374097429215908, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9582487106323242, "step": 6740 }, { "completion_length": 310.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 111.12109375, "completions/mean_terminated_length": 111.12109375, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.0061042160268024405, "frac_reward_zero_std": 0.8375, "grad_norm": 0.559802770614624, "kl": 0.3630591748515144, "learning_rate": 4.817142857142856e-07, "loss": 0.0004, "num_tokens": 490191706.0, "reward": 0.3078125, "reward_std": 0.1446669042110443, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.947809386253357, "step": 6745 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 118.1765625, "completions/mean_terminated_length": 118.1765625, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.0061087410201507005, "frac_reward_zero_std": 0.79375, "grad_norm": 1.1437461376190186, "kl": 0.22999378308886662, "learning_rate": 4.820714285714286e-07, "loss": 0.0002, "num_tokens": 490546388.0, "reward": 0.3890625, "reward_std": 0.176545450091362, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9213243722915649, "step": 6750 }, { "completion_length": 301.4, "completions/clipped_ratio": 0.0, "completions/max_length": 301.4, "completions/max_terminated_length": 301.4, "completions/mean_length": 107.16953125, "completions/mean_terminated_length": 107.16953125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.00611326601349896, "frac_reward_zero_std": 0.8375, "grad_norm": 0.11772819608449936, "kl": 0.09830226802732796, "learning_rate": 4.824285714285714e-07, "loss": 0.0001, "num_tokens": 490883765.0, "reward": 0.16875, "reward_std": 0.14355958849191666, "rewards/verify_chess_move/mean": 0.16875, "rewards/verify_chess_move/std": 0.9667293548583984, "step": 6755 }, { "completion_length": 313.4, "completions/clipped_ratio": 0.0, "completions/max_length": 313.4, "completions/max_terminated_length": 313.4, "completions/mean_length": 97.828125, "completions/mean_terminated_length": 97.828125, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.00611779100684722, "frac_reward_zero_std": 0.80625, "grad_norm": 0.17230482399463654, "kl": 0.042880113259889184, "learning_rate": 4.827857142857143e-07, "loss": 0.0, "num_tokens": 491205969.0, "reward": 0.346875, "reward_std": 0.1636077031493187, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9252397537231445, "step": 6760 }, { "completion_length": 339.2, "completions/clipped_ratio": 0.0, "completions/max_length": 339.2, "completions/max_terminated_length": 339.2, "completions/mean_length": 110.478125, "completions/mean_terminated_length": 110.478125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.00612231600019548, "frac_reward_zero_std": 0.79375, "grad_norm": 0.23342154920101166, "kl": 0.06648691084701568, "learning_rate": 4.831428571428571e-07, "loss": 0.0001, "num_tokens": 491547349.0, "reward": 0.2015625, "reward_std": 0.1701844274997711, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.980120837688446, "step": 6765 }, { "completion_length": 314.2, "completions/clipped_ratio": 0.0, "completions/max_length": 314.2, "completions/max_terminated_length": 314.2, "completions/mean_length": 117.7421875, "completions/mean_terminated_length": 117.7421875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.006126840993543739, "frac_reward_zero_std": 0.84375, "grad_norm": 0.1536736786365509, "kl": 0.3079793485114351, "learning_rate": 4.835e-07, "loss": 0.0003, "num_tokens": 491898979.0, "reward": 0.41875, "reward_std": 0.13098785392940043, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8931564569473267, "step": 6770 }, { "completion_length": 298.4, "completions/clipped_ratio": 0.0, "completions/max_length": 298.4, "completions/max_terminated_length": 298.4, "completions/mean_length": 101.5984375, "completions/mean_terminated_length": 101.5984375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.006131365986891999, "frac_reward_zero_std": 0.86875, "grad_norm": 0.5394254922866821, "kl": 0.09545745089417323, "learning_rate": 4.838571428571428e-07, "loss": 0.0001, "num_tokens": 492224217.0, "reward": 0.415625, "reward_std": 0.10668755397200584, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.867226243019104, "step": 6775 }, { "completion_length": 310.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 110.8421875, "completions/mean_terminated_length": 110.8421875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.006135890980240259, "frac_reward_zero_std": 0.78125, "grad_norm": 1.4109646081924438, "kl": 0.17833672516280785, "learning_rate": 4.842142857142857e-07, "loss": 0.0002, "num_tokens": 492566351.0, "reward": 0.23125, "reward_std": 0.18380614668130874, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9675458550453186, "step": 6780 }, { "completion_length": 315.4, "completions/clipped_ratio": 0.0, "completions/max_length": 315.4, "completions/max_terminated_length": 315.4, "completions/mean_length": 111.396875, "completions/mean_terminated_length": 111.396875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.006140415973588519, "frac_reward_zero_std": 0.825, "grad_norm": 3.0622000694274902, "kl": 0.3758846786804497, "learning_rate": 4.845714285714285e-07, "loss": 0.0004, "num_tokens": 492908475.0, "reward": 0.25, "reward_std": 0.15355320572853087, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9613080620765686, "step": 6785 }, { "completion_length": 292.8, "completions/clipped_ratio": 0.0, "completions/max_length": 292.8, "completions/max_terminated_length": 292.8, "completions/mean_length": 107.1859375, "completions/mean_terminated_length": 107.1859375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.006144940966936779, "frac_reward_zero_std": 0.8375, "grad_norm": 3.358180284500122, "kl": 0.10165965075138957, "learning_rate": 4.849285714285715e-07, "loss": 0.0001, "num_tokens": 493244073.0, "reward": 0.2671875, "reward_std": 0.1442425712943077, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9650529384613037, "step": 6790 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 108.60390625, "completions/mean_terminated_length": 108.60390625, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006149465960285039, "frac_reward_zero_std": 0.83125, "grad_norm": 1.888641119003296, "kl": 0.21012805437203497, "learning_rate": 4.852857142857143e-07, "loss": 0.0002, "num_tokens": 493583230.0, "reward": 0.3328125, "reward_std": 0.1418291598558426, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9312483429908752, "step": 6795 }, { "completion_length": 322.6, "completions/clipped_ratio": 0.0, "completions/max_length": 322.6, "completions/max_terminated_length": 322.6, "completions/mean_length": 115.74453125, "completions/mean_terminated_length": 115.74453125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.006153990953633298, "frac_reward_zero_std": 0.79375, "grad_norm": 2.0155446529388428, "kl": 0.29991715144133196, "learning_rate": 4.856428571428572e-07, "loss": 0.0003, "num_tokens": 493934655.0, "reward": 0.3109375, "reward_std": 0.1760746270418167, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9476861238479615, "step": 6800 }, { "completion_length": 289.2, "completions/clipped_ratio": 0.0, "completions/max_length": 289.2, "completions/max_terminated_length": 289.2, "completions/mean_length": 105.95390625, "completions/mean_terminated_length": 105.95390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.006158515946981558, "frac_reward_zero_std": 0.8125, "grad_norm": 2.1698477268218994, "kl": 0.4111902234260924, "learning_rate": 4.86e-07, "loss": 0.0004, "num_tokens": 494269644.0, "reward": 0.2203125, "reward_std": 0.16307503283023833, "rewards/verify_chess_move/mean": 0.2203125, "rewards/verify_chess_move/std": 0.9736318349838257, "step": 6805 }, { "completion_length": 309.6, "completions/clipped_ratio": 0.0, "completions/max_length": 309.6, "completions/max_terminated_length": 309.6, "completions/mean_length": 106.0828125, "completions/mean_terminated_length": 106.0828125, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.006163040940329817, "frac_reward_zero_std": 0.8, "grad_norm": 0.5628890991210938, "kl": 0.1667187025072053, "learning_rate": 4.863571428571429e-07, "loss": 0.0002, "num_tokens": 494603942.0, "reward": 0.2515625, "reward_std": 0.17528330385684968, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9653377294540405, "step": 6810 }, { "completion_length": 344.6, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 97.8125, "completions/mean_terminated_length": 97.8125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.006167565933678077, "frac_reward_zero_std": 0.81875, "grad_norm": 1.1851537227630615, "kl": 0.050806105934316295, "learning_rate": 4.867142857142857e-07, "loss": 0.0001, "num_tokens": 494926358.0, "reward": 0.2453125, "reward_std": 0.14908887147903443, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9671981334686279, "step": 6815 }, { "completion_length": 294.2, "completions/clipped_ratio": 0.0, "completions/max_length": 294.2, "completions/max_terminated_length": 294.2, "completions/mean_length": 105.809375, "completions/mean_terminated_length": 105.809375, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.0061720909270263375, "frac_reward_zero_std": 0.8625, "grad_norm": 1.1418408155441284, "kl": 0.051950054406188426, "learning_rate": 4.870714285714285e-07, "loss": 0.0001, "num_tokens": 495259386.0, "reward": 0.3265625, "reward_std": 0.11520686745643616, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9380110979080201, "step": 6820 }, { "completion_length": 343.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 112.7796875, "completions/mean_terminated_length": 112.7796875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.006176615920374597, "frac_reward_zero_std": 0.78125, "grad_norm": 3.754610300064087, "kl": 0.3457081829023082, "learning_rate": 4.874285714285714e-07, "loss": 0.0003, "num_tokens": 495603704.0, "reward": 0.2890625, "reward_std": 0.19311325401067733, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.950116777420044, "step": 6825 }, { "completion_length": 304.4, "completions/clipped_ratio": 0.0, "completions/max_length": 304.4, "completions/max_terminated_length": 304.4, "completions/mean_length": 109.78671875, "completions/mean_terminated_length": 109.78671875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006181140913722857, "frac_reward_zero_std": 0.825, "grad_norm": 1.236161470413208, "kl": 0.1435714080929756, "learning_rate": 4.877857142857142e-07, "loss": 0.0001, "num_tokens": 495943567.0, "reward": 0.3140625, "reward_std": 0.15854923725128173, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9388240814208985, "step": 6830 }, { "completion_length": 293.8, "completions/clipped_ratio": 0.0, "completions/max_length": 293.8, "completions/max_terminated_length": 293.8, "completions/mean_length": 106.46953125, "completions/mean_terminated_length": 106.46953125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006185665907071117, "frac_reward_zero_std": 0.8375, "grad_norm": 0.6084195375442505, "kl": 0.12267181675415487, "learning_rate": 4.881428571428572e-07, "loss": 0.0001, "num_tokens": 496278952.0, "reward": 0.2859375, "reward_std": 0.13399283289909364, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9500122666358948, "step": 6835 }, { "completion_length": 395.4, "completions/clipped_ratio": 0.0, "completions/max_length": 395.4, "completions/max_terminated_length": 395.4, "completions/mean_length": 105.8859375, "completions/mean_terminated_length": 105.8859375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.006190190900419376, "frac_reward_zero_std": 0.7875, "grad_norm": 3.6480023860931396, "kl": 1.9305944563646336, "learning_rate": 4.885e-07, "loss": 0.0019, "num_tokens": 496611638.0, "reward": 0.2484375, "reward_std": 0.18412213623523713, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9628338694572449, "step": 6840 }, { "completion_length": 350.4, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/max_terminated_length": 350.4, "completions/mean_length": 115.1546875, "completions/mean_terminated_length": 115.1546875, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.006194715893767636, "frac_reward_zero_std": 0.81875, "grad_norm": 3.7858645915985107, "kl": 4.723683811444789, "learning_rate": 4.888571428571429e-07, "loss": 0.0047, "num_tokens": 496961212.0, "reward": 0.2484375, "reward_std": 0.1579726293683052, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9599256992340088, "step": 6845 }, { "completion_length": 400.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.2, "completions/max_terminated_length": 321.8, "completions/mean_length": 108.271875, "completions/mean_terminated_length": 107.76039276123046, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.006199240887115896, "frac_reward_zero_std": 0.7875, "grad_norm": 1.5527390241622925, "kl": 5.420816264220048, "learning_rate": 4.892142857142857e-07, "loss": 0.0054, "num_tokens": 497298368.0, "reward": 0.3671875, "reward_std": 0.18958997428417207, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9252253651618958, "step": 6850 }, { "completion_length": 401.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 401.0, "completions/max_terminated_length": 305.2, "completions/mean_length": 107.83984375, "completions/mean_terminated_length": 107.32539978027344, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.0062037658804641556, "frac_reward_zero_std": 0.78125, "grad_norm": 12.672698974609375, "kl": 9.058334933314473, "learning_rate": 4.895714285714285e-07, "loss": 0.0091, "num_tokens": 497634539.0, "reward": 0.23125, "reward_std": 0.19763650000095367, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9657063126564026, "step": 6855 }, { "completion_length": 301.6, "completions/clipped_ratio": 0.0, "completions/max_length": 301.6, "completions/max_terminated_length": 301.6, "completions/mean_length": 111.32734375, "completions/mean_terminated_length": 111.32734375, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.006208290873812416, "frac_reward_zero_std": 0.8375, "grad_norm": 2.424917459487915, "kl": 4.11208930342691, "learning_rate": 4.899285714285714e-07, "loss": 0.0041, "num_tokens": 497979366.0, "reward": 0.3171875, "reward_std": 0.1349364459514618, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9458714962005615, "step": 6860 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 106.0859375, "completions/mean_terminated_length": 106.0859375, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.006212815867160675, "frac_reward_zero_std": 0.81875, "grad_norm": 0.9756121039390564, "kl": 0.7095092855626717, "learning_rate": 4.902857142857142e-07, "loss": 0.0007, "num_tokens": 498312324.0, "reward": 0.334375, "reward_std": 0.15955077409744262, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9348751544952393, "step": 6865 }, { "completion_length": 336.6, "completions/clipped_ratio": 0.0, "completions/max_length": 336.6, "completions/max_terminated_length": 336.6, "completions/mean_length": 104.89609375, "completions/mean_terminated_length": 104.89609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006217340860508935, "frac_reward_zero_std": 0.8125, "grad_norm": 1.3879715204238892, "kl": 0.3315833772183396, "learning_rate": 4.906428571428571e-07, "loss": 0.0003, "num_tokens": 498645295.0, "reward": 0.396875, "reward_std": 0.16602111458778382, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9024998188018799, "step": 6870 }, { "completion_length": 350.2, "completions/clipped_ratio": 0.0, "completions/max_length": 350.2, "completions/max_terminated_length": 350.2, "completions/mean_length": 111.54140625, "completions/mean_terminated_length": 111.54140625, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.006221865853857195, "frac_reward_zero_std": 0.88125, "grad_norm": 2.987647294998169, "kl": 0.602602654392831, "learning_rate": 4.909999999999999e-07, "loss": 0.0006, "num_tokens": 498989916.0, "reward": 0.2890625, "reward_std": 0.1110445350408554, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9467371821403503, "step": 6875 }, { "completion_length": 306.8, "completions/clipped_ratio": 0.0, "completions/max_length": 306.8, "completions/max_terminated_length": 306.8, "completions/mean_length": 102.63046875, "completions/mean_terminated_length": 102.63046875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006226390847205454, "frac_reward_zero_std": 0.825, "grad_norm": 0.7228002548217773, "kl": 0.07092726565897464, "learning_rate": 4.913571428571429e-07, "loss": 0.0001, "num_tokens": 499319683.0, "reward": 0.2296875, "reward_std": 0.14556559175252914, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9569421768188476, "step": 6880 }, { "completion_length": 287.2, "completions/clipped_ratio": 0.0, "completions/max_length": 287.2, "completions/max_terminated_length": 287.2, "completions/mean_length": 100.83515625, "completions/mean_terminated_length": 100.83515625, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006230915840553714, "frac_reward_zero_std": 0.875, "grad_norm": 3.7548248767852783, "kl": 0.09798148836707696, "learning_rate": 4.917142857142857e-07, "loss": 0.0001, "num_tokens": 499646992.0, "reward": 0.31875, "reward_std": 0.10747534930706024, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9403470396995545, "step": 6885 }, { "completion_length": 293.4, "completions/clipped_ratio": 0.0, "completions/max_length": 293.4, "completions/max_terminated_length": 293.4, "completions/mean_length": 101.68203125, "completions/mean_terminated_length": 101.68203125, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.0062354408339019745, "frac_reward_zero_std": 0.84375, "grad_norm": 1.22377610206604, "kl": 0.2299921489902772, "learning_rate": 4.920714285714286e-07, "loss": 0.0002, "num_tokens": 499972265.0, "reward": 0.3859375, "reward_std": 0.13414511531591417, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9104853749275208, "step": 6890 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 114.8375, "completions/mean_terminated_length": 114.8375, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.006239965827250234, "frac_reward_zero_std": 0.8, "grad_norm": 4.0614728927612305, "kl": 0.1512752604787238, "learning_rate": 4.924285714285714e-07, "loss": 0.0002, "num_tokens": 500322449.0, "reward": 0.4171875, "reward_std": 0.17759484946727752, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9085629940032959, "step": 6895 }, { "completion_length": 323.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 105.28359375, "completions/mean_terminated_length": 105.28359375, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006244490820598494, "frac_reward_zero_std": 0.7875, "grad_norm": 2.800718069076538, "kl": 0.1505103460745886, "learning_rate": 4.927857142857143e-07, "loss": 0.0002, "num_tokens": 500653892.0, "reward": 0.2890625, "reward_std": 0.1893787831068039, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9521723031997681, "step": 6900 }, { "completion_length": 310.4, "completions/clipped_ratio": 0.0, "completions/max_length": 310.4, "completions/max_terminated_length": 310.4, "completions/mean_length": 109.2796875, "completions/mean_terminated_length": 109.2796875, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.006249015813946754, "frac_reward_zero_std": 0.8625, "grad_norm": 0.7653151750564575, "kl": 0.11574812558246776, "learning_rate": 4.931428571428571e-07, "loss": 0.0001, "num_tokens": 500994914.0, "reward": 0.1953125, "reward_std": 0.11405206993222236, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9678330183029175, "step": 6905 }, { "completion_length": 330.6, "completions/clipped_ratio": 0.0, "completions/max_length": 330.6, "completions/max_terminated_length": 330.6, "completions/mean_length": 107.3421875, "completions/mean_terminated_length": 107.3421875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.006253540807295013, "frac_reward_zero_std": 0.8375, "grad_norm": 0.17815393209457397, "kl": 0.07887828944949433, "learning_rate": 4.935e-07, "loss": 0.0001, "num_tokens": 501330288.0, "reward": 0.3609375, "reward_std": 0.1458216980099678, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9326493382453919, "step": 6910 }, { "completion_length": 343.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 104.32734375, "completions/mean_terminated_length": 104.32734375, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.006258065800643273, "frac_reward_zero_std": 0.80625, "grad_norm": 0.31467750668525696, "kl": 0.11606518967309967, "learning_rate": 4.938571428571428e-07, "loss": 0.0001, "num_tokens": 501658947.0, "reward": 0.3546875, "reward_std": 0.17223280668258667, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9022074699401855, "step": 6915 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 103.9203125, "completions/mean_terminated_length": 103.9203125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.006262590793991532, "frac_reward_zero_std": 0.8125, "grad_norm": 0.11804680526256561, "kl": 0.7211192764923908, "learning_rate": 4.942142857142857e-07, "loss": 0.0007, "num_tokens": 501988149.0, "reward": 0.309375, "reward_std": 0.15782132595777512, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9463030695915222, "step": 6920 }, { "completion_length": 298.4, "completions/clipped_ratio": 0.0, "completions/max_length": 298.4, "completions/max_terminated_length": 298.4, "completions/mean_length": 101.4640625, "completions/mean_terminated_length": 101.4640625, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.0062671157873397925, "frac_reward_zero_std": 0.8375, "grad_norm": 0.7630783319473267, "kl": 0.30413171446416526, "learning_rate": 4.945714285714285e-07, "loss": 0.0003, "num_tokens": 502314527.0, "reward": 0.29375, "reward_std": 0.1353597931563854, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.9343263387680054, "step": 6925 }, { "completion_length": 329.4, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/max_terminated_length": 329.4, "completions/mean_length": 110.00078125, "completions/mean_terminated_length": 110.00078125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.006271640780688053, "frac_reward_zero_std": 0.7875, "grad_norm": 0.35943803191185, "kl": 0.4728227055631578, "learning_rate": 4.949285714285715e-07, "loss": 0.0005, "num_tokens": 502652080.0, "reward": 0.34375, "reward_std": 0.17644064426422118, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9361284732818603, "step": 6930 }, { "completion_length": 302.6, "completions/clipped_ratio": 0.0, "completions/max_length": 302.6, "completions/max_terminated_length": 302.6, "completions/mean_length": 107.09375, "completions/mean_terminated_length": 107.09375, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.006276165774036312, "frac_reward_zero_std": 0.8375, "grad_norm": 4.598122596740723, "kl": 0.38325658114627004, "learning_rate": 4.952857142857143e-07, "loss": 0.0004, "num_tokens": 502988360.0, "reward": 0.2359375, "reward_std": 0.1363024190068245, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9574798822402955, "step": 6935 }, { "completion_length": 318.6, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/max_terminated_length": 318.6, "completions/mean_length": 116.25390625, "completions/mean_terminated_length": 116.25390625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.006280690767384572, "frac_reward_zero_std": 0.8375, "grad_norm": 11.495390892028809, "kl": 0.7682729340391233, "learning_rate": 4.956428571428572e-07, "loss": 0.0008, "num_tokens": 503336805.0, "reward": 0.2796875, "reward_std": 0.14513773024082183, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9513777613639831, "step": 6940 }, { "completion_length": 321.2, "completions/clipped_ratio": 0.0, "completions/max_length": 321.2, "completions/max_terminated_length": 321.2, "completions/mean_length": 107.15234375, "completions/mean_terminated_length": 107.15234375, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.006285215760732832, "frac_reward_zero_std": 0.78125, "grad_norm": 3.9092044830322266, "kl": 0.2731106615974568, "learning_rate": 4.96e-07, "loss": 0.0003, "num_tokens": 503672680.0, "reward": 0.340625, "reward_std": 0.18675124049186706, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9346668601036072, "step": 6945 }, { "completion_length": 363.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 106.23515625, "completions/mean_terminated_length": 106.23515625, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.006289740754081091, "frac_reward_zero_std": 0.85, "grad_norm": 19.520341873168945, "kl": 13.223360242147464, "learning_rate": 4.963571428571428e-07, "loss": 0.0132, "num_tokens": 504008085.0, "reward": 0.2921875, "reward_std": 0.12972472012043, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9557689070701599, "step": 6950 }, { "completion_length": 395.4, "completions/clipped_ratio": 0.0, "completions/max_length": 395.4, "completions/max_terminated_length": 395.4, "completions/mean_length": 99.7390625, "completions/mean_terminated_length": 99.7390625, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.006294265747429351, "frac_reward_zero_std": 0.83125, "grad_norm": 1.7267425060272217, "kl": 15.432863646477927, "learning_rate": 4.967142857142857e-07, "loss": 0.0154, "num_tokens": 504331351.0, "reward": 0.3640625, "reward_std": 0.14797900021076202, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9328863978385925, "step": 6955 }, { "completion_length": 319.6, "completions/clipped_ratio": 0.0, "completions/max_length": 319.6, "completions/max_terminated_length": 319.6, "completions/mean_length": 111.7609375, "completions/mean_terminated_length": 111.7609375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.006298790740777611, "frac_reward_zero_std": 0.86875, "grad_norm": 2.202730417251587, "kl": 3.743309567484539, "learning_rate": 4.970714285714285e-07, "loss": 0.0037, "num_tokens": 504675333.0, "reward": 0.325, "reward_std": 0.12282945476472377, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9356478095054627, "step": 6960 }, { "completion_length": 312.2, "completions/clipped_ratio": 0.0, "completions/max_length": 312.2, "completions/max_terminated_length": 312.2, "completions/mean_length": 104.3671875, "completions/mean_terminated_length": 104.3671875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006303315734125871, "frac_reward_zero_std": 0.83125, "grad_norm": 1.2716313600540161, "kl": 1.6584004277945497, "learning_rate": 4.974285714285714e-07, "loss": 0.0017, "num_tokens": 505007275.0, "reward": 0.175, "reward_std": 0.14866199642419814, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9756915688514709, "step": 6965 }, { "completion_length": 311.6, "completions/clipped_ratio": 0.0, "completions/max_length": 311.6, "completions/max_terminated_length": 311.6, "completions/mean_length": 102.10625, "completions/mean_terminated_length": 102.10625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006307840727474131, "frac_reward_zero_std": 0.8125, "grad_norm": 2.1498348712921143, "kl": 1.2023856655694545, "learning_rate": 4.977857142857142e-07, "loss": 0.0012, "num_tokens": 505336571.0, "reward": 0.2046875, "reward_std": 0.15965911373496056, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9758771538734436, "step": 6970 }, { "completion_length": 314.6, "completions/clipped_ratio": 0.0, "completions/max_length": 314.6, "completions/max_terminated_length": 314.6, "completions/mean_length": 105.7984375, "completions/mean_terminated_length": 105.7984375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00631236572082239, "frac_reward_zero_std": 0.84375, "grad_norm": 1.596323847770691, "kl": 1.6987303260946647, "learning_rate": 4.981428571428572e-07, "loss": 0.0017, "num_tokens": 505672641.0, "reward": 0.2578125, "reward_std": 0.13593542724847793, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9672170519828797, "step": 6975 }, { "completion_length": 297.8, "completions/clipped_ratio": 0.0, "completions/max_length": 297.8, "completions/max_terminated_length": 297.8, "completions/mean_length": 103.45625, "completions/mean_terminated_length": 103.45625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.00631689071417065, "frac_reward_zero_std": 0.8375, "grad_norm": 1.8987542390823364, "kl": 0.6271015538834035, "learning_rate": 4.985e-07, "loss": 0.0006, "num_tokens": 506004145.0, "reward": 0.278125, "reward_std": 0.13493546172976495, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.951035988330841, "step": 6980 }, { "completion_length": 343.4, "completions/clipped_ratio": 0.0, "completions/max_length": 343.4, "completions/max_terminated_length": 343.4, "completions/mean_length": 112.1109375, "completions/mean_terminated_length": 112.1109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00632141570751891, "frac_reward_zero_std": 0.85, "grad_norm": 2.89762282371521, "kl": 0.6337057337164879, "learning_rate": 4.988571428571428e-07, "loss": 0.0006, "num_tokens": 506348031.0, "reward": 0.2953125, "reward_std": 0.1304086923599243, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9373429775238037, "step": 6985 }, { "completion_length": 325.6, "completions/clipped_ratio": 0.0, "completions/max_length": 325.6, "completions/max_terminated_length": 325.6, "completions/mean_length": 104.92578125, "completions/mean_terminated_length": 104.92578125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.006325940700867169, "frac_reward_zero_std": 0.8625, "grad_norm": 1.3370784521102905, "kl": 0.5565887286444194, "learning_rate": 4.992142857142857e-07, "loss": 0.0006, "num_tokens": 506680168.0, "reward": 0.278125, "reward_std": 0.1251979447901249, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9533739447593689, "step": 6990 }, { "completion_length": 301.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 102.17265625, "completions/mean_terminated_length": 102.17265625, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.0063304656942154295, "frac_reward_zero_std": 0.85625, "grad_norm": 3.4272918701171875, "kl": 2.4743456973461435, "learning_rate": 4.995714285714285e-07, "loss": 0.0025, "num_tokens": 507008997.0, "reward": 0.296875, "reward_std": 0.1282504007220268, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9496771574020386, "step": 6995 }, { "completion_length": 303.2, "completions/clipped_ratio": 0.0, "completions/max_length": 303.2, "completions/max_terminated_length": 303.2, "completions/mean_length": 100.4859375, "completions/mean_terminated_length": 100.4859375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.0063349906875636896, "frac_reward_zero_std": 0.84375, "grad_norm": 6.622987747192383, "kl": 3.859601677255705, "learning_rate": 4.999285714285714e-07, "loss": 0.0039, "num_tokens": 507335531.0, "reward": 0.25625, "reward_std": 0.13272970914840698, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9665741086006164, "step": 7000 }, { "completion_length": 309.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 107.48984375, "completions/mean_terminated_length": 107.48984375, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006339515680911949, "frac_reward_zero_std": 0.7875, "grad_norm": 9.232975006103516, "kl": 5.066283745411783, "learning_rate": 4.99968253968254e-07, "loss": 0.0051, "num_tokens": 507672046.0, "reward": 0.29375, "reward_std": 0.18164883702993392, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.9512527704238891, "step": 7005 }, { "completion_length": 349.2, "completions/clipped_ratio": 0.0, "completions/max_length": 349.2, "completions/max_terminated_length": 349.2, "completions/mean_length": 112.9765625, "completions/mean_terminated_length": 112.9765625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006344040674260209, "frac_reward_zero_std": 0.8125, "grad_norm": 4.675754547119141, "kl": 2.838321114145219, "learning_rate": 4.999285714285714e-07, "loss": 0.0028, "num_tokens": 508016888.0, "reward": 0.259375, "reward_std": 0.15513390600681304, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9359601140022278, "step": 7010 }, { "completion_length": 298.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 100.978125, "completions/mean_terminated_length": 100.978125, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006348565667608469, "frac_reward_zero_std": 0.8125, "grad_norm": 1.456521987915039, "kl": 1.1393181291176007, "learning_rate": 4.998888888888889e-07, "loss": 0.0011, "num_tokens": 508342796.0, "reward": 0.3328125, "reward_std": 0.16055426597595215, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9371016383171081, "step": 7015 }, { "completion_length": 389.4, "completions/clipped_ratio": 0.0, "completions/max_length": 389.4, "completions/max_terminated_length": 389.4, "completions/mean_length": 101.1703125, "completions/mean_terminated_length": 101.1703125, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006353090660956728, "frac_reward_zero_std": 0.88125, "grad_norm": 2.3001577854156494, "kl": 1.0190223979996518, "learning_rate": 4.998492063492063e-07, "loss": 0.001, "num_tokens": 508668094.0, "reward": 0.4671875, "reward_std": 0.09763655364513397, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8559951186180115, "step": 7020 }, { "completion_length": 315.8, "completions/clipped_ratio": 0.0, "completions/max_length": 315.8, "completions/max_terminated_length": 315.8, "completions/mean_length": 106.25859375, "completions/mean_terminated_length": 106.25859375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006357615654304988, "frac_reward_zero_std": 0.78125, "grad_norm": 2.4883248805999756, "kl": 2.1287131899502127, "learning_rate": 4.998095238095238e-07, "loss": 0.0021, "num_tokens": 509004353.0, "reward": 0.296875, "reward_std": 0.18628042340278625, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9490724802017212, "step": 7025 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 105.49453125, "completions/mean_terminated_length": 105.49453125, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006362140647653248, "frac_reward_zero_std": 0.80625, "grad_norm": 5.376382350921631, "kl": 2.318166278698482, "learning_rate": 4.997698412698412e-07, "loss": 0.0023, "num_tokens": 509336514.0, "reward": 0.371875, "reward_std": 0.16292373836040497, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9145393610000611, "step": 7030 }, { "completion_length": 321.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 102.36640625, "completions/mean_terminated_length": 102.36640625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006366665641001508, "frac_reward_zero_std": 0.81875, "grad_norm": 2.4789013862609863, "kl": 2.019316158327274, "learning_rate": 4.997301587301587e-07, "loss": 0.002, "num_tokens": 509665935.0, "reward": 0.378125, "reward_std": 0.15135098546743392, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9178937792778015, "step": 7035 }, { "completion_length": 293.6, "completions/clipped_ratio": 0.0, "completions/max_length": 293.6, "completions/max_terminated_length": 293.6, "completions/mean_length": 103.80859375, "completions/mean_terminated_length": 103.80859375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006371190634349768, "frac_reward_zero_std": 0.84375, "grad_norm": 3.4257593154907227, "kl": 0.5941580243175849, "learning_rate": 4.996904761904761e-07, "loss": 0.0006, "num_tokens": 509996186.0, "reward": 0.3203125, "reward_std": 0.145290008187294, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9441798567771912, "step": 7040 }, { "completion_length": 305.6, "completions/clipped_ratio": 0.0, "completions/max_length": 305.6, "completions/max_terminated_length": 305.6, "completions/mean_length": 115.5484375, "completions/mean_terminated_length": 115.5484375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006375715627698027, "frac_reward_zero_std": 0.83125, "grad_norm": 0.6343585848808289, "kl": 1.5803731233114378, "learning_rate": 4.996507936507936e-07, "loss": 0.0016, "num_tokens": 510348576.0, "reward": 0.171875, "reward_std": 0.14298297315835953, "rewards/verify_chess_move/mean": 0.171875, "rewards/verify_chess_move/std": 0.9834128499031067, "step": 7045 }, { "completion_length": 290.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 97.61328125, "completions/mean_terminated_length": 97.61328125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.006380240621046287, "frac_reward_zero_std": 0.86875, "grad_norm": 0.13837282359600067, "kl": 0.2694411270786077, "learning_rate": 4.99611111111111e-07, "loss": 0.0003, "num_tokens": 510671145.0, "reward": 0.38125, "reward_std": 0.10989229381084442, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9173431277275086, "step": 7050 }, { "completion_length": 302.8, "completions/clipped_ratio": 0.0, "completions/max_length": 302.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 113.4796875, "completions/mean_terminated_length": 113.4796875, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.006384765614394547, "frac_reward_zero_std": 0.80625, "grad_norm": 2.8027467727661133, "kl": 0.3637449090369046, "learning_rate": 4.995714285714285e-07, "loss": 0.0004, "num_tokens": 511017943.0, "reward": 0.290625, "reward_std": 0.1661284774541855, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.946258270740509, "step": 7055 }, { "completion_length": 496.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 496.2, "completions/max_terminated_length": 418.8, "completions/mean_length": 110.6, "completions/mean_terminated_length": 110.10180969238282, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.006389290607742806, "frac_reward_zero_std": 0.8125, "grad_norm": 4.779869079589844, "kl": 0.16229920246405527, "learning_rate": 4.995317460317461e-07, "loss": 0.0002, "num_tokens": 511359975.0, "reward": 0.125, "reward_std": 0.1635468363761902, "rewards/verify_chess_move/mean": 0.125, "rewards/verify_chess_move/std": 0.9849928855895996, "step": 7060 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 118.57265625, "completions/mean_terminated_length": 118.57265625, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006393815601091066, "frac_reward_zero_std": 0.84375, "grad_norm": 0.9562144875526428, "kl": 0.4075581874581985, "learning_rate": 4.994920634920635e-07, "loss": 0.0004, "num_tokens": 511715028.0, "reward": 0.2078125, "reward_std": 0.13456944674253463, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.9693149566650391, "step": 7065 }, { "completion_length": 305.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 99.10390625, "completions/mean_terminated_length": 99.10390625, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.0063983405944393265, "frac_reward_zero_std": 0.8625, "grad_norm": 0.10638343542814255, "kl": 0.23870009067468345, "learning_rate": 4.994523809523809e-07, "loss": 0.0002, "num_tokens": 512038913.0, "reward": 0.246875, "reward_std": 0.1186237707734108, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.965054714679718, "step": 7070 }, { "completion_length": 385.4, "completions/clipped_ratio": 0.0, "completions/max_length": 385.4, "completions/max_terminated_length": 385.4, "completions/mean_length": 99.315625, "completions/mean_terminated_length": 99.315625, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.006402865587787586, "frac_reward_zero_std": 0.80625, "grad_norm": 1.5604839324951172, "kl": 0.08597014495753683, "learning_rate": 4.994126984126984e-07, "loss": 0.0001, "num_tokens": 512362877.0, "reward": 0.4109375, "reward_std": 0.15976647138595582, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.9080386996269226, "step": 7075 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 104.21796875, "completions/mean_terminated_length": 104.21796875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006407390581135846, "frac_reward_zero_std": 0.83125, "grad_norm": 0.9512548446655273, "kl": 0.09923234307207167, "learning_rate": 4.993730158730159e-07, "loss": 0.0001, "num_tokens": 512694492.0, "reward": 0.2359375, "reward_std": 0.1459290564060211, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9673336386680603, "step": 7080 }, { "completion_length": 318.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 100.9484375, "completions/mean_terminated_length": 100.9484375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.006411915574484106, "frac_reward_zero_std": 0.8, "grad_norm": 3.113117218017578, "kl": 0.27219096638727935, "learning_rate": 4.993333333333333e-07, "loss": 0.0003, "num_tokens": 513020682.0, "reward": 0.28125, "reward_std": 0.1691809356212616, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9506673336029052, "step": 7085 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 103.21171875, "completions/mean_terminated_length": 103.21171875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.006416440567832365, "frac_reward_zero_std": 0.83125, "grad_norm": 3.8142523765563965, "kl": 0.5509551731520332, "learning_rate": 4.992936507936508e-07, "loss": 0.0006, "num_tokens": 513352025.0, "reward": 0.240625, "reward_std": 0.1482376605272293, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9658220767974853, "step": 7090 }, { "completion_length": 364.8, "completions/clipped_ratio": 0.0, "completions/max_length": 364.8, "completions/max_terminated_length": 364.8, "completions/mean_length": 104.17890625, "completions/mean_terminated_length": 104.17890625, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.006420965561180625, "frac_reward_zero_std": 0.84375, "grad_norm": 2.5547232627868652, "kl": 0.8023448846302926, "learning_rate": 4.992539682539682e-07, "loss": 0.0008, "num_tokens": 513683542.0, "reward": 0.284375, "reward_std": 0.13687805235385894, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9597194910049438, "step": 7095 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 102.45234375, "completions/mean_terminated_length": 102.45234375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.0064254905545288845, "frac_reward_zero_std": 0.8125, "grad_norm": 3.4493839740753174, "kl": 1.0521909717703237, "learning_rate": 4.992142857142857e-07, "loss": 0.0011, "num_tokens": 514011641.0, "reward": 0.4453125, "reward_std": 0.1637590043246746, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.8882161855697632, "step": 7100 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 390.4, "completions/max_terminated_length": 338.2, "completions/mean_length": 107.55546875, "completions/mean_terminated_length": 107.03871154785156, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.0064300155478771446, "frac_reward_zero_std": 0.79375, "grad_norm": 3.3746724128723145, "kl": 0.7346757151070051, "learning_rate": 4.991746031746031e-07, "loss": 0.0007, "num_tokens": 514348464.0, "reward": 0.2171875, "reward_std": 0.180174520611763, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9728579163551331, "step": 7105 }, { "completion_length": 337.8, "completions/clipped_ratio": 0.0, "completions/max_length": 337.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 105.434375, "completions/mean_terminated_length": 105.434375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006434540541225405, "frac_reward_zero_std": 0.85625, "grad_norm": 4.791040897369385, "kl": 0.17344124077353626, "learning_rate": 4.991349206349206e-07, "loss": 0.0002, "num_tokens": 514683764.0, "reward": 0.0875, "reward_std": 0.12167622894048691, "rewards/verify_chess_move/mean": 0.0875, "rewards/verify_chess_move/std": 0.9895066738128662, "step": 7110 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 101.08828125, "completions/mean_terminated_length": 101.08828125, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006439065534573664, "frac_reward_zero_std": 0.8625, "grad_norm": 2.554008960723877, "kl": 0.10592323397286237, "learning_rate": 4.990952380952381e-07, "loss": 0.0001, "num_tokens": 515010965.0, "reward": 0.3390625, "reward_std": 0.11405207365751266, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9258761048316956, "step": 7115 }, { "completion_length": 368.6, "completions/clipped_ratio": 0.0, "completions/max_length": 368.6, "completions/max_terminated_length": 368.6, "completions/mean_length": 98.79375, "completions/mean_terminated_length": 98.79375, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006443590527921924, "frac_reward_zero_std": 0.83125, "grad_norm": 3.0966789722442627, "kl": 0.10595865326467901, "learning_rate": 4.990555555555555e-07, "loss": 0.0001, "num_tokens": 515334141.0, "reward": 0.3359375, "reward_std": 0.1465645730495453, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9210626602172851, "step": 7120 }, { "completion_length": 429.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 429.0, "completions/max_terminated_length": 356.2, "completions/mean_length": 97.2765625, "completions/mean_terminated_length": 96.7558349609375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006448115521270184, "frac_reward_zero_std": 0.80625, "grad_norm": 2.47906756401062, "kl": 0.12240518186008557, "learning_rate": 4.99015873015873e-07, "loss": 0.0001, "num_tokens": 515652735.0, "reward": 0.2796875, "reward_std": 0.17065171599388124, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9602855801582336, "step": 7125 }, { "completion_length": 330.4, "completions/clipped_ratio": 0.0, "completions/max_length": 330.4, "completions/max_terminated_length": 330.4, "completions/mean_length": 104.55625, "completions/mean_terminated_length": 104.55625, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.006452640514618443, "frac_reward_zero_std": 0.80625, "grad_norm": 2.366393804550171, "kl": 0.2792152233072557, "learning_rate": 4.989761904761904e-07, "loss": 0.0003, "num_tokens": 515986479.0, "reward": 0.11875, "reward_std": 0.15908347815275192, "rewards/verify_chess_move/mean": 0.11875, "rewards/verify_chess_move/std": 0.9815916776657104, "step": 7130 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 101.6828125, "completions/mean_terminated_length": 101.6828125, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.006457165507966703, "frac_reward_zero_std": 0.85625, "grad_norm": 2.0962460041046143, "kl": 0.2164912852225825, "learning_rate": 4.98936507936508e-07, "loss": 0.0002, "num_tokens": 516313065.0, "reward": 0.415625, "reward_std": 0.11936664134263993, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9070230960845947, "step": 7135 }, { "completion_length": 302.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 99.5859375, "completions/mean_terminated_length": 99.5859375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.0064616905013149635, "frac_reward_zero_std": 0.79375, "grad_norm": 4.241218566894531, "kl": 0.9041111696045846, "learning_rate": 4.988968253968253e-07, "loss": 0.0009, "num_tokens": 516636855.0, "reward": 0.2875, "reward_std": 0.17928035110235213, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9550061345100402, "step": 7140 }, { "completion_length": 297.4, "completions/clipped_ratio": 0.0, "completions/max_length": 297.4, "completions/max_terminated_length": 297.4, "completions/mean_length": 101.82421875, "completions/mean_terminated_length": 101.82421875, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.006466215494663223, "frac_reward_zero_std": 0.83125, "grad_norm": 6.565038681030273, "kl": 3.6220912027405574, "learning_rate": 4.988571428571428e-07, "loss": 0.0036, "num_tokens": 516965638.0, "reward": 0.290625, "reward_std": 0.14161502867937087, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.936252212524414, "step": 7145 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 98.61875, "completions/mean_terminated_length": 98.61875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.006470740488011483, "frac_reward_zero_std": 0.775, "grad_norm": 4.679757595062256, "kl": 4.857551393890754, "learning_rate": 4.988174603174603e-07, "loss": 0.0049, "num_tokens": 517286022.0, "reward": 0.2890625, "reward_std": 0.20342385470867158, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9510304570198059, "step": 7150 }, { "completion_length": 364.4, "completions/clipped_ratio": 0.0, "completions/max_length": 364.4, "completions/max_terminated_length": 364.4, "completions/mean_length": 104.1578125, "completions/mean_terminated_length": 104.1578125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006475265481359742, "frac_reward_zero_std": 0.8, "grad_norm": 8.658930778503418, "kl": 5.839710217760876, "learning_rate": 4.987777777777778e-07, "loss": 0.0058, "num_tokens": 517617760.0, "reward": 0.3078125, "reward_std": 0.1698639214038849, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9522243976593018, "step": 7155 }, { "completion_length": 320.2, "completions/clipped_ratio": 0.0, "completions/max_length": 320.2, "completions/max_terminated_length": 320.2, "completions/mean_length": 100.13671875, "completions/mean_terminated_length": 100.13671875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006479790474708002, "frac_reward_zero_std": 0.8375, "grad_norm": 2.2980971336364746, "kl": 9.0947186967358, "learning_rate": 4.987380952380952e-07, "loss": 0.0091, "num_tokens": 517943951.0, "reward": 0.33125, "reward_std": 0.14266247004270555, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9354048252105713, "step": 7160 }, { "completion_length": 307.4, "completions/clipped_ratio": 0.0, "completions/max_length": 307.4, "completions/max_terminated_length": 307.4, "completions/mean_length": 99.98828125, "completions/mean_terminated_length": 99.98828125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.006484315468056262, "frac_reward_zero_std": 0.79375, "grad_norm": 3.2963874340057373, "kl": 2.35925060424488, "learning_rate": 4.986984126984127e-07, "loss": 0.0024, "num_tokens": 518268264.0, "reward": 0.33125, "reward_std": 0.19610425233840942, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9378477454185485, "step": 7165 }, { "completion_length": 346.4, "completions/clipped_ratio": 0.0, "completions/max_length": 346.4, "completions/max_terminated_length": 346.4, "completions/mean_length": 109.40703125, "completions/mean_terminated_length": 109.40703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006488840461404521, "frac_reward_zero_std": 0.84375, "grad_norm": 2.0905346870422363, "kl": 3.5331876184325663, "learning_rate": 4.986587301587302e-07, "loss": 0.0035, "num_tokens": 518608689.0, "reward": 0.1640625, "reward_std": 0.13072723150253296, "rewards/verify_chess_move/mean": 0.1640625, "rewards/verify_chess_move/std": 0.9858133792877197, "step": 7170 }, { "completion_length": 362.2, "completions/clipped_ratio": 0.0, "completions/max_length": 362.2, "completions/max_terminated_length": 362.2, "completions/mean_length": 109.22265625, "completions/mean_terminated_length": 109.22265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0064933654547527815, "frac_reward_zero_std": 0.85625, "grad_norm": 3.4786734580993652, "kl": 2.875570882903412, "learning_rate": 4.986190476190476e-07, "loss": 0.0029, "num_tokens": 518948878.0, "reward": 0.31875, "reward_std": 0.12188741564750671, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9343497395515442, "step": 7175 }, { "completion_length": 339.4, "completions/clipped_ratio": 0.0, "completions/max_length": 339.4, "completions/max_terminated_length": 339.4, "completions/mean_length": 100.025, "completions/mean_terminated_length": 100.025, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006497890448101042, "frac_reward_zero_std": 0.8375, "grad_norm": 7.383758068084717, "kl": 3.223347361246124, "learning_rate": 4.985793650793651e-07, "loss": 0.0032, "num_tokens": 519275238.0, "reward": 0.21875, "reward_std": 0.13809175044298172, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9615565061569213, "step": 7180 }, { "completion_length": 399.6, "completions/clipped_ratio": 0.0, "completions/max_length": 399.6, "completions/max_terminated_length": 399.6, "completions/mean_length": 109.47109375, "completions/mean_terminated_length": 109.47109375, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006502415441449301, "frac_reward_zero_std": 0.81875, "grad_norm": 1.4078302383422852, "kl": 1.9233807932119817, "learning_rate": 4.985396825396825e-07, "loss": 0.0019, "num_tokens": 519614961.0, "reward": 0.38125, "reward_std": 0.15771397352218627, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9184776663780212, "step": 7185 }, { "completion_length": 331.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 106.1921875, "completions/mean_terminated_length": 106.1921875, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006506940434797561, "frac_reward_zero_std": 0.85, "grad_norm": 2.4471116065979004, "kl": 0.6742120891110972, "learning_rate": 4.985e-07, "loss": 0.0007, "num_tokens": 519949239.0, "reward": 0.303125, "reward_std": 0.1283577635884285, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.945037305355072, "step": 7190 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 103.3734375, "completions/mean_terminated_length": 103.3734375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006511465428145821, "frac_reward_zero_std": 0.83125, "grad_norm": 2.0196738243103027, "kl": 0.49522821984719484, "learning_rate": 4.984603174603174e-07, "loss": 0.0005, "num_tokens": 520279541.0, "reward": 0.2484375, "reward_std": 0.1479789972305298, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9622861623764039, "step": 7195 }, { "completion_length": 282.6, "completions/clipped_ratio": 0.0, "completions/max_length": 282.6, "completions/max_terminated_length": 282.6, "completions/mean_length": 103.3515625, "completions/mean_terminated_length": 103.3515625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.00651599042149408, "frac_reward_zero_std": 0.83125, "grad_norm": 1.6124907732009888, "kl": 0.4684984530787915, "learning_rate": 4.984206349206349e-07, "loss": 0.0005, "num_tokens": 520611551.0, "reward": 0.3125, "reward_std": 0.15233950912952424, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9453368306159973, "step": 7200 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 101.1484375, "completions/mean_terminated_length": 101.1484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00652051541484234, "frac_reward_zero_std": 0.81875, "grad_norm": 3.2429280281066895, "kl": 0.2739053512341343, "learning_rate": 4.983809523809523e-07, "loss": 0.0003, "num_tokens": 520938197.0, "reward": 0.3140625, "reward_std": 0.15587618947029114, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9382273554801941, "step": 7205 }, { "completion_length": 304.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 108.11875, "completions/mean_terminated_length": 108.11875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0065250404081905996, "frac_reward_zero_std": 0.775, "grad_norm": 2.6950020790100098, "kl": 0.27338710175827147, "learning_rate": 4.983412698412699e-07, "loss": 0.0003, "num_tokens": 521275213.0, "reward": 0.3375, "reward_std": 0.18596344888210298, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9379014849662781, "step": 7210 }, { "completion_length": 303.6, "completions/clipped_ratio": 0.0, "completions/max_length": 303.6, "completions/max_terminated_length": 303.6, "completions/mean_length": 109.74609375, "completions/mean_terminated_length": 109.74609375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.00652956540153886, "frac_reward_zero_std": 0.85, "grad_norm": 1.891843318939209, "kl": 0.1853525476064533, "learning_rate": 4.983015873015872e-07, "loss": 0.0002, "num_tokens": 521618944.0, "reward": 0.253125, "reward_std": 0.12468219511210918, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9599563002586364, "step": 7215 }, { "completion_length": 320.8, "completions/clipped_ratio": 0.0, "completions/max_length": 320.8, "completions/max_terminated_length": 320.8, "completions/mean_length": 91.68203125, "completions/mean_terminated_length": 91.68203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00653409039488712, "frac_reward_zero_std": 0.7875, "grad_norm": 4.881622791290283, "kl": 0.9883582528913394, "learning_rate": 4.982619047619047e-07, "loss": 0.001, "num_tokens": 521929457.0, "reward": 0.3125, "reward_std": 0.17455539405345916, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.948715889453888, "step": 7220 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 110.24140625, "completions/mean_terminated_length": 110.24140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006538615388235379, "frac_reward_zero_std": 0.79375, "grad_norm": 1.9932221174240112, "kl": 0.8948887466453016, "learning_rate": 4.982222222222223e-07, "loss": 0.0009, "num_tokens": 522272262.0, "reward": 0.2171875, "reward_std": 0.1786438524723053, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9550744652748108, "step": 7225 }, { "completion_length": 335.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 106.9546875, "completions/mean_terminated_length": 106.9546875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006543140381583639, "frac_reward_zero_std": 0.83125, "grad_norm": 4.667349815368652, "kl": 1.350460292142816, "learning_rate": 4.981825396825396e-07, "loss": 0.0014, "num_tokens": 522611108.0, "reward": 0.25625, "reward_std": 0.15570698529481888, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9405637264251709, "step": 7230 }, { "completion_length": 294.6, "completions/clipped_ratio": 0.0, "completions/max_length": 294.6, "completions/max_terminated_length": 294.6, "completions/mean_length": 97.32421875, "completions/mean_terminated_length": 97.32421875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006547665374931899, "frac_reward_zero_std": 0.8125, "grad_norm": 2.9065794944763184, "kl": 0.7884059228003025, "learning_rate": 4.981428571428572e-07, "loss": 0.0008, "num_tokens": 522932987.0, "reward": 0.265625, "reward_std": 0.15760818421840667, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9438043236732483, "step": 7235 }, { "completion_length": 318.4, "completions/clipped_ratio": 0.0, "completions/max_length": 318.4, "completions/max_terminated_length": 318.4, "completions/mean_length": 103.1609375, "completions/mean_terminated_length": 103.1609375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.006552190368280158, "frac_reward_zero_std": 0.8625, "grad_norm": 1.1325947046279907, "kl": 0.39651081557385626, "learning_rate": 4.981031746031746e-07, "loss": 0.0004, "num_tokens": 523264217.0, "reward": 0.44375, "reward_std": 0.11289826184511184, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8908072233200073, "step": 7240 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 97.08203125, "completions/mean_terminated_length": 97.08203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0065567153616284185, "frac_reward_zero_std": 0.9, "grad_norm": 1.4788798093795776, "kl": 0.5014704035012982, "learning_rate": 4.980634920634921e-07, "loss": 0.0005, "num_tokens": 523584794.0, "reward": 0.4109375, "reward_std": 0.07844062373042107, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.8962489843368531, "step": 7245 }, { "completion_length": 337.2, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/max_terminated_length": 337.2, "completions/mean_length": 99.2671875, "completions/mean_terminated_length": 99.2671875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.0065612403549766786, "frac_reward_zero_std": 0.8625, "grad_norm": 3.7460203170776367, "kl": 0.9217489628819748, "learning_rate": 4.980238095238095e-07, "loss": 0.0009, "num_tokens": 523908544.0, "reward": 0.31875, "reward_std": 0.11631418466567993, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9302155137062073, "step": 7250 }, { "completion_length": 379.4, "completions/clipped_ratio": 0.0, "completions/max_length": 379.4, "completions/max_terminated_length": 379.4, "completions/mean_length": 104.1421875, "completions/mean_terminated_length": 104.1421875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.006565765348324938, "frac_reward_zero_std": 0.8125, "grad_norm": 6.232356548309326, "kl": 2.2911189689766616, "learning_rate": 4.97984126984127e-07, "loss": 0.0023, "num_tokens": 524241518.0, "reward": 0.2375, "reward_std": 0.16307601630687713, "rewards/verify_chess_move/mean": 0.2375, "rewards/verify_chess_move/std": 0.9697601556777954, "step": 7255 }, { "completion_length": 336.6, "completions/clipped_ratio": 0.0, "completions/max_length": 336.6, "completions/max_terminated_length": 336.6, "completions/mean_length": 98.3265625, "completions/mean_terminated_length": 98.3265625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006570290341673198, "frac_reward_zero_std": 0.8375, "grad_norm": 4.9221272468566895, "kl": 3.6698202926665546, "learning_rate": 4.979444444444444e-07, "loss": 0.0037, "num_tokens": 524564792.0, "reward": 0.2453125, "reward_std": 0.13898788392543793, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9572962999343873, "step": 7260 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 110.41875, "completions/mean_terminated_length": 110.41875, "completions/min_length": 35.2, "completions/min_terminated_length": 35.2, "epoch": 0.006574815335021457, "frac_reward_zero_std": 0.74375, "grad_norm": 4.167213439941406, "kl": 5.530417868122458, "learning_rate": 4.979047619047619e-07, "loss": 0.0055, "num_tokens": 524906576.0, "reward": 0.2125, "reward_std": 0.21873558759689332, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.9724388480186462, "step": 7265 }, { "completion_length": 337.2, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/max_terminated_length": 337.2, "completions/mean_length": 100.63671875, "completions/mean_terminated_length": 100.63671875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006579340328369717, "frac_reward_zero_std": 0.81875, "grad_norm": 1.9223573207855225, "kl": 0.8461253713816405, "learning_rate": 4.978650793650793e-07, "loss": 0.0008, "num_tokens": 525235911.0, "reward": 0.25, "reward_std": 0.15250577330589293, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9614883303642273, "step": 7270 }, { "completion_length": 414.8, "completions/clipped_ratio": 0.0, "completions/max_length": 414.8, "completions/max_terminated_length": 414.8, "completions/mean_length": 98.32109375, "completions/mean_terminated_length": 98.32109375, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006583865321717977, "frac_reward_zero_std": 0.84375, "grad_norm": 8.822308540344238, "kl": 1.8059889077441766, "learning_rate": 4.978253968253968e-07, "loss": 0.0018, "num_tokens": 525558650.0, "reward": 0.140625, "reward_std": 0.13824403434991836, "rewards/verify_chess_move/mean": 0.140625, "rewards/verify_chess_move/std": 0.9752379179000854, "step": 7275 }, { "completion_length": 445.4, "completions/clipped_ratio": 0.0, "completions/max_length": 445.4, "completions/max_terminated_length": 445.4, "completions/mean_length": 110.58125, "completions/mean_terminated_length": 110.58125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.0065883903150662365, "frac_reward_zero_std": 0.8375, "grad_norm": 4.555810451507568, "kl": 3.4035840846598147, "learning_rate": 4.977857142857142e-07, "loss": 0.0034, "num_tokens": 525901818.0, "reward": 0.246875, "reward_std": 0.13719659745693208, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9549042701721191, "step": 7280 }, { "completion_length": 339.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 102.53203125, "completions/mean_terminated_length": 102.53203125, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006592915308414497, "frac_reward_zero_std": 0.88125, "grad_norm": 1.198801040649414, "kl": 9.358212010096759, "learning_rate": 4.977460317460317e-07, "loss": 0.0094, "num_tokens": 526231683.0, "reward": 0.3265625, "reward_std": 0.10331557095050811, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9198832154273987, "step": 7285 }, { "completion_length": 412.4, "completions/clipped_ratio": 0.0, "completions/max_length": 412.4, "completions/max_terminated_length": 412.4, "completions/mean_length": 99.46953125, "completions/mean_terminated_length": 99.46953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006597440301762757, "frac_reward_zero_std": 0.81875, "grad_norm": 1.798988699913025, "kl": 9.909670372004621, "learning_rate": 4.977063492063492e-07, "loss": 0.0099, "num_tokens": 526556604.0, "reward": 0.2359375, "reward_std": 0.15608737170696257, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9521278142929077, "step": 7290 }, { "completion_length": 442.8, "completions/clipped_ratio": 0.0, "completions/max_length": 442.8, "completions/max_terminated_length": 442.8, "completions/mean_length": 106.12890625, "completions/mean_terminated_length": 106.12890625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006601965295111016, "frac_reward_zero_std": 0.83125, "grad_norm": 2.8674890995025635, "kl": 4.682574845198542, "learning_rate": 4.976666666666666e-07, "loss": 0.0047, "num_tokens": 526892905.0, "reward": 0.2546875, "reward_std": 0.149558125436306, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9677447080612183, "step": 7295 }, { "completion_length": 400.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.0, "completions/max_terminated_length": 337.8, "completions/mean_length": 100.609375, "completions/mean_terminated_length": 100.0841796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006606490288459276, "frac_reward_zero_std": 0.825, "grad_norm": 5.802185535430908, "kl": 3.2599750627530737, "learning_rate": 4.976269841269842e-07, "loss": 0.0033, "num_tokens": 527220021.0, "reward": 0.3125, "reward_std": 0.1501372829079628, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9345054626464844, "step": 7300 }, { "completion_length": 331.6, "completions/clipped_ratio": 0.0, "completions/max_length": 331.6, "completions/max_terminated_length": 331.6, "completions/mean_length": 102.80859375, "completions/mean_terminated_length": 102.80859375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006611015281807536, "frac_reward_zero_std": 0.8375, "grad_norm": 2.4091014862060547, "kl": 3.768258887366392, "learning_rate": 4.975873015873015e-07, "loss": 0.0038, "num_tokens": 527550752.0, "reward": 0.3578125, "reward_std": 0.13971834480762482, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9288970589637756, "step": 7305 }, { "completion_length": 339.4, "completions/clipped_ratio": 0.0, "completions/max_length": 339.4, "completions/max_terminated_length": 339.4, "completions/mean_length": 98.37421875, "completions/mean_terminated_length": 98.37421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006615540275155795, "frac_reward_zero_std": 0.825, "grad_norm": 5.834237575531006, "kl": 4.20374986527022, "learning_rate": 4.975476190476191e-07, "loss": 0.0042, "num_tokens": 527872799.0, "reward": 0.296875, "reward_std": 0.14356115311384202, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9533657312393189, "step": 7310 }, { "completion_length": 305.8, "completions/clipped_ratio": 0.0, "completions/max_length": 305.8, "completions/max_terminated_length": 305.8, "completions/mean_length": 103.2109375, "completions/mean_terminated_length": 103.2109375, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006620065268504055, "frac_reward_zero_std": 0.8125, "grad_norm": 4.732917785644531, "kl": 1.907047825725749, "learning_rate": 4.975079365079365e-07, "loss": 0.0019, "num_tokens": 528205517.0, "reward": 0.1921875, "reward_std": 0.15645437091588973, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9807406306266785, "step": 7315 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 105.02734375, "completions/mean_terminated_length": 105.02734375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.006624590261852315, "frac_reward_zero_std": 0.88125, "grad_norm": 3.4410457611083984, "kl": 0.7569882600335405, "learning_rate": 4.974682539682539e-07, "loss": 0.0008, "num_tokens": 528541128.0, "reward": 0.3390625, "reward_std": 0.10536551773548126, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9367623090744018, "step": 7320 }, { "completion_length": 297.4, "completions/clipped_ratio": 0.0, "completions/max_length": 297.4, "completions/max_terminated_length": 297.4, "completions/mean_length": 97.60390625, "completions/mean_terminated_length": 97.60390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.006629115255200575, "frac_reward_zero_std": 0.83125, "grad_norm": 3.8018798828125, "kl": 1.1641679033869878, "learning_rate": 4.974285714285714e-07, "loss": 0.0012, "num_tokens": 528864269.0, "reward": 0.2359375, "reward_std": 0.15071291327476502, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.970638656616211, "step": 7325 }, { "completion_length": 300.6, "completions/clipped_ratio": 0.0, "completions/max_length": 300.6, "completions/max_terminated_length": 300.6, "completions/mean_length": 97.31484375, "completions/mean_terminated_length": 97.31484375, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006633640248548835, "frac_reward_zero_std": 0.85, "grad_norm": 0.4775567352771759, "kl": 0.601995745813474, "learning_rate": 4.973888888888889e-07, "loss": 0.0006, "num_tokens": 529186120.0, "reward": 0.4671875, "reward_std": 0.12494085729122162, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8729042410850525, "step": 7330 }, { "completion_length": 302.6, "completions/clipped_ratio": 0.0, "completions/max_length": 302.6, "completions/max_terminated_length": 302.6, "completions/mean_length": 100.88671875, "completions/mean_terminated_length": 100.88671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006638165241897094, "frac_reward_zero_std": 0.8625, "grad_norm": 4.498754978179932, "kl": 0.5115657553775236, "learning_rate": 4.973492063492063e-07, "loss": 0.0005, "num_tokens": 529512831.0, "reward": 0.325, "reward_std": 0.11294671446084976, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.927000617980957, "step": 7335 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 102.67265625, "completions/mean_terminated_length": 102.67265625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006642690235245354, "frac_reward_zero_std": 0.83125, "grad_norm": 1.0749040842056274, "kl": 0.5962267081486061, "learning_rate": 4.973095238095238e-07, "loss": 0.0006, "num_tokens": 529844516.0, "reward": 0.3046875, "reward_std": 0.13909524008631707, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9476828336715698, "step": 7340 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 103.11484375, "completions/mean_terminated_length": 103.11484375, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006647215228593614, "frac_reward_zero_std": 0.8625, "grad_norm": 2.2907187938690186, "kl": 0.40504539598478007, "learning_rate": 4.972698412698413e-07, "loss": 0.0004, "num_tokens": 530176839.0, "reward": 0.2375, "reward_std": 0.11789331212639809, "rewards/verify_chess_move/mean": 0.2375, "rewards/verify_chess_move/std": 0.9594766974449158, "step": 7345 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 101.1046875, "completions/mean_terminated_length": 101.1046875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.0066517402219418735, "frac_reward_zero_std": 0.8875, "grad_norm": 5.085572719573975, "kl": 1.6602121270960197, "learning_rate": 4.972301587301587e-07, "loss": 0.0017, "num_tokens": 530504837.0, "reward": 0.346875, "reward_std": 0.09406580030918121, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9177956700325012, "step": 7350 }, { "completion_length": 411.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 411.2, "completions/max_terminated_length": 309.4, "completions/mean_length": 96.66796875, "completions/mean_terminated_length": 96.1508804321289, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0066562652152901336, "frac_reward_zero_std": 0.85, "grad_norm": 5.58651065826416, "kl": 1.454245820059441, "learning_rate": 4.971904761904762e-07, "loss": 0.0015, "num_tokens": 530825892.0, "reward": 0.246875, "reward_std": 0.12767575085163116, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9680374383926391, "step": 7355 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 101.484375, "completions/mean_terminated_length": 101.484375, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006660790208638394, "frac_reward_zero_std": 0.80625, "grad_norm": 5.6244425773620605, "kl": 6.787554133404046, "learning_rate": 4.971507936507936e-07, "loss": 0.0068, "num_tokens": 531154376.0, "reward": 0.3796875, "reward_std": 0.1638663589954376, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9173912644386292, "step": 7360 }, { "completion_length": 325.4, "completions/clipped_ratio": 0.0, "completions/max_length": 325.4, "completions/max_terminated_length": 325.4, "completions/mean_length": 100.1875, "completions/mean_terminated_length": 100.1875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006665315201986653, "frac_reward_zero_std": 0.86875, "grad_norm": 7.458865165710449, "kl": 2.5566801157779993, "learning_rate": 4.971111111111111e-07, "loss": 0.0026, "num_tokens": 531482400.0, "reward": 0.325, "reward_std": 0.11552482321858407, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9408334970474244, "step": 7365 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 99.36328125, "completions/mean_terminated_length": 99.36328125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006669840195334913, "frac_reward_zero_std": 0.8625, "grad_norm": 3.496804714202881, "kl": 0.5345262158312835, "learning_rate": 4.970714285714285e-07, "loss": 0.0005, "num_tokens": 531808097.0, "reward": 0.396875, "reward_std": 0.11426423788070679, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9096805930137635, "step": 7370 }, { "completion_length": 392.8, "completions/clipped_ratio": 0.0, "completions/max_length": 392.8, "completions/max_terminated_length": 392.8, "completions/mean_length": 99.04296875, "completions/mean_terminated_length": 99.04296875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006674365188683173, "frac_reward_zero_std": 0.81875, "grad_norm": 4.461828708648682, "kl": 1.148245435860008, "learning_rate": 4.97031746031746e-07, "loss": 0.0011, "num_tokens": 532132120.0, "reward": 0.378125, "reward_std": 0.14546077847480773, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9250446200370789, "step": 7375 }, { "completion_length": 303.4, "completions/clipped_ratio": 0.0, "completions/max_length": 303.4, "completions/max_terminated_length": 303.4, "completions/mean_length": 97.02734375, "completions/mean_terminated_length": 97.02734375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006678890182031432, "frac_reward_zero_std": 0.8875, "grad_norm": 2.6516265869140625, "kl": 0.5482529629021883, "learning_rate": 4.969920634920634e-07, "loss": 0.0005, "num_tokens": 532454051.0, "reward": 0.4171875, "reward_std": 0.10179378539323806, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8890284061431885, "step": 7380 }, { "completion_length": 469.6, "completions/clipped_ratio": 0.0, "completions/max_length": 469.6, "completions/max_terminated_length": 469.6, "completions/mean_length": 97.09296875, "completions/mean_terminated_length": 97.09296875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006683415175379692, "frac_reward_zero_std": 0.84375, "grad_norm": 4.570672035217285, "kl": 2.290869842050597, "learning_rate": 4.96952380952381e-07, "loss": 0.0023, "num_tokens": 532774826.0, "reward": 0.35, "reward_std": 0.13713769018650054, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9043581366539002, "step": 7385 }, { "completion_length": 314.6, "completions/clipped_ratio": 0.0, "completions/max_length": 314.6, "completions/max_terminated_length": 314.6, "completions/mean_length": 112.56640625, "completions/mean_terminated_length": 112.56640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006687940168727952, "frac_reward_zero_std": 0.875, "grad_norm": 5.205310344696045, "kl": 4.491787459049374, "learning_rate": 4.969126984126984e-07, "loss": 0.0045, "num_tokens": 533122375.0, "reward": 0.328125, "reward_std": 0.10931411385536194, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.926538598537445, "step": 7390 }, { "completion_length": 292.4, "completions/clipped_ratio": 0.0, "completions/max_length": 292.4, "completions/max_terminated_length": 292.4, "completions/mean_length": 100.27109375, "completions/mean_terminated_length": 100.27109375, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.006692465162076212, "frac_reward_zero_std": 0.8125, "grad_norm": 11.630128860473633, "kl": 8.865504455333575, "learning_rate": 4.968730158730158e-07, "loss": 0.0089, "num_tokens": 533449938.0, "reward": 0.2875, "reward_std": 0.17127580642700196, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9403066873550415, "step": 7395 }, { "completion_length": 312.8, "completions/clipped_ratio": 0.0, "completions/max_length": 312.8, "completions/max_terminated_length": 312.8, "completions/mean_length": 103.23671875, "completions/mean_terminated_length": 103.23671875, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.006696990155424472, "frac_reward_zero_std": 0.88125, "grad_norm": 4.056309700012207, "kl": 2.4467245800653474, "learning_rate": 4.968333333333334e-07, "loss": 0.0024, "num_tokens": 533781729.0, "reward": 0.2015625, "reward_std": 0.10489469766616821, "rewards/verify_chess_move/mean": 0.2015625, "rewards/verify_chess_move/std": 0.9637324810028076, "step": 7400 }, { "completion_length": 277.8, "completions/clipped_ratio": 0.0, "completions/max_length": 277.8, "completions/max_terminated_length": 277.8, "completions/mean_length": 92.99296875, "completions/mean_terminated_length": 92.99296875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006701515148772731, "frac_reward_zero_std": 0.8625, "grad_norm": 2.8125197887420654, "kl": 2.202616970287636, "learning_rate": 4.967936507936508e-07, "loss": 0.0022, "num_tokens": 534098000.0, "reward": 0.334375, "reward_std": 0.1181529477238655, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9326109528541565, "step": 7405 }, { "completion_length": 385.4, "completions/clipped_ratio": 0.0, "completions/max_length": 385.4, "completions/max_terminated_length": 385.4, "completions/mean_length": 102.9515625, "completions/mean_terminated_length": 102.9515625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006706040142120991, "frac_reward_zero_std": 0.85, "grad_norm": 2.52008056640625, "kl": 2.116652868723031, "learning_rate": 4.967539682539683e-07, "loss": 0.0021, "num_tokens": 534429506.0, "reward": 0.296875, "reward_std": 0.12720492631196975, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9507975816726685, "step": 7410 }, { "completion_length": 371.2, "completions/clipped_ratio": 0.0, "completions/max_length": 371.2, "completions/max_terminated_length": 371.2, "completions/mean_length": 99.43359375, "completions/mean_terminated_length": 99.43359375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006710565135469251, "frac_reward_zero_std": 0.83125, "grad_norm": 10.603574752807617, "kl": 2.262101187510416, "learning_rate": 4.967142857142857e-07, "loss": 0.0023, "num_tokens": 534751589.0, "reward": 0.3671875, "reward_std": 0.14618869125843048, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9035361647605896, "step": 7415 }, { "completion_length": 308.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 92.45625, "completions/mean_terminated_length": 92.45625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00671509012881751, "frac_reward_zero_std": 0.81875, "grad_norm": 2.764862060546875, "kl": 3.138798226555809, "learning_rate": 4.966746031746032e-07, "loss": 0.0031, "num_tokens": 535066581.0, "reward": 0.290625, "reward_std": 0.1531897470355034, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9511386275291442, "step": 7420 }, { "completion_length": 272.6, "completions/clipped_ratio": 0.0, "completions/max_length": 272.6, "completions/max_terminated_length": 272.6, "completions/mean_length": 98.61640625, "completions/mean_terminated_length": 98.61640625, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.0067196151221657705, "frac_reward_zero_std": 0.88125, "grad_norm": 3.084203004837036, "kl": 0.78930849886965, "learning_rate": 4.966349206349206e-07, "loss": 0.0008, "num_tokens": 535392042.0, "reward": 0.28125, "reward_std": 0.10194860994815827, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9357998847961426, "step": 7425 }, { "completion_length": 430.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 430.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 96.81484375, "completions/mean_terminated_length": 96.2962875366211, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006724140115514031, "frac_reward_zero_std": 0.8375, "grad_norm": 8.012799263000488, "kl": 1.708023265050724, "learning_rate": 4.965952380952381e-07, "loss": 0.0017, "num_tokens": 535712317.0, "reward": 0.41875, "reward_std": 0.1389869049191475, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8954608559608459, "step": 7430 }, { "completion_length": 327.6, "completions/clipped_ratio": 0.0, "completions/max_length": 327.6, "completions/max_terminated_length": 327.6, "completions/mean_length": 99.58125, "completions/mean_terminated_length": 99.58125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00672866510886229, "frac_reward_zero_std": 0.85, "grad_norm": 6.446340560913086, "kl": 2.247995237680152, "learning_rate": 4.965555555555555e-07, "loss": 0.0022, "num_tokens": 536038541.0, "reward": 0.2890625, "reward_std": 0.12746358662843704, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9546541094779968, "step": 7435 }, { "completion_length": 379.2, "completions/clipped_ratio": 0.0, "completions/max_length": 379.2, "completions/max_terminated_length": 379.2, "completions/mean_length": 98.290625, "completions/mean_terminated_length": 98.290625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00673319010221055, "frac_reward_zero_std": 0.86875, "grad_norm": 2.3188083171844482, "kl": 3.053168307826854, "learning_rate": 4.96515873015873e-07, "loss": 0.0031, "num_tokens": 536361217.0, "reward": 0.334375, "reward_std": 0.11741007566452026, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9199291944503785, "step": 7440 }, { "completion_length": 384.6, "completions/clipped_ratio": 0.0, "completions/max_length": 384.6, "completions/max_terminated_length": 384.6, "completions/mean_length": 97.884375, "completions/mean_terminated_length": 97.884375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.006737715095558809, "frac_reward_zero_std": 0.8375, "grad_norm": 4.3584065437316895, "kl": 1.8461975555866956, "learning_rate": 4.964761904761904e-07, "loss": 0.0018, "num_tokens": 536684125.0, "reward": 0.328125, "reward_std": 0.1390353586524725, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9334516167640686, "step": 7445 }, { "completion_length": 389.8, "completions/clipped_ratio": 0.0, "completions/max_length": 389.8, "completions/max_terminated_length": 389.8, "completions/mean_length": 94.475, "completions/mean_terminated_length": 94.475, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.006742240088907069, "frac_reward_zero_std": 0.875, "grad_norm": 1.8493967056274414, "kl": 0.6237842846894637, "learning_rate": 4.964365079365079e-07, "loss": 0.0006, "num_tokens": 537002181.0, "reward": 0.35625, "reward_std": 0.10158514752984046, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9306472539901733, "step": 7450 }, { "completion_length": 296.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 102.359375, "completions/mean_terminated_length": 102.359375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006746765082255329, "frac_reward_zero_std": 0.8, "grad_norm": 2.476318120956421, "kl": 1.3659930949099361, "learning_rate": 4.963968253968253e-07, "loss": 0.0014, "num_tokens": 537332681.0, "reward": 0.2875, "reward_std": 0.17076005637645722, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9581852316856384, "step": 7455 }, { "completion_length": 428.4, "completions/clipped_ratio": 0.0, "completions/max_length": 428.4, "completions/max_terminated_length": 428.4, "completions/mean_length": 104.29296875, "completions/mean_terminated_length": 104.29296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0067512900756035886, "frac_reward_zero_std": 0.83125, "grad_norm": 7.370750904083252, "kl": 1.7912461598869414, "learning_rate": 4.963571428571428e-07, "loss": 0.0018, "num_tokens": 537666464.0, "reward": 0.28125, "reward_std": 0.1400378704071045, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9534305572509766, "step": 7460 }, { "completion_length": 308.2, "completions/clipped_ratio": 0.0, "completions/max_length": 308.2, "completions/max_terminated_length": 308.2, "completions/mean_length": 101.91875, "completions/mean_terminated_length": 101.91875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006755815068951849, "frac_reward_zero_std": 0.85, "grad_norm": 2.4525740146636963, "kl": 3.3652695507742463, "learning_rate": 4.963174603174603e-07, "loss": 0.0034, "num_tokens": 537999272.0, "reward": 0.1890625, "reward_std": 0.12882956862449646, "rewards/verify_chess_move/mean": 0.1890625, "rewards/verify_chess_move/std": 0.9777634143829346, "step": 7465 }, { "completion_length": 401.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 401.2, "completions/max_terminated_length": 379.4, "completions/mean_length": 100.42890625, "completions/mean_terminated_length": 99.89801330566407, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006760340062300109, "frac_reward_zero_std": 0.83125, "grad_norm": 2.374614715576172, "kl": 3.705646094074473, "learning_rate": 4.962777777777777e-07, "loss": 0.0037, "num_tokens": 538324997.0, "reward": 0.271875, "reward_std": 0.15212636590003967, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9472884416580201, "step": 7470 }, { "completion_length": 324.6, "completions/clipped_ratio": 0.0, "completions/max_length": 324.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 96.14140625, "completions/mean_terminated_length": 96.14140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.006764865055648368, "frac_reward_zero_std": 0.85, "grad_norm": 1.9894824028015137, "kl": 3.6938740202225744, "learning_rate": 4.962380952380953e-07, "loss": 0.0037, "num_tokens": 538643970.0, "reward": 0.3453125, "reward_std": 0.12404570132493972, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9249545216560364, "step": 7475 }, { "completion_length": 321.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 97.60625, "completions/mean_terminated_length": 97.60625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006769390048996628, "frac_reward_zero_std": 0.86875, "grad_norm": 5.847820281982422, "kl": 4.345666888565757, "learning_rate": 4.961984126984127e-07, "loss": 0.0043, "num_tokens": 538966234.0, "reward": 0.4, "reward_std": 0.10784234553575515, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9134204149246216, "step": 7480 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.0, "completions/max_length": 392.6, "completions/max_terminated_length": 392.6, "completions/mean_length": 96.78515625, "completions/mean_terminated_length": 96.78515625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006773915042344888, "frac_reward_zero_std": 0.8375, "grad_norm": 6.037591457366943, "kl": 7.993527969275601, "learning_rate": 4.961587301587301e-07, "loss": 0.008, "num_tokens": 539287535.0, "reward": 0.325, "reward_std": 0.1445012331008911, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9468441605567932, "step": 7485 }, { "completion_length": 370.4, "completions/clipped_ratio": 0.0, "completions/max_length": 370.4, "completions/max_terminated_length": 370.4, "completions/mean_length": 98.78046875, "completions/mean_terminated_length": 98.78046875, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006778440035693147, "frac_reward_zero_std": 0.8625, "grad_norm": 7.413400173187256, "kl": 8.479765784088523, "learning_rate": 4.961190476190476e-07, "loss": 0.0085, "num_tokens": 539611542.0, "reward": 0.315625, "reward_std": 0.11720933765172958, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9469991564750672, "step": 7490 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 109.82265625, "completions/mean_terminated_length": 109.82265625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.0067829650290414075, "frac_reward_zero_std": 0.8125, "grad_norm": 1.4848709106445312, "kl": 1.9745955982129089, "learning_rate": 4.960793650793651e-07, "loss": 0.002, "num_tokens": 539954603.0, "reward": 0.2125, "reward_std": 0.1652906596660614, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.967277729511261, "step": 7495 }, { "completion_length": 351.8, "completions/clipped_ratio": 0.0, "completions/max_length": 351.8, "completions/max_terminated_length": 351.8, "completions/mean_length": 97.75, "completions/mean_terminated_length": 97.75, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.006787490022389667, "frac_reward_zero_std": 0.8125, "grad_norm": 6.759518146514893, "kl": 1.875608288310468, "learning_rate": 4.960396825396825e-07, "loss": 0.0019, "num_tokens": 540278699.0, "reward": 0.375, "reward_std": 0.17033219635486602, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.925626015663147, "step": 7500 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 105.1203125, "completions/mean_terminated_length": 105.1203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006792015015737927, "frac_reward_zero_std": 0.83125, "grad_norm": 2.212960720062256, "kl": 2.40119041048456, "learning_rate": 4.96e-07, "loss": 0.0024, "num_tokens": 540612453.0, "reward": 0.4, "reward_std": 0.14324260652065277, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9137136220932007, "step": 7505 }, { "completion_length": 310.8, "completions/clipped_ratio": 0.0, "completions/max_length": 310.8, "completions/max_terminated_length": 310.8, "completions/mean_length": 98.4984375, "completions/mean_terminated_length": 98.4984375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006796540009086187, "frac_reward_zero_std": 0.8625, "grad_norm": 5.748923301696777, "kl": 1.5781217711046338, "learning_rate": 4.959603174603174e-07, "loss": 0.0016, "num_tokens": 540937699.0, "reward": 0.2421875, "reward_std": 0.12293583452701569, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9541123509407043, "step": 7510 }, { "completion_length": 412.4, "completions/clipped_ratio": 0.0, "completions/max_length": 412.4, "completions/max_terminated_length": 412.4, "completions/mean_length": 103.6359375, "completions/mean_terminated_length": 103.6359375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.006801065002434446, "frac_reward_zero_std": 0.825, "grad_norm": 3.470116138458252, "kl": 0.7624136743484996, "learning_rate": 4.959206349206349e-07, "loss": 0.0008, "num_tokens": 541269433.0, "reward": 0.25, "reward_std": 0.15402403622865676, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9656516194343567, "step": 7515 }, { "completion_length": 290.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 93.421875, "completions/mean_terminated_length": 93.421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.006805589995782706, "frac_reward_zero_std": 0.85625, "grad_norm": 2.4967823028564453, "kl": 1.267384647205472, "learning_rate": 4.958809523809524e-07, "loss": 0.0013, "num_tokens": 541586101.0, "reward": 0.2203125, "reward_std": 0.11894329637289047, "rewards/verify_chess_move/mean": 0.2203125, "rewards/verify_chess_move/std": 0.9728073596954345, "step": 7520 }, { "completion_length": 361.8, "completions/clipped_ratio": 0.0, "completions/max_length": 361.8, "completions/max_terminated_length": 361.8, "completions/mean_length": 104.9375, "completions/mean_terminated_length": 104.9375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.006810114989130966, "frac_reward_zero_std": 0.85, "grad_norm": 3.711787462234497, "kl": 1.2006006181472912, "learning_rate": 4.958412698412698e-07, "loss": 0.0012, "num_tokens": 541920701.0, "reward": 0.3296875, "reward_std": 0.13608770817518234, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9390077114105224, "step": 7525 }, { "completion_length": 362.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 95.3609375, "completions/mean_terminated_length": 95.3609375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0068146399824792255, "frac_reward_zero_std": 0.85, "grad_norm": 5.171976089477539, "kl": 3.9094969094148837, "learning_rate": 4.958015873015873e-07, "loss": 0.0039, "num_tokens": 542239723.0, "reward": 0.3890625, "reward_std": 0.12472967058420181, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.8747772574424744, "step": 7530 }, { "completion_length": 292.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 103.2890625, "completions/mean_terminated_length": 103.2890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006819164975827486, "frac_reward_zero_std": 0.8375, "grad_norm": 7.60451602935791, "kl": 3.5682331847026942, "learning_rate": 4.957619047619047e-07, "loss": 0.0036, "num_tokens": 542573565.0, "reward": 0.25625, "reward_std": 0.14014169424772263, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9590557098388672, "step": 7535 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 91.7921875, "completions/mean_terminated_length": 91.7921875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.006823689969175746, "frac_reward_zero_std": 0.88125, "grad_norm": 4.163777828216553, "kl": 3.4439307479187846, "learning_rate": 4.957222222222222e-07, "loss": 0.0034, "num_tokens": 542886651.0, "reward": 0.3140625, "reward_std": 0.09832052364945412, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9352806329727172, "step": 7540 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 93.82109375, "completions/mean_terminated_length": 93.82109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.006828214962524005, "frac_reward_zero_std": 0.8625, "grad_norm": 1.0506300926208496, "kl": 4.126661968370899, "learning_rate": 4.956825396825396e-07, "loss": 0.0041, "num_tokens": 543202214.0, "reward": 0.4078125, "reward_std": 0.11746995449066162, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8985438585281372, "step": 7545 }, { "completion_length": 450.6, "completions/clipped_ratio": 0.0, "completions/max_length": 450.6, "completions/max_terminated_length": 450.6, "completions/mean_length": 101.58046875, "completions/mean_terminated_length": 101.58046875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.006832739955872265, "frac_reward_zero_std": 0.85, "grad_norm": 4.836194038391113, "kl": 4.270417873724364, "learning_rate": 4.956428571428572e-07, "loss": 0.0043, "num_tokens": 543530341.0, "reward": 0.3953125, "reward_std": 0.13087951242923737, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.8962432503700256, "step": 7550 }, { "completion_length": 462.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 97.65625, "completions/mean_terminated_length": 97.65625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006837264949220524, "frac_reward_zero_std": 0.775, "grad_norm": 9.653939247131348, "kl": 2.438899366790429, "learning_rate": 4.956031746031745e-07, "loss": 0.0024, "num_tokens": 543850989.0, "reward": 0.3921875, "reward_std": 0.1963788628578186, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9105241894721985, "step": 7555 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.0, "completions/max_length": 419.4, "completions/max_terminated_length": 419.4, "completions/mean_length": 101.46875, "completions/mean_terminated_length": 101.46875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006841789942568784, "frac_reward_zero_std": 0.825, "grad_norm": 5.2237701416015625, "kl": 1.9336797703406774, "learning_rate": 4.95563492063492e-07, "loss": 0.0019, "num_tokens": 544178925.0, "reward": 0.215625, "reward_std": 0.15381284952163696, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9628349900245666, "step": 7560 }, { "completion_length": 416.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 416.4, "completions/max_terminated_length": 360.2, "completions/mean_length": 95.1546875, "completions/mean_terminated_length": 94.62487182617187, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006846314935917044, "frac_reward_zero_std": 0.85625, "grad_norm": 4.595648765563965, "kl": 1.0959914718579966, "learning_rate": 4.955238095238095e-07, "loss": 0.0011, "num_tokens": 544498691.0, "reward": 0.3171875, "reward_std": 0.12099324017763138, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9253735780715943, "step": 7565 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 102.62109375, "completions/mean_terminated_length": 102.62109375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006850839929265304, "frac_reward_zero_std": 0.83125, "grad_norm": 1.1901671886444092, "kl": 1.0447707426152193, "learning_rate": 4.954841269841269e-07, "loss": 0.001, "num_tokens": 544829398.0, "reward": 0.3515625, "reward_std": 0.14887416064739228, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9366536259651184, "step": 7570 }, { "completion_length": 450.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 450.4, "completions/max_terminated_length": 376.4, "completions/mean_length": 98.5859375, "completions/mean_terminated_length": 98.05512390136718, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006855364922613564, "frac_reward_zero_std": 0.825, "grad_norm": 7.644351959228516, "kl": 1.678296957956627, "learning_rate": 4.954444444444445e-07, "loss": 0.0017, "num_tokens": 545151788.0, "reward": 0.26875, "reward_std": 0.14740337133407594, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9485334515571594, "step": 7575 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0, "completions/max_length": 393.8, "completions/max_terminated_length": 393.8, "completions/mean_length": 95.02109375, "completions/mean_terminated_length": 95.02109375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006859889915961824, "frac_reward_zero_std": 0.85625, "grad_norm": 4.207489490509033, "kl": 1.5007019393378869, "learning_rate": 4.954047619047619e-07, "loss": 0.0015, "num_tokens": 545468535.0, "reward": 0.3265625, "reward_std": 0.1300901487469673, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9465006113052368, "step": 7580 }, { "completion_length": 447.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.6, "completions/max_terminated_length": 414.8, "completions/mean_length": 95.06328125, "completions/mean_terminated_length": 94.5444564819336, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006864414909310083, "frac_reward_zero_std": 0.89375, "grad_norm": 1.6209057569503784, "kl": 1.581475772592239, "learning_rate": 4.953650793650794e-07, "loss": 0.0016, "num_tokens": 545786896.0, "reward": 0.253125, "reward_std": 0.0896938532590866, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9515854358673096, "step": 7585 }, { "completion_length": 280.2, "completions/clipped_ratio": 0.0, "completions/max_length": 280.2, "completions/max_terminated_length": 280.2, "completions/mean_length": 91.8875, "completions/mean_terminated_length": 91.8875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006868939902658343, "frac_reward_zero_std": 0.8125, "grad_norm": 5.057344436645508, "kl": 2.4397493130411023, "learning_rate": 4.953253968253968e-07, "loss": 0.0024, "num_tokens": 546100112.0, "reward": 0.2546875, "reward_std": 0.16029462665319444, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9611650109291077, "step": 7590 }, { "completion_length": 441.8, "completions/clipped_ratio": 0.0, "completions/max_length": 441.8, "completions/max_terminated_length": 441.8, "completions/mean_length": 103.6140625, "completions/mean_terminated_length": 103.6140625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006873464896006603, "frac_reward_zero_std": 0.85, "grad_norm": 3.480853796005249, "kl": 2.811141932848841, "learning_rate": 4.952857142857143e-07, "loss": 0.0028, "num_tokens": 546432634.0, "reward": 0.2703125, "reward_std": 0.12835874557495117, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9516154766082764, "step": 7595 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 106.934375, "completions/mean_terminated_length": 106.934375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.0068779898893548625, "frac_reward_zero_std": 0.81875, "grad_norm": 4.9334917068481445, "kl": 6.518093497725204, "learning_rate": 4.952460317460317e-07, "loss": 0.0065, "num_tokens": 546770766.0, "reward": 0.2640625, "reward_std": 0.15976293832063676, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9657370448112488, "step": 7600 }, { "completion_length": 314.8, "completions/clipped_ratio": 0.0, "completions/max_length": 314.8, "completions/max_terminated_length": 314.8, "completions/mean_length": 89.99296875, "completions/mean_terminated_length": 89.99296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006882514882703123, "frac_reward_zero_std": 0.84375, "grad_norm": 6.9523749351501465, "kl": 4.190449021500536, "learning_rate": 4.952063492063492e-07, "loss": 0.0042, "num_tokens": 547081485.0, "reward": 0.353125, "reward_std": 0.1452890306711197, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9320608258247376, "step": 7605 }, { "completion_length": 340.6, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/max_terminated_length": 340.6, "completions/mean_length": 99.38515625, "completions/mean_terminated_length": 99.38515625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.006887039876051382, "frac_reward_zero_std": 0.9, "grad_norm": 3.287599563598633, "kl": 1.5553997531998902, "learning_rate": 4.951666666666666e-07, "loss": 0.0016, "num_tokens": 547406386.0, "reward": 0.44375, "reward_std": 0.07912361472845078, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8828248500823974, "step": 7610 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 92.21796875, "completions/mean_terminated_length": 92.21796875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.006891564869399642, "frac_reward_zero_std": 0.8375, "grad_norm": 0.9012761116027832, "kl": 1.7996300710132345, "learning_rate": 4.951269841269841e-07, "loss": 0.0018, "num_tokens": 547719305.0, "reward": 0.3703125, "reward_std": 0.1363024242222309, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.918878173828125, "step": 7615 }, { "completion_length": 311.8, "completions/clipped_ratio": 0.0, "completions/max_length": 311.8, "completions/max_terminated_length": 311.8, "completions/mean_length": 98.4359375, "completions/mean_terminated_length": 98.4359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006896089862747902, "frac_reward_zero_std": 0.875, "grad_norm": 2.9583539962768555, "kl": 1.2081543962238357, "learning_rate": 4.950873015873015e-07, "loss": 0.0012, "num_tokens": 548042223.0, "reward": 0.3921875, "reward_std": 0.10841797888278962, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9116214036941528, "step": 7620 }, { "completion_length": 435.8, "completions/clipped_ratio": 0.0, "completions/max_length": 435.8, "completions/max_terminated_length": 435.8, "completions/mean_length": 99.2390625, "completions/mean_terminated_length": 99.2390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006900614856096161, "frac_reward_zero_std": 0.875, "grad_norm": 5.265837669372559, "kl": 2.026770946686156, "learning_rate": 4.95047619047619e-07, "loss": 0.002, "num_tokens": 548368569.0, "reward": 0.375, "reward_std": 0.10683983191847801, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9143747925758362, "step": 7625 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 105.17578125, "completions/mean_terminated_length": 105.17578125, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.006905139849444421, "frac_reward_zero_std": 0.84375, "grad_norm": 3.6697959899902344, "kl": 0.6147522777318954, "learning_rate": 4.950079365079364e-07, "loss": 0.0006, "num_tokens": 548702362.0, "reward": 0.4625, "reward_std": 0.13277815729379655, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8851272940635682, "step": 7630 }, { "completion_length": 316.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 98.7390625, "completions/mean_terminated_length": 98.7390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006909664842792681, "frac_reward_zero_std": 0.8375, "grad_norm": 3.027811050415039, "kl": 1.7526365864789113, "learning_rate": 4.949682539682539e-07, "loss": 0.0018, "num_tokens": 549027988.0, "reward": 0.290625, "reward_std": 0.14377076774835587, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9497276425361634, "step": 7635 }, { "completion_length": 439.4, "completions/clipped_ratio": 0.0, "completions/max_length": 439.4, "completions/max_terminated_length": 439.4, "completions/mean_length": 100.13359375, "completions/mean_terminated_length": 100.13359375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006914189836140941, "frac_reward_zero_std": 0.83125, "grad_norm": 2.749486207962036, "kl": 2.890477418783121, "learning_rate": 4.949285714285715e-07, "loss": 0.0029, "num_tokens": 549354735.0, "reward": 0.3515625, "reward_std": 0.14003688842058182, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9158775806427002, "step": 7640 }, { "completion_length": 371.2, "completions/clipped_ratio": 0.0, "completions/max_length": 371.2, "completions/max_terminated_length": 371.2, "completions/mean_length": 89.084375, "completions/mean_terminated_length": 89.084375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006918714829489201, "frac_reward_zero_std": 0.9, "grad_norm": 5.584614276885986, "kl": 3.5580172664718703, "learning_rate": 4.948888888888888e-07, "loss": 0.0036, "num_tokens": 549663907.0, "reward": 0.3328125, "reward_std": 0.08433083072304726, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.926667308807373, "step": 7645 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 96.94765625, "completions/mean_terminated_length": 96.94765625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.006923239822837461, "frac_reward_zero_std": 0.825, "grad_norm": 6.5272111892700195, "kl": 3.0735366043052634, "learning_rate": 4.948492063492064e-07, "loss": 0.0031, "num_tokens": 549988184.0, "reward": 0.25625, "reward_std": 0.1497129514813423, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9604208469390869, "step": 7650 }, { "completion_length": 499.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 499.8, "completions/max_terminated_length": 423.8, "completions/mean_length": 116.35, "completions/mean_terminated_length": 115.84945678710938, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00692776481618572, "frac_reward_zero_std": 0.85, "grad_norm": 1.538333773612976, "kl": 1.4203155405935832, "learning_rate": 4.948095238095238e-07, "loss": 0.0014, "num_tokens": 550341160.0, "reward": 0.2125, "reward_std": 0.13194034174084662, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.955571460723877, "step": 7655 }, { "completion_length": 354.8, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 101.646875, "completions/mean_terminated_length": 101.646875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00693228980953398, "frac_reward_zero_std": 0.85625, "grad_norm": 4.954350471496582, "kl": 2.3085006890352817, "learning_rate": 4.947698412698412e-07, "loss": 0.0023, "num_tokens": 550670668.0, "reward": 0.228125, "reward_std": 0.1209942176938057, "rewards/verify_chess_move/mean": 0.228125, "rewards/verify_chess_move/std": 0.9549825310707092, "step": 7660 }, { "completion_length": 302.4, "completions/clipped_ratio": 0.0, "completions/max_length": 302.4, "completions/max_terminated_length": 302.4, "completions/mean_length": 97.9765625, "completions/mean_terminated_length": 97.9765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006936814802882239, "frac_reward_zero_std": 0.8375, "grad_norm": 4.390798091888428, "kl": 1.2251328689744696, "learning_rate": 4.947301587301587e-07, "loss": 0.0012, "num_tokens": 550994054.0, "reward": 0.3640625, "reward_std": 0.1474008232355118, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.92049320936203, "step": 7665 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 92.17734375, "completions/mean_terminated_length": 92.17734375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.006941339796230499, "frac_reward_zero_std": 0.825, "grad_norm": 5.378410339355469, "kl": 2.7089369647204875, "learning_rate": 4.946904761904762e-07, "loss": 0.0027, "num_tokens": 551306521.0, "reward": 0.290625, "reward_std": 0.14761455357074738, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9131340146064758, "step": 7670 }, { "completion_length": 331.4, "completions/clipped_ratio": 0.0, "completions/max_length": 331.4, "completions/max_terminated_length": 331.4, "completions/mean_length": 94.96875, "completions/mean_terminated_length": 94.96875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0069458647895787595, "frac_reward_zero_std": 0.84375, "grad_norm": 3.9131546020507812, "kl": 2.669170401478186, "learning_rate": 4.946507936507936e-07, "loss": 0.0027, "num_tokens": 551624993.0, "reward": 0.409375, "reward_std": 0.13819754272699356, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9135010123252869, "step": 7675 }, { "completion_length": 396.2, "completions/clipped_ratio": 0.0, "completions/max_length": 396.2, "completions/max_terminated_length": 396.2, "completions/mean_length": 105.5921875, "completions/mean_terminated_length": 105.5921875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006950389782927019, "frac_reward_zero_std": 0.83125, "grad_norm": 3.8735713958740234, "kl": 1.3135967045091093, "learning_rate": 4.946111111111111e-07, "loss": 0.0013, "num_tokens": 551961271.0, "reward": 0.3171875, "reward_std": 0.14366792142391205, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9066415905952454, "step": 7680 }, { "completion_length": 404.8, "completions/clipped_ratio": 0.0, "completions/max_length": 404.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 101.71328125, "completions/mean_terminated_length": 101.71328125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.006954914776275279, "frac_reward_zero_std": 0.8625, "grad_norm": 2.3169147968292236, "kl": 1.551855244464241, "learning_rate": 4.945714285714285e-07, "loss": 0.0016, "num_tokens": 552290752.0, "reward": 0.2984375, "reward_std": 0.11473604664206505, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9550481915473938, "step": 7685 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.0, "completions/max_length": 390.4, "completions/max_terminated_length": 390.4, "completions/mean_length": 103.6828125, "completions/mean_terminated_length": 103.6828125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.006959439769623539, "frac_reward_zero_std": 0.85, "grad_norm": 3.5561180114746094, "kl": 1.2354202787391841, "learning_rate": 4.94531746031746e-07, "loss": 0.0012, "num_tokens": 552623586.0, "reward": 0.2734375, "reward_std": 0.13203626573085786, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9586915731430053, "step": 7690 }, { "completion_length": 469.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 97.8875, "completions/mean_terminated_length": 97.8875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.006963964762971798, "frac_reward_zero_std": 0.85, "grad_norm": 3.5968308448791504, "kl": 2.525790180149488, "learning_rate": 4.944920634920635e-07, "loss": 0.0025, "num_tokens": 552946954.0, "reward": 0.2328125, "reward_std": 0.13040868937969208, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.956888222694397, "step": 7695 }, { "completion_length": 403.4, "completions/clipped_ratio": 0.0, "completions/max_length": 403.4, "completions/max_terminated_length": 403.4, "completions/mean_length": 97.4796875, "completions/mean_terminated_length": 97.4796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.006968489756320058, "frac_reward_zero_std": 0.8625, "grad_norm": 10.074264526367188, "kl": 3.9284178604371847, "learning_rate": 4.944523809523809e-07, "loss": 0.0039, "num_tokens": 553270488.0, "reward": 0.3140625, "reward_std": 0.11699913516640663, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9431507110595703, "step": 7700 }, { "completion_length": 336.4, "completions/clipped_ratio": 0.0, "completions/max_length": 336.4, "completions/max_terminated_length": 336.4, "completions/mean_length": 99.36875, "completions/mean_terminated_length": 99.36875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.006973014749668318, "frac_reward_zero_std": 0.76875, "grad_norm": 4.532241344451904, "kl": 5.3336022217758, "learning_rate": 4.944126984126984e-07, "loss": 0.0053, "num_tokens": 553597232.0, "reward": 0.2984375, "reward_std": 0.19106683731079102, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9418040990829468, "step": 7705 }, { "completion_length": 284.2, "completions/clipped_ratio": 0.0, "completions/max_length": 284.2, "completions/max_terminated_length": 284.2, "completions/mean_length": 94.56796875, "completions/mean_terminated_length": 94.56796875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.006977539743016578, "frac_reward_zero_std": 0.89375, "grad_norm": 4.576652526855469, "kl": 2.5344209607224912, "learning_rate": 4.943730158730158e-07, "loss": 0.0025, "num_tokens": 553914935.0, "reward": 0.3609375, "reward_std": 0.08832689598202706, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9323623657226563, "step": 7710 }, { "completion_length": 320.2, "completions/clipped_ratio": 0.0, "completions/max_length": 320.2, "completions/max_terminated_length": 320.2, "completions/mean_length": 97.73203125, "completions/mean_terminated_length": 97.73203125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.006982064736364838, "frac_reward_zero_std": 0.85625, "grad_norm": 2.96553111076355, "kl": 2.024826434161514, "learning_rate": 4.943333333333333e-07, "loss": 0.002, "num_tokens": 554238056.0, "reward": 0.403125, "reward_std": 0.12120540738105774, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.8969631791114807, "step": 7715 }, { "completion_length": 424.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 102.6234375, "completions/mean_terminated_length": 102.6234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.006986589729713098, "frac_reward_zero_std": 0.88125, "grad_norm": 5.295753002166748, "kl": 3.9180555688450114, "learning_rate": 4.942936507936507e-07, "loss": 0.0039, "num_tokens": 554570038.0, "reward": 0.3484375, "reward_std": 0.10216077715158463, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9324350833892823, "step": 7720 }, { "completion_length": 381.4, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 99.03515625, "completions/mean_terminated_length": 99.03515625, "completions/min_length": 33.2, "completions/min_terminated_length": 33.2, "epoch": 0.006991114723061357, "frac_reward_zero_std": 0.85, "grad_norm": 4.067509174346924, "kl": 3.9678340250626207, "learning_rate": 4.942539682539683e-07, "loss": 0.004, "num_tokens": 554894635.0, "reward": 0.3359375, "reward_std": 0.12609564810991286, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9384116888046264, "step": 7725 }, { "completion_length": 313.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 95.72734375, "completions/mean_terminated_length": 95.72734375, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.006995639716409617, "frac_reward_zero_std": 0.8125, "grad_norm": 6.455177307128906, "kl": 3.521128857997246, "learning_rate": 4.942142857142857e-07, "loss": 0.0035, "num_tokens": 555215686.0, "reward": 0.3203125, "reward_std": 0.1630750373005867, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9368307590484619, "step": 7730 }, { "completion_length": 391.4, "completions/clipped_ratio": 0.0, "completions/max_length": 391.4, "completions/max_terminated_length": 391.4, "completions/mean_length": 95.67109375, "completions/mean_terminated_length": 95.67109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007000164709757876, "frac_reward_zero_std": 0.9, "grad_norm": 4.084856986999512, "kl": 1.3285147417802363, "learning_rate": 4.941746031746031e-07, "loss": 0.0013, "num_tokens": 555537345.0, "reward": 0.334375, "reward_std": 0.0895864948630333, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.928240442276001, "step": 7735 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 93.05234375, "completions/mean_terminated_length": 93.05234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007004689703106136, "frac_reward_zero_std": 0.86875, "grad_norm": 4.043066024780273, "kl": 1.1090173396049068, "learning_rate": 4.941349206349206e-07, "loss": 0.0011, "num_tokens": 555852924.0, "reward": 0.35625, "reward_std": 0.10989229232072831, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9166067004203796, "step": 7740 }, { "completion_length": 304.6, "completions/clipped_ratio": 0.0, "completions/max_length": 304.6, "completions/max_terminated_length": 304.6, "completions/mean_length": 88.484375, "completions/mean_terminated_length": 88.484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0070092146964543965, "frac_reward_zero_std": 0.825, "grad_norm": 3.8357386589050293, "kl": 1.19756141921971, "learning_rate": 4.940952380952381e-07, "loss": 0.0012, "num_tokens": 556161768.0, "reward": 0.4203125, "reward_std": 0.14582326561212539, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8999307870864868, "step": 7745 }, { "completion_length": 415.2, "completions/clipped_ratio": 0.0, "completions/max_length": 415.2, "completions/max_terminated_length": 415.2, "completions/mean_length": 97.571875, "completions/mean_terminated_length": 97.571875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007013739689802656, "frac_reward_zero_std": 0.88125, "grad_norm": 6.81549072265625, "kl": 1.63810720940819, "learning_rate": 4.940555555555556e-07, "loss": 0.0016, "num_tokens": 556484228.0, "reward": 0.4203125, "reward_std": 0.10216077864170074, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8989062070846557, "step": 7750 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 381.0, "completions/max_terminated_length": 303.8, "completions/mean_length": 91.29296875, "completions/mean_terminated_length": 90.7668243408203, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007018264683150916, "frac_reward_zero_std": 0.89375, "grad_norm": 2.659090518951416, "kl": 3.306662959326059, "learning_rate": 4.94015873015873e-07, "loss": 0.0033, "num_tokens": 556798899.0, "reward": 0.25, "reward_std": 0.08832591474056244, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.958596670627594, "step": 7755 }, { "completion_length": 375.2, "completions/clipped_ratio": 0.0, "completions/max_length": 375.2, "completions/max_terminated_length": 375.2, "completions/mean_length": 100.05078125, "completions/mean_terminated_length": 100.05078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007022789676499176, "frac_reward_zero_std": 0.83125, "grad_norm": 4.695194721221924, "kl": 2.506819928192999, "learning_rate": 4.939761904761905e-07, "loss": 0.0025, "num_tokens": 557124780.0, "reward": 0.40625, "reward_std": 0.14388008862733842, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9076500535011292, "step": 7760 }, { "completion_length": 379.2, "completions/clipped_ratio": 0.0, "completions/max_length": 379.2, "completions/max_terminated_length": 379.2, "completions/mean_length": 100.20234375, "completions/mean_terminated_length": 100.20234375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.007027314669847435, "frac_reward_zero_std": 0.85, "grad_norm": 4.2711076736450195, "kl": 2.279124827729538, "learning_rate": 4.939365079365079e-07, "loss": 0.0023, "num_tokens": 557453759.0, "reward": 0.25, "reward_std": 0.1385610058903694, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9657190442085266, "step": 7765 }, { "completion_length": 407.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 407.2, "completions/max_terminated_length": 326.0, "completions/mean_length": 103.77734375, "completions/mean_terminated_length": 103.2593765258789, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.007031839663195695, "frac_reward_zero_std": 0.80625, "grad_norm": 2.8772835731506348, "kl": 1.7865031262859703, "learning_rate": 4.938968253968254e-07, "loss": 0.0018, "num_tokens": 557786754.0, "reward": 0.1984375, "reward_std": 0.16744697988033294, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.978208041191101, "step": 7770 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 100.7515625, "completions/mean_terminated_length": 100.7515625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007036364656543955, "frac_reward_zero_std": 0.8125, "grad_norm": 5.677526950836182, "kl": 4.255134635162539, "learning_rate": 4.938571428571428e-07, "loss": 0.0043, "num_tokens": 558113708.0, "reward": 0.3953125, "reward_std": 0.17101714313030242, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9123362064361572, "step": 7775 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 101.76015625, "completions/mean_terminated_length": 101.76015625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.0070408896498922145, "frac_reward_zero_std": 0.825, "grad_norm": 3.7880678176879883, "kl": 3.4768148028990256, "learning_rate": 4.938174603174603e-07, "loss": 0.0035, "num_tokens": 558443953.0, "reward": 0.2515625, "reward_std": 0.1573944494128227, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9650084018707276, "step": 7780 }, { "completion_length": 394.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 394.2, "completions/max_terminated_length": 322.4, "completions/mean_length": 94.03125, "completions/mean_terminated_length": 93.50066223144532, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007045414643240475, "frac_reward_zero_std": 0.825, "grad_norm": 5.220016002655029, "kl": 11.951720768492669, "learning_rate": 4.937777777777777e-07, "loss": 0.012, "num_tokens": 558762137.0, "reward": 0.228125, "reward_std": 0.15466151535511016, "rewards/verify_chess_move/mean": 0.228125, "rewards/verify_chess_move/std": 0.9692237138748169, "step": 7785 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 95.7703125, "completions/mean_terminated_length": 95.7703125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007049939636588734, "frac_reward_zero_std": 0.8875, "grad_norm": 3.833425521850586, "kl": 10.950811097351835, "learning_rate": 4.937380952380952e-07, "loss": 0.011, "num_tokens": 559083995.0, "reward": 0.4015625, "reward_std": 0.09795352667570115, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8863123059272766, "step": 7790 }, { "completion_length": 300.8, "completions/clipped_ratio": 0.0, "completions/max_length": 300.8, "completions/max_terminated_length": 300.8, "completions/mean_length": 95.925, "completions/mean_terminated_length": 95.925, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.007054464629936994, "frac_reward_zero_std": 0.88125, "grad_norm": 3.25846266746521, "kl": 4.4723561983555555, "learning_rate": 4.936984126984126e-07, "loss": 0.0045, "num_tokens": 559405203.0, "reward": 0.3125, "reward_std": 0.10326809883117676, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9400419116020202, "step": 7795 }, { "completion_length": 455.6, "completions/clipped_ratio": 0.0, "completions/max_length": 455.6, "completions/max_terminated_length": 455.6, "completions/mean_length": 99.04375, "completions/mean_terminated_length": 99.04375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.007058989623285254, "frac_reward_zero_std": 0.825, "grad_norm": 4.054881572723389, "kl": 5.290298486873508, "learning_rate": 4.936587301587302e-07, "loss": 0.0053, "num_tokens": 559730227.0, "reward": 0.328125, "reward_std": 0.1503949612379074, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9355726480484009, "step": 7800 }, { "completion_length": 529.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 529.4, "completions/max_terminated_length": 461.0, "completions/mean_length": 105.32578125, "completions/mean_terminated_length": 104.30348205566406, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.007063514616633513, "frac_reward_zero_std": 0.84375, "grad_norm": 6.020151615142822, "kl": 1.6731410683132708, "learning_rate": 4.936190476190476e-07, "loss": 0.0017, "num_tokens": 560064612.0, "reward": 0.3484375, "reward_std": 0.14008181542158127, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9321022510528565, "step": 7805 }, { "completion_length": 414.4, "completions/clipped_ratio": 0.0, "completions/max_length": 414.4, "completions/max_terminated_length": 414.4, "completions/mean_length": 100.803125, "completions/mean_terminated_length": 100.803125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.007068039609981773, "frac_reward_zero_std": 0.8, "grad_norm": 6.150543212890625, "kl": 7.29236729030963, "learning_rate": 4.93579365079365e-07, "loss": 0.0073, "num_tokens": 560393440.0, "reward": 0.215625, "reward_std": 0.17259882390499115, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9741539120674133, "step": 7810 }, { "completion_length": 514.6, "completions/clipped_ratio": 0.0, "completions/max_length": 514.6, "completions/max_terminated_length": 514.6, "completions/mean_length": 103.15, "completions/mean_terminated_length": 103.15, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0070725646033300334, "frac_reward_zero_std": 0.8375, "grad_norm": 4.190971374511719, "kl": 5.576128258788958, "learning_rate": 4.935396825396826e-07, "loss": 0.0056, "num_tokens": 560726184.0, "reward": 0.3703125, "reward_std": 0.13809272944927214, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9261428117752075, "step": 7815 }, { "completion_length": 403.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 403.8, "completions/max_terminated_length": 402.2, "completions/mean_length": 101.83203125, "completions/mean_terminated_length": 100.80637969970704, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.007077089596678293, "frac_reward_zero_std": 0.85625, "grad_norm": 4.9171881675720215, "kl": 8.763841676944867, "learning_rate": 4.935e-07, "loss": 0.0088, "num_tokens": 561055097.0, "reward": 0.2984375, "reward_std": 0.12167720943689346, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9386447429656982, "step": 7820 }, { "completion_length": 398.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.6, "completions/max_terminated_length": 365.0, "completions/mean_length": 97.55859375, "completions/mean_terminated_length": 97.03825225830079, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.007081614590026553, "frac_reward_zero_std": 0.90625, "grad_norm": 2.999818801879883, "kl": 6.396128315827809, "learning_rate": 4.934603174603175e-07, "loss": 0.0064, "num_tokens": 561379164.0, "reward": 0.3296875, "reward_std": 0.07607017457485199, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.93913254737854, "step": 7825 }, { "completion_length": 331.6, "completions/clipped_ratio": 0.0, "completions/max_length": 331.6, "completions/max_terminated_length": 331.6, "completions/mean_length": 100.03203125, "completions/mean_terminated_length": 100.03203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007086139583374813, "frac_reward_zero_std": 0.81875, "grad_norm": 6.543536186218262, "kl": 5.228610874293372, "learning_rate": 4.934206349206349e-07, "loss": 0.0052, "num_tokens": 561706413.0, "reward": 0.2453125, "reward_std": 0.1579726293683052, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9678984880447388, "step": 7830 }, { "completion_length": 377.6, "completions/clipped_ratio": 0.0, "completions/max_length": 377.6, "completions/max_terminated_length": 377.6, "completions/mean_length": 102.71171875, "completions/mean_terminated_length": 102.71171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007090664576723072, "frac_reward_zero_std": 0.85625, "grad_norm": 10.20893383026123, "kl": 9.01080980086699, "learning_rate": 4.933809523809524e-07, "loss": 0.009, "num_tokens": 562039980.0, "reward": 0.2421875, "reward_std": 0.12620143443346024, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9640246391296386, "step": 7835 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 95.54296875, "completions/mean_terminated_length": 95.54296875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.007095189570071332, "frac_reward_zero_std": 0.88125, "grad_norm": 1.7726686000823975, "kl": 4.71023625486996, "learning_rate": 4.933412698412698e-07, "loss": 0.0047, "num_tokens": 562362587.0, "reward": 0.2421875, "reward_std": 0.09900253042578697, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9492235064506531, "step": 7840 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 89.228125, "completions/mean_terminated_length": 89.228125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007099714563419591, "frac_reward_zero_std": 0.8375, "grad_norm": 5.037524223327637, "kl": 5.057125463057309, "learning_rate": 4.933015873015873e-07, "loss": 0.0051, "num_tokens": 562672727.0, "reward": 0.34375, "reward_std": 0.13877571672201156, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.8856350660324097, "step": 7845 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 93.98203125, "completions/mean_terminated_length": 93.98203125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0071042395567678515, "frac_reward_zero_std": 0.825, "grad_norm": 2.7032246589660645, "kl": 3.0035402614856137, "learning_rate": 4.932619047619047e-07, "loss": 0.003, "num_tokens": 562988152.0, "reward": 0.3078125, "reward_std": 0.14693156480789185, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.943028700351715, "step": 7850 }, { "completion_length": 305.8, "completions/clipped_ratio": 0.0, "completions/max_length": 305.8, "completions/max_terminated_length": 305.8, "completions/mean_length": 99.33046875, "completions/mean_terminated_length": 99.33046875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.007108764550116112, "frac_reward_zero_std": 0.84375, "grad_norm": 4.7495317459106445, "kl": 2.674200374120846, "learning_rate": 4.932222222222222e-07, "loss": 0.0027, "num_tokens": 563314511.0, "reward": 0.359375, "reward_std": 0.136878053098917, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9051530718803406, "step": 7855 }, { "completion_length": 437.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 437.4, "completions/max_terminated_length": 426.6, "completions/mean_length": 104.40234375, "completions/mean_terminated_length": 103.89393463134766, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007113289543464371, "frac_reward_zero_std": 0.84375, "grad_norm": 3.615464687347412, "kl": 2.106079334160313, "learning_rate": 4.931825396825397e-07, "loss": 0.0021, "num_tokens": 563650458.0, "reward": 0.3109375, "reward_std": 0.13051605522632598, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9379332423210144, "step": 7860 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 96.1328125, "completions/mean_terminated_length": 96.1328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007117814536812631, "frac_reward_zero_std": 0.825, "grad_norm": 3.5299220085144043, "kl": 3.5834742167033253, "learning_rate": 4.931428571428571e-07, "loss": 0.0036, "num_tokens": 563972364.0, "reward": 0.2609375, "reward_std": 0.1487703263759613, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9614291310310363, "step": 7865 }, { "completion_length": 572.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 572.6, "completions/max_terminated_length": 471.6, "completions/mean_length": 99.62890625, "completions/mean_terminated_length": 99.09676513671874, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007122339530160891, "frac_reward_zero_std": 0.825, "grad_norm": 4.293088912963867, "kl": 6.276749396906235, "learning_rate": 4.931031746031746e-07, "loss": 0.0063, "num_tokens": 564298089.0, "reward": 0.31875, "reward_std": 0.1510324403643608, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9025108456611634, "step": 7870 }, { "completion_length": 413.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 413.6, "completions/max_terminated_length": 360.2, "completions/mean_length": 99.39140625, "completions/mean_terminated_length": 98.87895355224609, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00712686452350915, "frac_reward_zero_std": 0.875, "grad_norm": 9.211078643798828, "kl": 3.366995882242918, "learning_rate": 4.93063492063492e-07, "loss": 0.0034, "num_tokens": 564626158.0, "reward": 0.196875, "reward_std": 0.10931411385536194, "rewards/verify_chess_move/mean": 0.196875, "rewards/verify_chess_move/std": 0.968174397945404, "step": 7875 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.0, "completions/max_terminated_length": 311.6, "completions/mean_length": 91.45078125, "completions/mean_terminated_length": 90.93852081298829, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.00713138951685741, "frac_reward_zero_std": 0.875, "grad_norm": 8.37803840637207, "kl": 2.5706282647559417, "learning_rate": 4.930238095238095e-07, "loss": 0.0026, "num_tokens": 564939703.0, "reward": 0.2828125, "reward_std": 0.10820679217576981, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9494367480278015, "step": 7880 }, { "completion_length": 311.6, "completions/clipped_ratio": 0.0, "completions/max_length": 311.6, "completions/max_terminated_length": 311.6, "completions/mean_length": 96.59921875, "completions/mean_terminated_length": 96.59921875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00713591451020567, "frac_reward_zero_std": 0.88125, "grad_norm": 4.967872619628906, "kl": 4.3462630394846204, "learning_rate": 4.929841269841269e-07, "loss": 0.0043, "num_tokens": 565264726.0, "reward": 0.175, "reward_std": 0.09511480033397675, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9748667001724243, "step": 7885 }, { "completion_length": 530.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 530.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 99.31796875, "completions/mean_terminated_length": 98.28556060791016, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00714043950355393, "frac_reward_zero_std": 0.80625, "grad_norm": 8.451569557189941, "kl": 6.699438145570457, "learning_rate": 4.929444444444445e-07, "loss": 0.0067, "num_tokens": 565590197.0, "reward": 0.3328125, "reward_std": 0.16775506883859634, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.936805522441864, "step": 7890 }, { "completion_length": 313.4, "completions/clipped_ratio": 0.0, "completions/max_length": 313.4, "completions/max_terminated_length": 313.4, "completions/mean_length": 91.72578125, "completions/mean_terminated_length": 91.72578125, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00714496449690219, "frac_reward_zero_std": 0.86875, "grad_norm": 3.7536768913269043, "kl": 10.932077649701387, "learning_rate": 4.929047619047618e-07, "loss": 0.0109, "num_tokens": 565902494.0, "reward": 0.3140625, "reward_std": 0.11078842543065548, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9403996944427491, "step": 7895 }, { "completion_length": 405.8, "completions/clipped_ratio": 0.0, "completions/max_length": 405.8, "completions/max_terminated_length": 405.8, "completions/mean_length": 94.94296875, "completions/mean_terminated_length": 94.94296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007149489490250449, "frac_reward_zero_std": 0.84375, "grad_norm": 4.003143310546875, "kl": 7.266074979584664, "learning_rate": 4.928650793650793e-07, "loss": 0.0073, "num_tokens": 566220381.0, "reward": 0.3109375, "reward_std": 0.13614661544561385, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9216995239257812, "step": 7900 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 99.3046875, "completions/mean_terminated_length": 99.3046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007154014483598709, "frac_reward_zero_std": 0.8625, "grad_norm": 12.23213005065918, "kl": 3.7067407385911793, "learning_rate": 4.928253968253968e-07, "loss": 0.0037, "num_tokens": 566547083.0, "reward": 0.33125, "reward_std": 0.12204165756702423, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.914952528476715, "step": 7905 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 91.32421875, "completions/mean_terminated_length": 91.32421875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007158539476946969, "frac_reward_zero_std": 0.89375, "grad_norm": 0.4036256968975067, "kl": 2.6845930027309803, "learning_rate": 4.927857142857143e-07, "loss": 0.0027, "num_tokens": 566861442.0, "reward": 0.3234375, "reward_std": 0.08401385098695754, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9458524107933044, "step": 7910 }, { "completion_length": 410.8, "completions/clipped_ratio": 0.0, "completions/max_length": 410.8, "completions/max_terminated_length": 410.8, "completions/mean_length": 97.17265625, "completions/mean_terminated_length": 97.17265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007163064470295228, "frac_reward_zero_std": 0.8875, "grad_norm": 4.40647554397583, "kl": 2.065584558225237, "learning_rate": 4.927460317460317e-07, "loss": 0.0021, "num_tokens": 567184911.0, "reward": 0.384375, "reward_std": 0.09727053716778755, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.913367509841919, "step": 7915 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 100.28671875, "completions/mean_terminated_length": 100.28671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0071675894636434884, "frac_reward_zero_std": 0.84375, "grad_norm": 4.965664386749268, "kl": 1.0704225156689062, "learning_rate": 4.927063492063492e-07, "loss": 0.0011, "num_tokens": 567512206.0, "reward": 0.2453125, "reward_std": 0.13803185969591142, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9616766571998596, "step": 7920 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.0, "completions/max_length": 444.8, "completions/max_terminated_length": 444.8, "completions/mean_length": 106.07109375, "completions/mean_terminated_length": 106.07109375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.0071721144569917485, "frac_reward_zero_std": 0.83125, "grad_norm": 3.2722325325012207, "kl": 1.065126151475124, "learning_rate": 4.926666666666667e-07, "loss": 0.0011, "num_tokens": 567849801.0, "reward": 0.221875, "reward_std": 0.14614122062921525, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.972769832611084, "step": 7925 }, { "completion_length": 435.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.0, "completions/max_terminated_length": 374.8, "completions/mean_length": 100.6625, "completions/mean_terminated_length": 100.15538177490234, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007176639450340008, "frac_reward_zero_std": 0.875, "grad_norm": 5.959702014923096, "kl": 0.6356983047211543, "learning_rate": 4.926269841269841e-07, "loss": 0.0006, "num_tokens": 568179257.0, "reward": 0.265625, "reward_std": 0.10773499161005021, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9436219334602356, "step": 7930 }, { "completion_length": 472.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 472.8, "completions/max_terminated_length": 415.4, "completions/mean_length": 98.49765625, "completions/mean_terminated_length": 97.97825469970704, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007181164443688268, "frac_reward_zero_std": 0.8375, "grad_norm": 4.986546516418457, "kl": 0.7827314203837886, "learning_rate": 4.925873015873016e-07, "loss": 0.0008, "num_tokens": 568504262.0, "reward": 0.340625, "reward_std": 0.14082566499710084, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9373409271240234, "step": 7935 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 95.30234375, "completions/mean_terminated_length": 95.30234375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007185689437036528, "frac_reward_zero_std": 0.90625, "grad_norm": 8.640527725219727, "kl": 0.4243159523466602, "learning_rate": 4.92547619047619e-07, "loss": 0.0004, "num_tokens": 568823649.0, "reward": 0.3125, "reward_std": 0.08768883049488067, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9445729732513428, "step": 7940 }, { "completion_length": 401.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 401.6, "completions/max_terminated_length": 320.2, "completions/mean_length": 96.62109375, "completions/mean_terminated_length": 96.0927001953125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007190214430384787, "frac_reward_zero_std": 0.88125, "grad_norm": 2.086165189743042, "kl": 0.5649760551925282, "learning_rate": 4.925079365079365e-07, "loss": 0.0006, "num_tokens": 569146028.0, "reward": 0.3625, "reward_std": 0.10557768642902374, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.917721939086914, "step": 7945 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 98.9015625, "completions/mean_terminated_length": 98.9015625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007194739423733047, "frac_reward_zero_std": 0.825, "grad_norm": 3.4078476428985596, "kl": 1.061433677142486, "learning_rate": 4.924682539682539e-07, "loss": 0.0011, "num_tokens": 569472934.0, "reward": 0.3671875, "reward_std": 0.1523974433541298, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9176661014556885, "step": 7950 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 100.859375, "completions/mean_terminated_length": 100.859375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0071992644170813065, "frac_reward_zero_std": 0.83125, "grad_norm": 1.752058982849121, "kl": 1.0655070906272157, "learning_rate": 4.924285714285714e-07, "loss": 0.0011, "num_tokens": 569801850.0, "reward": 0.3109375, "reward_std": 0.15165456235408784, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9458803534507751, "step": 7955 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 105.99765625, "completions/mean_terminated_length": 105.99765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007203789410429567, "frac_reward_zero_std": 0.85625, "grad_norm": 3.8600544929504395, "kl": 0.3959828916238621, "learning_rate": 4.923888888888888e-07, "loss": 0.0004, "num_tokens": 570140135.0, "reward": 0.2296875, "reward_std": 0.12351401001214982, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9610311985015869, "step": 7960 }, { "completion_length": 303.6, "completions/clipped_ratio": 0.0, "completions/max_length": 303.6, "completions/max_terminated_length": 303.6, "completions/mean_length": 95.5453125, "completions/mean_terminated_length": 95.5453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007208314403777827, "frac_reward_zero_std": 0.8375, "grad_norm": 3.736593246459961, "kl": 1.1558703125454486, "learning_rate": 4.923492063492064e-07, "loss": 0.0012, "num_tokens": 570459705.0, "reward": 0.3609375, "reward_std": 0.1438182383775711, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9182925701141358, "step": 7965 }, { "completion_length": 411.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 411.6, "completions/max_terminated_length": 347.2, "completions/mean_length": 94.61875, "completions/mean_terminated_length": 94.0902587890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007212839397126086, "frac_reward_zero_std": 0.8375, "grad_norm": 4.107856750488281, "kl": 2.755809583258815, "learning_rate": 4.923095238095237e-07, "loss": 0.0028, "num_tokens": 570779313.0, "reward": 0.35, "reward_std": 0.14813030362129212, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.934749174118042, "step": 7970 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 96.42734375, "completions/mean_terminated_length": 96.42734375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007217364390474346, "frac_reward_zero_std": 0.8875, "grad_norm": 6.941780090332031, "kl": 1.5556505829561502, "learning_rate": 4.922698412698412e-07, "loss": 0.0016, "num_tokens": 571101244.0, "reward": 0.38125, "reward_std": 0.09500744380056858, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9210544705390931, "step": 7975 }, { "completion_length": 408.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 408.2, "completions/max_terminated_length": 357.8, "completions/mean_length": 96.7515625, "completions/mean_terminated_length": 96.22429962158203, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007221889383822606, "frac_reward_zero_std": 0.8625, "grad_norm": 3.2491908073425293, "kl": 1.128856884711422, "learning_rate": 4.922301587301588e-07, "loss": 0.0011, "num_tokens": 571424454.0, "reward": 0.3390625, "reward_std": 0.11999072879552841, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9358394980430603, "step": 7980 }, { "completion_length": 608.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 608.0, "completions/max_terminated_length": 462.4, "completions/mean_length": 101.5953125, "completions/mean_terminated_length": 100.55769195556641, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007226414377170865, "frac_reward_zero_std": 0.84375, "grad_norm": 7.3981499671936035, "kl": 0.9813657367834822, "learning_rate": 4.921904761904761e-07, "loss": 0.001, "num_tokens": 571755088.0, "reward": 0.4390625, "reward_std": 0.12641418874263763, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8963915944099426, "step": 7985 }, { "completion_length": 404.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 404.8, "completions/max_terminated_length": 358.6, "completions/mean_length": 99.7265625, "completions/mean_terminated_length": 99.19828796386719, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007230939370519125, "frac_reward_zero_std": 0.8125, "grad_norm": 6.3799543380737305, "kl": 1.6976450190646575, "learning_rate": 4.921507936507937e-07, "loss": 0.0017, "num_tokens": 572081850.0, "reward": 0.2828125, "reward_std": 0.1628173589706421, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9432917833328247, "step": 7990 }, { "completion_length": 417.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 417.6, "completions/max_terminated_length": 367.6, "completions/mean_length": 92.94765625, "completions/mean_terminated_length": 92.41618957519532, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.0072354643638673855, "frac_reward_zero_std": 0.9125, "grad_norm": 0.20580226182937622, "kl": 2.944536188908387, "learning_rate": 4.921111111111111e-07, "loss": 0.0029, "num_tokens": 572398415.0, "reward": 0.3203125, "reward_std": 0.07280653119087219, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9414592742919922, "step": 7995 }, { "completion_length": 398.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.4, "completions/max_terminated_length": 340.6, "completions/mean_length": 99.03203125, "completions/mean_terminated_length": 98.52323303222656, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007239989357215645, "frac_reward_zero_std": 0.83125, "grad_norm": 4.8839826583862305, "kl": 4.545533944491763, "learning_rate": 4.920714285714286e-07, "loss": 0.0045, "num_tokens": 572723760.0, "reward": 0.3015625, "reward_std": 0.14814369678497313, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9446298122406006, "step": 8000 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 378.4, "completions/max_terminated_length": 330.8, "completions/mean_length": 91.053125, "completions/mean_terminated_length": 90.5351547241211, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007244514350563905, "frac_reward_zero_std": 0.8625, "grad_norm": 5.016023635864258, "kl": 4.875479747576174, "learning_rate": 4.92031746031746e-07, "loss": 0.0049, "num_tokens": 573037500.0, "reward": 0.2171875, "reward_std": 0.1172568142414093, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.966640055179596, "step": 8005 }, { "completion_length": 375.2, "completions/clipped_ratio": 0.0, "completions/max_length": 375.2, "completions/max_terminated_length": 375.2, "completions/mean_length": 97.9546875, "completions/mean_terminated_length": 97.9546875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007249039343912164, "frac_reward_zero_std": 0.9, "grad_norm": 4.310235023498535, "kl": 3.3319541569100695, "learning_rate": 4.919920634920635e-07, "loss": 0.0033, "num_tokens": 573360482.0, "reward": 0.2515625, "reward_std": 0.08937433063983917, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9643662333488464, "step": 8010 }, { "completion_length": 351.2, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/max_terminated_length": 351.2, "completions/mean_length": 99.2078125, "completions/mean_terminated_length": 99.2078125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007253564337260424, "frac_reward_zero_std": 0.8375, "grad_norm": 8.278420448303223, "kl": 3.61589631983079, "learning_rate": 4.919523809523809e-07, "loss": 0.0036, "num_tokens": 573687948.0, "reward": 0.3125, "reward_std": 0.13446463644504547, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9435292959213257, "step": 8015 }, { "completion_length": 276.6, "completions/clipped_ratio": 0.0, "completions/max_length": 276.6, "completions/max_terminated_length": 276.6, "completions/mean_length": 91.1984375, "completions/mean_terminated_length": 91.1984375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.007258089330608684, "frac_reward_zero_std": 0.825, "grad_norm": 10.225729942321777, "kl": 4.749883654061705, "learning_rate": 4.919126984126984e-07, "loss": 0.0047, "num_tokens": 574000850.0, "reward": 0.2421875, "reward_std": 0.15402501523494722, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9490671396255493, "step": 8020 }, { "completion_length": 357.4, "completions/clipped_ratio": 0.0, "completions/max_length": 357.4, "completions/max_terminated_length": 357.4, "completions/mean_length": 92.9984375, "completions/mean_terminated_length": 92.9984375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0072626143239569434, "frac_reward_zero_std": 0.7625, "grad_norm": 9.824726104736328, "kl": 11.39118861425668, "learning_rate": 4.918730158730158e-07, "loss": 0.0114, "num_tokens": 574317920.0, "reward": 0.3296875, "reward_std": 0.20615934431552888, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9429591774940491, "step": 8025 }, { "completion_length": 341.6, "completions/clipped_ratio": 0.0, "completions/max_length": 341.6, "completions/max_terminated_length": 341.6, "completions/mean_length": 89.2125, "completions/mean_terminated_length": 89.2125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0072671393173052035, "frac_reward_zero_std": 0.88125, "grad_norm": 5.669991493225098, "kl": 3.8781093414057977, "learning_rate": 4.918333333333333e-07, "loss": 0.0039, "num_tokens": 574628952.0, "reward": 0.3953125, "reward_std": 0.10578984990715981, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.8959301829338073, "step": 8030 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 91.18828125, "completions/mean_terminated_length": 91.18828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007271664310653464, "frac_reward_zero_std": 0.85625, "grad_norm": 5.814074993133545, "kl": 7.7091278966516255, "learning_rate": 4.917936507936508e-07, "loss": 0.0077, "num_tokens": 574943273.0, "reward": 0.309375, "reward_std": 0.11778752356767655, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9409745931625366, "step": 8035 }, { "completion_length": 473.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 473.8, "completions/max_terminated_length": 374.4, "completions/mean_length": 98.8828125, "completions/mean_terminated_length": 98.35545196533204, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007276189304001723, "frac_reward_zero_std": 0.85, "grad_norm": 6.477912902832031, "kl": 8.556781615735963, "learning_rate": 4.917539682539682e-07, "loss": 0.0086, "num_tokens": 575268035.0, "reward": 0.3859375, "reward_std": 0.12788791954517365, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9122705101966858, "step": 8040 }, { "completion_length": 448.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 448.6, "completions/max_terminated_length": 340.4, "completions/mean_length": 93.696875, "completions/mean_terminated_length": 93.15391387939454, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007280714297349983, "frac_reward_zero_std": 0.9, "grad_norm": 3.362488031387329, "kl": 4.399679938121698, "learning_rate": 4.917142857142857e-07, "loss": 0.0044, "num_tokens": 575584567.0, "reward": 0.3984375, "reward_std": 0.08254052028059959, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9094464421272278, "step": 8045 }, { "completion_length": 436.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.6, "completions/max_terminated_length": 350.8, "completions/mean_length": 97.9609375, "completions/mean_terminated_length": 97.43205413818359, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007285239290698243, "frac_reward_zero_std": 0.89375, "grad_norm": 2.3732924461364746, "kl": 0.9111297601135447, "learning_rate": 4.916746031746031e-07, "loss": 0.0009, "num_tokens": 575910429.0, "reward": 0.3546875, "reward_std": 0.08969287350773811, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9321704983711243, "step": 8050 }, { "completion_length": 440.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 440.4, "completions/max_terminated_length": 392.0, "completions/mean_length": 94.26328125, "completions/mean_terminated_length": 93.73814392089844, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007289764284046502, "frac_reward_zero_std": 0.85625, "grad_norm": 5.458593368530273, "kl": 1.0667332291603089, "learning_rate": 4.916349206349207e-07, "loss": 0.0011, "num_tokens": 576230062.0, "reward": 0.24375, "reward_std": 0.122360198199749, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.954447329044342, "step": 8055 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.8, "completions/max_terminated_length": 366.6, "completions/mean_length": 94.23359375, "completions/mean_terminated_length": 93.70848693847657, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007294289277394762, "frac_reward_zero_std": 0.85625, "grad_norm": 3.1349666118621826, "kl": 0.5329520375933499, "learning_rate": 4.91595238095238e-07, "loss": 0.0005, "num_tokens": 576547601.0, "reward": 0.2890625, "reward_std": 0.1244091659784317, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9544867515563965, "step": 8060 }, { "completion_length": 399.2, "completions/clipped_ratio": 0.0, "completions/max_length": 399.2, "completions/max_terminated_length": 399.2, "completions/mean_length": 87.88515625, "completions/mean_terminated_length": 87.88515625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007298814270743022, "frac_reward_zero_std": 0.825, "grad_norm": 5.8573126792907715, "kl": 0.8292004033457487, "learning_rate": 4.915555555555556e-07, "loss": 0.0008, "num_tokens": 576855022.0, "reward": 0.3921875, "reward_std": 0.1521862491965294, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9173379421234131, "step": 8065 }, { "completion_length": 425.4, "completions/clipped_ratio": 0.0, "completions/max_length": 425.4, "completions/max_terminated_length": 425.4, "completions/mean_length": 88.57109375, "completions/mean_terminated_length": 88.57109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007303339264091282, "frac_reward_zero_std": 0.86875, "grad_norm": 4.882508277893066, "kl": 3.1928063293453306, "learning_rate": 4.91515873015873e-07, "loss": 0.0032, "num_tokens": 577163881.0, "reward": 0.3625, "reward_std": 0.11872759759426117, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9240248680114747, "step": 8070 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 104.321875, "completions/mean_terminated_length": 104.321875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007307864257439542, "frac_reward_zero_std": 0.85625, "grad_norm": 8.270350456237793, "kl": 5.612306234356947, "learning_rate": 4.914761904761904e-07, "loss": 0.0056, "num_tokens": 577499821.0, "reward": 0.3671875, "reward_std": 0.12236117720603942, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.926573383808136, "step": 8075 }, { "completion_length": 394.8, "completions/clipped_ratio": 0.0, "completions/max_length": 394.8, "completions/max_terminated_length": 394.8, "completions/mean_length": 98.3515625, "completions/mean_terminated_length": 98.3515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007312389250787801, "frac_reward_zero_std": 0.8625, "grad_norm": 4.4140706062316895, "kl": 7.551870587549638, "learning_rate": 4.914365079365079e-07, "loss": 0.0076, "num_tokens": 577826743.0, "reward": 0.3234375, "reward_std": 0.11294573098421097, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.92725088596344, "step": 8080 }, { "completion_length": 331.8, "completions/clipped_ratio": 0.0, "completions/max_length": 331.8, "completions/max_terminated_length": 331.8, "completions/mean_length": 102.328125, "completions/mean_terminated_length": 102.328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007316914244136061, "frac_reward_zero_std": 0.8375, "grad_norm": 8.343179702758789, "kl": 8.836249518301338, "learning_rate": 4.913968253968254e-07, "loss": 0.0088, "num_tokens": 578157267.0, "reward": 0.24375, "reward_std": 0.14676432311534882, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.967870545387268, "step": 8085 }, { "completion_length": 508.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 508.4, "completions/max_terminated_length": 497.4, "completions/mean_length": 98.59375, "completions/mean_terminated_length": 98.08718872070312, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007321439237484321, "frac_reward_zero_std": 0.85, "grad_norm": 2.4724111557006836, "kl": 5.12794449464418, "learning_rate": 4.913571428571429e-07, "loss": 0.0051, "num_tokens": 578481859.0, "reward": 0.3546875, "reward_std": 0.12835874110460282, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9298746705055236, "step": 8090 }, { "completion_length": 464.2, "completions/clipped_ratio": 0.0, "completions/max_length": 464.2, "completions/max_terminated_length": 464.2, "completions/mean_length": 103.7, "completions/mean_terminated_length": 103.7, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.00732596423083258, "frac_reward_zero_std": 0.9, "grad_norm": 5.533411026000977, "kl": 1.1143114552949556, "learning_rate": 4.913174603174603e-07, "loss": 0.0011, "num_tokens": 578816667.0, "reward": 0.275, "reward_std": 0.08937334567308426, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9463531374931335, "step": 8095 }, { "completion_length": 301.2, "completions/clipped_ratio": 0.0, "completions/max_length": 301.2, "completions/max_terminated_length": 301.2, "completions/mean_length": 96.2875, "completions/mean_terminated_length": 96.2875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0073304892241808405, "frac_reward_zero_std": 0.84375, "grad_norm": 9.477151870727539, "kl": 2.580669442610815, "learning_rate": 4.912777777777778e-07, "loss": 0.0026, "num_tokens": 579138259.0, "reward": 0.4390625, "reward_std": 0.12936124950647354, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8978433609008789, "step": 8100 }, { "completion_length": 309.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 101.675, "completions/mean_terminated_length": 101.675, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007335014217529101, "frac_reward_zero_std": 0.91875, "grad_norm": 6.5333709716796875, "kl": 2.7250428545987235, "learning_rate": 4.912380952380952e-07, "loss": 0.0027, "num_tokens": 579470219.0, "reward": 0.2359375, "reward_std": 0.06928128637373447, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9688341021537781, "step": 8105 }, { "completion_length": 332.8, "completions/clipped_ratio": 0.0, "completions/max_length": 332.8, "completions/max_terminated_length": 332.8, "completions/mean_length": 90.96015625, "completions/mean_terminated_length": 90.96015625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00733953921087736, "frac_reward_zero_std": 0.86875, "grad_norm": 7.029117107391357, "kl": 1.284134504524991, "learning_rate": 4.911984126984127e-07, "loss": 0.0013, "num_tokens": 579783184.0, "reward": 0.3296875, "reward_std": 0.11099960952997208, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9343770265579223, "step": 8110 }, { "completion_length": 499.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 499.0, "completions/max_terminated_length": 438.2, "completions/mean_length": 99.92421875, "completions/mean_terminated_length": 98.8997085571289, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00734406420422562, "frac_reward_zero_std": 0.88125, "grad_norm": 1.0579859018325806, "kl": 1.0988571691443212, "learning_rate": 4.911587301587301e-07, "loss": 0.0011, "num_tokens": 580109495.0, "reward": 0.3078125, "reward_std": 0.1037399034947157, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9340292096138001, "step": 8115 }, { "completion_length": 320.4, "completions/clipped_ratio": 0.0, "completions/max_length": 320.4, "completions/max_terminated_length": 320.4, "completions/mean_length": 94.87265625, "completions/mean_terminated_length": 94.87265625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00734858919757388, "frac_reward_zero_std": 0.8625, "grad_norm": 2.753093719482422, "kl": 1.5516268405597657, "learning_rate": 4.911190476190476e-07, "loss": 0.0016, "num_tokens": 580429452.0, "reward": 0.4265625, "reward_std": 0.1138408899307251, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.8991757869720459, "step": 8120 }, { "completion_length": 401.6, "completions/clipped_ratio": 0.0, "completions/max_length": 401.6, "completions/max_terminated_length": 401.6, "completions/mean_length": 100.98046875, "completions/mean_terminated_length": 100.98046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007353114190922139, "frac_reward_zero_std": 0.85625, "grad_norm": 3.3043532371520996, "kl": 1.1231835112674162, "learning_rate": 4.91079365079365e-07, "loss": 0.0011, "num_tokens": 580758179.0, "reward": 0.2703125, "reward_std": 0.11894329637289047, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9548139929771423, "step": 8125 }, { "completion_length": 417.6, "completions/clipped_ratio": 0.0, "completions/max_length": 417.6, "completions/max_terminated_length": 417.6, "completions/mean_length": 97.2546875, "completions/mean_terminated_length": 97.2546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007357639184270399, "frac_reward_zero_std": 0.84375, "grad_norm": 1.4496793746948242, "kl": 0.7967143634450622, "learning_rate": 4.910396825396825e-07, "loss": 0.0008, "num_tokens": 581081921.0, "reward": 0.4671875, "reward_std": 0.13162238746881486, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8707680583000184, "step": 8130 }, { "completion_length": 493.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 493.4, "completions/max_terminated_length": 418.2, "completions/mean_length": 98.32265625, "completions/mean_terminated_length": 97.81203918457031, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0073621641776186585, "frac_reward_zero_std": 0.8625, "grad_norm": 4.876809597015381, "kl": 2.7559261198039167, "learning_rate": 4.909999999999999e-07, "loss": 0.0028, "num_tokens": 581406726.0, "reward": 0.2578125, "reward_std": 0.12272464632987976, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9564640522003174, "step": 8135 }, { "completion_length": 291.2, "completions/clipped_ratio": 0.0, "completions/max_length": 291.2, "completions/max_terminated_length": 291.2, "completions/mean_length": 104.08515625, "completions/mean_terminated_length": 104.08515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007366689170966919, "frac_reward_zero_std": 0.8625, "grad_norm": 1.5077391862869263, "kl": 1.877045363560319, "learning_rate": 4.909603174603175e-07, "loss": 0.0019, "num_tokens": 581741155.0, "reward": 0.3625, "reward_std": 0.12267717570066453, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9307141304016113, "step": 8140 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.0, "completions/max_length": 378.4, "completions/max_terminated_length": 378.4, "completions/mean_length": 93.86171875, "completions/mean_terminated_length": 93.86171875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007371214164315179, "frac_reward_zero_std": 0.88125, "grad_norm": 4.253643989562988, "kl": 2.6723102106014265, "learning_rate": 4.90920634920635e-07, "loss": 0.0027, "num_tokens": 582060162.0, "reward": 0.4046875, "reward_std": 0.10715582817792893, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9015268087387085, "step": 8145 }, { "completion_length": 424.8, "completions/clipped_ratio": 0.0, "completions/max_length": 424.8, "completions/max_terminated_length": 424.8, "completions/mean_length": 93.4703125, "completions/mean_terminated_length": 93.4703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007375739157663438, "frac_reward_zero_std": 0.91875, "grad_norm": 2.3122971057891846, "kl": 3.09243380012922, "learning_rate": 4.908809523809523e-07, "loss": 0.0031, "num_tokens": 582378172.0, "reward": 0.421875, "reward_std": 0.06723231971263885, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8778862714767456, "step": 8150 }, { "completion_length": 332.6, "completions/clipped_ratio": 0.0, "completions/max_length": 332.6, "completions/max_terminated_length": 332.6, "completions/mean_length": 98.95234375, "completions/mean_terminated_length": 98.95234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007380264151011698, "frac_reward_zero_std": 0.90625, "grad_norm": 4.711779594421387, "kl": 6.498256889521144, "learning_rate": 4.908412698412699e-07, "loss": 0.0065, "num_tokens": 582705367.0, "reward": 0.2828125, "reward_std": 0.08085403814911843, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9585397481918335, "step": 8155 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 95.27109375, "completions/mean_terminated_length": 95.27109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007384789144359958, "frac_reward_zero_std": 0.8625, "grad_norm": 13.314502716064453, "kl": 6.600140659720637, "learning_rate": 4.908015873015873e-07, "loss": 0.0066, "num_tokens": 583027194.0, "reward": 0.2046875, "reward_std": 0.11452485732734204, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9741634368896485, "step": 8160 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 100.9234375, "completions/mean_terminated_length": 100.9234375, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.007389314137708217, "frac_reward_zero_std": 0.8875, "grad_norm": 4.0161895751953125, "kl": 3.5314988491823898, "learning_rate": 4.907619047619048e-07, "loss": 0.0035, "num_tokens": 583356784.0, "reward": 0.378125, "reward_std": 0.09411228969693183, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9240192174911499, "step": 8165 }, { "completion_length": 410.6, "completions/clipped_ratio": 0.0, "completions/max_length": 410.6, "completions/max_terminated_length": 410.6, "completions/mean_length": 96.56796875, "completions/mean_terminated_length": 96.56796875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0073938391310564774, "frac_reward_zero_std": 0.85625, "grad_norm": 6.472489356994629, "kl": 2.759795464714989, "learning_rate": 4.907222222222222e-07, "loss": 0.0028, "num_tokens": 583679895.0, "reward": 0.2296875, "reward_std": 0.12525782734155655, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.972437584400177, "step": 8170 }, { "completion_length": 277.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 102.76328125, "completions/mean_terminated_length": 102.76328125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0073983641244047375, "frac_reward_zero_std": 0.8375, "grad_norm": 5.278749465942383, "kl": 3.2242049063323064, "learning_rate": 4.906825396825397e-07, "loss": 0.0032, "num_tokens": 584013224.0, "reward": 0.3125, "reward_std": 0.14240479469299316, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9174018859863281, "step": 8175 }, { "completion_length": 409.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 409.8, "completions/max_terminated_length": 381.0, "completions/mean_length": 97.3171875, "completions/mean_terminated_length": 96.80101776123047, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007402889117752997, "frac_reward_zero_std": 0.875, "grad_norm": 5.327332973480225, "kl": 2.5582301155431195, "learning_rate": 4.906428571428571e-07, "loss": 0.0026, "num_tokens": 584336478.0, "reward": 0.33125, "reward_std": 0.1154174655675888, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9439589023590088, "step": 8180 }, { "completion_length": 390.6, "completions/clipped_ratio": 0.0, "completions/max_length": 390.6, "completions/max_terminated_length": 390.6, "completions/mean_length": 94.8296875, "completions/mean_terminated_length": 94.8296875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.007407414111101257, "frac_reward_zero_std": 0.89375, "grad_norm": 5.781979084014893, "kl": 3.162936974968761, "learning_rate": 4.906031746031746e-07, "loss": 0.0032, "num_tokens": 584654796.0, "reward": 0.3546875, "reward_std": 0.09106081500649452, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9298100948333741, "step": 8185 }, { "completion_length": 512.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 512.8, "completions/max_terminated_length": 425.4, "completions/mean_length": 95.4234375, "completions/mean_terminated_length": 94.88362579345703, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007411939104449516, "frac_reward_zero_std": 0.83125, "grad_norm": 7.400716304779053, "kl": 3.0213755729608236, "learning_rate": 4.90563492063492e-07, "loss": 0.003, "num_tokens": 584973882.0, "reward": 0.34375, "reward_std": 0.14661204367876052, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9247931003570556, "step": 8190 }, { "completion_length": 287.8, "completions/clipped_ratio": 0.0, "completions/max_length": 287.8, "completions/max_terminated_length": 287.8, "completions/mean_length": 92.70859375, "completions/mean_terminated_length": 92.70859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007416464097797776, "frac_reward_zero_std": 0.86875, "grad_norm": 2.420549154281616, "kl": 4.760975038656033, "learning_rate": 4.905238095238095e-07, "loss": 0.0048, "num_tokens": 585290221.0, "reward": 0.1921875, "reward_std": 0.10579337775707245, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9782310962677002, "step": 8195 }, { "completion_length": 398.8, "completions/clipped_ratio": 0.0, "completions/max_length": 398.8, "completions/max_terminated_length": 398.8, "completions/mean_length": 100.74765625, "completions/mean_terminated_length": 100.74765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007420989091146036, "frac_reward_zero_std": 0.86875, "grad_norm": 2.9284660816192627, "kl": 7.109391976939515, "learning_rate": 4.904841269841269e-07, "loss": 0.0071, "num_tokens": 585618906.0, "reward": 0.34375, "reward_std": 0.11236657463014126, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.916899847984314, "step": 8200 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 89.18515625, "completions/mean_terminated_length": 89.18515625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.0074255140844942955, "frac_reward_zero_std": 0.93125, "grad_norm": 4.817849159240723, "kl": 11.77905496447347, "learning_rate": 4.904444444444444e-07, "loss": 0.0118, "num_tokens": 585928951.0, "reward": 0.271875, "reward_std": 0.06454332619905472, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9431951999664306, "step": 8205 }, { "completion_length": 518.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 93.99765625, "completions/mean_terminated_length": 93.99765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007430039077842556, "frac_reward_zero_std": 0.84375, "grad_norm": 5.913076877593994, "kl": 10.744770910125226, "learning_rate": 4.904047619047619e-07, "loss": 0.0107, "num_tokens": 586245924.0, "reward": 0.3125, "reward_std": 0.1355101138353348, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9335696339607239, "step": 8210 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 94.03046875, "completions/mean_terminated_length": 94.03046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007434564071190816, "frac_reward_zero_std": 0.8375, "grad_norm": 9.45832347869873, "kl": 5.929964424972423, "learning_rate": 4.903650793650794e-07, "loss": 0.0059, "num_tokens": 586565299.0, "reward": 0.2890625, "reward_std": 0.1385635554790497, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9420580387115478, "step": 8215 }, { "completion_length": 320.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 102.23203125, "completions/mean_terminated_length": 102.23203125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007439089064539075, "frac_reward_zero_std": 0.86875, "grad_norm": 5.451197624206543, "kl": 4.200002080923878, "learning_rate": 4.903253968253968e-07, "loss": 0.0042, "num_tokens": 586897284.0, "reward": 0.3359375, "reward_std": 0.11236755549907684, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9216020822525024, "step": 8220 }, { "completion_length": 355.8, "completions/clipped_ratio": 0.0, "completions/max_length": 355.8, "completions/max_terminated_length": 355.8, "completions/mean_length": 98.15, "completions/mean_terminated_length": 98.15, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007443614057887335, "frac_reward_zero_std": 0.89375, "grad_norm": 7.273653507232666, "kl": 2.039460163493641, "learning_rate": 4.902857142857142e-07, "loss": 0.002, "num_tokens": 587221156.0, "reward": 0.3671875, "reward_std": 0.09673983007669448, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9021528244018555, "step": 8225 }, { "completion_length": 304.8, "completions/clipped_ratio": 0.0, "completions/max_length": 304.8, "completions/max_terminated_length": 304.8, "completions/mean_length": 93.1734375, "completions/mean_terminated_length": 93.1734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007448139051235595, "frac_reward_zero_std": 0.9, "grad_norm": 5.3938398361206055, "kl": 3.239004549244419, "learning_rate": 4.902460317460318e-07, "loss": 0.0032, "num_tokens": 587537978.0, "reward": 0.3828125, "reward_std": 0.0914242785423994, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9190673232078552, "step": 8230 }, { "completion_length": 441.4, "completions/clipped_ratio": 0.0, "completions/max_length": 441.4, "completions/max_terminated_length": 441.4, "completions/mean_length": 105.44609375, "completions/mean_terminated_length": 105.44609375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.007452664044583854, "frac_reward_zero_std": 0.80625, "grad_norm": 4.165229320526123, "kl": 5.130737772793509, "learning_rate": 4.902063492063492e-07, "loss": 0.0051, "num_tokens": 587873653.0, "reward": 0.31875, "reward_std": 0.16817842423915863, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9390981912612915, "step": 8235 }, { "completion_length": 383.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 100.3890625, "completions/mean_terminated_length": 100.3890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007457189037932114, "frac_reward_zero_std": 0.84375, "grad_norm": 4.918099880218506, "kl": 10.473267304571346, "learning_rate": 4.901666666666666e-07, "loss": 0.0105, "num_tokens": 588202975.0, "reward": 0.303125, "reward_std": 0.1336733117699623, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.94788498878479, "step": 8240 }, { "completion_length": 436.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 93.4828125, "completions/mean_terminated_length": 93.4828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007461714031280374, "frac_reward_zero_std": 0.8875, "grad_norm": 4.377603530883789, "kl": 21.481619264231995, "learning_rate": 4.901269841269841e-07, "loss": 0.0215, "num_tokens": 588518705.0, "reward": 0.2578125, "reward_std": 0.09616321586072445, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9621014952659607, "step": 8245 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.6, "completions/max_terminated_length": 363.2, "completions/mean_length": 104.23671875, "completions/mean_terminated_length": 103.73142547607422, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007466239024628634, "frac_reward_zero_std": 0.85625, "grad_norm": 14.52041244506836, "kl": 16.857488980470226, "learning_rate": 4.900873015873016e-07, "loss": 0.0169, "num_tokens": 588853016.0, "reward": 0.2421875, "reward_std": 0.12688540518283845, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9521922588348388, "step": 8250 }, { "completion_length": 285.2, "completions/clipped_ratio": 0.0, "completions/max_length": 285.2, "completions/max_terminated_length": 285.2, "completions/mean_length": 87.0484375, "completions/mean_terminated_length": 87.0484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007470764017976894, "frac_reward_zero_std": 0.9, "grad_norm": 8.153831481933594, "kl": 3.6675958189414812, "learning_rate": 4.90047619047619e-07, "loss": 0.0037, "num_tokens": 589160198.0, "reward": 0.3296875, "reward_std": 0.09415623247623443, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9237871408462525, "step": 8255 }, { "completion_length": 345.4, "completions/clipped_ratio": 0.0, "completions/max_length": 345.4, "completions/max_terminated_length": 345.4, "completions/mean_length": 97.06953125, "completions/mean_terminated_length": 97.06953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007475289011325153, "frac_reward_zero_std": 0.88125, "grad_norm": 9.24974536895752, "kl": 3.364601411158219, "learning_rate": 4.900079365079365e-07, "loss": 0.0034, "num_tokens": 589484103.0, "reward": 0.3109375, "reward_std": 0.09985119625926017, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9411930203437805, "step": 8260 }, { "completion_length": 352.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 94.47578125, "completions/mean_terminated_length": 94.47578125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.007479814004673413, "frac_reward_zero_std": 0.86875, "grad_norm": 8.824498176574707, "kl": 2.708737133257091, "learning_rate": 4.89968253968254e-07, "loss": 0.0027, "num_tokens": 589803208.0, "reward": 0.3484375, "reward_std": 0.11941254734992982, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9191913723945617, "step": 8265 }, { "completion_length": 406.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 406.0, "completions/max_terminated_length": 391.8, "completions/mean_length": 97.33515625, "completions/mean_terminated_length": 96.82972869873046, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007484338998021673, "frac_reward_zero_std": 0.88125, "grad_norm": 5.226490020751953, "kl": 3.3422765333089046, "learning_rate": 4.899285714285714e-07, "loss": 0.0033, "num_tokens": 590127453.0, "reward": 0.3328125, "reward_std": 0.09648175984621048, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.925046443939209, "step": 8270 }, { "completion_length": 409.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 409.0, "completions/max_terminated_length": 362.4, "completions/mean_length": 96.95234375, "completions/mean_terminated_length": 96.41910400390626, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0074888639913699324, "frac_reward_zero_std": 0.84375, "grad_norm": 3.779703140258789, "kl": 3.9069367513642645, "learning_rate": 4.898888888888889e-07, "loss": 0.0039, "num_tokens": 590449144.0, "reward": 0.315625, "reward_std": 0.13645372539758682, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9333836197853088, "step": 8275 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 91.06015625, "completions/mean_terminated_length": 91.06015625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.0074933889847181925, "frac_reward_zero_std": 0.88125, "grad_norm": 3.3990392684936523, "kl": 3.6607666441239415, "learning_rate": 4.898492063492063e-07, "loss": 0.0037, "num_tokens": 590761845.0, "reward": 0.434375, "reward_std": 0.09191006422042847, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8987089395523071, "step": 8280 }, { "completion_length": 484.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 484.6, "completions/max_terminated_length": 393.0, "completions/mean_length": 97.821875, "completions/mean_terminated_length": 97.30014190673828, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007497913978066453, "frac_reward_zero_std": 0.875, "grad_norm": 6.878841876983643, "kl": 5.178662210726179, "learning_rate": 4.898095238095238e-07, "loss": 0.0052, "num_tokens": 591086217.0, "reward": 0.28125, "reward_std": 0.10747534930706024, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.956804621219635, "step": 8285 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 100.9046875, "completions/mean_terminated_length": 100.9046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007502438971414712, "frac_reward_zero_std": 0.84375, "grad_norm": 11.618953704833984, "kl": 11.019875545310788, "learning_rate": 4.897698412698412e-07, "loss": 0.011, "num_tokens": 591416703.0, "reward": 0.18125, "reward_std": 0.13734887689352035, "rewards/verify_chess_move/mean": 0.18125, "rewards/verify_chess_move/std": 0.9674649238586426, "step": 8290 }, { "completion_length": 400.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.0, "completions/max_terminated_length": 316.8, "completions/mean_length": 98.47734375, "completions/mean_terminated_length": 97.94760284423828, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007506963964762972, "frac_reward_zero_std": 0.8875, "grad_norm": 4.815293312072754, "kl": 8.510341546498239, "learning_rate": 4.897301587301587e-07, "loss": 0.0085, "num_tokens": 591741866.0, "reward": 0.175, "reward_std": 0.09548022970557213, "rewards/verify_chess_move/mean": 0.175, "rewards/verify_chess_move/std": 0.9746398448944091, "step": 8295 }, { "completion_length": 335.2, "completions/clipped_ratio": 0.0, "completions/max_length": 335.2, "completions/max_terminated_length": 335.2, "completions/mean_length": 94.0703125, "completions/mean_terminated_length": 94.0703125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.007511488958111231, "frac_reward_zero_std": 0.90625, "grad_norm": 2.047128677368164, "kl": 6.432193621364422, "learning_rate": 4.896904761904761e-07, "loss": 0.0064, "num_tokens": 592060156.0, "reward": 0.4390625, "reward_std": 0.0853782631456852, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8851793527603149, "step": 8300 }, { "completion_length": 589.4, "completions/clipped_ratio": 0.0, "completions/max_length": 589.4, "completions/max_terminated_length": 589.4, "completions/mean_length": 98.9015625, "completions/mean_terminated_length": 98.9015625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007516013951459491, "frac_reward_zero_std": 0.9125, "grad_norm": 0.9893801808357239, "kl": 5.310853211837821, "learning_rate": 4.896507936507937e-07, "loss": 0.0053, "num_tokens": 592386190.0, "reward": 0.20625, "reward_std": 0.07848456613719464, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9736638545989991, "step": 8305 }, { "completion_length": 450.4, "completions/clipped_ratio": 0.0, "completions/max_length": 450.4, "completions/max_terminated_length": 450.4, "completions/mean_length": 110.81484375, "completions/mean_terminated_length": 110.81484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007520538944807751, "frac_reward_zero_std": 0.8875, "grad_norm": 7.965221405029297, "kl": 3.1772438431857153, "learning_rate": 4.89611111111111e-07, "loss": 0.0032, "num_tokens": 592731857.0, "reward": 0.3546875, "reward_std": 0.08954059258103371, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.932373309135437, "step": 8310 }, { "completion_length": 312.8, "completions/clipped_ratio": 0.0, "completions/max_length": 312.8, "completions/max_terminated_length": 312.8, "completions/mean_length": 96.23828125, "completions/mean_terminated_length": 96.23828125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.007525063938156011, "frac_reward_zero_std": 0.9, "grad_norm": 2.2430834770202637, "kl": 0.8224575529806316, "learning_rate": 4.895714285714285e-07, "loss": 0.0008, "num_tokens": 593054986.0, "reward": 0.3828125, "reward_std": 0.08869035989046097, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9204298734664917, "step": 8315 }, { "completion_length": 382.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 94.49609375, "completions/mean_terminated_length": 94.49609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007529588931504271, "frac_reward_zero_std": 0.8875, "grad_norm": 3.649442195892334, "kl": 0.7700705331284553, "learning_rate": 4.89531746031746e-07, "loss": 0.0008, "num_tokens": 593373445.0, "reward": 0.2078125, "reward_std": 0.09842631220817566, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.9301092982292175, "step": 8320 }, { "completion_length": 397.2, "completions/clipped_ratio": 0.0, "completions/max_length": 397.2, "completions/max_terminated_length": 397.2, "completions/mean_length": 98.95078125, "completions/mean_terminated_length": 98.95078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007534113924852531, "frac_reward_zero_std": 0.8, "grad_norm": 7.463603973388672, "kl": 0.8923068234231323, "learning_rate": 4.894920634920635e-07, "loss": 0.0009, "num_tokens": 593700262.0, "reward": 0.2453125, "reward_std": 0.1739638179540634, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9517606973648072, "step": 8325 }, { "completion_length": 511.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 97.20390625, "completions/mean_terminated_length": 97.20390625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.00753863891820079, "frac_reward_zero_std": 0.8875, "grad_norm": 2.829442262649536, "kl": 0.6604566519963555, "learning_rate": 4.89452380952381e-07, "loss": 0.0007, "num_tokens": 594024403.0, "reward": 0.325, "reward_std": 0.10026213005185128, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9294812321662903, "step": 8330 }, { "completion_length": 327.8, "completions/clipped_ratio": 0.0, "completions/max_length": 327.8, "completions/max_terminated_length": 327.8, "completions/mean_length": 100.29140625, "completions/mean_terminated_length": 100.29140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00754316391154905, "frac_reward_zero_std": 0.8875, "grad_norm": 2.250842571258545, "kl": 0.49836997970705854, "learning_rate": 4.894126984126984e-07, "loss": 0.0005, "num_tokens": 594353328.0, "reward": 0.2546875, "reward_std": 0.09295847937464714, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9679578065872192, "step": 8335 }, { "completion_length": 381.4, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 96.76796875, "completions/mean_terminated_length": 96.76796875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00754768890489731, "frac_reward_zero_std": 0.8875, "grad_norm": 5.761184215545654, "kl": 0.9690577087923884, "learning_rate": 4.893730158730159e-07, "loss": 0.001, "num_tokens": 594675823.0, "reward": 0.3828125, "reward_std": 0.09385363459587097, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9116337895393372, "step": 8340 }, { "completion_length": 467.4, "completions/clipped_ratio": 0.0, "completions/max_length": 467.4, "completions/max_terminated_length": 467.4, "completions/mean_length": 100.9921875, "completions/mean_terminated_length": 100.9921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007552213898245569, "frac_reward_zero_std": 0.88125, "grad_norm": 9.337632179260254, "kl": 1.5013538601342589, "learning_rate": 4.893333333333333e-07, "loss": 0.0015, "num_tokens": 595004381.0, "reward": 0.509375, "reward_std": 0.10079381987452507, "rewards/verify_chess_move/mean": 0.509375, "rewards/verify_chess_move/std": 0.8380970597267151, "step": 8345 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 96.90625, "completions/mean_terminated_length": 96.90625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0075567388915938295, "frac_reward_zero_std": 0.875, "grad_norm": 6.031070232391357, "kl": 0.8418086087796837, "learning_rate": 4.892936507936508e-07, "loss": 0.0008, "num_tokens": 595325605.0, "reward": 0.4078125, "reward_std": 0.1136261761188507, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8965169429779053, "step": 8350 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.0, "completions/max_length": 363.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 95.67890625, "completions/mean_terminated_length": 95.67890625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.007561263884942089, "frac_reward_zero_std": 0.8625, "grad_norm": 7.833123683929443, "kl": 2.024086994596291, "learning_rate": 4.892539682539682e-07, "loss": 0.002, "num_tokens": 595646130.0, "reward": 0.325, "reward_std": 0.11699815541505813, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9404578447341919, "step": 8355 }, { "completion_length": 335.2, "completions/clipped_ratio": 0.0, "completions/max_length": 335.2, "completions/max_terminated_length": 335.2, "completions/mean_length": 109.19140625, "completions/mean_terminated_length": 109.19140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007565788878290349, "frac_reward_zero_std": 0.875, "grad_norm": 1.7006651163101196, "kl": 1.0099091710639185, "learning_rate": 4.892142857142857e-07, "loss": 0.001, "num_tokens": 595990463.0, "reward": 0.2234375, "reward_std": 0.10837149098515511, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9569435358047486, "step": 8360 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 96.07734375, "completions/mean_terminated_length": 96.07734375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.007570313871638609, "frac_reward_zero_std": 0.85, "grad_norm": 7.55556058883667, "kl": 1.4032609950518236, "learning_rate": 4.891746031746031e-07, "loss": 0.0014, "num_tokens": 596312202.0, "reward": 0.4296875, "reward_std": 0.13450858145952224, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8865676045417785, "step": 8365 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 95.978125, "completions/mean_terminated_length": 95.978125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007574838864986868, "frac_reward_zero_std": 0.90625, "grad_norm": 2.2285211086273193, "kl": 0.9124928802717477, "learning_rate": 4.891349206349206e-07, "loss": 0.0009, "num_tokens": 596634062.0, "reward": 0.4390625, "reward_std": 0.08311713188886642, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8952887296676636, "step": 8370 }, { "completion_length": 405.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.8, "completions/max_terminated_length": 383.0, "completions/mean_length": 91.63203125, "completions/mean_terminated_length": 91.11832275390626, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007579363858335128, "frac_reward_zero_std": 0.86875, "grad_norm": 16.0842227935791, "kl": 3.2918910210137255, "learning_rate": 4.89095238095238e-07, "loss": 0.0033, "num_tokens": 596948847.0, "reward": 0.278125, "reward_std": 0.11326172798871995, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9478248834609986, "step": 8375 }, { "completion_length": 507.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 507.6, "completions/max_terminated_length": 472.0, "completions/mean_length": 93.8453125, "completions/mean_terminated_length": 93.32030792236328, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007583888851683388, "frac_reward_zero_std": 0.875, "grad_norm": 0.10752498358488083, "kl": 2.7185707354568875, "learning_rate": 4.890555555555556e-07, "loss": 0.0027, "num_tokens": 597266097.0, "reward": 0.4234375, "reward_std": 0.10773597061634063, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8888904213905334, "step": 8380 }, { "completion_length": 468.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 99.36484375, "completions/mean_terminated_length": 99.36484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0075884138450316475, "frac_reward_zero_std": 0.8125, "grad_norm": 2.865081310272217, "kl": 5.692617722798604, "learning_rate": 4.89015873015873e-07, "loss": 0.0057, "num_tokens": 597592516.0, "reward": 0.1484375, "reward_std": 0.1639721542596817, "rewards/verify_chess_move/mean": 0.1484375, "rewards/verify_chess_move/std": 0.9787144660949707, "step": 8385 }, { "completion_length": 476.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 96.86953125, "completions/mean_terminated_length": 96.86953125, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.007592938838379908, "frac_reward_zero_std": 0.89375, "grad_norm": 5.2195916175842285, "kl": 6.028730973636266, "learning_rate": 4.889761904761904e-07, "loss": 0.006, "num_tokens": 597914909.0, "reward": 0.2515625, "reward_std": 0.09263797551393509, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9596995830535888, "step": 8390 }, { "completion_length": 455.8, "completions/clipped_ratio": 0.0, "completions/max_length": 455.8, "completions/max_terminated_length": 455.8, "completions/mean_length": 93.54453125, "completions/mean_terminated_length": 93.54453125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007597463831728168, "frac_reward_zero_std": 0.9, "grad_norm": 0.7520449757575989, "kl": 3.1131711929338053, "learning_rate": 4.88936507936508e-07, "loss": 0.0031, "num_tokens": 598235502.0, "reward": 0.253125, "reward_std": 0.08296387158334255, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9395210027694703, "step": 8395 }, { "completion_length": 469.8, "completions/clipped_ratio": 0.0, "completions/max_length": 469.8, "completions/max_terminated_length": 469.8, "completions/mean_length": 93.215625, "completions/mean_terminated_length": 93.215625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007601988825076427, "frac_reward_zero_std": 0.84375, "grad_norm": 3.612039804458618, "kl": 5.270073672884609, "learning_rate": 4.888968253968253e-07, "loss": 0.0053, "num_tokens": 598551322.0, "reward": 0.346875, "reward_std": 0.13866836428642274, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9348504543304443, "step": 8400 }, { "completion_length": 311.6, "completions/clipped_ratio": 0.0, "completions/max_length": 311.6, "completions/max_terminated_length": 311.6, "completions/mean_length": 82.5484375, "completions/mean_terminated_length": 82.5484375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.007606513818424687, "frac_reward_zero_std": 0.86875, "grad_norm": 8.773452758789062, "kl": 5.727097554085776, "learning_rate": 4.888571428571429e-07, "loss": 0.0057, "num_tokens": 598850288.0, "reward": 0.384375, "reward_std": 0.11236657351255416, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9179305791854858, "step": 8405 }, { "completion_length": 480.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 89.74765625, "completions/mean_terminated_length": 89.74765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007611038811772946, "frac_reward_zero_std": 0.88125, "grad_norm": 4.7122883796691895, "kl": 11.692651861952617, "learning_rate": 4.888174603174603e-07, "loss": 0.0117, "num_tokens": 599160821.0, "reward": 0.378125, "reward_std": 0.09989866614341736, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.917649781703949, "step": 8410 }, { "completion_length": 548.8, "completions/clipped_ratio": 0.0, "completions/max_length": 548.8, "completions/max_terminated_length": 548.8, "completions/mean_length": 101.3421875, "completions/mean_terminated_length": 101.3421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007615563805121206, "frac_reward_zero_std": 0.875, "grad_norm": 5.430506229400635, "kl": 9.09526594968047, "learning_rate": 4.887777777777778e-07, "loss": 0.0091, "num_tokens": 599490291.0, "reward": 0.24375, "reward_std": 0.1067933440208435, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9615952849388123, "step": 8415 }, { "completion_length": 469.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 469.8, "completions/max_terminated_length": 445.8, "completions/mean_length": 92.49375, "completions/mean_terminated_length": 91.9714569091797, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0076200887984694664, "frac_reward_zero_std": 0.875, "grad_norm": 5.6380295753479, "kl": 7.297555401502177, "learning_rate": 4.887380952380952e-07, "loss": 0.0073, "num_tokens": 599806467.0, "reward": 0.3328125, "reward_std": 0.11046792566776276, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9275568723678589, "step": 8420 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 99.00625, "completions/mean_terminated_length": 99.00625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007624613791817726, "frac_reward_zero_std": 0.875, "grad_norm": 5.7569804191589355, "kl": 4.8330771724693475, "learning_rate": 4.886984126984127e-07, "loss": 0.0048, "num_tokens": 600134147.0, "reward": 0.3546875, "reward_std": 0.10363411158323288, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9246592998504639, "step": 8425 }, { "completion_length": 383.8, "completions/clipped_ratio": 0.0, "completions/max_length": 383.8, "completions/max_terminated_length": 383.8, "completions/mean_length": 98.98671875, "completions/mean_terminated_length": 98.98671875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007629138785165986, "frac_reward_zero_std": 0.89375, "grad_norm": 9.145360946655273, "kl": 3.4487791415303946, "learning_rate": 4.886587301587301e-07, "loss": 0.0034, "num_tokens": 600461042.0, "reward": 0.390625, "reward_std": 0.08695993833243847, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9101139664649963, "step": 8430 }, { "completion_length": 550.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 550.2, "completions/max_terminated_length": 509.0, "completions/mean_length": 94.53125, "completions/mean_terminated_length": 94.00584716796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007633663778514246, "frac_reward_zero_std": 0.81875, "grad_norm": 7.918054580688477, "kl": 2.7601708616828544, "learning_rate": 4.886190476190476e-07, "loss": 0.0028, "num_tokens": 600778426.0, "reward": 0.3515625, "reward_std": 0.15681783556938172, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9341193914413453, "step": 8435 }, { "completion_length": 394.4, "completions/clipped_ratio": 0.0, "completions/max_length": 394.4, "completions/max_terminated_length": 394.4, "completions/mean_length": 89.0234375, "completions/mean_terminated_length": 89.0234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007638188771862505, "frac_reward_zero_std": 0.875, "grad_norm": 9.103140830993652, "kl": 2.0419946708949284, "learning_rate": 4.88579365079365e-07, "loss": 0.002, "num_tokens": 601088672.0, "reward": 0.2546875, "reward_std": 0.1015841692686081, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9610798597335816, "step": 8440 }, { "completion_length": 302.8, "completions/clipped_ratio": 0.0, "completions/max_length": 302.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 100.26328125, "completions/mean_terminated_length": 100.26328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007642713765210765, "frac_reward_zero_std": 0.8375, "grad_norm": 3.0551280975341797, "kl": 2.9518771580420435, "learning_rate": 4.885396825396825e-07, "loss": 0.003, "num_tokens": 601418121.0, "reward": 0.2625, "reward_std": 0.1390353575348854, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9390004754066468, "step": 8445 }, { "completion_length": 456.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 456.8, "completions/max_terminated_length": 422.4, "completions/mean_length": 93.2578125, "completions/mean_terminated_length": 92.7240478515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007647238758559025, "frac_reward_zero_std": 0.825, "grad_norm": 5.455498695373535, "kl": 6.841867764655035, "learning_rate": 4.885e-07, "loss": 0.0068, "num_tokens": 601735323.0, "reward": 0.325, "reward_std": 0.15260960757732392, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.937627625465393, "step": 8450 }, { "completion_length": 476.8, "completions/clipped_ratio": 0.0, "completions/max_length": 476.8, "completions/max_terminated_length": 476.8, "completions/mean_length": 97.909375, "completions/mean_terminated_length": 97.909375, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.0076517637519072845, "frac_reward_zero_std": 0.89375, "grad_norm": 11.546466827392578, "kl": 4.864143802085891, "learning_rate": 4.884603174603174e-07, "loss": 0.0049, "num_tokens": 602060327.0, "reward": 0.365625, "reward_std": 0.09195498637855053, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.922312343120575, "step": 8455 }, { "completion_length": 479.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 479.2, "completions/max_terminated_length": 471.2, "completions/mean_length": 87.534375, "completions/mean_terminated_length": 87.0125015258789, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007656288745255545, "frac_reward_zero_std": 0.85, "grad_norm": 3.004910945892334, "kl": 6.873865656519774, "learning_rate": 4.884206349206349e-07, "loss": 0.0069, "num_tokens": 602366875.0, "reward": 0.3125, "reward_std": 0.13087852708995343, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9487385034561158, "step": 8460 }, { "completion_length": 358.8, "completions/clipped_ratio": 0.0, "completions/max_length": 358.8, "completions/max_terminated_length": 358.8, "completions/mean_length": 95.9515625, "completions/mean_terminated_length": 95.9515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007660813738603805, "frac_reward_zero_std": 0.875, "grad_norm": 4.764888286590576, "kl": 3.544925513258204, "learning_rate": 4.883809523809523e-07, "loss": 0.0035, "num_tokens": 602689717.0, "reward": 0.2859375, "reward_std": 0.10384725779294968, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9341745257377625, "step": 8465 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 94.5671875, "completions/mean_terminated_length": 94.5671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007665338731952064, "frac_reward_zero_std": 0.86875, "grad_norm": 5.179573059082031, "kl": 3.3442386431153865, "learning_rate": 4.883412698412699e-07, "loss": 0.0033, "num_tokens": 603009739.0, "reward": 0.2421875, "reward_std": 0.11326270997524261, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.966647458076477, "step": 8470 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 88.9, "completions/mean_terminated_length": 88.9, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007669863725300324, "frac_reward_zero_std": 0.9, "grad_norm": 3.3616344928741455, "kl": 4.809325044136495, "learning_rate": 4.883015873015872e-07, "loss": 0.0048, "num_tokens": 603320091.0, "reward": 0.4046875, "reward_std": 0.08706474751234054, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9129061698913574, "step": 8475 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 86.596875, "completions/mean_terminated_length": 86.596875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007674388718648583, "frac_reward_zero_std": 0.9, "grad_norm": 1.5881868600845337, "kl": 1.02100725955097, "learning_rate": 4.882619047619048e-07, "loss": 0.001, "num_tokens": 603626031.0, "reward": 0.3296875, "reward_std": 0.08616959154605866, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9385069966316223, "step": 8480 }, { "completion_length": 523.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 523.0, "completions/max_terminated_length": 370.6, "completions/mean_length": 106.17890625, "completions/mean_terminated_length": 104.60836486816406, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007678913711996843, "frac_reward_zero_std": 0.9, "grad_norm": 8.550226211547852, "kl": 6.611481144756544, "learning_rate": 4.882222222222222e-07, "loss": 0.0066, "num_tokens": 603963972.0, "reward": 0.159375, "reward_std": 0.09027046784758568, "rewards/verify_chess_move/mean": 0.159375, "rewards/verify_chess_move/std": 0.9674651622772217, "step": 8485 }, { "completion_length": 423.8, "completions/clipped_ratio": 0.0, "completions/max_length": 423.8, "completions/max_terminated_length": 423.8, "completions/mean_length": 102.121875, "completions/mean_terminated_length": 102.121875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007683438705345103, "frac_reward_zero_std": 0.8375, "grad_norm": 3.6908161640167236, "kl": 4.487144684384111, "learning_rate": 4.881825396825396e-07, "loss": 0.0045, "num_tokens": 604295808.0, "reward": 0.38125, "reward_std": 0.14966097176074983, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9021745324134827, "step": 8490 }, { "completion_length": 444.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.4, "completions/max_terminated_length": 443.6, "completions/mean_length": 98.31484375, "completions/mean_terminated_length": 97.78831634521484, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007687963698693363, "frac_reward_zero_std": 0.89375, "grad_norm": 4.937948703765869, "kl": 1.5532581841922366, "learning_rate": 4.881428571428572e-07, "loss": 0.0016, "num_tokens": 604620219.0, "reward": 0.38125, "reward_std": 0.09169535040855407, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9191742897033691, "step": 8495 }, { "completion_length": 463.8, "completions/clipped_ratio": 0.0, "completions/max_length": 463.8, "completions/max_terminated_length": 463.8, "completions/mean_length": 100.53515625, "completions/mean_terminated_length": 100.53515625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007692488692041623, "frac_reward_zero_std": 0.81875, "grad_norm": 0.9822603464126587, "kl": 3.5274785342626274, "learning_rate": 4.881031746031746e-07, "loss": 0.0035, "num_tokens": 604949112.0, "reward": 0.365625, "reward_std": 0.15997510477900506, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9224877238273621, "step": 8500 }, { "completion_length": 418.2, "completions/clipped_ratio": 0.0, "completions/max_length": 418.2, "completions/max_terminated_length": 418.2, "completions/mean_length": 100.3515625, "completions/mean_terminated_length": 100.3515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007697013685389883, "frac_reward_zero_std": 0.9125, "grad_norm": 9.319600105285645, "kl": 2.880686319898814, "learning_rate": 4.880634920634921e-07, "loss": 0.0029, "num_tokens": 605279162.0, "reward": 0.2984375, "reward_std": 0.07575163245201111, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9298632502555847, "step": 8505 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 98.4, "completions/mean_terminated_length": 98.4, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007701538678738142, "frac_reward_zero_std": 0.91875, "grad_norm": 1.8855671882629395, "kl": 0.8829825452063232, "learning_rate": 4.880238095238095e-07, "loss": 0.0009, "num_tokens": 605605946.0, "reward": 0.23125, "reward_std": 0.0726981908082962, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9696839809417724, "step": 8510 }, { "completion_length": 545.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 545.2, "completions/max_terminated_length": 408.8, "completions/mean_length": 101.6734375, "completions/mean_terminated_length": 100.63163909912109, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007706063672086402, "frac_reward_zero_std": 0.8375, "grad_norm": 2.6468405723571777, "kl": 3.705132782831788, "learning_rate": 4.87984126984127e-07, "loss": 0.0037, "num_tokens": 605935360.0, "reward": 0.271875, "reward_std": 0.13651458621025087, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9640618443489075, "step": 8515 }, { "completion_length": 485.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.4, "completions/max_terminated_length": 397.0, "completions/mean_length": 99.4796875, "completions/mean_terminated_length": 98.95454406738281, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.007710588665434662, "frac_reward_zero_std": 0.8875, "grad_norm": 17.612808227539062, "kl": 2.728200136742089, "learning_rate": 4.879444444444444e-07, "loss": 0.0027, "num_tokens": 606262262.0, "reward": 0.384375, "reward_std": 0.09479625970125198, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.900545620918274, "step": 8520 }, { "completion_length": 423.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 423.2, "completions/max_terminated_length": 376.2, "completions/mean_length": 88.434375, "completions/mean_terminated_length": 87.90254974365234, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.0077151136587829214, "frac_reward_zero_std": 0.8625, "grad_norm": 5.412956237792969, "kl": 7.731213724589907, "learning_rate": 4.879047619047619e-07, "loss": 0.0077, "num_tokens": 606570810.0, "reward": 0.3609375, "reward_std": 0.11499567702412605, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9319163799285889, "step": 8525 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 99.365625, "completions/mean_terminated_length": 99.365625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0077196386521311815, "frac_reward_zero_std": 0.85625, "grad_norm": 3.304701328277588, "kl": 3.2968382858438416, "learning_rate": 4.878650793650793e-07, "loss": 0.0033, "num_tokens": 606897358.0, "reward": 0.421875, "reward_std": 0.11941509842872619, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8947810173034668, "step": 8530 }, { "completion_length": 471.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.6, "completions/max_terminated_length": 393.6, "completions/mean_length": 93.34609375, "completions/mean_terminated_length": 92.81638641357422, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007724163645479441, "frac_reward_zero_std": 0.8625, "grad_norm": 3.7449138164520264, "kl": 5.890331896790303, "learning_rate": 4.878253968253968e-07, "loss": 0.0059, "num_tokens": 607215017.0, "reward": 0.2203125, "reward_std": 0.12046155333518982, "rewards/verify_chess_move/mean": 0.2203125, "rewards/verify_chess_move/std": 0.9718823909759522, "step": 8535 }, { "completion_length": 496.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 496.8, "completions/max_terminated_length": 489.8, "completions/mean_length": 103.26171875, "completions/mean_terminated_length": 102.24596405029297, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007728688638827701, "frac_reward_zero_std": 0.8625, "grad_norm": 1.106241226196289, "kl": 6.561024374794215, "learning_rate": 4.877857142857142e-07, "loss": 0.0066, "num_tokens": 607547496.0, "reward": 0.246875, "reward_std": 0.12929783761501312, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9653758764266968, "step": 8540 }, { "completion_length": 545.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 545.0, "completions/max_terminated_length": 485.2, "completions/mean_length": 99.56953125, "completions/mean_terminated_length": 99.05965576171874, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007733213632175961, "frac_reward_zero_std": 0.825, "grad_norm": 14.801399230957031, "kl": 4.758585080713965, "learning_rate": 4.877460317460317e-07, "loss": 0.0048, "num_tokens": 607873609.0, "reward": 0.2921875, "reward_std": 0.15287022441625595, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9548068881034851, "step": 8545 }, { "completion_length": 364.4, "completions/clipped_ratio": 0.0, "completions/max_length": 364.4, "completions/max_terminated_length": 364.4, "completions/mean_length": 97.73203125, "completions/mean_terminated_length": 97.73203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00773773862552422, "frac_reward_zero_std": 0.925, "grad_norm": 2.9214229583740234, "kl": 2.557047544722445, "learning_rate": 4.877063492063491e-07, "loss": 0.0026, "num_tokens": 608198810.0, "reward": 0.35, "reward_std": 0.05939599648118019, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.8936649441719056, "step": 8550 }, { "completion_length": 300.6, "completions/clipped_ratio": 0.0, "completions/max_length": 300.6, "completions/max_terminated_length": 300.6, "completions/mean_length": 88.20546875, "completions/mean_terminated_length": 88.20546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00774226361887248, "frac_reward_zero_std": 0.88125, "grad_norm": 6.507388591766357, "kl": 1.7670225797453896, "learning_rate": 4.876666666666667e-07, "loss": 0.0018, "num_tokens": 608508113.0, "reward": 0.2671875, "reward_std": 0.10053516291081906, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9592254638671875, "step": 8555 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 99.51015625, "completions/mean_terminated_length": 99.51015625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.00774678861222074, "frac_reward_zero_std": 0.8375, "grad_norm": 1.794310450553894, "kl": 2.2648991907248273, "learning_rate": 4.876269841269842e-07, "loss": 0.0023, "num_tokens": 608835382.0, "reward": 0.3375, "reward_std": 0.14035484492778777, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.937199330329895, "step": 8560 }, { "completion_length": 429.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 429.4, "completions/max_terminated_length": 374.8, "completions/mean_length": 94.09140625, "completions/mean_terminated_length": 93.56502838134766, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007751313605569, "frac_reward_zero_std": 0.86875, "grad_norm": 2.704864025115967, "kl": 2.3434882164699955, "learning_rate": 4.875873015873015e-07, "loss": 0.0023, "num_tokens": 609153803.0, "reward": 0.1921875, "reward_std": 0.1121544063091278, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9649882078170776, "step": 8565 }, { "completion_length": 355.4, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/max_terminated_length": 355.4, "completions/mean_length": 94.18671875, "completions/mean_terminated_length": 94.18671875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00775583859891726, "frac_reward_zero_std": 0.9125, "grad_norm": 5.044476509094238, "kl": 0.8499517571413889, "learning_rate": 4.875476190476191e-07, "loss": 0.0008, "num_tokens": 609473898.0, "reward": 0.2078125, "reward_std": 0.08143064975738526, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.966156268119812, "step": 8570 }, { "completion_length": 522.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 522.6, "completions/max_terminated_length": 431.4, "completions/mean_length": 103.92421875, "completions/mean_terminated_length": 102.35821380615235, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.00776036359226552, "frac_reward_zero_std": 0.88125, "grad_norm": 3.4390289783477783, "kl": 2.02594384070253, "learning_rate": 4.875079365079365e-07, "loss": 0.002, "num_tokens": 609807825.0, "reward": 0.240625, "reward_std": 0.09306485466659069, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9617573499679566, "step": 8575 }, { "completion_length": 624.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 624.8, "completions/max_terminated_length": 579.8, "completions/mean_length": 94.00859375, "completions/mean_terminated_length": 93.47864074707032, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007764888585613779, "frac_reward_zero_std": 0.8375, "grad_norm": 6.425970554351807, "kl": 1.293913332815282, "learning_rate": 4.87468253968254e-07, "loss": 0.0013, "num_tokens": 610125292.0, "reward": 0.3640625, "reward_std": 0.13651361018419267, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9260408639907837, "step": 8580 }, { "completion_length": 573.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 573.6, "completions/max_terminated_length": 555.8, "completions/mean_length": 103.6859375, "completions/mean_terminated_length": 102.64688110351562, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007769413578962039, "frac_reward_zero_std": 0.89375, "grad_norm": 4.739856719970703, "kl": 2.183811178535689, "learning_rate": 4.874285714285714e-07, "loss": 0.0022, "num_tokens": 610458914.0, "reward": 0.3234375, "reward_std": 0.08922205120325089, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9403866529464722, "step": 8585 }, { "completion_length": 502.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 502.6, "completions/max_terminated_length": 456.6, "completions/mean_length": 99.653125, "completions/mean_terminated_length": 98.60923919677734, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007773938572310298, "frac_reward_zero_std": 0.85625, "grad_norm": 4.04232931137085, "kl": 2.8636109450249934, "learning_rate": 4.873888888888889e-07, "loss": 0.0029, "num_tokens": 610783926.0, "reward": 0.3953125, "reward_std": 0.12577710896730424, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9115620851516724, "step": 8590 }, { "completion_length": 355.6, "completions/clipped_ratio": 0.0, "completions/max_length": 355.6, "completions/max_terminated_length": 355.6, "completions/mean_length": 91.61484375, "completions/mean_terminated_length": 91.61484375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007778463565658558, "frac_reward_zero_std": 0.91875, "grad_norm": 3.6781880855560303, "kl": 2.6723358906107024, "learning_rate": 4.873492063492063e-07, "loss": 0.0027, "num_tokens": 611098633.0, "reward": 0.384375, "reward_std": 0.0692822676151991, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.914109981060028, "step": 8595 }, { "completion_length": 362.4, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/max_terminated_length": 362.4, "completions/mean_length": 96.56484375, "completions/mean_terminated_length": 96.56484375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0077829885590068185, "frac_reward_zero_std": 0.8625, "grad_norm": 6.056847095489502, "kl": 1.1479164691874757, "learning_rate": 4.873095238095238e-07, "loss": 0.0011, "num_tokens": 611421868.0, "reward": 0.2828125, "reward_std": 0.11862475126981735, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9415167093276977, "step": 8600 }, { "completion_length": 310.2, "completions/clipped_ratio": 0.0, "completions/max_length": 310.2, "completions/max_terminated_length": 310.2, "completions/mean_length": 90.82109375, "completions/mean_terminated_length": 90.82109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007787513552355078, "frac_reward_zero_std": 0.85625, "grad_norm": 3.4682626724243164, "kl": 2.497907260386273, "learning_rate": 4.872698412698412e-07, "loss": 0.0025, "num_tokens": 611734831.0, "reward": 0.31875, "reward_std": 0.12415051385760308, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9442265391349792, "step": 8605 }, { "completion_length": 383.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 383.8, "completions/max_terminated_length": 368.4, "completions/mean_length": 95.89140625, "completions/mean_terminated_length": 95.36539001464844, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007792038545703338, "frac_reward_zero_std": 0.8875, "grad_norm": 9.598947525024414, "kl": 3.0321648251963778, "learning_rate": 4.872301587301587e-07, "loss": 0.003, "num_tokens": 612056316.0, "reward": 0.3578125, "reward_std": 0.09726955890655517, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9146871089935302, "step": 8610 }, { "completion_length": 405.2, "completions/clipped_ratio": 0.0, "completions/max_length": 405.2, "completions/max_terminated_length": 405.2, "completions/mean_length": 93.521875, "completions/mean_terminated_length": 93.521875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007796563539051598, "frac_reward_zero_std": 0.84375, "grad_norm": 13.40889835357666, "kl": 12.272692384431139, "learning_rate": 4.871904761904762e-07, "loss": 0.0123, "num_tokens": 612373184.0, "reward": 0.290625, "reward_std": 0.12983305752277374, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9492692112922668, "step": 8615 }, { "completion_length": 510.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 510.0, "completions/max_terminated_length": 439.6, "completions/mean_length": 107.52890625, "completions/mean_terminated_length": 105.47394714355468, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007801088532399857, "frac_reward_zero_std": 0.875, "grad_norm": 11.193965911865234, "kl": 6.004231202206574, "learning_rate": 4.871507936507936e-07, "loss": 0.006, "num_tokens": 612713469.0, "reward": 0.30625, "reward_std": 0.1068398341536522, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9276444673538208, "step": 8620 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 88.24921875, "completions/mean_terminated_length": 88.24921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007805613525748117, "frac_reward_zero_std": 0.86875, "grad_norm": 10.175934791564941, "kl": 11.870214434037916, "learning_rate": 4.871111111111111e-07, "loss": 0.0119, "num_tokens": 613023820.0, "reward": 0.2859375, "reward_std": 0.11194322109222413, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9509716749191284, "step": 8625 }, { "completion_length": 591.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 104.90078125, "completions/mean_terminated_length": 104.90078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007810138519096377, "frac_reward_zero_std": 0.8625, "grad_norm": 8.969583511352539, "kl": 9.369871558155864, "learning_rate": 4.870714285714285e-07, "loss": 0.0094, "num_tokens": 613358557.0, "reward": 0.215625, "reward_std": 0.12156887054443359, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9741050481796265, "step": 8630 }, { "completion_length": 557.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 557.4, "completions/max_terminated_length": 459.4, "completions/mean_length": 101.80546875, "completions/mean_terminated_length": 101.28547973632813, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007814663512444637, "frac_reward_zero_std": 0.85, "grad_norm": 5.1369948387146, "kl": 7.013548281462863, "learning_rate": 4.87031746031746e-07, "loss": 0.007, "num_tokens": 613688324.0, "reward": 0.425, "reward_std": 0.130667345225811, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9045149326324463, "step": 8635 }, { "completion_length": 545.2, "completions/clipped_ratio": 0.0, "completions/max_length": 545.2, "completions/max_terminated_length": 545.2, "completions/mean_length": 92.93046875, "completions/mean_terminated_length": 92.93046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007819188505792897, "frac_reward_zero_std": 0.88125, "grad_norm": 4.425873279571533, "kl": 3.3259371704189107, "learning_rate": 4.869920634920634e-07, "loss": 0.0033, "num_tokens": 614005235.0, "reward": 0.3046875, "reward_std": 0.10126562267541886, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9354231595993042, "step": 8640 }, { "completion_length": 532.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 532.2, "completions/max_terminated_length": 516.2, "completions/mean_length": 100.10546875, "completions/mean_terminated_length": 99.07367553710938, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007823713499141157, "frac_reward_zero_std": 0.8375, "grad_norm": 7.940789222717285, "kl": 2.4059096682351084, "learning_rate": 4.86952380952381e-07, "loss": 0.0024, "num_tokens": 614331098.0, "reward": 0.2734375, "reward_std": 0.13898788690567015, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9331490159034729, "step": 8645 }, { "completion_length": 430.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 95.03046875, "completions/mean_terminated_length": 95.03046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007828238492489417, "frac_reward_zero_std": 0.8375, "grad_norm": 3.5122079849243164, "kl": 2.8535391815705227, "learning_rate": 4.869126984126984e-07, "loss": 0.0029, "num_tokens": 614652729.0, "reward": 0.2625, "reward_std": 0.14808380752801895, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9434329867362976, "step": 8650 }, { "completion_length": 585.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 585.0, "completions/max_terminated_length": 522.6, "completions/mean_length": 93.55078125, "completions/mean_terminated_length": 93.02975006103516, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007832763485837675, "frac_reward_zero_std": 0.85625, "grad_norm": 6.586514472961426, "kl": 1.9988438291824422, "learning_rate": 4.868730158730158e-07, "loss": 0.002, "num_tokens": 614970026.0, "reward": 0.26875, "reward_std": 0.12167622894048691, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9420904278755188, "step": 8655 }, { "completion_length": 427.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 427.0, "completions/max_terminated_length": 346.2, "completions/mean_length": 91.63515625, "completions/mean_terminated_length": 91.10537719726562, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007837288479185935, "frac_reward_zero_std": 0.88125, "grad_norm": 10.254497528076172, "kl": 2.572603229340166, "learning_rate": 4.868333333333333e-07, "loss": 0.0026, "num_tokens": 615284807.0, "reward": 0.3390625, "reward_std": 0.09968649968504906, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9071920871734619, "step": 8660 }, { "completion_length": 581.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 581.0, "completions/max_terminated_length": 449.4, "completions/mean_length": 100.04609375, "completions/mean_terminated_length": 97.39931640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007841813472534195, "frac_reward_zero_std": 0.88125, "grad_norm": 5.220984935760498, "kl": 1.417251772689633, "learning_rate": 4.867936507936508e-07, "loss": 0.0014, "num_tokens": 615612330.0, "reward": 0.340625, "reward_std": 0.10168897211551667, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9315513014793396, "step": 8665 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 96.98203125, "completions/mean_terminated_length": 96.98203125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007846338465882455, "frac_reward_zero_std": 0.85625, "grad_norm": 4.845677852630615, "kl": 8.329160021292045, "learning_rate": 4.867539682539683e-07, "loss": 0.0083, "num_tokens": 615936179.0, "reward": 0.4046875, "reward_std": 0.11799968779087067, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.8983264684677124, "step": 8670 }, { "completion_length": 495.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 495.2, "completions/max_terminated_length": 471.6, "completions/mean_length": 93.32421875, "completions/mean_terminated_length": 92.81088409423828, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.007850863459230716, "frac_reward_zero_std": 0.85, "grad_norm": 7.467643737792969, "kl": 7.943078915821388, "learning_rate": 4.867142857142857e-07, "loss": 0.0079, "num_tokens": 616253466.0, "reward": 0.296875, "reward_std": 0.13677069395780564, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.947526478767395, "step": 8675 }, { "completion_length": 363.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 88.3203125, "completions/mean_terminated_length": 88.3203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007855388452578974, "frac_reward_zero_std": 0.88125, "grad_norm": 8.958955764770508, "kl": 5.588584469177294, "learning_rate": 4.866746031746032e-07, "loss": 0.0056, "num_tokens": 616562540.0, "reward": 0.2484375, "reward_std": 0.09989964663982391, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9634719371795655, "step": 8680 }, { "completion_length": 399.8, "completions/clipped_ratio": 0.0, "completions/max_length": 399.8, "completions/max_terminated_length": 399.8, "completions/mean_length": 97.00625, "completions/mean_terminated_length": 97.00625, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.007859913445927234, "frac_reward_zero_std": 0.91875, "grad_norm": 11.00268840789795, "kl": 11.940764829982072, "learning_rate": 4.866349206349206e-07, "loss": 0.0119, "num_tokens": 616885908.0, "reward": 0.259375, "reward_std": 0.06996623575687408, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9568232536315918, "step": 8685 }, { "completion_length": 421.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.2, "completions/max_terminated_length": 321.8, "completions/mean_length": 88.48359375, "completions/mean_terminated_length": 87.94913635253906, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.007864438439275494, "frac_reward_zero_std": 0.875, "grad_norm": 7.857244968414307, "kl": 6.633129787503276, "learning_rate": 4.865952380952381e-07, "loss": 0.0066, "num_tokens": 617195023.0, "reward": 0.3015625, "reward_std": 0.11046792566776276, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9445738911628723, "step": 8690 }, { "completion_length": 471.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 471.2, "completions/max_terminated_length": 372.8, "completions/mean_length": 88.83203125, "completions/mean_terminated_length": 87.78167114257812, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007868963432623754, "frac_reward_zero_std": 0.88125, "grad_norm": 3.1112008094787598, "kl": 5.2677785496925935, "learning_rate": 4.865555555555555e-07, "loss": 0.0053, "num_tokens": 617504440.0, "reward": 0.30625, "reward_std": 0.10378541201353073, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9514794707298279, "step": 8695 }, { "completion_length": 304.2, "completions/clipped_ratio": 0.0, "completions/max_length": 304.2, "completions/max_terminated_length": 304.2, "completions/mean_length": 89.38515625, "completions/mean_terminated_length": 89.38515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007873488425972014, "frac_reward_zero_std": 0.91875, "grad_norm": 15.348243713378906, "kl": 7.99159518815577, "learning_rate": 4.86515873015873e-07, "loss": 0.008, "num_tokens": 617816277.0, "reward": 0.2515625, "reward_std": 0.07517345175147057, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9602109313011169, "step": 8700 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 86.9765625, "completions/mean_terminated_length": 86.9765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007878013419320274, "frac_reward_zero_std": 0.90625, "grad_norm": 4.796351909637451, "kl": 4.20470466343686, "learning_rate": 4.864761904761904e-07, "loss": 0.0042, "num_tokens": 618123687.0, "reward": 0.3875, "reward_std": 0.08469527177512645, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9153894543647766, "step": 8705 }, { "completion_length": 639.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 639.6, "completions/max_terminated_length": 563.0, "completions/mean_length": 99.60234375, "completions/mean_terminated_length": 98.03547821044921, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007882538412668533, "frac_reward_zero_std": 0.88125, "grad_norm": 6.289323329925537, "kl": 3.8515156102133914, "learning_rate": 4.864365079365079e-07, "loss": 0.0039, "num_tokens": 618449778.0, "reward": 0.3796875, "reward_std": 0.09716573059558868, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.8896807432174683, "step": 8710 }, { "completion_length": 472.6, "completions/clipped_ratio": 0.0, "completions/max_length": 472.6, "completions/max_terminated_length": 472.6, "completions/mean_length": 91.27265625, "completions/mean_terminated_length": 91.27265625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.007887063406016793, "frac_reward_zero_std": 0.91875, "grad_norm": 4.034690856933594, "kl": 1.3575158448540605, "learning_rate": 4.863968253968253e-07, "loss": 0.0014, "num_tokens": 618763231.0, "reward": 0.290625, "reward_std": 0.0697530884295702, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9531975626945496, "step": 8715 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 89.6703125, "completions/mean_terminated_length": 89.6703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007891588399365053, "frac_reward_zero_std": 0.875, "grad_norm": 6.518763065338135, "kl": 2.7155947786290198, "learning_rate": 4.863571428571429e-07, "loss": 0.0027, "num_tokens": 619074937.0, "reward": 0.3875, "reward_std": 0.11183684319257736, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9161792039871216, "step": 8720 }, { "completion_length": 412.8, "completions/clipped_ratio": 0.0, "completions/max_length": 412.8, "completions/max_terminated_length": 412.8, "completions/mean_length": 93.95, "completions/mean_terminated_length": 93.95, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007896113392713313, "frac_reward_zero_std": 0.85625, "grad_norm": 3.5637614727020264, "kl": 4.317516416893341, "learning_rate": 4.863174603174602e-07, "loss": 0.0043, "num_tokens": 619394081.0, "reward": 0.4046875, "reward_std": 0.12099324241280555, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9118113279342651, "step": 8725 }, { "completion_length": 541.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 541.4, "completions/max_terminated_length": 448.2, "completions/mean_length": 101.7359375, "completions/mean_terminated_length": 100.70219116210937, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007900638386061573, "frac_reward_zero_std": 0.84375, "grad_norm": 4.863167762756348, "kl": 5.117738248256501, "learning_rate": 4.862777777777777e-07, "loss": 0.0051, "num_tokens": 619725127.0, "reward": 0.165625, "reward_std": 0.13619604259729384, "rewards/verify_chess_move/mean": 0.165625, "rewards/verify_chess_move/std": 0.9819092512130737, "step": 8730 }, { "completion_length": 550.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 550.8, "completions/max_terminated_length": 452.4, "completions/mean_length": 99.6578125, "completions/mean_terminated_length": 99.12508087158203, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007905163379409831, "frac_reward_zero_std": 0.90625, "grad_norm": 4.786472320556641, "kl": 4.803280654188711, "learning_rate": 4.862380952380953e-07, "loss": 0.0048, "num_tokens": 620050753.0, "reward": 0.3046875, "reward_std": 0.08243316188454627, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9414427399635314, "step": 8735 }, { "completion_length": 572.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 572.6, "completions/max_terminated_length": 547.4, "completions/mean_length": 94.2640625, "completions/mean_terminated_length": 93.20303802490234, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.007909688372758092, "frac_reward_zero_std": 0.90625, "grad_norm": 3.43668532371521, "kl": 2.222203631827142, "learning_rate": 4.861984126984127e-07, "loss": 0.0022, "num_tokens": 620367467.0, "reward": 0.35, "reward_std": 0.07922744303941727, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9046311140060425, "step": 8740 }, { "completion_length": 421.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.6, "completions/max_terminated_length": 397.4, "completions/mean_length": 92.109375, "completions/mean_terminated_length": 91.5824981689453, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.007914213366106352, "frac_reward_zero_std": 0.86875, "grad_norm": 0.00945303961634636, "kl": 1.3192525361897425, "learning_rate": 4.861587301587302e-07, "loss": 0.0013, "num_tokens": 620683119.0, "reward": 0.3390625, "reward_std": 0.11783342361450196, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9241957664489746, "step": 8745 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 89.709375, "completions/mean_terminated_length": 89.709375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007918738359454612, "frac_reward_zero_std": 0.8625, "grad_norm": 2.142289161682129, "kl": 2.4478986767935567, "learning_rate": 4.861190476190476e-07, "loss": 0.0024, "num_tokens": 620994379.0, "reward": 0.2734375, "reward_std": 0.11725681535899639, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9480217695236206, "step": 8750 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 97.796875, "completions/mean_terminated_length": 97.796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007923263352802872, "frac_reward_zero_std": 0.9125, "grad_norm": 3.0197484493255615, "kl": 1.0062870937399566, "learning_rate": 4.860793650793651e-07, "loss": 0.001, "num_tokens": 621319471.0, "reward": 0.4140625, "reward_std": 0.07938070371747016, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9022453904151917, "step": 8755 }, { "completion_length": 440.8, "completions/clipped_ratio": 0.0, "completions/max_length": 440.8, "completions/max_terminated_length": 440.8, "completions/mean_length": 97.56953125, "completions/mean_terminated_length": 97.56953125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007927788346151132, "frac_reward_zero_std": 0.9125, "grad_norm": 2.1759884357452393, "kl": 1.457377974758856, "learning_rate": 4.860396825396825e-07, "loss": 0.0015, "num_tokens": 621643752.0, "reward": 0.2625, "reward_std": 0.07370070293545723, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9398944020271301, "step": 8760 }, { "completion_length": 376.2, "completions/clipped_ratio": 0.0, "completions/max_length": 376.2, "completions/max_terminated_length": 376.2, "completions/mean_length": 98.6671875, "completions/mean_terminated_length": 98.6671875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00793231333949939, "frac_reward_zero_std": 0.90625, "grad_norm": 3.4617257118225098, "kl": 2.2373488067416476, "learning_rate": 4.86e-07, "loss": 0.0022, "num_tokens": 621969894.0, "reward": 0.321875, "reward_std": 0.08085501939058304, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9392126798629761, "step": 8765 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 86.5875, "completions/mean_terminated_length": 86.5875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.00793683833284765, "frac_reward_zero_std": 0.825, "grad_norm": 2.009317636489868, "kl": 6.237602634017821, "learning_rate": 4.859603174603174e-07, "loss": 0.0062, "num_tokens": 622275094.0, "reward": 0.3515625, "reward_std": 0.15513135641813278, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9326996803283691, "step": 8770 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 98.0703125, "completions/mean_terminated_length": 98.0703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00794136332619591, "frac_reward_zero_std": 0.91875, "grad_norm": 2.3161849975585938, "kl": 2.089880254934542, "learning_rate": 4.859206349206349e-07, "loss": 0.0021, "num_tokens": 622600048.0, "reward": 0.1890625, "reward_std": 0.06565221510827542, "rewards/verify_chess_move/mean": 0.1890625, "rewards/verify_chess_move/std": 0.9703198552131653, "step": 8775 }, { "completion_length": 625.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 625.6, "completions/max_terminated_length": 602.2, "completions/mean_length": 102.54296875, "completions/mean_terminated_length": 102.03384094238281, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00794588831954417, "frac_reward_zero_std": 0.86875, "grad_norm": 7.151559829711914, "kl": 4.555212804721668, "learning_rate": 4.858809523809523e-07, "loss": 0.0046, "num_tokens": 622932855.0, "reward": 0.2328125, "reward_std": 0.11667863577604294, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.9539709091186523, "step": 8780 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 93.35234375, "completions/mean_terminated_length": 93.35234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00795041331289243, "frac_reward_zero_std": 0.89375, "grad_norm": 3.4367737770080566, "kl": 6.495064575923607, "learning_rate": 4.858412698412698e-07, "loss": 0.0065, "num_tokens": 623251106.0, "reward": 0.30625, "reward_std": 0.0905890092253685, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9415507674217224, "step": 8785 }, { "completion_length": 567.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 567.6, "completions/max_terminated_length": 433.4, "completions/mean_length": 97.109375, "completions/mean_terminated_length": 95.56050415039063, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.007954938306240689, "frac_reward_zero_std": 0.86875, "grad_norm": 8.37619400024414, "kl": 5.830672214156948, "learning_rate": 4.858015873015873e-07, "loss": 0.0058, "num_tokens": 623573670.0, "reward": 0.3671875, "reward_std": 0.11945904195308685, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9130936026573181, "step": 8790 }, { "completion_length": 546.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 546.4, "completions/max_terminated_length": 423.0, "completions/mean_length": 98.103125, "completions/mean_terminated_length": 97.05599822998047, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007959463299588949, "frac_reward_zero_std": 0.8625, "grad_norm": 4.251889705657959, "kl": 2.4668468299903905, "learning_rate": 4.857619047619048e-07, "loss": 0.0025, "num_tokens": 623898090.0, "reward": 0.4703125, "reward_std": 0.12135866582393647, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8749162077903747, "step": 8795 }, { "completion_length": 435.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.2, "completions/max_terminated_length": 330.4, "completions/mean_length": 89.73671875, "completions/mean_terminated_length": 89.1980194091797, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.00796398829293721, "frac_reward_zero_std": 0.89375, "grad_norm": 8.030709266662598, "kl": 2.0354542116983794, "learning_rate": 4.857222222222222e-07, "loss": 0.002, "num_tokens": 624209777.0, "reward": 0.3171875, "reward_std": 0.08969287574291229, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9045338034629822, "step": 8800 }, { "completion_length": 470.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 470.2, "completions/max_terminated_length": 416.8, "completions/mean_length": 99.66484375, "completions/mean_terminated_length": 99.14206237792969, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00796851328628547, "frac_reward_zero_std": 0.85, "grad_norm": 3.8866968154907227, "kl": 3.4186417424469253, "learning_rate": 4.856825396825396e-07, "loss": 0.0034, "num_tokens": 624538516.0, "reward": 0.1515625, "reward_std": 0.12814559265971184, "rewards/verify_chess_move/mean": 0.1515625, "rewards/verify_chess_move/std": 0.9805220603942871, "step": 8805 }, { "completion_length": 301.8, "completions/clipped_ratio": 0.0, "completions/max_length": 301.8, "completions/max_terminated_length": 301.8, "completions/mean_length": 84.60703125, "completions/mean_terminated_length": 84.60703125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.00797303827963373, "frac_reward_zero_std": 0.875, "grad_norm": 3.9672799110412598, "kl": 1.4515375770628451, "learning_rate": 4.856428571428572e-07, "loss": 0.0015, "num_tokens": 624841061.0, "reward": 0.4078125, "reward_std": 0.11094070672988891, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8940327525138855, "step": 8810 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 89.303125, "completions/mean_terminated_length": 89.303125, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00797756327298199, "frac_reward_zero_std": 0.90625, "grad_norm": 5.055197715759277, "kl": 3.751526407885831, "learning_rate": 4.856031746031745e-07, "loss": 0.0038, "num_tokens": 625152721.0, "reward": 0.2390625, "reward_std": 0.08127836883068085, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9561690926551819, "step": 8815 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 99.13984375, "completions/mean_terminated_length": 99.13984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.007982088266330248, "frac_reward_zero_std": 0.8625, "grad_norm": 7.3472466468811035, "kl": 4.3646492343628776, "learning_rate": 4.855634920634921e-07, "loss": 0.0044, "num_tokens": 625480964.0, "reward": 0.2921875, "reward_std": 0.10816186964511872, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9468054652214051, "step": 8820 }, { "completion_length": 394.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 394.8, "completions/max_terminated_length": 297.4, "completions/mean_length": 94.54609375, "completions/mean_terminated_length": 94.02022552490234, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.007986613259678508, "frac_reward_zero_std": 0.9, "grad_norm": 14.356813430786133, "kl": 5.769630808697547, "learning_rate": 4.855238095238095e-07, "loss": 0.0058, "num_tokens": 625802551.0, "reward": 0.3203125, "reward_std": 0.09026948586106301, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9028042316436767, "step": 8825 }, { "completion_length": 492.8, "completions/clipped_ratio": 0.0, "completions/max_length": 492.8, "completions/max_terminated_length": 492.8, "completions/mean_length": 90.275, "completions/mean_terminated_length": 90.275, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.007991138253026768, "frac_reward_zero_std": 0.85625, "grad_norm": 8.655688285827637, "kl": 7.885810497310013, "learning_rate": 4.85484126984127e-07, "loss": 0.0079, "num_tokens": 626116751.0, "reward": 0.2421875, "reward_std": 0.12372715473175049, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.963832676410675, "step": 8830 }, { "completion_length": 522.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 522.6, "completions/max_terminated_length": 466.8, "completions/mean_length": 92.26171875, "completions/mean_terminated_length": 91.73020629882812, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.007995663246375028, "frac_reward_zero_std": 0.8375, "grad_norm": 14.38180160522461, "kl": 6.459272650640924, "learning_rate": 4.854444444444444e-07, "loss": 0.0065, "num_tokens": 626430446.0, "reward": 0.33125, "reward_std": 0.1398840218782425, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9410496354103088, "step": 8835 }, { "completion_length": 471.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.8, "completions/max_terminated_length": 376.0, "completions/mean_length": 92.2546875, "completions/mean_terminated_length": 91.72633666992188, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008000188239723288, "frac_reward_zero_std": 0.88125, "grad_norm": 10.157695770263672, "kl": 4.707141774962656, "learning_rate": 4.854047619047619e-07, "loss": 0.0047, "num_tokens": 626745468.0, "reward": 0.2953125, "reward_std": 0.10079480260610581, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.954012668132782, "step": 8840 }, { "completion_length": 458.6, "completions/clipped_ratio": 0.0, "completions/max_length": 458.6, "completions/max_terminated_length": 458.6, "completions/mean_length": 98.1078125, "completions/mean_terminated_length": 98.1078125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008004713233071547, "frac_reward_zero_std": 0.90625, "grad_norm": 11.196435928344727, "kl": 6.171192428912036, "learning_rate": 4.853650793650794e-07, "loss": 0.0062, "num_tokens": 627070982.0, "reward": 0.2484375, "reward_std": 0.08337677121162415, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9672159552574158, "step": 8845 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 91.0984375, "completions/mean_terminated_length": 91.0984375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008009238226419807, "frac_reward_zero_std": 0.8375, "grad_norm": 5.344041347503662, "kl": 2.488602834532503, "learning_rate": 4.853253968253968e-07, "loss": 0.0025, "num_tokens": 627383716.0, "reward": 0.3625, "reward_std": 0.14129845201969146, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9111292481422424, "step": 8850 }, { "completion_length": 335.8, "completions/clipped_ratio": 0.0, "completions/max_length": 335.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 89.121875, "completions/mean_terminated_length": 89.121875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008013763219768067, "frac_reward_zero_std": 0.86875, "grad_norm": 4.193659782409668, "kl": 1.5194542051292956, "learning_rate": 4.852857142857143e-07, "loss": 0.0015, "num_tokens": 627695112.0, "reward": 0.2640625, "reward_std": 0.10668853372335434, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9651538968086243, "step": 8855 }, { "completion_length": 448.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 448.6, "completions/max_terminated_length": 391.8, "completions/mean_length": 93.26953125, "completions/mean_terminated_length": 92.74736938476562, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008018288213116327, "frac_reward_zero_std": 0.9, "grad_norm": 5.329113960266113, "kl": 0.6879237769520842, "learning_rate": 4.852460317460317e-07, "loss": 0.0007, "num_tokens": 628012505.0, "reward": 0.403125, "reward_std": 0.080278405174613, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9151992917060852, "step": 8860 }, { "completion_length": 596.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 596.0, "completions/max_terminated_length": 555.2, "completions/mean_length": 98.34609375, "completions/mean_terminated_length": 97.81279754638672, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008022813206464587, "frac_reward_zero_std": 0.8625, "grad_norm": 10.380696296691895, "kl": 1.4131192834349349, "learning_rate": 4.852063492063492e-07, "loss": 0.0014, "num_tokens": 628337268.0, "reward": 0.3453125, "reward_std": 0.11704563200473786, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.930638313293457, "step": 8865 }, { "completion_length": 332.4, "completions/clipped_ratio": 0.0, "completions/max_length": 332.4, "completions/max_terminated_length": 332.4, "completions/mean_length": 93.07890625, "completions/mean_terminated_length": 93.07890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008027338199812847, "frac_reward_zero_std": 0.90625, "grad_norm": 7.255545139312744, "kl": 2.6286566203809345, "learning_rate": 4.851666666666666e-07, "loss": 0.0026, "num_tokens": 628654201.0, "reward": 0.321875, "reward_std": 0.08017104938626289, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9395619988441467, "step": 8870 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 92.58515625, "completions/mean_terminated_length": 92.58515625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.008031863193161105, "frac_reward_zero_std": 0.925, "grad_norm": 4.900525093078613, "kl": 1.810041614016518, "learning_rate": 4.851269841269841e-07, "loss": 0.0018, "num_tokens": 628970766.0, "reward": 0.45625, "reward_std": 0.06554583944380284, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8880501985549927, "step": 8875 }, { "completion_length": 538.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 538.2, "completions/max_terminated_length": 515.6, "completions/mean_length": 92.54921875, "completions/mean_terminated_length": 92.03816680908203, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008036388186509365, "frac_reward_zero_std": 0.89375, "grad_norm": 1.8256585597991943, "kl": 5.369536711287219, "learning_rate": 4.850873015873015e-07, "loss": 0.0054, "num_tokens": 629286125.0, "reward": 0.3984375, "reward_std": 0.09195596724748611, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.8973047494888305, "step": 8880 }, { "completion_length": 467.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 467.8, "completions/max_terminated_length": 455.2, "completions/mean_length": 97.4515625, "completions/mean_terminated_length": 96.9412612915039, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008040913179857626, "frac_reward_zero_std": 0.91875, "grad_norm": 1.6572167873382568, "kl": 3.9575965510797686, "learning_rate": 4.850476190476191e-07, "loss": 0.004, "num_tokens": 629611599.0, "reward": 0.25625, "reward_std": 0.0690226323902607, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9631182909011841, "step": 8885 }, { "completion_length": 324.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 94.13828125, "completions/mean_terminated_length": 94.13828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008045438173205886, "frac_reward_zero_std": 0.89375, "grad_norm": 6.72662878036499, "kl": 6.321480709454045, "learning_rate": 4.850079365079364e-07, "loss": 0.0063, "num_tokens": 629931152.0, "reward": 0.33125, "reward_std": 0.0890098825097084, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9388330578804016, "step": 8890 }, { "completion_length": 485.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.8, "completions/max_terminated_length": 419.6, "completions/mean_length": 93.12734375, "completions/mean_terminated_length": 92.59770050048829, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008049963166554146, "frac_reward_zero_std": 0.85, "grad_norm": 8.487457275390625, "kl": 2.8657619695179166, "learning_rate": 4.84968253968254e-07, "loss": 0.0029, "num_tokens": 630249587.0, "reward": 0.2828125, "reward_std": 0.13262333124876022, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9480340003967285, "step": 8895 }, { "completion_length": 590.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 590.6, "completions/max_terminated_length": 465.4, "completions/mean_length": 93.415625, "completions/mean_terminated_length": 92.35338134765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008054488159902404, "frac_reward_zero_std": 0.88125, "grad_norm": 4.1449503898620605, "kl": 4.281942109554075, "learning_rate": 4.849285714285715e-07, "loss": 0.0043, "num_tokens": 630568311.0, "reward": 0.2609375, "reward_std": 0.10399953685700894, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9608607172966004, "step": 8900 }, { "completion_length": 493.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 493.8, "completions/max_terminated_length": 465.6, "completions/mean_length": 90.05546875, "completions/mean_terminated_length": 89.53528594970703, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008059013153250664, "frac_reward_zero_std": 0.8375, "grad_norm": 14.910636901855469, "kl": 3.440926088229753, "learning_rate": 4.848888888888888e-07, "loss": 0.0034, "num_tokens": 630880302.0, "reward": 0.3046875, "reward_std": 0.1349344804883003, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9471265435218811, "step": 8905 }, { "completion_length": 385.6, "completions/clipped_ratio": 0.0, "completions/max_length": 385.6, "completions/max_terminated_length": 385.6, "completions/mean_length": 92.99296875, "completions/mean_terminated_length": 92.99296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008063538146598924, "frac_reward_zero_std": 0.8625, "grad_norm": 6.163343906402588, "kl": 2.1107883221120574, "learning_rate": 4.848492063492064e-07, "loss": 0.0021, "num_tokens": 631198469.0, "reward": 0.2703125, "reward_std": 0.11678598821163177, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9512464165687561, "step": 8910 }, { "completion_length": 538.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 538.2, "completions/max_terminated_length": 425.8, "completions/mean_length": 93.28515625, "completions/mean_terminated_length": 92.24035186767578, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008068063139947184, "frac_reward_zero_std": 0.9, "grad_norm": 3.9667248725891113, "kl": 2.074079142115079, "learning_rate": 4.848095238095238e-07, "loss": 0.0021, "num_tokens": 631515474.0, "reward": 0.3046875, "reward_std": 0.0850147970020771, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9272230744361878, "step": 8915 }, { "completion_length": 300.6, "completions/clipped_ratio": 0.0, "completions/max_length": 300.6, "completions/max_terminated_length": 300.6, "completions/mean_length": 95.56171875, "completions/mean_terminated_length": 95.56171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008072588133295444, "frac_reward_zero_std": 0.9125, "grad_norm": 13.110008239746094, "kl": 4.754859817237593, "learning_rate": 4.847698412698413e-07, "loss": 0.0048, "num_tokens": 631838705.0, "reward": 0.4171875, "reward_std": 0.07485647946596145, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9016521334648132, "step": 8920 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 97.92421875, "completions/mean_terminated_length": 97.92421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008077113126643705, "frac_reward_zero_std": 0.88125, "grad_norm": 7.138134479522705, "kl": 7.129568471782841, "learning_rate": 4.847301587301587e-07, "loss": 0.0071, "num_tokens": 632165096.0, "reward": 0.35625, "reward_std": 0.09989866465330124, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9213068366050721, "step": 8925 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 98.8484375, "completions/mean_terminated_length": 98.8484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008081638119991963, "frac_reward_zero_std": 0.90625, "grad_norm": 4.2833991050720215, "kl": 4.582490111060906, "learning_rate": 4.846904761904762e-07, "loss": 0.0046, "num_tokens": 632492894.0, "reward": 0.3515625, "reward_std": 0.07538816630840302, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9251283764839172, "step": 8930 }, { "completion_length": 292.4, "completions/clipped_ratio": 0.0, "completions/max_length": 292.4, "completions/max_terminated_length": 292.4, "completions/mean_length": 88.1828125, "completions/mean_terminated_length": 88.1828125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008086163113340223, "frac_reward_zero_std": 0.91875, "grad_norm": 3.6932671070098877, "kl": 0.938346036686562, "learning_rate": 4.846507936507936e-07, "loss": 0.0009, "num_tokens": 632802304.0, "reward": 0.4609375, "reward_std": 0.0677041221410036, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8846444249153137, "step": 8935 }, { "completion_length": 271.2, "completions/clipped_ratio": 0.0, "completions/max_length": 271.2, "completions/max_terminated_length": 271.2, "completions/mean_length": 81.0921875, "completions/mean_terminated_length": 81.0921875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008090688106688483, "frac_reward_zero_std": 0.875, "grad_norm": 0.17323832213878632, "kl": 0.5260903672548011, "learning_rate": 4.846111111111111e-07, "loss": 0.0005, "num_tokens": 633099286.0, "reward": 0.4015625, "reward_std": 0.11046988517045975, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9017754554748535, "step": 8940 }, { "completion_length": 275.4, "completions/clipped_ratio": 0.0, "completions/max_length": 275.4, "completions/max_terminated_length": 275.4, "completions/mean_length": 86.75078125, "completions/mean_terminated_length": 86.75078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008095213100036743, "frac_reward_zero_std": 0.95, "grad_norm": 2.6198956966400146, "kl": 0.5989629429066554, "learning_rate": 4.845714285714285e-07, "loss": 0.0006, "num_tokens": 633408775.0, "reward": 0.309375, "reward_std": 0.04287311993539333, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9459647178649903, "step": 8945 }, { "completion_length": 385.2, "completions/clipped_ratio": 0.0, "completions/max_length": 385.2, "completions/max_terminated_length": 385.2, "completions/mean_length": 93.91484375, "completions/mean_terminated_length": 93.91484375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.008099738093385003, "frac_reward_zero_std": 0.93125, "grad_norm": 4.227682590484619, "kl": 1.7588565098587423, "learning_rate": 4.84531746031746e-07, "loss": 0.0018, "num_tokens": 633726250.0, "reward": 0.3890625, "reward_std": 0.05544740185141563, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9139377236366272, "step": 8950 }, { "completion_length": 429.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 429.6, "completions/max_terminated_length": 404.4, "completions/mean_length": 98.59765625, "completions/mean_terminated_length": 98.08567810058594, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008104263086733262, "frac_reward_zero_std": 0.8875, "grad_norm": 8.923138618469238, "kl": 1.0292985982378013, "learning_rate": 4.844920634920634e-07, "loss": 0.001, "num_tokens": 634050823.0, "reward": 0.459375, "reward_std": 0.09595105499029159, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8754785418510437, "step": 8955 }, { "completion_length": 445.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.0, "completions/max_terminated_length": 382.4, "completions/mean_length": 89.86328125, "completions/mean_terminated_length": 89.32868347167968, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.008108788080081522, "frac_reward_zero_std": 0.9, "grad_norm": 6.4379730224609375, "kl": 0.9664512960938737, "learning_rate": 4.844523809523809e-07, "loss": 0.001, "num_tokens": 634365120.0, "reward": 0.2765625, "reward_std": 0.0886903628706932, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.960322916507721, "step": 8960 }, { "completion_length": 516.4, "completions/clipped_ratio": 0.0, "completions/max_length": 516.4, "completions/max_terminated_length": 516.4, "completions/mean_length": 93.5828125, "completions/mean_terminated_length": 93.5828125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008113313073429782, "frac_reward_zero_std": 0.89375, "grad_norm": 3.4722986221313477, "kl": 2.6906894765328615, "learning_rate": 4.844126984126984e-07, "loss": 0.0027, "num_tokens": 634685074.0, "reward": 0.3015625, "reward_std": 0.08948168903589249, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9337586522102356, "step": 8965 }, { "completion_length": 496.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 496.4, "completions/max_terminated_length": 435.2, "completions/mean_length": 96.83515625, "completions/mean_terminated_length": 95.26793365478515, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008117838066778042, "frac_reward_zero_std": 0.875, "grad_norm": 6.910220146179199, "kl": 1.9407521131564862, "learning_rate": 4.843730158730159e-07, "loss": 0.0019, "num_tokens": 635007255.0, "reward": 0.4171875, "reward_std": 0.11204705014824867, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8964609861373901, "step": 8970 }, { "completion_length": 392.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 392.8, "completions/max_terminated_length": 317.8, "completions/mean_length": 98.015625, "completions/mean_terminated_length": 97.49580078125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008122363060126302, "frac_reward_zero_std": 0.89375, "grad_norm": 7.148338317871094, "kl": 1.3721247928682714, "learning_rate": 4.843333333333334e-07, "loss": 0.0014, "num_tokens": 635333155.0, "reward": 0.2671875, "reward_std": 0.08764292597770691, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9519866824150085, "step": 8975 }, { "completion_length": 525.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 525.4, "completions/max_terminated_length": 464.0, "completions/mean_length": 96.9140625, "completions/mean_terminated_length": 96.37923431396484, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008126888053474562, "frac_reward_zero_std": 0.88125, "grad_norm": 5.174553394317627, "kl": 1.547529922053218, "learning_rate": 4.842936507936507e-07, "loss": 0.0015, "num_tokens": 635656405.0, "reward": 0.4203125, "reward_std": 0.10399954319000244, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.9008468270301819, "step": 8980 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 92.09921875, "completions/mean_terminated_length": 92.09921875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00813141304682282, "frac_reward_zero_std": 0.88125, "grad_norm": 10.045607566833496, "kl": 7.929374867887236, "learning_rate": 4.842539682539683e-07, "loss": 0.0079, "num_tokens": 635974020.0, "reward": 0.3671875, "reward_std": 0.10783979706466199, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9149846076965332, "step": 8985 }, { "completion_length": 422.8, "completions/clipped_ratio": 0.0, "completions/max_length": 422.8, "completions/max_terminated_length": 422.8, "completions/mean_length": 98.6734375, "completions/mean_terminated_length": 98.6734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00813593804017108, "frac_reward_zero_std": 0.90625, "grad_norm": 7.507442474365234, "kl": 5.338310986082069, "learning_rate": 4.842142857142857e-07, "loss": 0.0053, "num_tokens": 636300706.0, "reward": 0.3796875, "reward_std": 0.07859290689229965, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9122280716896057, "step": 8990 }, { "completion_length": 516.8, "completions/clipped_ratio": 0.003125, "completions/max_length": 516.8, "completions/max_terminated_length": 435.4, "completions/mean_length": 95.89296875, "completions/mean_terminated_length": 93.81805419921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00814046303351934, "frac_reward_zero_std": 0.84375, "grad_norm": 7.556035995483398, "kl": 16.530241881171243, "learning_rate": 4.841746031746032e-07, "loss": 0.0165, "num_tokens": 636622337.0, "reward": 0.253125, "reward_std": 0.12846511602401733, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9630784153938293, "step": 8995 }, { "completion_length": 530.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 530.6, "completions/max_terminated_length": 431.6, "completions/mean_length": 95.378125, "completions/mean_terminated_length": 94.85236358642578, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0081449880268676, "frac_reward_zero_std": 0.9125, "grad_norm": 11.658663749694824, "kl": 10.273809864651412, "learning_rate": 4.841349206349206e-07, "loss": 0.0103, "num_tokens": 636942925.0, "reward": 0.284375, "reward_std": 0.07775411009788513, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9550165176391602, "step": 9000 }, { "completion_length": 485.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.0, "completions/max_terminated_length": 382.2, "completions/mean_length": 88.1109375, "completions/mean_terminated_length": 87.57393493652344, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00814951302021586, "frac_reward_zero_std": 0.90625, "grad_norm": 5.2993245124816895, "kl": 3.8554688570555298, "learning_rate": 4.840952380952381e-07, "loss": 0.0039, "num_tokens": 637252595.0, "reward": 0.2890625, "reward_std": 0.07880409136414528, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9526849627494812, "step": 9005 }, { "completion_length": 506.6, "completions/clipped_ratio": 0.0, "completions/max_length": 506.6, "completions/max_terminated_length": 506.6, "completions/mean_length": 99.41875, "completions/mean_terminated_length": 99.41875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008154038013564121, "frac_reward_zero_std": 0.8875, "grad_norm": 5.189474582672119, "kl": 1.6390624446677975, "learning_rate": 4.840555555555555e-07, "loss": 0.0016, "num_tokens": 637578659.0, "reward": 0.271875, "reward_std": 0.09821218326687813, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9516297340393066, "step": 9010 }, { "completion_length": 386.6, "completions/clipped_ratio": 0.0, "completions/max_length": 386.6, "completions/max_terminated_length": 386.6, "completions/mean_length": 97.703125, "completions/mean_terminated_length": 97.703125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00815856300691238, "frac_reward_zero_std": 0.86875, "grad_norm": 7.134868621826172, "kl": 0.716794378682971, "learning_rate": 4.84015873015873e-07, "loss": 0.0007, "num_tokens": 637902999.0, "reward": 0.3671875, "reward_std": 0.11420435458421707, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9201547026634216, "step": 9015 }, { "completion_length": 423.4, "completions/clipped_ratio": 0.0, "completions/max_length": 423.4, "completions/max_terminated_length": 423.4, "completions/mean_length": 88.084375, "completions/mean_terminated_length": 88.084375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00816308800026064, "frac_reward_zero_std": 0.875, "grad_norm": 7.256825923919678, "kl": 0.611728913791012, "learning_rate": 4.839761904761905e-07, "loss": 0.0006, "num_tokens": 638212323.0, "reward": 0.5015625, "reward_std": 0.1074763298034668, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.8577853679656983, "step": 9020 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 90.3859375, "completions/mean_terminated_length": 90.3859375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0081676129936089, "frac_reward_zero_std": 0.9, "grad_norm": 4.155815124511719, "kl": 1.2684561565052719, "learning_rate": 4.839365079365079e-07, "loss": 0.0013, "num_tokens": 638525425.0, "reward": 0.3828125, "reward_std": 0.08595644198358059, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.8944911241531373, "step": 9025 }, { "completion_length": 472.8, "completions/clipped_ratio": 0.0, "completions/max_length": 472.8, "completions/max_terminated_length": 472.8, "completions/mean_length": 92.903125, "completions/mean_terminated_length": 92.903125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00817213798695716, "frac_reward_zero_std": 0.86875, "grad_norm": 0.541620671749115, "kl": 2.840111128578428, "learning_rate": 4.838968253968254e-07, "loss": 0.0028, "num_tokens": 638841061.0, "reward": 0.3515625, "reward_std": 0.11099961474537849, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9236289381980896, "step": 9030 }, { "completion_length": 498.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 498.6, "completions/max_terminated_length": 440.0, "completions/mean_length": 96.44765625, "completions/mean_terminated_length": 95.39695434570312, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.00817666298030542, "frac_reward_zero_std": 0.85, "grad_norm": 9.801959991455078, "kl": 1.5556137876817957, "learning_rate": 4.838571428571428e-07, "loss": 0.0016, "num_tokens": 639162066.0, "reward": 0.3421875, "reward_std": 0.11905064731836319, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9238003611564636, "step": 9035 }, { "completion_length": 445.2, "completions/clipped_ratio": 0.0, "completions/max_length": 445.2, "completions/max_terminated_length": 445.2, "completions/mean_length": 96.0921875, "completions/mean_terminated_length": 96.0921875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008181187973653678, "frac_reward_zero_std": 0.85625, "grad_norm": 5.826748847961426, "kl": 1.1596411949838512, "learning_rate": 4.838174603174603e-07, "loss": 0.0012, "num_tokens": 639485488.0, "reward": 0.3015625, "reward_std": 0.12073360532522201, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9392646908760071, "step": 9040 }, { "completion_length": 361.2, "completions/clipped_ratio": 0.0, "completions/max_length": 361.2, "completions/max_terminated_length": 361.2, "completions/mean_length": 96.1296875, "completions/mean_terminated_length": 96.1296875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.008185712967001938, "frac_reward_zero_std": 0.89375, "grad_norm": 4.211523056030273, "kl": 2.773918324778788, "learning_rate": 4.837777777777777e-07, "loss": 0.0028, "num_tokens": 639807646.0, "reward": 0.3875, "reward_std": 0.09148416221141815, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9143547534942627, "step": 9045 }, { "completion_length": 530.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 530.2, "completions/max_terminated_length": 420.0, "completions/mean_length": 96.77109375, "completions/mean_terminated_length": 95.73064270019532, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008190237960350198, "frac_reward_zero_std": 0.85625, "grad_norm": 5.7601141929626465, "kl": 2.8703435001312756, "learning_rate": 4.837380952380952e-07, "loss": 0.0029, "num_tokens": 640129129.0, "reward": 0.2578125, "reward_std": 0.12188839614391327, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9631905913352966, "step": 9050 }, { "completion_length": 488.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 488.2, "completions/max_terminated_length": 460.2, "completions/mean_length": 88.55546875, "completions/mean_terminated_length": 88.0267318725586, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008194762953698458, "frac_reward_zero_std": 0.88125, "grad_norm": 6.863766193389893, "kl": 0.7430203390365933, "learning_rate": 4.836984126984126e-07, "loss": 0.0007, "num_tokens": 640439392.0, "reward": 0.25, "reward_std": 0.09784871563315392, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9646670937538147, "step": 9055 }, { "completion_length": 376.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 98.23828125, "completions/mean_terminated_length": 98.23828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008199287947046718, "frac_reward_zero_std": 0.875, "grad_norm": 4.252388000488281, "kl": 0.7277475517010317, "learning_rate": 4.836587301587302e-07, "loss": 0.0007, "num_tokens": 640765297.0, "reward": 0.321875, "reward_std": 0.10521421656012535, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9365295529365539, "step": 9060 }, { "completion_length": 327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 89.25703125, "completions/mean_terminated_length": 89.25703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008203812940394978, "frac_reward_zero_std": 0.9375, "grad_norm": 0.9846910238265991, "kl": 1.032262299139984, "learning_rate": 4.836190476190475e-07, "loss": 0.001, "num_tokens": 641075722.0, "reward": 0.35, "reward_std": 0.05192314051091671, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9137408375740051, "step": 9065 }, { "completion_length": 391.2, "completions/clipped_ratio": 0.0, "completions/max_length": 391.2, "completions/max_terminated_length": 391.2, "completions/mean_length": 93.43359375, "completions/mean_terminated_length": 93.43359375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008208337933743237, "frac_reward_zero_std": 0.875, "grad_norm": 2.1718742847442627, "kl": 0.7898605000111274, "learning_rate": 4.83579365079365e-07, "loss": 0.0008, "num_tokens": 641394605.0, "reward": 0.34375, "reward_std": 0.10273994021117687, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9323637127876282, "step": 9070 }, { "completion_length": 389.4, "completions/clipped_ratio": 0.0, "completions/max_length": 389.4, "completions/max_terminated_length": 389.4, "completions/mean_length": 97.2671875, "completions/mean_terminated_length": 97.2671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008212862927091497, "frac_reward_zero_std": 0.86875, "grad_norm": 1.9552597999572754, "kl": 0.5845108592649921, "learning_rate": 4.835396825396826e-07, "loss": 0.0006, "num_tokens": 641719163.0, "reward": 0.3640625, "reward_std": 0.11373353153467178, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.93118314743042, "step": 9075 }, { "completion_length": 525.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 525.6, "completions/max_terminated_length": 462.0, "completions/mean_length": 93.38984375, "completions/mean_terminated_length": 92.8722412109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008217387920439757, "frac_reward_zero_std": 0.9, "grad_norm": 1.0607982873916626, "kl": 0.9007439368404448, "learning_rate": 4.835e-07, "loss": 0.0009, "num_tokens": 642036470.0, "reward": 0.35, "reward_std": 0.08390747904777526, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9266488075256347, "step": 9080 }, { "completion_length": 267.8, "completions/clipped_ratio": 0.0, "completions/max_length": 267.8, "completions/max_terminated_length": 267.8, "completions/mean_length": 90.68984375, "completions/mean_terminated_length": 90.68984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008221912913788017, "frac_reward_zero_std": 0.93125, "grad_norm": 0.5751284956932068, "kl": 0.37860121272969993, "learning_rate": 4.834603174603175e-07, "loss": 0.0004, "num_tokens": 642350585.0, "reward": 0.14375, "reward_std": 0.06133858636021614, "rewards/verify_chess_move/mean": 0.14375, "rewards/verify_chess_move/std": 0.9684825778007508, "step": 9085 }, { "completion_length": 362.8, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 86.1859375, "completions/mean_terminated_length": 86.1859375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008226437907136277, "frac_reward_zero_std": 0.89375, "grad_norm": 1.7592909336090088, "kl": 0.813540513755288, "learning_rate": 4.834206349206349e-07, "loss": 0.0008, "num_tokens": 642656399.0, "reward": 0.2234375, "reward_std": 0.09127199947834015, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9663491487503052, "step": 9090 }, { "completion_length": 601.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 601.2, "completions/max_terminated_length": 564.4, "completions/mean_length": 103.0484375, "completions/mean_terminated_length": 101.51020965576171, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008230962900484536, "frac_reward_zero_std": 0.8875, "grad_norm": 2.320631504058838, "kl": 1.4580773205670994, "learning_rate": 4.833809523809524e-07, "loss": 0.0015, "num_tokens": 642987885.0, "reward": 0.4015625, "reward_std": 0.09590358287096024, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9111773252487183, "step": 9095 }, { "completion_length": 397.4, "completions/clipped_ratio": 0.0, "completions/max_length": 397.4, "completions/max_terminated_length": 397.4, "completions/mean_length": 96.5671875, "completions/mean_terminated_length": 96.5671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008235487893832796, "frac_reward_zero_std": 0.875, "grad_norm": 14.41466236114502, "kl": 1.699760045472067, "learning_rate": 4.833412698412698e-07, "loss": 0.0017, "num_tokens": 643310187.0, "reward": 0.3796875, "reward_std": 0.10410689376294613, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.8983843564987183, "step": 9100 }, { "completion_length": 515.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 515.4, "completions/max_terminated_length": 483.4, "completions/mean_length": 94.24296875, "completions/mean_terminated_length": 93.72236938476563, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008240012887181056, "frac_reward_zero_std": 0.89375, "grad_norm": 11.169137001037598, "kl": 3.1358754141721876, "learning_rate": 4.833015873015873e-07, "loss": 0.0031, "num_tokens": 643628034.0, "reward": 0.3796875, "reward_std": 0.08922205045819283, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9181496143341065, "step": 9105 }, { "completion_length": 351.2, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/max_terminated_length": 351.2, "completions/mean_length": 90.55, "completions/mean_terminated_length": 90.55, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008244537880529316, "frac_reward_zero_std": 0.90625, "grad_norm": 16.337345123291016, "kl": 2.196833919931669, "learning_rate": 4.832619047619047e-07, "loss": 0.0022, "num_tokens": 643941146.0, "reward": 0.30625, "reward_std": 0.08606125265359879, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9519921541213989, "step": 9110 }, { "completion_length": 286.4, "completions/clipped_ratio": 0.0, "completions/max_length": 286.4, "completions/max_terminated_length": 286.4, "completions/mean_length": 91.225, "completions/mean_terminated_length": 91.225, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008249062873877576, "frac_reward_zero_std": 0.9125, "grad_norm": 5.046541690826416, "kl": 3.235363294207491, "learning_rate": 4.832222222222222e-07, "loss": 0.0032, "num_tokens": 644254778.0, "reward": 0.4578125, "reward_std": 0.07301771640777588, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.86339852809906, "step": 9115 }, { "completion_length": 458.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.2, "completions/max_terminated_length": 415.6, "completions/mean_length": 94.7765625, "completions/mean_terminated_length": 94.25378723144532, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008253587867225836, "frac_reward_zero_std": 0.86875, "grad_norm": 5.866464614868164, "kl": 4.198427031585015, "learning_rate": 4.831825396825396e-07, "loss": 0.0042, "num_tokens": 644573772.0, "reward": 0.2546875, "reward_std": 0.1098932757973671, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9575514078140259, "step": 9120 }, { "completion_length": 383.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 383.8, "completions/max_terminated_length": 366.8, "completions/mean_length": 100.65, "completions/mean_terminated_length": 100.13250732421875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.008258112860574094, "frac_reward_zero_std": 0.9375, "grad_norm": 6.8973307609558105, "kl": 1.7855287000886164, "learning_rate": 4.831428571428571e-07, "loss": 0.0018, "num_tokens": 644905060.0, "reward": 0.2796875, "reward_std": 0.053290097415447234, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9537341475486756, "step": 9125 }, { "completion_length": 528.2, "completions/clipped_ratio": 0.0, "completions/max_length": 528.2, "completions/max_terminated_length": 528.2, "completions/mean_length": 88.9625, "completions/mean_terminated_length": 88.9625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008262637853922354, "frac_reward_zero_std": 0.8625, "grad_norm": 7.265219211578369, "kl": 5.425211182446219, "learning_rate": 4.831031746031745e-07, "loss": 0.0054, "num_tokens": 645214276.0, "reward": 0.365625, "reward_std": 0.11452387571334839, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.8966774582862854, "step": 9130 }, { "completion_length": 473.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 473.0, "completions/max_terminated_length": 426.4, "completions/mean_length": 91.6578125, "completions/mean_terminated_length": 90.6093505859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008267162847270615, "frac_reward_zero_std": 0.925, "grad_norm": 3.145340919494629, "kl": 4.1563259434886275, "learning_rate": 4.830634920634921e-07, "loss": 0.0042, "num_tokens": 645528670.0, "reward": 0.4015625, "reward_std": 0.06370805725455284, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8727728366851807, "step": 9135 }, { "completion_length": 288.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 89.14140625, "completions/mean_terminated_length": 89.14140625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008271687840618875, "frac_reward_zero_std": 0.93125, "grad_norm": 5.8304643630981445, "kl": 1.6460168116725982, "learning_rate": 4.830238095238095e-07, "loss": 0.0016, "num_tokens": 645839739.0, "reward": 0.2609375, "reward_std": 0.06202157661318779, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9637048006057739, "step": 9140 }, { "completion_length": 459.6, "completions/clipped_ratio": 0.0, "completions/max_length": 459.6, "completions/max_terminated_length": 459.6, "completions/mean_length": 99.15, "completions/mean_terminated_length": 99.15, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.008276212833967135, "frac_reward_zero_std": 0.89375, "grad_norm": 10.08247184753418, "kl": 4.322144574741833, "learning_rate": 4.829841269841269e-07, "loss": 0.0043, "num_tokens": 646168211.0, "reward": 0.303125, "reward_std": 0.09605487883090973, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.941137170791626, "step": 9145 }, { "completion_length": 645.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 645.4, "completions/max_terminated_length": 628.8, "completions/mean_length": 102.6265625, "completions/mean_terminated_length": 101.08488464355469, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008280737827315393, "frac_reward_zero_std": 0.90625, "grad_norm": 1.4671533107757568, "kl": 3.8826578353648076, "learning_rate": 4.829444444444445e-07, "loss": 0.0039, "num_tokens": 646500925.0, "reward": 0.1, "reward_std": 0.07927589379251003, "rewards/verify_chess_move/mean": 0.1, "rewards/verify_chess_move/std": 0.9891760110855102, "step": 9150 }, { "completion_length": 476.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 476.0, "completions/max_terminated_length": 378.4, "completions/mean_length": 100.9546875, "completions/mean_terminated_length": 100.43722534179688, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008285262820663653, "frac_reward_zero_std": 0.94375, "grad_norm": 4.848820686340332, "kl": 1.2638983400072903, "learning_rate": 4.829047619047618e-07, "loss": 0.0013, "num_tokens": 646830643.0, "reward": 0.3859375, "reward_std": 0.04660856798291206, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9041926383972168, "step": 9155 }, { "completion_length": 443.4, "completions/clipped_ratio": 0.0, "completions/max_length": 443.4, "completions/max_terminated_length": 443.4, "completions/mean_length": 96.1625, "completions/mean_terminated_length": 96.1625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008289787814011913, "frac_reward_zero_std": 0.85625, "grad_norm": 4.2427077293396, "kl": 3.4293274926021695, "learning_rate": 4.828650793650794e-07, "loss": 0.0034, "num_tokens": 647150643.0, "reward": 0.365625, "reward_std": 0.13051545917987822, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9255046963691711, "step": 9160 }, { "completion_length": 321.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 94.975, "completions/mean_terminated_length": 94.975, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008294312807360173, "frac_reward_zero_std": 0.9125, "grad_norm": 3.7516298294067383, "kl": 1.3100891776848584, "learning_rate": 4.828253968253968e-07, "loss": 0.0013, "num_tokens": 647472035.0, "reward": 0.3734375, "reward_std": 0.07780157923698425, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9225630521774292, "step": 9165 }, { "completion_length": 508.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 508.8, "completions/max_terminated_length": 443.2, "completions/mean_length": 95.8453125, "completions/mean_terminated_length": 95.31008605957031, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008298837800708433, "frac_reward_zero_std": 0.90625, "grad_norm": 5.95839262008667, "kl": 2.619569813495036, "learning_rate": 4.827857142857143e-07, "loss": 0.0026, "num_tokens": 647792749.0, "reward": 0.353125, "reward_std": 0.08264533057808876, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9284402370452881, "step": 9170 }, { "completion_length": 419.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.4, "completions/max_terminated_length": 404.2, "completions/mean_length": 101.59296875, "completions/mean_terminated_length": 101.06954193115234, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008303362794056694, "frac_reward_zero_std": 0.90625, "grad_norm": 0.073554128408432, "kl": 1.449213214800693, "learning_rate": 4.827460317460317e-07, "loss": 0.0014, "num_tokens": 648125020.0, "reward": 0.3796875, "reward_std": 0.07585898637771607, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9250163078308106, "step": 9175 }, { "completion_length": 468.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 468.2, "completions/max_terminated_length": 450.2, "completions/mean_length": 103.4984375, "completions/mean_terminated_length": 102.99561767578125, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008307887787404952, "frac_reward_zero_std": 0.88125, "grad_norm": 11.49138069152832, "kl": 4.722079240891617, "learning_rate": 4.827063492063492e-07, "loss": 0.0047, "num_tokens": 648457626.0, "reward": 0.2546875, "reward_std": 0.10373990386724471, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9264665484428406, "step": 9180 }, { "completion_length": 528.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 528.2, "completions/max_terminated_length": 437.0, "completions/mean_length": 91.478125, "completions/mean_terminated_length": 90.41501159667969, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008312412780753212, "frac_reward_zero_std": 0.85, "grad_norm": 15.238113403320312, "kl": 4.861331216699909, "learning_rate": 4.826666666666666e-07, "loss": 0.0049, "num_tokens": 648771406.0, "reward": 0.33125, "reward_std": 0.13104518949985505, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9360065937042237, "step": 9185 }, { "completion_length": 455.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 455.0, "completions/max_terminated_length": 440.6, "completions/mean_length": 91.04921875, "completions/mean_terminated_length": 90.53334045410156, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008316937774101472, "frac_reward_zero_std": 0.9375, "grad_norm": 4.4182000160217285, "kl": 3.758204090700019, "learning_rate": 4.826269841269841e-07, "loss": 0.0038, "num_tokens": 649083445.0, "reward": 0.346875, "reward_std": 0.048978038132190704, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9314282774925232, "step": 9190 }, { "completion_length": 443.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 95.2640625, "completions/mean_terminated_length": 95.2640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008321462767449732, "frac_reward_zero_std": 0.90625, "grad_norm": 2.190150737762451, "kl": 3.1769485416589305, "learning_rate": 4.825873015873016e-07, "loss": 0.0032, "num_tokens": 649402591.0, "reward": 0.4453125, "reward_std": 0.07738966271281242, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.8894773483276367, "step": 9195 }, { "completion_length": 367.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 92.4078125, "completions/mean_terminated_length": 92.4078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008325987760797992, "frac_reward_zero_std": 0.89375, "grad_norm": 6.000455856323242, "kl": 2.0683640317409298, "learning_rate": 4.82547619047619e-07, "loss": 0.0021, "num_tokens": 649719857.0, "reward": 0.3234375, "reward_std": 0.09080117493867874, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9205793023109436, "step": 9200 }, { "completion_length": 409.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 409.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 94.41015625, "completions/mean_terminated_length": 93.89844207763672, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00833051275414625, "frac_reward_zero_std": 0.8625, "grad_norm": 7.832972526550293, "kl": 4.995625975634903, "learning_rate": 4.825079365079365e-07, "loss": 0.005, "num_tokens": 650038366.0, "reward": 0.2703125, "reward_std": 0.12156985253095627, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.953728461265564, "step": 9205 }, { "completion_length": 429.8, "completions/clipped_ratio": 0.0, "completions/max_length": 429.8, "completions/max_terminated_length": 429.8, "completions/mean_length": 98.91640625, "completions/mean_terminated_length": 98.91640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00833503774749451, "frac_reward_zero_std": 0.8375, "grad_norm": 10.020511627197266, "kl": 2.823691729956772, "learning_rate": 4.824682539682539e-07, "loss": 0.0028, "num_tokens": 650363707.0, "reward": 0.2609375, "reward_std": 0.1453973650932312, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9527931213378906, "step": 9210 }, { "completion_length": 431.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 431.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 97.89609375, "completions/mean_terminated_length": 97.38493957519532, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00833956274084277, "frac_reward_zero_std": 0.875, "grad_norm": 7.111569881439209, "kl": 4.912901219306514, "learning_rate": 4.824285714285714e-07, "loss": 0.0049, "num_tokens": 650689070.0, "reward": 0.2109375, "reward_std": 0.11473251283168792, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.942557692527771, "step": 9215 }, { "completion_length": 410.8, "completions/clipped_ratio": 0.0, "completions/max_length": 410.8, "completions/max_terminated_length": 410.8, "completions/mean_length": 94.65078125, "completions/mean_terminated_length": 94.65078125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008344087734191031, "frac_reward_zero_std": 0.925, "grad_norm": 1.1234108209609985, "kl": 2.662231726443861, "learning_rate": 4.823888888888888e-07, "loss": 0.0027, "num_tokens": 651009743.0, "reward": 0.1375, "reward_std": 0.06507501490414143, "rewards/verify_chess_move/mean": 0.1375, "rewards/verify_chess_move/std": 0.9841734409332276, "step": 9220 }, { "completion_length": 519.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 519.2, "completions/max_terminated_length": 333.0, "completions/mean_length": 98.58671875, "completions/mean_terminated_length": 97.5473403930664, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008348612727539291, "frac_reward_zero_std": 0.875, "grad_norm": 10.5188570022583, "kl": 2.7788907599751838, "learning_rate": 4.823492063492064e-07, "loss": 0.0028, "num_tokens": 651336030.0, "reward": 0.4375, "reward_std": 0.10957375168800354, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8830983400344848, "step": 9225 }, { "completion_length": 544.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 544.0, "completions/max_terminated_length": 402.4, "completions/mean_length": 101.55078125, "completions/mean_terminated_length": 100.50919036865234, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.008353137720887551, "frac_reward_zero_std": 0.9125, "grad_norm": 22.215505599975586, "kl": 3.9331819901824927, "learning_rate": 4.823095238095237e-07, "loss": 0.0039, "num_tokens": 651667911.0, "reward": 0.1734375, "reward_std": 0.07711760886013508, "rewards/verify_chess_move/mean": 0.1734375, "rewards/verify_chess_move/std": 0.981086790561676, "step": 9230 }, { "completion_length": 426.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.0, "completions/max_terminated_length": 363.6, "completions/mean_length": 93.74375, "completions/mean_terminated_length": 93.2205795288086, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00835766271423581, "frac_reward_zero_std": 0.925, "grad_norm": 3.8429341316223145, "kl": 7.586868399684318, "learning_rate": 4.822698412698413e-07, "loss": 0.0076, "num_tokens": 651986751.0, "reward": 0.259375, "reward_std": 0.06392022259533406, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9623661518096924, "step": 9235 }, { "completion_length": 433.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 433.8, "completions/max_terminated_length": 345.2, "completions/mean_length": 94.3125, "completions/mean_terminated_length": 93.78464813232422, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00836218770758407, "frac_reward_zero_std": 0.9125, "grad_norm": 12.12434196472168, "kl": 8.686114123126027, "learning_rate": 4.822301587301587e-07, "loss": 0.0087, "num_tokens": 652306311.0, "reward": 0.2796875, "reward_std": 0.06891782097518444, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9438716769218445, "step": 9240 }, { "completion_length": 466.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 466.8, "completions/max_terminated_length": 447.8, "completions/mean_length": 110.00625, "completions/mean_terminated_length": 109.49817810058593, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.00836671270093233, "frac_reward_zero_std": 0.88125, "grad_norm": 1.634485125541687, "kl": 12.570825243066064, "learning_rate": 4.821904761904761e-07, "loss": 0.0126, "num_tokens": 652653615.0, "reward": 0.2453125, "reward_std": 0.10194959193468094, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9487319588661194, "step": 9245 }, { "completion_length": 459.2, "completions/clipped_ratio": 0.0, "completions/max_length": 459.2, "completions/max_terminated_length": 459.2, "completions/mean_length": 97.03828125, "completions/mean_terminated_length": 97.03828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00837123769428059, "frac_reward_zero_std": 0.9, "grad_norm": 8.267529487609863, "kl": 13.392210802366026, "learning_rate": 4.821507936507937e-07, "loss": 0.0134, "num_tokens": 652975080.0, "reward": 0.3359375, "reward_std": 0.08343567252159119, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9283053517341614, "step": 9250 }, { "completion_length": 432.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 432.0, "completions/max_terminated_length": 415.6, "completions/mean_length": 96.4453125, "completions/mean_terminated_length": 95.92718200683593, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00837576268762885, "frac_reward_zero_std": 0.89375, "grad_norm": 5.439977169036865, "kl": 5.5000243206624875, "learning_rate": 4.821111111111111e-07, "loss": 0.0055, "num_tokens": 653297826.0, "reward": 0.2734375, "reward_std": 0.0860637977719307, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9591793417930603, "step": 9255 }, { "completion_length": 496.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 96.66015625, "completions/mean_terminated_length": 96.66015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008380287680977108, "frac_reward_zero_std": 0.8625, "grad_norm": 18.015853881835938, "kl": 3.88117395847803, "learning_rate": 4.820714285714286e-07, "loss": 0.0039, "num_tokens": 653620727.0, "reward": 0.340625, "reward_std": 0.12357232570648194, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9301450490951538, "step": 9260 }, { "completion_length": 521.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 521.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 88.35546875, "completions/mean_terminated_length": 87.29272003173828, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.008384812674325368, "frac_reward_zero_std": 0.89375, "grad_norm": 8.589582443237305, "kl": 3.13949258858338, "learning_rate": 4.82031746031746e-07, "loss": 0.0031, "num_tokens": 653928950.0, "reward": 0.396875, "reward_std": 0.0931097786873579, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9132237195968628, "step": 9265 }, { "completion_length": 460.6, "completions/clipped_ratio": 0.0, "completions/max_length": 460.6, "completions/max_terminated_length": 460.6, "completions/mean_length": 95.159375, "completions/mean_terminated_length": 95.159375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008389337667673628, "frac_reward_zero_std": 0.8875, "grad_norm": 12.13362979888916, "kl": 4.999783187487628, "learning_rate": 4.819920634920635e-07, "loss": 0.005, "num_tokens": 654250338.0, "reward": 0.2546875, "reward_std": 0.10184223651885986, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9658098578453064, "step": 9270 }, { "completion_length": 335.8, "completions/clipped_ratio": 0.0, "completions/max_length": 335.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 94.08125, "completions/mean_terminated_length": 94.08125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008393862661021888, "frac_reward_zero_std": 0.875, "grad_norm": 5.978786945343018, "kl": 5.932095100567676, "learning_rate": 4.819523809523809e-07, "loss": 0.0059, "num_tokens": 654569762.0, "reward": 0.45625, "reward_std": 0.1097849354147911, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8798062682151795, "step": 9275 }, { "completion_length": 307.8, "completions/clipped_ratio": 0.0, "completions/max_length": 307.8, "completions/max_terminated_length": 307.8, "completions/mean_length": 87.7515625, "completions/mean_terminated_length": 87.7515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008398387654370149, "frac_reward_zero_std": 0.875, "grad_norm": 5.6174235343933105, "kl": 4.802860180754214, "learning_rate": 4.819126984126984e-07, "loss": 0.0048, "num_tokens": 654878956.0, "reward": 0.265625, "reward_std": 0.11477998420596122, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.956396484375, "step": 9280 }, { "completion_length": 405.2, "completions/clipped_ratio": 0.0, "completions/max_length": 405.2, "completions/max_terminated_length": 405.2, "completions/mean_length": 94.00546875, "completions/mean_terminated_length": 94.00546875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008402912647718409, "frac_reward_zero_std": 0.93125, "grad_norm": 4.950190544128418, "kl": 1.7866251742467285, "learning_rate": 4.818730158730158e-07, "loss": 0.0018, "num_tokens": 655197259.0, "reward": 0.484375, "reward_std": 0.05476441346108914, "rewards/verify_chess_move/mean": 0.484375, "rewards/verify_chess_move/std": 0.8651567101478577, "step": 9285 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.0, "completions/max_length": 444.6, "completions/max_terminated_length": 444.6, "completions/mean_length": 96.06640625, "completions/mean_terminated_length": 96.06640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008407437641066667, "frac_reward_zero_std": 0.8875, "grad_norm": 10.043524742126465, "kl": 1.993996952008456, "learning_rate": 4.818333333333333e-07, "loss": 0.002, "num_tokens": 655518952.0, "reward": 0.3484375, "reward_std": 0.09974384009838104, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9262296199798584, "step": 9290 }, { "completion_length": 404.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 91.3484375, "completions/mean_terminated_length": 91.3484375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008411962634414927, "frac_reward_zero_std": 0.95625, "grad_norm": 6.213983058929443, "kl": 2.9183759381761774, "learning_rate": 4.817936507936507e-07, "loss": 0.0029, "num_tokens": 655834070.0, "reward": 0.365625, "reward_std": 0.03956102356314659, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.892115318775177, "step": 9295 }, { "completion_length": 406.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 406.2, "completions/max_terminated_length": 377.8, "completions/mean_length": 92.13125, "completions/mean_terminated_length": 91.06124572753906, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008416487627763187, "frac_reward_zero_std": 0.90625, "grad_norm": 16.142513275146484, "kl": 8.029375168890692, "learning_rate": 4.817539682539682e-07, "loss": 0.008, "num_tokens": 656150230.0, "reward": 0.30625, "reward_std": 0.07901625595986843, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.94354727268219, "step": 9300 }, { "completion_length": 521.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 521.2, "completions/max_terminated_length": 463.8, "completions/mean_length": 89.88359375, "completions/mean_terminated_length": 88.85391235351562, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008421012621111447, "frac_reward_zero_std": 0.93125, "grad_norm": 15.796943664550781, "kl": 2.5279961396125143, "learning_rate": 4.817142857142856e-07, "loss": 0.0025, "num_tokens": 656460481.0, "reward": 0.440625, "reward_std": 0.058393485471606256, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8951797008514404, "step": 9305 }, { "completion_length": 537.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 537.0, "completions/max_terminated_length": 516.6, "completions/mean_length": 96.8578125, "completions/mean_terminated_length": 95.31273345947265, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008425537614459707, "frac_reward_zero_std": 0.8875, "grad_norm": 7.130377769470215, "kl": 1.306802392366808, "learning_rate": 4.816746031746032e-07, "loss": 0.0013, "num_tokens": 656783011.0, "reward": 0.3625, "reward_std": 0.10389316380023957, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9156832218170166, "step": 9310 }, { "completion_length": 485.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.4, "completions/max_terminated_length": 431.0, "completions/mean_length": 91.6265625, "completions/mean_terminated_length": 91.09722595214843, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008430062607807966, "frac_reward_zero_std": 0.88125, "grad_norm": 1.3025108575820923, "kl": 2.1549777258653195, "learning_rate": 4.816349206349207e-07, "loss": 0.0022, "num_tokens": 657099053.0, "reward": 0.30625, "reward_std": 0.09921665713191033, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9524715065956115, "step": 9315 }, { "completion_length": 429.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 429.0, "completions/max_terminated_length": 401.2, "completions/mean_length": 96.0125, "completions/mean_terminated_length": 94.9797119140625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008434587601156226, "frac_reward_zero_std": 0.925, "grad_norm": 8.022799491882324, "kl": 0.9862012909492478, "learning_rate": 4.81595238095238e-07, "loss": 0.001, "num_tokens": 657422893.0, "reward": 0.284375, "reward_std": 0.06827975511550903, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9473429679870605, "step": 9320 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 440.6, "completions/max_terminated_length": 372.4, "completions/mean_length": 94.15078125, "completions/mean_terminated_length": 93.62604522705078, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008439112594504486, "frac_reward_zero_std": 0.93125, "grad_norm": 0.05392042547464371, "kl": 0.8515264898422175, "learning_rate": 4.815555555555556e-07, "loss": 0.0009, "num_tokens": 657740862.0, "reward": 0.4296875, "reward_std": 0.05544740222394466, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.9001101493835449, "step": 9325 }, { "completion_length": 371.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 371.2, "completions/max_terminated_length": 337.0, "completions/mean_length": 91.17578125, "completions/mean_terminated_length": 90.64615020751953, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008443637587852746, "frac_reward_zero_std": 0.9, "grad_norm": 4.118818283081055, "kl": 0.6777193958172575, "learning_rate": 4.81515873015873e-07, "loss": 0.0007, "num_tokens": 658053791.0, "reward": 0.4140625, "reward_std": 0.08711123764514923, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9036361813545227, "step": 9330 }, { "completion_length": 523.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 523.2, "completions/max_terminated_length": 426.8, "completions/mean_length": 102.2640625, "completions/mean_terminated_length": 101.20763397216797, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008448162581201006, "frac_reward_zero_std": 0.89375, "grad_norm": 9.035965919494629, "kl": 1.3410420844098554, "learning_rate": 4.814761904761905e-07, "loss": 0.0013, "num_tokens": 658387553.0, "reward": 0.3140625, "reward_std": 0.08853808268904687, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9206356167793274, "step": 9335 }, { "completion_length": 631.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 631.2, "completions/max_terminated_length": 516.0, "completions/mean_length": 97.30078125, "completions/mean_terminated_length": 95.72604522705078, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008452687574549266, "frac_reward_zero_std": 0.9, "grad_norm": 7.384397983551025, "kl": 1.4456582708400674, "learning_rate": 4.814365079365079e-07, "loss": 0.0014, "num_tokens": 658711090.0, "reward": 0.315625, "reward_std": 0.08595742695033551, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9478882074356079, "step": 9340 }, { "completion_length": 395.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 395.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 89.2765625, "completions/mean_terminated_length": 88.19776763916016, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008457212567897525, "frac_reward_zero_std": 0.89375, "grad_norm": 4.887362480163574, "kl": 1.18741773001384, "learning_rate": 4.813968253968254e-07, "loss": 0.0012, "num_tokens": 659021708.0, "reward": 0.48125, "reward_std": 0.08900988399982453, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8741413354873657, "step": 9345 }, { "completion_length": 375.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 375.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 94.84453125, "completions/mean_terminated_length": 94.33135528564453, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008461737561245785, "frac_reward_zero_std": 0.91875, "grad_norm": 6.053299903869629, "kl": 2.105912211886607, "learning_rate": 4.813571428571428e-07, "loss": 0.0021, "num_tokens": 659341565.0, "reward": 0.284375, "reward_std": 0.06770314127206803, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9529846787452698, "step": 9350 }, { "completion_length": 393.6, "completions/clipped_ratio": 0.0, "completions/max_length": 393.6, "completions/max_terminated_length": 393.6, "completions/mean_length": 93.9359375, "completions/mean_terminated_length": 93.9359375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008466262554594045, "frac_reward_zero_std": 0.91875, "grad_norm": 0.989996075630188, "kl": 1.929892251896672, "learning_rate": 4.813174603174603e-07, "loss": 0.0019, "num_tokens": 659660147.0, "reward": 0.3984375, "reward_std": 0.06975407004356385, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9097895145416259, "step": 9355 }, { "completion_length": 376.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 376.6, "completions/max_terminated_length": 289.2, "completions/mean_length": 87.65078125, "completions/mean_terminated_length": 87.12798461914062, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008470787547942305, "frac_reward_zero_std": 0.925, "grad_norm": 3.3254215717315674, "kl": 0.7781939041567967, "learning_rate": 4.812777777777777e-07, "loss": 0.0008, "num_tokens": 659969244.0, "reward": 0.2953125, "reward_std": 0.06691279634833336, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9321548223495484, "step": 9360 }, { "completion_length": 300.2, "completions/clipped_ratio": 0.0, "completions/max_length": 300.2, "completions/max_terminated_length": 300.2, "completions/mean_length": 93.95625, "completions/mean_terminated_length": 93.95625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008475312541290565, "frac_reward_zero_std": 0.90625, "grad_norm": 4.661725044250488, "kl": 0.8427740906598047, "learning_rate": 4.812380952380952e-07, "loss": 0.0008, "num_tokens": 660290132.0, "reward": 0.509375, "reward_std": 0.08017104864120483, "rewards/verify_chess_move/mean": 0.509375, "rewards/verify_chess_move/std": 0.8431695461273193, "step": 9365 }, { "completion_length": 519.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 519.2, "completions/max_terminated_length": 356.4, "completions/mean_length": 89.41953125, "completions/mean_terminated_length": 88.36356506347656, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.008479837534638823, "frac_reward_zero_std": 0.875, "grad_norm": 6.362865924835205, "kl": 1.6135411004535853, "learning_rate": 4.811984126984127e-07, "loss": 0.0016, "num_tokens": 660602597.0, "reward": 0.359375, "reward_std": 0.10090118125081063, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.929101312160492, "step": 9370 }, { "completion_length": 538.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 538.6, "completions/max_terminated_length": 443.6, "completions/mean_length": 100.0390625, "completions/mean_terminated_length": 98.99223175048829, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008484362527987083, "frac_reward_zero_std": 0.9, "grad_norm": 6.075517177581787, "kl": 2.3937015240313486, "learning_rate": 4.811587301587301e-07, "loss": 0.0024, "num_tokens": 660932799.0, "reward": 0.2578125, "reward_std": 0.08800639025866985, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9529729843139648, "step": 9375 }, { "completion_length": 451.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.6, "completions/max_terminated_length": 425.8, "completions/mean_length": 100.6234375, "completions/mean_terminated_length": 100.10865020751953, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008488887521335343, "frac_reward_zero_std": 0.8875, "grad_norm": 11.156784057617188, "kl": 2.357189691439271, "learning_rate": 4.811190476190476e-07, "loss": 0.0024, "num_tokens": 661263989.0, "reward": 0.2484375, "reward_std": 0.10367903560400009, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9594184517860412, "step": 9380 }, { "completion_length": 425.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.4, "completions/max_terminated_length": 359.0, "completions/mean_length": 98.5140625, "completions/mean_terminated_length": 97.99105529785156, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008493412514683604, "frac_reward_zero_std": 0.85, "grad_norm": 3.510103702545166, "kl": 3.773330853320658, "learning_rate": 4.810793650793651e-07, "loss": 0.0038, "num_tokens": 661589991.0, "reward": 0.2296875, "reward_std": 0.1313038393855095, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9664552450180054, "step": 9385 }, { "completion_length": 507.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 507.0, "completions/max_terminated_length": 462.4, "completions/mean_length": 96.5875, "completions/mean_terminated_length": 95.02609405517578, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008497937508031864, "frac_reward_zero_std": 0.86875, "grad_norm": 0.7587674260139465, "kl": 7.944739974732511, "learning_rate": 4.810396825396825e-07, "loss": 0.0079, "num_tokens": 661912287.0, "reward": 0.3421875, "reward_std": 0.11236559301614761, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.932428526878357, "step": 9390 }, { "completion_length": 396.4, "completions/clipped_ratio": 0.0, "completions/max_length": 396.4, "completions/max_terminated_length": 396.4, "completions/mean_length": 92.28203125, "completions/mean_terminated_length": 92.28203125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.008502462501380124, "frac_reward_zero_std": 0.9125, "grad_norm": 9.928235054016113, "kl": 7.094835991412401, "learning_rate": 4.809999999999999e-07, "loss": 0.0071, "num_tokens": 662229016.0, "reward": 0.3640625, "reward_std": 0.07622245326638222, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9307472944259644, "step": 9395 }, { "completion_length": 506.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 506.6, "completions/max_terminated_length": 501.2, "completions/mean_length": 97.728125, "completions/mean_terminated_length": 97.21535949707031, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008506987494728382, "frac_reward_zero_std": 0.9125, "grad_norm": 4.834281921386719, "kl": 7.017386039812118, "learning_rate": 4.809603174603175e-07, "loss": 0.007, "num_tokens": 662552860.0, "reward": 0.38125, "reward_std": 0.07758941352367402, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.881076967716217, "step": 9400 }, { "completion_length": 628.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 628.2, "completions/max_terminated_length": 570.4, "completions/mean_length": 96.90625, "completions/mean_terminated_length": 95.33530883789062, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008511512488076642, "frac_reward_zero_std": 0.89375, "grad_norm": 19.22907066345215, "kl": 7.066499789908994, "learning_rate": 4.809206349206349e-07, "loss": 0.0071, "num_tokens": 662874260.0, "reward": 0.209375, "reward_std": 0.09490008763968945, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9651721239089965, "step": 9405 }, { "completion_length": 390.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 88.49921875, "completions/mean_terminated_length": 88.49921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008516037481424902, "frac_reward_zero_std": 0.9, "grad_norm": 9.704736709594727, "kl": 3.607699949736707, "learning_rate": 4.808809523809523e-07, "loss": 0.0036, "num_tokens": 663184515.0, "reward": 0.2828125, "reward_std": 0.09205979257822036, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9424848794937134, "step": 9410 }, { "completion_length": 460.8, "completions/clipped_ratio": 0.0, "completions/max_length": 460.8, "completions/max_terminated_length": 460.8, "completions/mean_length": 90.628125, "completions/mean_terminated_length": 90.628125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008520562474773162, "frac_reward_zero_std": 0.875, "grad_norm": 9.537622451782227, "kl": 1.6910462499363348, "learning_rate": 4.808412698412698e-07, "loss": 0.0017, "num_tokens": 663497687.0, "reward": 0.48125, "reward_std": 0.1042706087231636, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8749299049377441, "step": 9415 }, { "completion_length": 446.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 85.44609375, "completions/mean_terminated_length": 85.44609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008525087468121422, "frac_reward_zero_std": 0.90625, "grad_norm": 11.885358810424805, "kl": 3.0794105197885075, "learning_rate": 4.808015873015873e-07, "loss": 0.0031, "num_tokens": 663801874.0, "reward": 0.2765625, "reward_std": 0.07585898861289024, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9520098686218261, "step": 9420 }, { "completion_length": 471.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.8, "completions/max_terminated_length": 421.0, "completions/mean_length": 99.7640625, "completions/mean_terminated_length": 99.2453598022461, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00852961246146968, "frac_reward_zero_std": 0.88125, "grad_norm": 8.821602821350098, "kl": 5.971421890379861, "learning_rate": 4.807619047619048e-07, "loss": 0.006, "num_tokens": 664131500.0, "reward": 0.28125, "reward_std": 0.10100696980953217, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9432902455329895, "step": 9425 }, { "completion_length": 697.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 697.6, "completions/max_terminated_length": 499.4, "completions/mean_length": 103.421875, "completions/mean_terminated_length": 101.85750579833984, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008534137454817941, "frac_reward_zero_std": 0.90625, "grad_norm": 8.378866195678711, "kl": 3.7561866507050583, "learning_rate": 4.807222222222222e-07, "loss": 0.0038, "num_tokens": 664466432.0, "reward": 0.2921875, "reward_std": 0.07517501935362816, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9494606614112854, "step": 9430 }, { "completion_length": 504.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 504.6, "completions/max_terminated_length": 372.4, "completions/mean_length": 96.44140625, "completions/mean_terminated_length": 95.40069274902343, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008538662448166201, "frac_reward_zero_std": 0.925, "grad_norm": 15.036600112915039, "kl": 2.5349742006743328, "learning_rate": 4.806825396825397e-07, "loss": 0.0025, "num_tokens": 664790581.0, "reward": 0.2921875, "reward_std": 0.06302408799529076, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9449209332466125, "step": 9435 }, { "completion_length": 485.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.8, "completions/max_terminated_length": 465.6, "completions/mean_length": 99.27890625, "completions/mean_terminated_length": 98.76197662353516, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008543187441514461, "frac_reward_zero_std": 0.88125, "grad_norm": 2.993717908859253, "kl": 3.670282408397179, "learning_rate": 4.806428571428571e-07, "loss": 0.0037, "num_tokens": 665118226.0, "reward": 0.3921875, "reward_std": 0.09942686259746551, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9156835198402404, "step": 9440 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 386.4, "completions/max_terminated_length": 281.8, "completions/mean_length": 87.98203125, "completions/mean_terminated_length": 87.45307769775391, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008547712434862721, "frac_reward_zero_std": 0.91875, "grad_norm": 1.8968063592910767, "kl": 4.654348272876814, "learning_rate": 4.806031746031746e-07, "loss": 0.0047, "num_tokens": 665427323.0, "reward": 0.3296875, "reward_std": 0.06881046295166016, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9337169170379639, "step": 9445 }, { "completion_length": 366.8, "completions/clipped_ratio": 0.0, "completions/max_length": 366.8, "completions/max_terminated_length": 366.8, "completions/mean_length": 91.22578125, "completions/mean_terminated_length": 91.22578125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008552237428210981, "frac_reward_zero_std": 0.95, "grad_norm": 0.2830674350261688, "kl": 1.5155992464511656, "learning_rate": 4.80563492063492e-07, "loss": 0.0015, "num_tokens": 665742404.0, "reward": 0.3375, "reward_std": 0.04192951247096062, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9174351096153259, "step": 9450 }, { "completion_length": 511.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 93.31640625, "completions/mean_terminated_length": 93.31640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00855676242155924, "frac_reward_zero_std": 0.875, "grad_norm": 8.410032272338867, "kl": 3.9721338028321043, "learning_rate": 4.805238095238095e-07, "loss": 0.004, "num_tokens": 666059385.0, "reward": 0.265625, "reward_std": 0.09795607626438141, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9569802641868591, "step": 9455 }, { "completion_length": 636.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 636.2, "completions/max_terminated_length": 488.0, "completions/mean_length": 96.59765625, "completions/mean_terminated_length": 95.54465026855469, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0085612874149075, "frac_reward_zero_std": 0.925, "grad_norm": 8.399733543395996, "kl": 4.504603658244013, "learning_rate": 4.804841269841269e-07, "loss": 0.0045, "num_tokens": 666382702.0, "reward": 0.3953125, "reward_std": 0.0655448567122221, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.8942236542701721, "step": 9460 }, { "completion_length": 579.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 579.6, "completions/max_terminated_length": 569.6, "completions/mean_length": 92.046875, "completions/mean_terminated_length": 91.52108612060547, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00856581240825576, "frac_reward_zero_std": 0.925, "grad_norm": 1.95552659034729, "kl": 3.9106383091420867, "learning_rate": 4.804444444444444e-07, "loss": 0.0039, "num_tokens": 666697178.0, "reward": 0.359375, "reward_std": 0.06323625519871712, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9293490529060364, "step": 9465 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 374.2, "completions/max_terminated_length": 288.2, "completions/mean_length": 93.86953125, "completions/mean_terminated_length": 93.35269470214844, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00857033740160402, "frac_reward_zero_std": 0.86875, "grad_norm": 13.607959747314453, "kl": 9.472131396341137, "learning_rate": 4.804047619047618e-07, "loss": 0.0095, "num_tokens": 667016955.0, "reward": 0.340625, "reward_std": 0.11057626456022263, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9375919222831726, "step": 9470 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 87.20234375, "completions/mean_terminated_length": 87.20234375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00857486239495228, "frac_reward_zero_std": 0.93125, "grad_norm": 3.4164721965789795, "kl": 5.021293166466057, "learning_rate": 4.803650793650794e-07, "loss": 0.005, "num_tokens": 667324094.0, "reward": 0.3234375, "reward_std": 0.054976581037044524, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9200543761253357, "step": 9475 }, { "completion_length": 467.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 467.6, "completions/max_terminated_length": 466.6, "completions/mean_length": 96.45546875, "completions/mean_terminated_length": 95.43054809570313, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008579387388300538, "frac_reward_zero_std": 0.86875, "grad_norm": 7.5771965980529785, "kl": 6.60972410554532, "learning_rate": 4.803253968253967e-07, "loss": 0.0066, "num_tokens": 667647469.0, "reward": 0.3171875, "reward_std": 0.11415786594152451, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9291247487068176, "step": 9480 }, { "completion_length": 319.4, "completions/clipped_ratio": 0.0, "completions/max_length": 319.4, "completions/max_terminated_length": 319.4, "completions/mean_length": 91.98515625, "completions/mean_terminated_length": 91.98515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008583912381648798, "frac_reward_zero_std": 0.94375, "grad_norm": 7.207892417907715, "kl": 1.6915363835403696, "learning_rate": 4.802857142857142e-07, "loss": 0.0017, "num_tokens": 667965338.0, "reward": 0.246875, "reward_std": 0.04319166131317616, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9678603053092957, "step": 9485 }, { "completion_length": 336.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 97.23125, "completions/mean_terminated_length": 97.23125, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008588437374997059, "frac_reward_zero_std": 0.91875, "grad_norm": 3.95080828666687, "kl": 1.5732722133863717, "learning_rate": 4.802460317460318e-07, "loss": 0.0016, "num_tokens": 668291202.0, "reward": 0.35625, "reward_std": 0.0636032484471798, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9301980495452881, "step": 9490 }, { "completion_length": 599.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 599.0, "completions/max_terminated_length": 476.4, "completions/mean_length": 100.5734375, "completions/mean_terminated_length": 99.51941375732422, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008592962368345319, "frac_reward_zero_std": 0.85625, "grad_norm": 14.419404029846191, "kl": 5.017970304144546, "learning_rate": 4.802063492063492e-07, "loss": 0.005, "num_tokens": 668619768.0, "reward": 0.2046875, "reward_std": 0.12441112771630287, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9503390431404114, "step": 9495 }, { "completion_length": 495.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 495.0, "completions/max_terminated_length": 482.2, "completions/mean_length": 91.028125, "completions/mean_terminated_length": 90.50838317871094, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008597487361693579, "frac_reward_zero_std": 0.88125, "grad_norm": 5.37344217300415, "kl": 3.484147153387312, "learning_rate": 4.801666666666667e-07, "loss": 0.0035, "num_tokens": 668932132.0, "reward": 0.378125, "reward_std": 0.10310340523719788, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9200665593147278, "step": 9500 }, { "completion_length": 576.4, "completions/clipped_ratio": 0.003125, "completions/max_length": 576.4, "completions/max_terminated_length": 481.0, "completions/mean_length": 98.24453125, "completions/mean_terminated_length": 96.1554183959961, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008602012355041839, "frac_reward_zero_std": 0.8625, "grad_norm": 10.62663745880127, "kl": 2.3896158997667953, "learning_rate": 4.801269841269841e-07, "loss": 0.0024, "num_tokens": 669258205.0, "reward": 0.321875, "reward_std": 0.10995315611362458, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9291588544845581, "step": 9505 }, { "completion_length": 416.4, "completions/clipped_ratio": 0.0, "completions/max_length": 416.4, "completions/max_terminated_length": 416.4, "completions/mean_length": 87.0171875, "completions/mean_terminated_length": 87.0171875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008606537348390097, "frac_reward_zero_std": 0.85625, "grad_norm": 8.156719207763672, "kl": 7.766403196414467, "learning_rate": 4.800873015873016e-07, "loss": 0.0078, "num_tokens": 669565779.0, "reward": 0.2671875, "reward_std": 0.12683695703744888, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.952777373790741, "step": 9510 }, { "completion_length": 420.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 420.4, "completions/max_terminated_length": 415.2, "completions/mean_length": 95.84296875, "completions/mean_terminated_length": 95.33289337158203, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008611062341738357, "frac_reward_zero_std": 0.90625, "grad_norm": 6.3317389488220215, "kl": 5.667967664333991, "learning_rate": 4.80047619047619e-07, "loss": 0.0057, "num_tokens": 669888362.0, "reward": 0.20625, "reward_std": 0.08358893394470215, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.972002136707306, "step": 9515 }, { "completion_length": 365.6, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 86.06328125, "completions/mean_terminated_length": 86.06328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008615587335086617, "frac_reward_zero_std": 0.90625, "grad_norm": 6.57358455657959, "kl": 6.152847277699038, "learning_rate": 4.800079365079365e-07, "loss": 0.0062, "num_tokens": 670195899.0, "reward": 0.396875, "reward_std": 0.07675316333770751, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9164797782897949, "step": 9520 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 97.95234375, "completions/mean_terminated_length": 97.95234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008620112328434877, "frac_reward_zero_std": 0.9, "grad_norm": 8.260920524597168, "kl": 23.50279444780899, "learning_rate": 4.799682539682539e-07, "loss": 0.0235, "num_tokens": 670522526.0, "reward": 0.334375, "reward_std": 0.08117356263101101, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9306373000144958, "step": 9525 }, { "completion_length": 348.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 87.7328125, "completions/mean_terminated_length": 87.7328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008624637321783138, "frac_reward_zero_std": 0.89375, "grad_norm": 6.719478607177734, "kl": 9.034803182259202, "learning_rate": 4.799285714285714e-07, "loss": 0.009, "num_tokens": 670830784.0, "reward": 0.3390625, "reward_std": 0.08696091771125794, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9374397039413452, "step": 9530 }, { "completion_length": 311.6, "completions/clipped_ratio": 0.0, "completions/max_length": 311.6, "completions/max_terminated_length": 311.6, "completions/mean_length": 88.734375, "completions/mean_terminated_length": 88.734375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008629162315131396, "frac_reward_zero_std": 0.9125, "grad_norm": 5.194381237030029, "kl": 2.391180353448726, "learning_rate": 4.798888888888888e-07, "loss": 0.0024, "num_tokens": 671142988.0, "reward": 0.25, "reward_std": 0.07301869541406632, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9597088694572449, "step": 9535 }, { "completion_length": 375.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 375.0, "completions/max_terminated_length": 352.2, "completions/mean_length": 94.3359375, "completions/mean_terminated_length": 93.31414184570312, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008633687308479656, "frac_reward_zero_std": 0.86875, "grad_norm": 2.2614359855651855, "kl": 5.7564782335422935, "learning_rate": 4.798492063492063e-07, "loss": 0.0058, "num_tokens": 671460778.0, "reward": 0.3828125, "reward_std": 0.11026915609836578, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9185662865638733, "step": 9540 }, { "completion_length": 537.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 537.8, "completions/max_terminated_length": 507.6, "completions/mean_length": 96.38359375, "completions/mean_terminated_length": 95.34857940673828, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008638212301827916, "frac_reward_zero_std": 0.875, "grad_norm": 7.055457592010498, "kl": 3.9355725359288045, "learning_rate": 4.798095238095238e-07, "loss": 0.0039, "num_tokens": 671782733.0, "reward": 0.3703125, "reward_std": 0.09863906539976597, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9035208821296692, "step": 9545 }, { "completion_length": 449.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.4, "completions/max_terminated_length": 422.8, "completions/mean_length": 95.7265625, "completions/mean_terminated_length": 95.21870269775391, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008642737295176176, "frac_reward_zero_std": 0.88125, "grad_norm": 10.980551719665527, "kl": 5.299850494880229, "learning_rate": 4.797698412698413e-07, "loss": 0.0053, "num_tokens": 672105303.0, "reward": 0.3921875, "reward_std": 0.10510587766766548, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9124803423881531, "step": 9550 }, { "completion_length": 505.6, "completions/clipped_ratio": 0.0, "completions/max_length": 505.6, "completions/max_terminated_length": 505.6, "completions/mean_length": 104.57265625, "completions/mean_terminated_length": 104.57265625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.008647262288524436, "frac_reward_zero_std": 0.9125, "grad_norm": 18.595361709594727, "kl": 10.840629787230863, "learning_rate": 4.797301587301587e-07, "loss": 0.0108, "num_tokens": 672442420.0, "reward": 0.271875, "reward_std": 0.07985250651836395, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9415788173675537, "step": 9555 }, { "completion_length": 431.8, "completions/clipped_ratio": 0.0, "completions/max_length": 431.8, "completions/max_terminated_length": 431.8, "completions/mean_length": 94.80859375, "completions/mean_terminated_length": 94.80859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008651787281872696, "frac_reward_zero_std": 0.93125, "grad_norm": 15.308127403259277, "kl": 5.019555233023129, "learning_rate": 4.796904761904761e-07, "loss": 0.005, "num_tokens": 672763447.0, "reward": 0.2265625, "reward_std": 0.05792168229818344, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.9523613214492798, "step": 9560 }, { "completion_length": 399.6, "completions/clipped_ratio": 0.0, "completions/max_length": 399.6, "completions/max_terminated_length": 399.6, "completions/mean_length": 89.446875, "completions/mean_terminated_length": 89.446875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008656312275220955, "frac_reward_zero_std": 0.8625, "grad_norm": 2.452457904815674, "kl": 3.1214723169920036, "learning_rate": 4.796507936507937e-07, "loss": 0.0031, "num_tokens": 673075403.0, "reward": 0.290625, "reward_std": 0.11200310662388802, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9541359305381775, "step": 9565 }, { "completion_length": 389.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 389.0, "completions/max_terminated_length": 300.4, "completions/mean_length": 91.8359375, "completions/mean_terminated_length": 91.3071273803711, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008660837268569215, "frac_reward_zero_std": 0.9, "grad_norm": 15.880790710449219, "kl": 5.035256352100987, "learning_rate": 4.79611111111111e-07, "loss": 0.005, "num_tokens": 673391745.0, "reward": 0.3328125, "reward_std": 0.09095345288515091, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9329121351242066, "step": 9570 }, { "completion_length": 493.6, "completions/clipped_ratio": 0.0, "completions/max_length": 493.6, "completions/max_terminated_length": 493.6, "completions/mean_length": 92.83203125, "completions/mean_terminated_length": 92.83203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008665362261917475, "frac_reward_zero_std": 0.875, "grad_norm": 5.856180667877197, "kl": 3.5400113084586335, "learning_rate": 4.795714285714286e-07, "loss": 0.0035, "num_tokens": 673706890.0, "reward": 0.3140625, "reward_std": 0.10747633241117001, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9384072899818421, "step": 9575 }, { "completion_length": 547.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 547.8, "completions/max_terminated_length": 525.0, "completions/mean_length": 101.34296875, "completions/mean_terminated_length": 100.30380706787109, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.008669887255265735, "frac_reward_zero_std": 0.91875, "grad_norm": 6.977641582489014, "kl": 2.0086690534022638, "learning_rate": 4.79531746031746e-07, "loss": 0.002, "num_tokens": 674037249.0, "reward": 0.3015625, "reward_std": 0.06586536057293416, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.921810245513916, "step": 9580 }, { "completion_length": 423.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 423.2, "completions/max_terminated_length": 379.4, "completions/mean_length": 99.01875, "completions/mean_terminated_length": 98.48923797607422, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008674412248613995, "frac_reward_zero_std": 0.9, "grad_norm": 14.716761589050293, "kl": 4.2136162294074895, "learning_rate": 4.794920634920635e-07, "loss": 0.0042, "num_tokens": 674364385.0, "reward": 0.3234375, "reward_std": 0.08343567252159119, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9300818562507629, "step": 9585 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 90.07421875, "completions/mean_terminated_length": 90.07421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008678937241962253, "frac_reward_zero_std": 0.9125, "grad_norm": 6.842789173126221, "kl": 5.765669700689614, "learning_rate": 4.79452380952381e-07, "loss": 0.0058, "num_tokens": 674677048.0, "reward": 0.3359375, "reward_std": 0.07165173590183258, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9331530094146728, "step": 9590 }, { "completion_length": 449.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 100.76328125, "completions/mean_terminated_length": 100.23135528564453, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008683462235310514, "frac_reward_zero_std": 0.925, "grad_norm": 5.879355430603027, "kl": 3.4975459696375766, "learning_rate": 4.794126984126984e-07, "loss": 0.0035, "num_tokens": 675007481.0, "reward": 0.346875, "reward_std": 0.0700700655579567, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9069144368171692, "step": 9595 }, { "completion_length": 444.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 101.76171875, "completions/mean_terminated_length": 101.76171875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008687987228658774, "frac_reward_zero_std": 0.91875, "grad_norm": 14.177155494689941, "kl": 4.708134577283635, "learning_rate": 4.793730158730159e-07, "loss": 0.0047, "num_tokens": 675339232.0, "reward": 0.253125, "reward_std": 0.0692822676151991, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9647563934326172, "step": 9600 }, { "completion_length": 463.4, "completions/clipped_ratio": 0.0, "completions/max_length": 463.4, "completions/max_terminated_length": 463.4, "completions/mean_length": 86.75703125, "completions/mean_terminated_length": 86.75703125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008692512222007034, "frac_reward_zero_std": 0.88125, "grad_norm": 12.999529838562012, "kl": 3.0431520323385484, "learning_rate": 4.793333333333333e-07, "loss": 0.003, "num_tokens": 675645841.0, "reward": 0.3546875, "reward_std": 0.10305593609809875, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9214271903038025, "step": 9605 }, { "completion_length": 284.6, "completions/clipped_ratio": 0.0, "completions/max_length": 284.6, "completions/max_terminated_length": 284.6, "completions/mean_length": 88.88671875, "completions/mean_terminated_length": 88.88671875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008697037215355294, "frac_reward_zero_std": 0.90625, "grad_norm": 0.3831053674221039, "kl": 5.708150686603039, "learning_rate": 4.792936507936508e-07, "loss": 0.0057, "num_tokens": 675958360.0, "reward": 0.2859375, "reward_std": 0.08269279897212982, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9554690241813659, "step": 9610 }, { "completion_length": 445.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.4, "completions/max_terminated_length": 431.2, "completions/mean_length": 97.040625, "completions/mean_terminated_length": 96.53152770996094, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008701562208703554, "frac_reward_zero_std": 0.9125, "grad_norm": 5.951900959014893, "kl": 0.8035690163378604, "learning_rate": 4.792539682539682e-07, "loss": 0.0008, "num_tokens": 676282236.0, "reward": 0.4140625, "reward_std": 0.0716517373919487, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8970888972282409, "step": 9615 }, { "completion_length": 396.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 396.8, "completions/max_terminated_length": 359.2, "completions/mean_length": 100.48984375, "completions/mean_terminated_length": 99.97099151611329, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.008706087202051812, "frac_reward_zero_std": 0.90625, "grad_norm": 14.312301635742188, "kl": 0.5088006898295134, "learning_rate": 4.792142857142857e-07, "loss": 0.0005, "num_tokens": 676610199.0, "reward": 0.3890625, "reward_std": 0.0772249672561884, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9079118728637695, "step": 9620 }, { "completion_length": 477.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 477.0, "completions/max_terminated_length": 385.6, "completions/mean_length": 91.778125, "completions/mean_terminated_length": 91.24500427246093, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008710612195400072, "frac_reward_zero_std": 0.89375, "grad_norm": 1.3834041357040405, "kl": 1.000613542390056, "learning_rate": 4.791746031746031e-07, "loss": 0.001, "num_tokens": 676926243.0, "reward": 0.4296875, "reward_std": 0.0944282814860344, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8853272318840026, "step": 9625 }, { "completion_length": 539.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 539.2, "completions/max_terminated_length": 451.2, "completions/mean_length": 102.28984375, "completions/mean_terminated_length": 101.75960235595703, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.008715137188748332, "frac_reward_zero_std": 0.9125, "grad_norm": 6.221235752105713, "kl": 3.484966933366377, "learning_rate": 4.791349206349206e-07, "loss": 0.0035, "num_tokens": 677258726.0, "reward": 0.184375, "reward_std": 0.07117993161082267, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9765715718269348, "step": 9630 }, { "completion_length": 420.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 420.8, "completions/max_terminated_length": 410.2, "completions/mean_length": 97.67265625, "completions/mean_terminated_length": 96.09437713623046, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.008719662182096593, "frac_reward_zero_std": 0.875, "grad_norm": 6.29162073135376, "kl": 2.295292377693113, "learning_rate": 4.79095238095238e-07, "loss": 0.0023, "num_tokens": 677583347.0, "reward": 0.3875, "reward_std": 0.10363509431481362, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.8889354944229126, "step": 9635 }, { "completion_length": 449.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.2, "completions/max_terminated_length": 427.4, "completions/mean_length": 93.18984375, "completions/mean_terminated_length": 92.66965789794922, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008724187175444853, "frac_reward_zero_std": 0.9125, "grad_norm": 6.561915874481201, "kl": 4.143278445862234, "learning_rate": 4.790555555555556e-07, "loss": 0.0041, "num_tokens": 677899606.0, "reward": 0.340625, "reward_std": 0.06733967810869217, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9290011286735534, "step": 9640 }, { "completion_length": 363.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 363.4, "completions/max_terminated_length": 329.6, "completions/mean_length": 94.99375, "completions/mean_terminated_length": 94.48578643798828, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008728712168793111, "frac_reward_zero_std": 0.93125, "grad_norm": 2.0836737155914307, "kl": 4.287140669289511, "learning_rate": 4.790158730158729e-07, "loss": 0.0043, "num_tokens": 678220398.0, "reward": 0.340625, "reward_std": 0.0633885346353054, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9304535031318665, "step": 9645 }, { "completion_length": 293.4, "completions/clipped_ratio": 0.0, "completions/max_length": 293.4, "completions/max_terminated_length": 293.4, "completions/mean_length": 90.7515625, "completions/mean_terminated_length": 90.7515625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008733237162141371, "frac_reward_zero_std": 0.9, "grad_norm": 0.6732814311981201, "kl": 4.325736506469548, "learning_rate": 4.789761904761905e-07, "loss": 0.0043, "num_tokens": 678535432.0, "reward": 0.34375, "reward_std": 0.08664139322936534, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9384198307991027, "step": 9650 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 91.790625, "completions/mean_terminated_length": 91.790625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008737762155489631, "frac_reward_zero_std": 0.90625, "grad_norm": 8.137828826904297, "kl": 4.804053831263445, "learning_rate": 4.78936507936508e-07, "loss": 0.0048, "num_tokens": 678851708.0, "reward": 0.36875, "reward_std": 0.0790162593126297, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9253928542137146, "step": 9655 }, { "completion_length": 437.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 437.2, "completions/max_terminated_length": 412.2, "completions/mean_length": 91.61171875, "completions/mean_terminated_length": 91.09498901367188, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008742287148837891, "frac_reward_zero_std": 0.85625, "grad_norm": 5.509382724761963, "kl": 13.230741750344169, "learning_rate": 4.788968253968253e-07, "loss": 0.0132, "num_tokens": 679166347.0, "reward": 0.315625, "reward_std": 0.12735525369644166, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.940985906124115, "step": 9660 }, { "completion_length": 310.6, "completions/clipped_ratio": 0.0, "completions/max_length": 310.6, "completions/max_terminated_length": 310.6, "completions/mean_length": 87.70703125, "completions/mean_terminated_length": 87.70703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008746812142186151, "frac_reward_zero_std": 0.9, "grad_norm": 8.319538116455078, "kl": 8.910435961093754, "learning_rate": 4.788571428571429e-07, "loss": 0.0089, "num_tokens": 679474596.0, "reward": 0.3640625, "reward_std": 0.07933578193187714, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9312276363372802, "step": 9665 }, { "completion_length": 317.4, "completions/clipped_ratio": 0.0, "completions/max_length": 317.4, "completions/max_terminated_length": 317.4, "completions/mean_length": 84.8640625, "completions/mean_terminated_length": 84.8640625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008751337135534411, "frac_reward_zero_std": 0.89375, "grad_norm": 15.380363464355469, "kl": 7.596119851735421, "learning_rate": 4.788174603174603e-07, "loss": 0.0076, "num_tokens": 679778798.0, "reward": 0.4296875, "reward_std": 0.09127199612557887, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.9011649131774903, "step": 9670 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 92.13203125, "completions/mean_terminated_length": 92.13203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00875586212888267, "frac_reward_zero_std": 0.89375, "grad_norm": 7.814168453216553, "kl": 3.830007670260966, "learning_rate": 4.787777777777778e-07, "loss": 0.0038, "num_tokens": 680094823.0, "reward": 0.440625, "reward_std": 0.09174380265176296, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8830958366394043, "step": 9675 }, { "completion_length": 522.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 522.2, "completions/max_terminated_length": 512.4, "completions/mean_length": 90.44453125, "completions/mean_terminated_length": 89.39843139648437, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.00876038712223093, "frac_reward_zero_std": 0.86875, "grad_norm": 9.127521514892578, "kl": 6.596897880127654, "learning_rate": 4.787380952380952e-07, "loss": 0.0066, "num_tokens": 680407040.0, "reward": 0.4125, "reward_std": 0.11915193125605583, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.883148980140686, "step": 9680 }, { "completion_length": 467.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 467.6, "completions/max_terminated_length": 386.6, "completions/mean_length": 96.16484375, "completions/mean_terminated_length": 95.63781127929687, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00876491211557919, "frac_reward_zero_std": 0.8875, "grad_norm": 4.00584077835083, "kl": 2.8800820249831305, "learning_rate": 4.786984126984127e-07, "loss": 0.0029, "num_tokens": 680731443.0, "reward": 0.2875, "reward_std": 0.09116718918085098, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9506202578544617, "step": 9685 }, { "completion_length": 473.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 473.8, "completions/max_terminated_length": 362.4, "completions/mean_length": 91.08984375, "completions/mean_terminated_length": 89.51756744384765, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00876943710892745, "frac_reward_zero_std": 0.925, "grad_norm": 7.366847991943359, "kl": 3.623896387394052, "learning_rate": 4.786587301587301e-07, "loss": 0.0036, "num_tokens": 681046254.0, "reward": 0.478125, "reward_std": 0.060550790280103683, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8575678706169129, "step": 9690 }, { "completion_length": 486.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 486.4, "completions/max_terminated_length": 441.8, "completions/mean_length": 97.88515625, "completions/mean_terminated_length": 96.82801971435546, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00877396210227571, "frac_reward_zero_std": 0.91875, "grad_norm": 7.468482494354248, "kl": 3.229136444651522, "learning_rate": 4.786190476190476e-07, "loss": 0.0032, "num_tokens": 681370115.0, "reward": 0.2359375, "reward_std": 0.07086041383445263, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9676371335983276, "step": 9695 }, { "completion_length": 464.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 464.8, "completions/max_terminated_length": 377.2, "completions/mean_length": 97.253125, "completions/mean_terminated_length": 96.72801208496094, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.00877848709562397, "frac_reward_zero_std": 0.88125, "grad_norm": 1.5685882568359375, "kl": 2.2079482393688523, "learning_rate": 4.78579365079365e-07, "loss": 0.0022, "num_tokens": 681692431.0, "reward": 0.43125, "reward_std": 0.09626959413290023, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8857666730880738, "step": 9700 }, { "completion_length": 463.8, "completions/clipped_ratio": 0.0, "completions/max_length": 463.8, "completions/max_terminated_length": 463.8, "completions/mean_length": 93.421875, "completions/mean_terminated_length": 93.421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008783012088972229, "frac_reward_zero_std": 0.9, "grad_norm": 1.2692772150039673, "kl": 3.6636146426782945, "learning_rate": 4.785396825396825e-07, "loss": 0.0037, "num_tokens": 682008059.0, "reward": 0.4140625, "reward_std": 0.08711123578250408, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9002267956733704, "step": 9705 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 94.9234375, "completions/mean_terminated_length": 94.9234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008787537082320489, "frac_reward_zero_std": 0.90625, "grad_norm": 5.566640853881836, "kl": 1.9084497350500897, "learning_rate": 4.785e-07, "loss": 0.0019, "num_tokens": 682330409.0, "reward": 0.2984375, "reward_std": 0.07790893763303756, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9423265099525452, "step": 9710 }, { "completion_length": 333.4, "completions/clipped_ratio": 0.0, "completions/max_length": 333.4, "completions/max_terminated_length": 333.4, "completions/mean_length": 89.3, "completions/mean_terminated_length": 89.3, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008792062075668749, "frac_reward_zero_std": 0.925, "grad_norm": 15.245482444763184, "kl": 3.043309740140103, "learning_rate": 4.784603174603174e-07, "loss": 0.003, "num_tokens": 682642969.0, "reward": 0.3625, "reward_std": 0.06622980795800686, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9143133401870728, "step": 9715 }, { "completion_length": 554.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 554.8, "completions/max_terminated_length": 477.8, "completions/mean_length": 95.55546875, "completions/mean_terminated_length": 94.51060333251954, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008796587069017009, "frac_reward_zero_std": 0.89375, "grad_norm": 4.5903215408325195, "kl": 4.586058042652439, "learning_rate": 4.784206349206349e-07, "loss": 0.0046, "num_tokens": 682962240.0, "reward": 0.3359375, "reward_std": 0.08785607293248177, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9415082454681396, "step": 9720 }, { "completion_length": 414.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 414.8, "completions/max_terminated_length": 386.8, "completions/mean_length": 91.32265625, "completions/mean_terminated_length": 90.26773529052734, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008801112062365269, "frac_reward_zero_std": 0.9125, "grad_norm": 15.59915542602539, "kl": 4.811772614205256, "learning_rate": 4.783809523809524e-07, "loss": 0.0048, "num_tokens": 683275029.0, "reward": 0.3796875, "reward_std": 0.07617596425116062, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.921493411064148, "step": 9725 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 93.7625, "completions/mean_terminated_length": 93.7625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008805637055713527, "frac_reward_zero_std": 0.89375, "grad_norm": 9.978455543518066, "kl": 9.402923540957271, "learning_rate": 4.783412698412699e-07, "loss": 0.0094, "num_tokens": 683594629.0, "reward": 0.3453125, "reward_std": 0.08538179099559784, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9182831048965454, "step": 9730 }, { "completion_length": 483.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 483.4, "completions/max_terminated_length": 359.2, "completions/mean_length": 102.88203125, "completions/mean_terminated_length": 101.84684600830079, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008810162049061787, "frac_reward_zero_std": 0.89375, "grad_norm": 30.551223754882812, "kl": 5.662640482583083, "learning_rate": 4.783015873015872e-07, "loss": 0.0057, "num_tokens": 683929598.0, "reward": 0.26875, "reward_std": 0.09242580756545067, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9286041617393493, "step": 9735 }, { "completion_length": 353.2, "completions/clipped_ratio": 0.0, "completions/max_length": 353.2, "completions/max_terminated_length": 353.2, "completions/mean_length": 89.134375, "completions/mean_terminated_length": 89.134375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008814687042410048, "frac_reward_zero_std": 0.925, "grad_norm": 7.555324554443359, "kl": 3.095998314593453, "learning_rate": 4.782619047619048e-07, "loss": 0.0031, "num_tokens": 684241610.0, "reward": 0.4, "reward_std": 0.058712026476860045, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9084567308425904, "step": 9740 }, { "completion_length": 464.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 464.4, "completions/max_terminated_length": 403.6, "completions/mean_length": 96.034375, "completions/mean_terminated_length": 95.50558013916016, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008819212035758308, "frac_reward_zero_std": 0.9125, "grad_norm": 1.7464081048965454, "kl": 3.1121627928689124, "learning_rate": 4.782222222222222e-07, "loss": 0.0031, "num_tokens": 684563686.0, "reward": 0.3671875, "reward_std": 0.0745968397706747, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9278653264045715, "step": 9745 }, { "completion_length": 483.4, "completions/clipped_ratio": 0.0, "completions/max_length": 483.4, "completions/max_terminated_length": 483.4, "completions/mean_length": 99.5203125, "completions/mean_terminated_length": 99.5203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008823737029106568, "frac_reward_zero_std": 0.91875, "grad_norm": 4.018813610076904, "kl": 12.29994241570821, "learning_rate": 4.781825396825397e-07, "loss": 0.0123, "num_tokens": 684891544.0, "reward": 0.3890625, "reward_std": 0.06996525600552558, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9153563141822815, "step": 9750 }, { "completion_length": 359.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 94.53359375, "completions/mean_terminated_length": 94.53359375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008828262022454828, "frac_reward_zero_std": 0.91875, "grad_norm": 9.664360046386719, "kl": 6.0940340120578185, "learning_rate": 4.781428571428571e-07, "loss": 0.0061, "num_tokens": 685212379.0, "reward": 0.3609375, "reward_std": 0.06770412176847458, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9156178712844849, "step": 9755 }, { "completion_length": 411.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 411.4, "completions/max_terminated_length": 337.4, "completions/mean_length": 96.63046875, "completions/mean_terminated_length": 96.1199737548828, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008832787015803086, "frac_reward_zero_std": 0.88125, "grad_norm": 4.7454142570495605, "kl": 2.454343806009274, "learning_rate": 4.781031746031746e-07, "loss": 0.0025, "num_tokens": 685533826.0, "reward": 0.4328125, "reward_std": 0.10673149675130844, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.8950870990753174, "step": 9760 }, { "completion_length": 487.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 487.2, "completions/max_terminated_length": 439.4, "completions/mean_length": 96.69296875, "completions/mean_terminated_length": 96.16303405761718, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008837312009151346, "frac_reward_zero_std": 0.9125, "grad_norm": 3.5005645751953125, "kl": 1.230960489215795, "learning_rate": 4.78063492063492e-07, "loss": 0.0012, "num_tokens": 685858433.0, "reward": 0.221875, "reward_std": 0.07643462046980858, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9698829770088195, "step": 9765 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 92.9421875, "completions/mean_terminated_length": 92.9421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008841837002499606, "frac_reward_zero_std": 0.94375, "grad_norm": 2.5735883712768555, "kl": 1.8958448325865902, "learning_rate": 4.780238095238095e-07, "loss": 0.0019, "num_tokens": 686176519.0, "reward": 0.4390625, "reward_std": 0.046608568727970125, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8908003687858581, "step": 9770 }, { "completion_length": 383.6, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/max_terminated_length": 383.6, "completions/mean_length": 91.13984375, "completions/mean_terminated_length": 91.13984375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.008846361995847866, "frac_reward_zero_std": 0.91875, "grad_norm": 4.134727478027344, "kl": 4.50181645831326, "learning_rate": 4.77984126984127e-07, "loss": 0.0045, "num_tokens": 686490714.0, "reward": 0.3328125, "reward_std": 0.06928128749132156, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9340779066085816, "step": 9775 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 367.8, "completions/max_terminated_length": 297.6, "completions/mean_length": 88.909375, "completions/mean_terminated_length": 88.38164367675782, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008850886989196127, "frac_reward_zero_std": 0.91875, "grad_norm": 2.184080123901367, "kl": 1.7772866256069393, "learning_rate": 4.779444444444444e-07, "loss": 0.0018, "num_tokens": 686801118.0, "reward": 0.403125, "reward_std": 0.06928226724267006, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9074552655220032, "step": 9780 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 90.53203125, "completions/mean_terminated_length": 90.53203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008855411982544385, "frac_reward_zero_std": 0.925, "grad_norm": 9.147085189819336, "kl": 0.7589227919350379, "learning_rate": 4.779047619047619e-07, "loss": 0.0008, "num_tokens": 687114959.0, "reward": 0.4125, "reward_std": 0.05576692558825016, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.8969456553459167, "step": 9785 }, { "completion_length": 488.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 488.0, "completions/max_terminated_length": 394.8, "completions/mean_length": 89.08203125, "completions/mean_terminated_length": 88.54589538574218, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008859936975892645, "frac_reward_zero_std": 0.91875, "grad_norm": 11.927814483642578, "kl": 2.899207979394123, "learning_rate": 4.778650793650793e-07, "loss": 0.0029, "num_tokens": 687426360.0, "reward": 0.371875, "reward_std": 0.06812747716903686, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9206125378608704, "step": 9790 }, { "completion_length": 565.8, "completions/clipped_ratio": 0.0, "completions/max_length": 565.8, "completions/max_terminated_length": 565.8, "completions/mean_length": 92.109375, "completions/mean_terminated_length": 92.109375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008864461969240905, "frac_reward_zero_std": 0.91875, "grad_norm": 7.267467498779297, "kl": 0.8531300443341024, "learning_rate": 4.778253968253968e-07, "loss": 0.0009, "num_tokens": 687741612.0, "reward": 0.3671875, "reward_std": 0.0715443804860115, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9300157308578492, "step": 9795 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 93.5921875, "completions/mean_terminated_length": 93.5921875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.008868986962589165, "frac_reward_zero_std": 0.91875, "grad_norm": 2.71541690826416, "kl": 0.6080330243799835, "learning_rate": 4.777857142857142e-07, "loss": 0.0006, "num_tokens": 688059818.0, "reward": 0.19375, "reward_std": 0.08016752153635025, "rewards/verify_chess_move/mean": 0.19375, "rewards/verify_chess_move/std": 0.9766299247741699, "step": 9800 }, { "completion_length": 321.6, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/max_terminated_length": 321.6, "completions/mean_length": 96.0625, "completions/mean_terminated_length": 96.0625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008873511955937425, "frac_reward_zero_std": 0.93125, "grad_norm": 0.33969637751579285, "kl": 0.8347013516584412, "learning_rate": 4.777460317460317e-07, "loss": 0.0008, "num_tokens": 688383042.0, "reward": 0.25625, "reward_std": 0.060654615983366966, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9589741706848145, "step": 9805 }, { "completion_length": 472.2, "completions/clipped_ratio": 0.0, "completions/max_length": 472.2, "completions/max_terminated_length": 472.2, "completions/mean_length": 92.89453125, "completions/mean_terminated_length": 92.89453125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008878036949285685, "frac_reward_zero_std": 0.9, "grad_norm": 1.4538413286209106, "kl": 2.7020827949396335, "learning_rate": 4.777063492063491e-07, "loss": 0.0027, "num_tokens": 688700507.0, "reward": 0.3046875, "reward_std": 0.08574525862932206, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9332626461982727, "step": 9810 }, { "completion_length": 447.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 92.49765625, "completions/mean_terminated_length": 91.96436920166016, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008882561942633944, "frac_reward_zero_std": 0.9125, "grad_norm": 8.86149787902832, "kl": 3.3301759365014734, "learning_rate": 4.776666666666667e-07, "loss": 0.0033, "num_tokens": 689016856.0, "reward": 0.3859375, "reward_std": 0.07186292298138142, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9101315498352051, "step": 9815 }, { "completion_length": 447.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.0, "completions/max_terminated_length": 344.8, "completions/mean_length": 91.10859375, "completions/mean_terminated_length": 90.57073974609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008887086935982204, "frac_reward_zero_std": 0.93125, "grad_norm": 1.713375449180603, "kl": 9.221679860679433, "learning_rate": 4.776269841269841e-07, "loss": 0.0092, "num_tokens": 689330659.0, "reward": 0.3796875, "reward_std": 0.05613137185573578, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9140773892402649, "step": 9820 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.0, "completions/max_length": 392.6, "completions/max_terminated_length": 392.6, "completions/mean_length": 91.6828125, "completions/mean_terminated_length": 91.6828125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008891611929330464, "frac_reward_zero_std": 0.95625, "grad_norm": 4.861435413360596, "kl": 2.4562860748847015, "learning_rate": 4.775873015873015e-07, "loss": 0.0025, "num_tokens": 689645949.0, "reward": 0.4, "reward_std": 0.03866586796939373, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9158938765525818, "step": 9825 }, { "completion_length": 368.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 368.4, "completions/max_terminated_length": 321.4, "completions/mean_length": 93.56171875, "completions/mean_terminated_length": 93.04436645507812, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008896136922678724, "frac_reward_zero_std": 0.925, "grad_norm": 3.7728214263916016, "kl": 5.186479913815856, "learning_rate": 4.775476190476191e-07, "loss": 0.0052, "num_tokens": 689965260.0, "reward": 0.428125, "reward_std": 0.06123279482126236, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.9038519978523254, "step": 9830 }, { "completion_length": 483.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 483.4, "completions/max_terminated_length": 339.8, "completions/mean_length": 89.63359375, "completions/mean_terminated_length": 88.57260284423828, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008900661916026984, "frac_reward_zero_std": 0.9125, "grad_norm": 7.355961322784424, "kl": 10.39686912210891, "learning_rate": 4.775079365079365e-07, "loss": 0.0104, "num_tokens": 690278895.0, "reward": 0.2859375, "reward_std": 0.06960179172456264, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9544739365577698, "step": 9835 }, { "completion_length": 327.4, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/max_terminated_length": 327.4, "completions/mean_length": 91.6140625, "completions/mean_terminated_length": 91.6140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008905186909375242, "frac_reward_zero_std": 0.91875, "grad_norm": 28.035625457763672, "kl": 8.76359769788105, "learning_rate": 4.77468253968254e-07, "loss": 0.0088, "num_tokens": 690595385.0, "reward": 0.30625, "reward_std": 0.07406612932682037, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9387952566146851, "step": 9840 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 89.39453125, "completions/mean_terminated_length": 89.39453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008909711902723503, "frac_reward_zero_std": 0.90625, "grad_norm": 2.110130548477173, "kl": 4.345807607914321, "learning_rate": 4.774285714285714e-07, "loss": 0.0043, "num_tokens": 690909562.0, "reward": 0.3390625, "reward_std": 0.08243316225707531, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9328808546066284, "step": 9845 }, { "completion_length": 376.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 376.2, "completions/max_terminated_length": 360.0, "completions/mean_length": 103.01015625, "completions/mean_terminated_length": 101.99427337646485, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008914236896071763, "frac_reward_zero_std": 0.89375, "grad_norm": 6.5668182373046875, "kl": 4.640076650981792, "learning_rate": 4.773888888888889e-07, "loss": 0.0046, "num_tokens": 691243143.0, "reward": 0.134375, "reward_std": 0.09647921249270439, "rewards/verify_chess_move/mean": 0.134375, "rewards/verify_chess_move/std": 0.9886019468307495, "step": 9850 }, { "completion_length": 407.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 407.2, "completions/max_terminated_length": 326.2, "completions/mean_length": 91.33359375, "completions/mean_terminated_length": 90.80069885253906, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008918761889420023, "frac_reward_zero_std": 0.9375, "grad_norm": 8.488740921020508, "kl": 2.57443065517582, "learning_rate": 4.773492063492063e-07, "loss": 0.0026, "num_tokens": 691560210.0, "reward": 0.3828125, "reward_std": 0.05534004494547844, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9118899703025818, "step": 9855 }, { "completion_length": 435.4, "completions/clipped_ratio": 0.0, "completions/max_length": 435.4, "completions/max_terminated_length": 435.4, "completions/mean_length": 98.86015625, "completions/mean_terminated_length": 98.86015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.008923286882768283, "frac_reward_zero_std": 0.9125, "grad_norm": 1.6732287406921387, "kl": 1.6397736871847883, "learning_rate": 4.773095238095238e-07, "loss": 0.0016, "num_tokens": 691888303.0, "reward": 0.3328125, "reward_std": 0.07664678692817688, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9387051582336425, "step": 9860 }, { "completion_length": 505.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 505.2, "completions/max_terminated_length": 421.4, "completions/mean_length": 84.7734375, "completions/mean_terminated_length": 84.22830047607422, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008927811876116543, "frac_reward_zero_std": 0.93125, "grad_norm": 12.553326606750488, "kl": 4.247442325053271, "learning_rate": 4.772698412698412e-07, "loss": 0.0042, "num_tokens": 692192957.0, "reward": 0.3421875, "reward_std": 0.058605652302503586, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9363154172897339, "step": 9865 }, { "completion_length": 546.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 546.8, "completions/max_terminated_length": 394.0, "completions/mean_length": 96.490625, "completions/mean_terminated_length": 94.93480224609375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.008932336869464801, "frac_reward_zero_std": 0.90625, "grad_norm": 1.8112133741378784, "kl": 7.242727247648872, "learning_rate": 4.772301587301587e-07, "loss": 0.0072, "num_tokens": 692515185.0, "reward": 0.4828125, "reward_std": 0.08064285293221474, "rewards/verify_chess_move/mean": 0.4828125, "rewards/verify_chess_move/std": 0.8664014935493469, "step": 9870 }, { "completion_length": 500.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 500.8, "completions/max_terminated_length": 474.6, "completions/mean_length": 100.20859375, "completions/mean_terminated_length": 99.67992095947265, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.008936861862813061, "frac_reward_zero_std": 0.8875, "grad_norm": 6.96403169631958, "kl": 6.1704795936355366, "learning_rate": 4.771904761904761e-07, "loss": 0.0062, "num_tokens": 692844172.0, "reward": 0.3421875, "reward_std": 0.09479527920484543, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9322427153587342, "step": 9875 }, { "completion_length": 285.6, "completions/clipped_ratio": 0.0, "completions/max_length": 285.6, "completions/max_terminated_length": 285.6, "completions/mean_length": 91.65234375, "completions/mean_terminated_length": 91.65234375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008941386856161321, "frac_reward_zero_std": 0.925, "grad_norm": 10.692619323730469, "kl": 4.142087786342017, "learning_rate": 4.771507936507936e-07, "loss": 0.0041, "num_tokens": 693160399.0, "reward": 0.26875, "reward_std": 0.06055079102516174, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9538537263870239, "step": 9880 }, { "completion_length": 315.2, "completions/clipped_ratio": 0.0, "completions/max_length": 315.2, "completions/max_terminated_length": 315.2, "completions/mean_length": 86.79453125, "completions/mean_terminated_length": 86.79453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008945911849509582, "frac_reward_zero_std": 0.9, "grad_norm": 14.773744583129883, "kl": 8.183495165593921, "learning_rate": 4.77111111111111e-07, "loss": 0.0082, "num_tokens": 693467424.0, "reward": 0.4046875, "reward_std": 0.08506128713488578, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9118790626525879, "step": 9885 }, { "completion_length": 300.6, "completions/clipped_ratio": 0.0, "completions/max_length": 300.6, "completions/max_terminated_length": 300.6, "completions/mean_length": 92.12890625, "completions/mean_terminated_length": 92.12890625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.008950436842857842, "frac_reward_zero_std": 0.93125, "grad_norm": 0.013185261748731136, "kl": 3.759419954102486, "learning_rate": 4.770714285714286e-07, "loss": 0.0038, "num_tokens": 693784237.0, "reward": 0.31875, "reward_std": 0.06065461896359921, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9402838706970215, "step": 9890 }, { "completion_length": 579.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 579.4, "completions/max_terminated_length": 481.4, "completions/mean_length": 87.72109375, "completions/mean_terminated_length": 87.19239044189453, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0089549618362061, "frac_reward_zero_std": 0.89375, "grad_norm": 7.0507941246032715, "kl": 1.2856596783385612, "learning_rate": 4.77031746031746e-07, "loss": 0.0013, "num_tokens": 694093144.0, "reward": 0.259375, "reward_std": 0.0853808145970106, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9491345882415771, "step": 9895 }, { "completion_length": 439.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.2, "completions/max_terminated_length": 363.6, "completions/mean_length": 98.19453125, "completions/mean_terminated_length": 97.66685485839844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00895948682955436, "frac_reward_zero_std": 0.8875, "grad_norm": 1.8561787605285645, "kl": 1.5714264258858748, "learning_rate": 4.769920634920634e-07, "loss": 0.0016, "num_tokens": 694418561.0, "reward": 0.4, "reward_std": 0.10026213377714158, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.8951164722442627, "step": 9900 }, { "completion_length": 300.2, "completions/clipped_ratio": 0.0, "completions/max_length": 300.2, "completions/max_terminated_length": 300.2, "completions/mean_length": 98.0421875, "completions/mean_terminated_length": 98.0421875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00896401182290262, "frac_reward_zero_std": 0.93125, "grad_norm": 0.1184166893362999, "kl": 1.731654483312741, "learning_rate": 4.76952380952381e-07, "loss": 0.0017, "num_tokens": 694745255.0, "reward": 0.25625, "reward_std": 0.06044343113899231, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9436988949775695, "step": 9905 }, { "completion_length": 419.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.6, "completions/max_terminated_length": 396.6, "completions/mean_length": 99.54375, "completions/mean_terminated_length": 99.01338653564453, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00896853681625088, "frac_reward_zero_std": 0.925, "grad_norm": 2.2937283515930176, "kl": 0.775325606460683, "learning_rate": 4.769126984126984e-07, "loss": 0.0008, "num_tokens": 695073791.0, "reward": 0.33125, "reward_std": 0.062341098114848136, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9316437363624572, "step": 9910 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 90.34296875, "completions/mean_terminated_length": 90.34296875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.00897306180959914, "frac_reward_zero_std": 0.9, "grad_norm": 7.244588375091553, "kl": 1.2574396462528967, "learning_rate": 4.768730158730159e-07, "loss": 0.0013, "num_tokens": 695387646.0, "reward": 0.2140625, "reward_std": 0.08232737258076668, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9659684896469116, "step": 9915 }, { "completion_length": 472.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 472.6, "completions/max_terminated_length": 379.2, "completions/mean_length": 90.953125, "completions/mean_terminated_length": 89.36665802001953, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0089775868029474, "frac_reward_zero_std": 0.9375, "grad_norm": 5.062647819519043, "kl": 1.0081662221229635, "learning_rate": 4.7683333333333333e-07, "loss": 0.001, "num_tokens": 695700642.0, "reward": 0.21875, "reward_std": 0.05055716261267662, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9668655395507812, "step": 9920 }, { "completion_length": 439.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.8, "completions/max_terminated_length": 362.0, "completions/mean_length": 93.56328125, "completions/mean_terminated_length": 93.04561462402344, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.008982111796295659, "frac_reward_zero_std": 0.9125, "grad_norm": 5.879607200622559, "kl": 0.9656744153122417, "learning_rate": 4.7679365079365074e-07, "loss": 0.001, "num_tokens": 696018795.0, "reward": 0.29375, "reward_std": 0.07685895040631294, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.9453753590583801, "step": 9925 }, { "completion_length": 426.4, "completions/clipped_ratio": 0.0, "completions/max_length": 426.4, "completions/max_terminated_length": 426.4, "completions/mean_length": 92.09296875, "completions/mean_terminated_length": 92.09296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008986636789643919, "frac_reward_zero_std": 0.925, "grad_norm": 3.8550667762756348, "kl": 1.3458349392050877, "learning_rate": 4.7675396825396824e-07, "loss": 0.0013, "num_tokens": 696335042.0, "reward": 0.24375, "reward_std": 0.06349589191377163, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9590502977371216, "step": 9930 }, { "completion_length": 455.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 455.6, "completions/max_terminated_length": 433.4, "completions/mean_length": 92.54921875, "completions/mean_terminated_length": 92.01705932617188, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.008991161782992179, "frac_reward_zero_std": 0.93125, "grad_norm": 0.5544496178627014, "kl": 0.5087756956578232, "learning_rate": 4.767142857142857e-07, "loss": 0.0005, "num_tokens": 696653009.0, "reward": 0.209375, "reward_std": 0.05818033888936043, "rewards/verify_chess_move/mean": 0.209375, "rewards/verify_chess_move/std": 0.9777791261672973, "step": 9935 }, { "completion_length": 468.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 468.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 90.74453125, "completions/mean_terminated_length": 90.21757507324219, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.008995686776340439, "frac_reward_zero_std": 0.89375, "grad_norm": 8.58692741394043, "kl": 1.567999097739812, "learning_rate": 4.7667460317460315e-07, "loss": 0.0016, "num_tokens": 696964682.0, "reward": 0.2875, "reward_std": 0.09216813109815121, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9471360445022583, "step": 9940 }, { "completion_length": 382.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 382.0, "completions/max_terminated_length": 286.6, "completions/mean_length": 88.7703125, "completions/mean_terminated_length": 88.23729553222657, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0090002117696887, "frac_reward_zero_std": 0.9125, "grad_norm": 7.576611042022705, "kl": 0.8788741619791836, "learning_rate": 4.766349206349206e-07, "loss": 0.0009, "num_tokens": 697277788.0, "reward": 0.246875, "reward_std": 0.07117993496358395, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9382674336433411, "step": 9945 }, { "completion_length": 380.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 380.8, "completions/max_terminated_length": 286.4, "completions/mean_length": 89.9296875, "completions/mean_terminated_length": 89.4094741821289, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009004736763036958, "frac_reward_zero_std": 0.975, "grad_norm": 0.7048569321632385, "kl": 1.1653314502676948, "learning_rate": 4.7659523809523806e-07, "loss": 0.0012, "num_tokens": 697591754.0, "reward": 0.34375, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9148019909858703, "step": 9950 }, { "completion_length": 416.4, "completions/clipped_ratio": 0.0, "completions/max_length": 416.4, "completions/max_terminated_length": 416.4, "completions/mean_length": 91.68515625, "completions/mean_terminated_length": 91.68515625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009009261756385218, "frac_reward_zero_std": 0.90625, "grad_norm": 1.7564618587493896, "kl": 1.2714933250215836, "learning_rate": 4.7655555555555557e-07, "loss": 0.0013, "num_tokens": 697908231.0, "reward": 0.3640625, "reward_std": 0.08290398679673672, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9193482995033264, "step": 9955 }, { "completion_length": 464.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 464.4, "completions/max_terminated_length": 430.2, "completions/mean_length": 94.6265625, "completions/mean_terminated_length": 94.10353393554688, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.009013786749733478, "frac_reward_zero_std": 0.925, "grad_norm": 6.403578281402588, "kl": 0.9503267546650023, "learning_rate": 4.7651587301587297e-07, "loss": 0.001, "num_tokens": 698228489.0, "reward": 0.3359375, "reward_std": 0.06370805725455284, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9369353175163269, "step": 9960 }, { "completion_length": 443.2, "completions/clipped_ratio": 0.0, "completions/max_length": 443.2, "completions/max_terminated_length": 443.2, "completions/mean_length": 88.42421875, "completions/mean_terminated_length": 88.42421875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009018311743081738, "frac_reward_zero_std": 0.925, "grad_norm": 10.916136741638184, "kl": 3.8345400413265454, "learning_rate": 4.764761904761905e-07, "loss": 0.0038, "num_tokens": 698536800.0, "reward": 0.40625, "reward_std": 0.06601666137576104, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.8968204498291016, "step": 9965 }, { "completion_length": 400.8, "completions/clipped_ratio": 0.0, "completions/max_length": 400.8, "completions/max_terminated_length": 400.8, "completions/mean_length": 96.1359375, "completions/mean_terminated_length": 96.1359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009022836736429998, "frac_reward_zero_std": 0.89375, "grad_norm": 8.951791763305664, "kl": 4.96770355803892, "learning_rate": 4.7643650793650793e-07, "loss": 0.005, "num_tokens": 698860214.0, "reward": 0.275, "reward_std": 0.08333086632192135, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9538038611412049, "step": 9970 }, { "completion_length": 435.6, "completions/clipped_ratio": 0.0, "completions/max_length": 435.6, "completions/max_terminated_length": 435.6, "completions/mean_length": 91.1390625, "completions/mean_terminated_length": 91.1390625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009027361729778258, "frac_reward_zero_std": 0.89375, "grad_norm": 2.212655544281006, "kl": 5.621552012208849, "learning_rate": 4.7639682539682533e-07, "loss": 0.0056, "num_tokens": 699174224.0, "reward": 0.2703125, "reward_std": 0.08469782173633575, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9597963213920593, "step": 9975 }, { "completion_length": 517.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 517.6, "completions/max_terminated_length": 488.2, "completions/mean_length": 97.0296875, "completions/mean_terminated_length": 96.51243438720704, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009031886723126516, "frac_reward_zero_std": 0.925, "grad_norm": 3.5587518215179443, "kl": 8.646811371005606, "learning_rate": 4.7635714285714284e-07, "loss": 0.0086, "num_tokens": 699498038.0, "reward": 0.4703125, "reward_std": 0.061917747184634206, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8805994868278504, "step": 9980 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 91.06171875, "completions/mean_terminated_length": 91.06171875, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.009036411716474776, "frac_reward_zero_std": 0.89375, "grad_norm": 5.90040922164917, "kl": 10.784283775300718, "learning_rate": 4.763174603174603e-07, "loss": 0.0108, "num_tokens": 699812725.0, "reward": 0.3375, "reward_std": 0.09673885181546212, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9301929593086242, "step": 9985 }, { "completion_length": 423.6, "completions/clipped_ratio": 0.0, "completions/max_length": 423.6, "completions/max_terminated_length": 423.6, "completions/mean_length": 99.5421875, "completions/mean_terminated_length": 99.5421875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009040936709823037, "frac_reward_zero_std": 0.8875, "grad_norm": 8.572100639343262, "kl": 8.142116361204534, "learning_rate": 4.762777777777778e-07, "loss": 0.0081, "num_tokens": 700142979.0, "reward": 0.2421875, "reward_std": 0.09500842541456223, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9649441838264465, "step": 9990 }, { "completion_length": 543.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 543.6, "completions/max_terminated_length": 429.2, "completions/mean_length": 92.3171875, "completions/mean_terminated_length": 91.26010284423828, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009045461703171297, "frac_reward_zero_std": 0.91875, "grad_norm": 10.421746253967285, "kl": 3.200280082062818, "learning_rate": 4.762380952380952e-07, "loss": 0.0032, "num_tokens": 700460001.0, "reward": 0.24375, "reward_std": 0.07474813833832741, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.964605712890625, "step": 9995 }, { "completion_length": 447.2, "completions/clipped_ratio": 0.0, "completions/max_length": 447.2, "completions/max_terminated_length": 447.2, "completions/mean_length": 90.99765625, "completions/mean_terminated_length": 90.99765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009049986696519557, "frac_reward_zero_std": 0.925, "grad_norm": 8.41126823425293, "kl": 3.0412751358468086, "learning_rate": 4.7619841269841265e-07, "loss": 0.003, "num_tokens": 700775022.0, "reward": 0.309375, "reward_std": 0.06396671570837498, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9464547991752624, "step": 10000 }, { "completion_length": 336.8, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/max_terminated_length": 336.8, "completions/mean_length": 92.53203125, "completions/mean_terminated_length": 92.53203125, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.009054511689867815, "frac_reward_zero_std": 0.88125, "grad_norm": 7.933701515197754, "kl": 2.1115209761541336, "learning_rate": 4.7615873015873016e-07, "loss": 0.0021, "num_tokens": 701092311.0, "reward": 0.2546875, "reward_std": 0.107839797437191, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9486854314804077, "step": 10005 }, { "completion_length": 341.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 92.17109375, "completions/mean_terminated_length": 92.17109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009059036683216075, "frac_reward_zero_std": 0.91875, "grad_norm": 5.152210712432861, "kl": 1.5432183883152901, "learning_rate": 4.761190476190476e-07, "loss": 0.0015, "num_tokens": 701410010.0, "reward": 0.40625, "reward_std": 0.06812747418880463, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9010132789611817, "step": 10010 }, { "completion_length": 588.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 588.4, "completions/max_terminated_length": 520.6, "completions/mean_length": 104.6578125, "completions/mean_terminated_length": 103.61187438964843, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.009063561676564335, "frac_reward_zero_std": 0.90625, "grad_norm": 8.451881408691406, "kl": 1.9918762168264947, "learning_rate": 4.7607936507936507e-07, "loss": 0.002, "num_tokens": 701746196.0, "reward": 0.2421875, "reward_std": 0.07790893912315369, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9695228219032288, "step": 10015 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 95.66953125, "completions/mean_terminated_length": 95.66953125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009068086669912595, "frac_reward_zero_std": 0.925, "grad_norm": 5.0223774909973145, "kl": 2.139304868062027, "learning_rate": 4.760396825396825e-07, "loss": 0.0021, "num_tokens": 702068597.0, "reward": 0.3375, "reward_std": 0.06622980684041976, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9369133114814758, "step": 10020 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 92.36875, "completions/mean_terminated_length": 92.36875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009072611663260855, "frac_reward_zero_std": 0.91875, "grad_norm": 5.152660369873047, "kl": 3.489989219023846, "learning_rate": 4.76e-07, "loss": 0.0035, "num_tokens": 702386493.0, "reward": 0.275, "reward_std": 0.07064824402332306, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9529595732688904, "step": 10025 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 94.19140625, "completions/mean_terminated_length": 94.19140625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009077136656609116, "frac_reward_zero_std": 0.96875, "grad_norm": 4.322567462921143, "kl": 1.935604493203573, "learning_rate": 4.7596031746031743e-07, "loss": 0.0019, "num_tokens": 702704754.0, "reward": 0.2296875, "reward_std": 0.024831003323197366, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9485692143440246, "step": 10030 }, { "completion_length": 524.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 524.2, "completions/max_terminated_length": 447.2, "completions/mean_length": 93.42578125, "completions/mean_terminated_length": 92.38141632080078, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009081661649957374, "frac_reward_zero_std": 0.86875, "grad_norm": 8.548762321472168, "kl": 2.5597924740752207, "learning_rate": 4.759206349206349e-07, "loss": 0.0026, "num_tokens": 703021987.0, "reward": 0.2953125, "reward_std": 0.11057528257369995, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9422236800193786, "step": 10035 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 87.39453125, "completions/mean_terminated_length": 87.39453125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.009086186643305634, "frac_reward_zero_std": 0.9, "grad_norm": 8.489656448364258, "kl": 4.531834207777865, "learning_rate": 4.758809523809524e-07, "loss": 0.0045, "num_tokens": 703330844.0, "reward": 0.3421875, "reward_std": 0.09116464368999004, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9295509576797485, "step": 10040 }, { "completion_length": 435.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.2, "completions/max_terminated_length": 413.0, "completions/mean_length": 95.8515625, "completions/mean_terminated_length": 95.33549194335937, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009090711636653894, "frac_reward_zero_std": 0.9125, "grad_norm": 10.778449058532715, "kl": 5.174744588788599, "learning_rate": 4.7584126984126984e-07, "loss": 0.0052, "num_tokens": 703654054.0, "reward": 0.3515625, "reward_std": 0.0762224555015564, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9244210124015808, "step": 10045 }, { "completion_length": 462.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 462.4, "completions/max_terminated_length": 405.8, "completions/mean_length": 93.21640625, "completions/mean_terminated_length": 92.69476776123047, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009095236630002154, "frac_reward_zero_std": 0.88125, "grad_norm": 8.080483436584473, "kl": 6.079757587728091, "learning_rate": 4.7580158730158724e-07, "loss": 0.0061, "num_tokens": 703972611.0, "reward": 0.265625, "reward_std": 0.10625969618558884, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9490764021873475, "step": 10050 }, { "completion_length": 399.4, "completions/clipped_ratio": 0.0, "completions/max_length": 399.4, "completions/max_terminated_length": 399.4, "completions/mean_length": 86.28125, "completions/mean_terminated_length": 86.28125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009099761623350414, "frac_reward_zero_std": 0.9125, "grad_norm": 11.878567695617676, "kl": 6.443790620006621, "learning_rate": 4.7576190476190475e-07, "loss": 0.0064, "num_tokens": 704278819.0, "reward": 0.2890625, "reward_std": 0.07459683939814568, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9434435725212097, "step": 10055 }, { "completion_length": 314.2, "completions/clipped_ratio": 0.0, "completions/max_length": 314.2, "completions/max_terminated_length": 314.2, "completions/mean_length": 87.11171875, "completions/mean_terminated_length": 87.11171875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009104286616698673, "frac_reward_zero_std": 0.95, "grad_norm": 8.983916282653809, "kl": 3.7229024946456777, "learning_rate": 4.757222222222222e-07, "loss": 0.0037, "num_tokens": 704586578.0, "reward": 0.28125, "reward_std": 0.045818221569061277, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9471548080444336, "step": 10060 }, { "completion_length": 386.2, "completions/clipped_ratio": 0.0, "completions/max_length": 386.2, "completions/max_terminated_length": 386.2, "completions/mean_length": 90.96171875, "completions/mean_terminated_length": 90.96171875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009108811610046933, "frac_reward_zero_std": 0.925, "grad_norm": 2.455435276031494, "kl": 3.0837291155476123, "learning_rate": 4.7568253968253966e-07, "loss": 0.0031, "num_tokens": 704900857.0, "reward": 0.31875, "reward_std": 0.05871202573180199, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9054256558418274, "step": 10065 }, { "completion_length": 455.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 455.6, "completions/max_terminated_length": 444.0, "completions/mean_length": 104.4140625, "completions/mean_terminated_length": 103.90271911621093, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009113336603395193, "frac_reward_zero_std": 0.86875, "grad_norm": 7.0324482917785645, "kl": 3.89431369304657, "learning_rate": 4.756428571428571e-07, "loss": 0.0039, "num_tokens": 705237187.0, "reward": 0.278125, "reward_std": 0.10579240173101426, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9463323593139649, "step": 10070 }, { "completion_length": 302.2, "completions/clipped_ratio": 0.0, "completions/max_length": 302.2, "completions/max_terminated_length": 302.2, "completions/mean_length": 97.48046875, "completions/mean_terminated_length": 97.48046875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009117861596743453, "frac_reward_zero_std": 0.925, "grad_norm": 9.997578620910645, "kl": 1.001971826585941, "learning_rate": 4.7560317460317457e-07, "loss": 0.001, "num_tokens": 705565522.0, "reward": 0.375, "reward_std": 0.06507501602172852, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9107392311096192, "step": 10075 }, { "completion_length": 595.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 595.6, "completions/max_terminated_length": 431.4, "completions/mean_length": 100.15703125, "completions/mean_terminated_length": 99.1062728881836, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009122386590091713, "frac_reward_zero_std": 0.925, "grad_norm": 3.4524662494659424, "kl": 1.5857077340246177, "learning_rate": 4.755634920634921e-07, "loss": 0.0016, "num_tokens": 705896115.0, "reward": 0.3921875, "reward_std": 0.06806758940219879, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9118257641792298, "step": 10080 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 85.275, "completions/mean_terminated_length": 85.275, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009126911583439973, "frac_reward_zero_std": 0.9375, "grad_norm": 11.214969635009766, "kl": 2.023954936116934, "learning_rate": 4.755238095238095e-07, "loss": 0.002, "num_tokens": 706202291.0, "reward": 0.2828125, "reward_std": 0.05034499727189541, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9545452237129212, "step": 10085 }, { "completion_length": 540.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 85.85703125, "completions/mean_terminated_length": 85.85703125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.009131436576788231, "frac_reward_zero_std": 0.90625, "grad_norm": 9.94713306427002, "kl": 6.19636181598762, "learning_rate": 4.75484126984127e-07, "loss": 0.0062, "num_tokens": 706508076.0, "reward": 0.3234375, "reward_std": 0.07628332041203975, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9214846730232239, "step": 10090 }, { "completion_length": 444.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 94.4828125, "completions/mean_terminated_length": 94.4828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009135961570136492, "frac_reward_zero_std": 0.95, "grad_norm": 0.0250370092689991, "kl": 2.181263346830383, "learning_rate": 4.7544444444444444e-07, "loss": 0.0022, "num_tokens": 706827886.0, "reward": 0.478125, "reward_std": 0.03740528486669063, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8609733343124389, "step": 10095 }, { "completion_length": 462.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 462.4, "completions/max_terminated_length": 373.6, "completions/mean_length": 92.34609375, "completions/mean_terminated_length": 91.81936950683594, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009140486563484752, "frac_reward_zero_std": 0.875, "grad_norm": 5.290961742401123, "kl": 2.191595726914238, "learning_rate": 4.754047619047619e-07, "loss": 0.0022, "num_tokens": 707145225.0, "reward": 0.2484375, "reward_std": 0.105684057995677, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9476941347122192, "step": 10100 }, { "completion_length": 286.6, "completions/clipped_ratio": 0.0, "completions/max_length": 286.6, "completions/max_terminated_length": 286.6, "completions/mean_length": 85.59375, "completions/mean_terminated_length": 85.59375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009145011556833012, "frac_reward_zero_std": 0.94375, "grad_norm": 8.205629348754883, "kl": 1.8385408216621726, "learning_rate": 4.7536507936507934e-07, "loss": 0.0018, "num_tokens": 707451273.0, "reward": 0.303125, "reward_std": 0.04839985817670822, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9509167909622193, "step": 10105 }, { "completion_length": 513.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 513.0, "completions/max_terminated_length": 465.2, "completions/mean_length": 99.42265625, "completions/mean_terminated_length": 98.90677490234376, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009149536550181272, "frac_reward_zero_std": 0.8875, "grad_norm": 12.540733337402344, "kl": 2.4486783452448435, "learning_rate": 4.753253968253968e-07, "loss": 0.0024, "num_tokens": 707778622.0, "reward": 0.30625, "reward_std": 0.10005094334483147, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9222849845886231, "step": 10110 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 90.14609375, "completions/mean_terminated_length": 90.14609375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00915406154352953, "frac_reward_zero_std": 0.9625, "grad_norm": 2.0769095420837402, "kl": 2.0628512591822075, "learning_rate": 4.752857142857143e-07, "loss": 0.0021, "num_tokens": 708092849.0, "reward": 0.3671875, "reward_std": 0.034457636252045634, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9299895286560058, "step": 10115 }, { "completion_length": 394.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 94.11875, "completions/mean_terminated_length": 94.11875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00915858653687779, "frac_reward_zero_std": 0.9125, "grad_norm": 5.530482769012451, "kl": 0.924814380495809, "learning_rate": 4.752460317460317e-07, "loss": 0.0009, "num_tokens": 708412385.0, "reward": 0.1875, "reward_std": 0.07506864070892334, "rewards/verify_chess_move/mean": 0.1875, "rewards/verify_chess_move/std": 0.9750031232833862, "step": 10120 }, { "completion_length": 417.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 96.43359375, "completions/mean_terminated_length": 96.43359375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00916311153022605, "frac_reward_zero_std": 0.86875, "grad_norm": 7.767888069152832, "kl": 0.869346099277027, "learning_rate": 4.7520634920634916e-07, "loss": 0.0009, "num_tokens": 708737084.0, "reward": 0.38125, "reward_std": 0.11078940927982331, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9134015917778016, "step": 10125 }, { "completion_length": 501.4, "completions/clipped_ratio": 0.0, "completions/max_length": 501.4, "completions/max_terminated_length": 501.4, "completions/mean_length": 97.321875, "completions/mean_terminated_length": 97.321875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00916763652357431, "frac_reward_zero_std": 0.91875, "grad_norm": 8.39892292022705, "kl": 1.039790374657605, "learning_rate": 4.7516666666666667e-07, "loss": 0.001, "num_tokens": 709061736.0, "reward": 0.3125, "reward_std": 0.07338216304779052, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9229264616966247, "step": 10130 }, { "completion_length": 337.4, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/max_terminated_length": 337.4, "completions/mean_length": 88.3625, "completions/mean_terminated_length": 88.3625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00917216151692257, "frac_reward_zero_std": 0.925, "grad_norm": 0.21272246539592743, "kl": 1.9477925488958134, "learning_rate": 4.751269841269841e-07, "loss": 0.0019, "num_tokens": 709373008.0, "reward": 0.3421875, "reward_std": 0.06302408576011657, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9322981476783753, "step": 10135 }, { "completion_length": 453.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 453.0, "completions/max_terminated_length": 444.2, "completions/mean_length": 99.42421875, "completions/mean_terminated_length": 97.87576904296876, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00917668651027083, "frac_reward_zero_std": 0.9, "grad_norm": 3.2059738636016846, "kl": 2.004028322501108, "learning_rate": 4.750873015873016e-07, "loss": 0.002, "num_tokens": 709700639.0, "reward": 0.225, "reward_std": 0.089162165671587, "rewards/verify_chess_move/mean": 0.225, "rewards/verify_chess_move/std": 0.9711775302886962, "step": 10140 }, { "completion_length": 513.8, "completions/clipped_ratio": 0.0, "completions/max_length": 513.8, "completions/max_terminated_length": 513.8, "completions/mean_length": 91.7015625, "completions/mean_terminated_length": 91.7015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009181211503619089, "frac_reward_zero_std": 0.875, "grad_norm": 7.442864894866943, "kl": 1.840908694884274, "learning_rate": 4.7504761904761903e-07, "loss": 0.0018, "num_tokens": 710015857.0, "reward": 0.3734375, "reward_std": 0.11273102313280106, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.8946632146835327, "step": 10145 }, { "completion_length": 315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 93.2671875, "completions/mean_terminated_length": 93.2671875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009185736496967349, "frac_reward_zero_std": 0.91875, "grad_norm": 0.18538880348205566, "kl": 0.5693395256064833, "learning_rate": 4.750079365079365e-07, "loss": 0.0006, "num_tokens": 710335287.0, "reward": 0.2578125, "reward_std": 0.07243953794240951, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.96305410861969, "step": 10150 }, { "completion_length": 469.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 469.0, "completions/max_terminated_length": 416.2, "completions/mean_length": 90.40625, "completions/mean_terminated_length": 89.87652435302735, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00919026149031561, "frac_reward_zero_std": 0.95, "grad_norm": 0.25668463110923767, "kl": 1.6253412946825847, "learning_rate": 4.7496825396825394e-07, "loss": 0.0016, "num_tokens": 710649847.0, "reward": 0.221875, "reward_std": 0.04218915030360222, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9702972292900085, "step": 10155 }, { "completion_length": 335.2, "completions/clipped_ratio": 0.0, "completions/max_length": 335.2, "completions/max_terminated_length": 335.2, "completions/mean_length": 95.00546875, "completions/mean_terminated_length": 95.00546875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00919478648366387, "frac_reward_zero_std": 0.9, "grad_norm": 15.24605655670166, "kl": 1.9630754354409874, "learning_rate": 4.749285714285714e-07, "loss": 0.002, "num_tokens": 710971054.0, "reward": 0.28125, "reward_std": 0.08711221925914288, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9404386520385742, "step": 10160 }, { "completion_length": 323.8, "completions/clipped_ratio": 0.0, "completions/max_length": 323.8, "completions/max_terminated_length": 323.8, "completions/mean_length": 92.32265625, "completions/mean_terminated_length": 92.32265625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00919931147701213, "frac_reward_zero_std": 0.91875, "grad_norm": 7.5070600509643555, "kl": 0.965087210573256, "learning_rate": 4.748888888888889e-07, "loss": 0.001, "num_tokens": 711288491.0, "reward": 0.3671875, "reward_std": 0.07175556570291519, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9094961404800415, "step": 10165 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 366.2, "completions/max_terminated_length": 306.4, "completions/mean_length": 90.4359375, "completions/mean_terminated_length": 89.91969146728516, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009203836470360388, "frac_reward_zero_std": 0.93125, "grad_norm": 9.998703002929688, "kl": 1.4953872148878873, "learning_rate": 4.7484920634920635e-07, "loss": 0.0015, "num_tokens": 711602569.0, "reward": 0.5015625, "reward_std": 0.0620215754956007, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.8625111937522888, "step": 10170 }, { "completion_length": 362.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 362.2, "completions/max_terminated_length": 309.6, "completions/mean_length": 86.184375, "completions/mean_terminated_length": 85.65308380126953, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.009208361463708648, "frac_reward_zero_std": 0.90625, "grad_norm": 0.27664515376091003, "kl": 0.7651426190743222, "learning_rate": 4.7480952380952375e-07, "loss": 0.0008, "num_tokens": 711909797.0, "reward": 0.3671875, "reward_std": 0.07312507182359695, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.918875002861023, "step": 10175 }, { "completion_length": 280.4, "completions/clipped_ratio": 0.0, "completions/max_length": 280.4, "completions/max_terminated_length": 280.4, "completions/mean_length": 84.55, "completions/mean_terminated_length": 84.55, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009212886457056908, "frac_reward_zero_std": 0.93125, "grad_norm": 2.8852756023406982, "kl": 1.2220369983930142, "learning_rate": 4.7476984126984126e-07, "loss": 0.0012, "num_tokens": 712214413.0, "reward": 0.315625, "reward_std": 0.05360962077975273, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.937385356426239, "step": 10180 }, { "completion_length": 360.2, "completions/clipped_ratio": 0.0, "completions/max_length": 360.2, "completions/max_terminated_length": 360.2, "completions/mean_length": 88.71953125, "completions/mean_terminated_length": 88.71953125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009217411450405168, "frac_reward_zero_std": 0.875, "grad_norm": 4.86956787109375, "kl": 1.4444307595840655, "learning_rate": 4.747301587301587e-07, "loss": 0.0014, "num_tokens": 712525822.0, "reward": 0.3109375, "reward_std": 0.10884230881929398, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9356228590011597, "step": 10185 }, { "completion_length": 559.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 559.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 87.82421875, "completions/mean_terminated_length": 86.75696258544922, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009221936443753428, "frac_reward_zero_std": 0.93125, "grad_norm": 8.035621643066406, "kl": 1.0328820234630256, "learning_rate": 4.746904761904762e-07, "loss": 0.001, "num_tokens": 712835725.0, "reward": 0.3453125, "reward_std": 0.0561313733458519, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.930327033996582, "step": 10190 }, { "completion_length": 506.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 506.4, "completions/max_terminated_length": 409.6, "completions/mean_length": 96.41796875, "completions/mean_terminated_length": 95.8941436767578, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009226461437101688, "frac_reward_zero_std": 0.9, "grad_norm": 2.945305109024048, "kl": 1.79225639346987, "learning_rate": 4.746507936507936e-07, "loss": 0.0018, "num_tokens": 713160500.0, "reward": 0.3328125, "reward_std": 0.0886903628706932, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9374682784080506, "step": 10195 }, { "completion_length": 438.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 438.8, "completions/max_terminated_length": 420.2, "completions/mean_length": 98.95, "completions/mean_terminated_length": 97.91280822753906, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009230986430449947, "frac_reward_zero_std": 0.8875, "grad_norm": 4.5605926513671875, "kl": 1.5989701534505003, "learning_rate": 4.746111111111111e-07, "loss": 0.0016, "num_tokens": 713488644.0, "reward": 0.309375, "reward_std": 0.09705739319324494, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9474847555160523, "step": 10200 }, { "completion_length": 432.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 432.4, "completions/max_terminated_length": 380.0, "completions/mean_length": 86.7796875, "completions/mean_terminated_length": 86.24195861816406, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009235511423798207, "frac_reward_zero_std": 0.8875, "grad_norm": 6.195837497711182, "kl": 2.006912069860846, "learning_rate": 4.745714285714286e-07, "loss": 0.002, "num_tokens": 713797114.0, "reward": 0.31875, "reward_std": 0.09863651469349861, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9481064558029175, "step": 10205 }, { "completion_length": 299.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 88.60234375, "completions/mean_terminated_length": 88.60234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009240036417146467, "frac_reward_zero_std": 0.90625, "grad_norm": 8.104266166687012, "kl": 4.323299724888057, "learning_rate": 4.74531746031746e-07, "loss": 0.0043, "num_tokens": 714108573.0, "reward": 0.3203125, "reward_std": 0.08448311053216458, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9473485946655273, "step": 10210 }, { "completion_length": 363.6, "completions/clipped_ratio": 0.0, "completions/max_length": 363.6, "completions/max_terminated_length": 363.6, "completions/mean_length": 85.63671875, "completions/mean_terminated_length": 85.63671875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009244561410494727, "frac_reward_zero_std": 0.925, "grad_norm": 7.9808549880981445, "kl": 5.564567623811309, "learning_rate": 4.7449206349206344e-07, "loss": 0.0056, "num_tokens": 714415868.0, "reward": 0.36875, "reward_std": 0.06554583832621574, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9170651078224182, "step": 10215 }, { "completion_length": 405.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.4, "completions/max_terminated_length": 363.0, "completions/mean_length": 85.90859375, "completions/mean_terminated_length": 85.37588958740234, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009249086403842987, "frac_reward_zero_std": 0.90625, "grad_norm": 5.017146110534668, "kl": 4.813770758244209, "learning_rate": 4.7445238095238094e-07, "loss": 0.0048, "num_tokens": 714722311.0, "reward": 0.3265625, "reward_std": 0.07790893577039242, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9369113445281982, "step": 10220 }, { "completion_length": 302.4, "completions/clipped_ratio": 0.0, "completions/max_length": 302.4, "completions/max_terminated_length": 302.4, "completions/mean_length": 84.3515625, "completions/mean_terminated_length": 84.3515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009253611397191245, "frac_reward_zero_std": 0.925, "grad_norm": 0.317903608083725, "kl": 2.953438011580147, "learning_rate": 4.744126984126984e-07, "loss": 0.003, "num_tokens": 715026121.0, "reward": 0.3203125, "reward_std": 0.057345069199800494, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9468991160392761, "step": 10225 }, { "completion_length": 472.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 472.8, "completions/max_terminated_length": 435.8, "completions/mean_length": 87.71796875, "completions/mean_terminated_length": 87.1880096435547, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009258136390539505, "frac_reward_zero_std": 0.925, "grad_norm": 1.6777629852294922, "kl": 1.8410733289201744, "learning_rate": 4.7437301587301585e-07, "loss": 0.0018, "num_tokens": 715336048.0, "reward": 0.31875, "reward_std": 0.061445944011211395, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.938794469833374, "step": 10230 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 93.8140625, "completions/mean_terminated_length": 93.8140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009262661383887765, "frac_reward_zero_std": 0.95, "grad_norm": 4.789824485778809, "kl": 1.98985956821125, "learning_rate": 4.743333333333333e-07, "loss": 0.002, "num_tokens": 715656074.0, "reward": 0.265625, "reward_std": 0.04581822231411934, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9496737718582153, "step": 10235 }, { "completion_length": 444.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.2, "completions/max_terminated_length": 416.4, "completions/mean_length": 89.5640625, "completions/mean_terminated_length": 89.04300079345703, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009267186377236026, "frac_reward_zero_std": 0.93125, "grad_norm": 0.2969939112663269, "kl": 2.5289384280331433, "learning_rate": 4.7429365079365076e-07, "loss": 0.0025, "num_tokens": 715968340.0, "reward": 0.284375, "reward_std": 0.062233741581439975, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9556935906410218, "step": 10240 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 95.76796875, "completions/mean_terminated_length": 95.76796875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009271711370584286, "frac_reward_zero_std": 0.93125, "grad_norm": 4.223752498626709, "kl": 1.3972509878920392, "learning_rate": 4.742539682539682e-07, "loss": 0.0014, "num_tokens": 716292739.0, "reward": 0.3125, "reward_std": 0.05839348472654819, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9376553773880005, "step": 10245 }, { "completion_length": 300.6, "completions/clipped_ratio": 0.0, "completions/max_length": 300.6, "completions/max_terminated_length": 300.6, "completions/mean_length": 90.9375, "completions/mean_terminated_length": 90.9375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009276236363932546, "frac_reward_zero_std": 0.9125, "grad_norm": 6.593856334686279, "kl": 1.2398439800832421, "learning_rate": 4.7421428571428567e-07, "loss": 0.0012, "num_tokens": 716607611.0, "reward": 0.2609375, "reward_std": 0.07506766319274902, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9609160184860229, "step": 10250 }, { "completion_length": 341.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 95.92109375, "completions/mean_terminated_length": 95.92109375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009280761357280804, "frac_reward_zero_std": 0.9, "grad_norm": 7.0958757400512695, "kl": 0.707668192894198, "learning_rate": 4.741746031746032e-07, "loss": 0.0007, "num_tokens": 716932582.0, "reward": 0.2828125, "reward_std": 0.0804905716329813, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9398904919624329, "step": 10255 }, { "completion_length": 473.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 473.2, "completions/max_terminated_length": 444.8, "completions/mean_length": 98.3796875, "completions/mean_terminated_length": 97.85464935302734, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009285286350629064, "frac_reward_zero_std": 0.9375, "grad_norm": 2.1039650440216064, "kl": 1.051263802824542, "learning_rate": 4.7413492063492063e-07, "loss": 0.0011, "num_tokens": 717260124.0, "reward": 0.28125, "reward_std": 0.04782324507832527, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9359249472618103, "step": 10260 }, { "completion_length": 475.2, "completions/clipped_ratio": 0.0, "completions/max_length": 475.2, "completions/max_terminated_length": 475.2, "completions/mean_length": 91.28046875, "completions/mean_terminated_length": 91.28046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009289811343977324, "frac_reward_zero_std": 0.90625, "grad_norm": 15.37409782409668, "kl": 0.9973685489967465, "learning_rate": 4.7409523809523803e-07, "loss": 0.001, "num_tokens": 717575259.0, "reward": 0.3828125, "reward_std": 0.07922842241823673, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.892720627784729, "step": 10265 }, { "completion_length": 381.8, "completions/clipped_ratio": 0.0, "completions/max_length": 381.8, "completions/max_terminated_length": 381.8, "completions/mean_length": 91.9421875, "completions/mean_terminated_length": 91.9421875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009294336337325584, "frac_reward_zero_std": 0.94375, "grad_norm": 3.118433952331543, "kl": 0.42405977203743533, "learning_rate": 4.7405555555555554e-07, "loss": 0.0004, "num_tokens": 717892905.0, "reward": 0.3015625, "reward_std": 0.04524259120225906, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9543743014335633, "step": 10270 }, { "completion_length": 415.8, "completions/clipped_ratio": 0.0, "completions/max_length": 415.8, "completions/max_terminated_length": 415.8, "completions/mean_length": 90.33046875, "completions/mean_terminated_length": 90.33046875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.009298861330673844, "frac_reward_zero_std": 0.93125, "grad_norm": 1.5493098497390747, "kl": 0.3309237003326416, "learning_rate": 4.74015873015873e-07, "loss": 0.0003, "num_tokens": 718207600.0, "reward": 0.2234375, "reward_std": 0.05954729653894901, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9670165061950684, "step": 10275 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 90.0796875, "completions/mean_terminated_length": 90.0796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009303386324022103, "frac_reward_zero_std": 0.925, "grad_norm": 0.2779816687107086, "kl": 0.27171468558954076, "learning_rate": 4.739761904761905e-07, "loss": 0.0003, "num_tokens": 718520918.0, "reward": 0.4078125, "reward_std": 0.06780795156955718, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8904654264450074, "step": 10280 }, { "completion_length": 326.4, "completions/clipped_ratio": 0.0, "completions/max_length": 326.4, "completions/max_terminated_length": 326.4, "completions/mean_length": 90.04609375, "completions/mean_terminated_length": 90.04609375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009307911317370363, "frac_reward_zero_std": 0.925, "grad_norm": 0.14904126524925232, "kl": 0.3559709379100241, "learning_rate": 4.739365079365079e-07, "loss": 0.0004, "num_tokens": 718834249.0, "reward": 0.303125, "reward_std": 0.06439104601740837, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.943052864074707, "step": 10285 }, { "completion_length": 449.2, "completions/clipped_ratio": 0.0, "completions/max_length": 449.2, "completions/max_terminated_length": 449.2, "completions/mean_length": 97.940625, "completions/mean_terminated_length": 97.940625, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.009312436310718623, "frac_reward_zero_std": 0.91875, "grad_norm": 10.29080867767334, "kl": 0.7003090938553214, "learning_rate": 4.7389682539682535e-07, "loss": 0.0007, "num_tokens": 719159205.0, "reward": 0.3984375, "reward_std": 0.06428623721003532, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9078172445297241, "step": 10290 }, { "completion_length": 452.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 452.4, "completions/max_terminated_length": 363.8, "completions/mean_length": 93.896875, "completions/mean_terminated_length": 93.37892608642578, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.009316961304066883, "frac_reward_zero_std": 0.86875, "grad_norm": 4.548583984375, "kl": 1.4652122267405503, "learning_rate": 4.7385714285714286e-07, "loss": 0.0015, "num_tokens": 719478241.0, "reward": 0.2703125, "reward_std": 0.11010446101427078, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9235382676124573, "step": 10295 }, { "completion_length": 513.8, "completions/clipped_ratio": 0.0, "completions/max_length": 513.8, "completions/max_terminated_length": 513.8, "completions/mean_length": 96.509375, "completions/mean_terminated_length": 96.509375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009321486297415143, "frac_reward_zero_std": 0.925, "grad_norm": 9.250497817993164, "kl": 1.1336351500009187, "learning_rate": 4.7381746031746026e-07, "loss": 0.0011, "num_tokens": 719801221.0, "reward": 0.234375, "reward_std": 0.07169567830860615, "rewards/verify_chess_move/mean": 0.234375, "rewards/verify_chess_move/std": 0.9484705209732056, "step": 10300 }, { "completion_length": 490.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 490.2, "completions/max_terminated_length": 467.8, "completions/mean_length": 96.51875, "completions/mean_terminated_length": 96.01428527832032, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009326011290763403, "frac_reward_zero_std": 0.86875, "grad_norm": 3.8070809841156006, "kl": 3.4165154539747165, "learning_rate": 4.7377777777777777e-07, "loss": 0.0034, "num_tokens": 720124237.0, "reward": 0.2390625, "reward_std": 0.115572290122509, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9643476366996765, "step": 10305 }, { "completion_length": 496.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 496.6, "completions/max_terminated_length": 399.8, "completions/mean_length": 94.728125, "completions/mean_terminated_length": 94.19496459960938, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009330536284111662, "frac_reward_zero_std": 0.88125, "grad_norm": 3.3717782497406006, "kl": 2.187177231349051, "learning_rate": 4.737380952380952e-07, "loss": 0.0022, "num_tokens": 720446673.0, "reward": 0.25, "reward_std": 0.10326809883117676, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9689777255058288, "step": 10310 }, { "completion_length": 421.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.6, "completions/max_terminated_length": 326.4, "completions/mean_length": 94.44296875, "completions/mean_terminated_length": 93.91629028320312, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009335061277459922, "frac_reward_zero_std": 0.925, "grad_norm": 16.935827255249023, "kl": 3.4510309497243723, "learning_rate": 4.736984126984127e-07, "loss": 0.0035, "num_tokens": 720767568.0, "reward": 0.284375, "reward_std": 0.062341098487377164, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9513047337532043, "step": 10315 }, { "completion_length": 400.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.8, "completions/max_terminated_length": 371.2, "completions/mean_length": 91.79765625, "completions/mean_terminated_length": 91.27965545654297, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009339586270808182, "frac_reward_zero_std": 0.9125, "grad_norm": 7.627761363983154, "kl": 1.6248158938833512, "learning_rate": 4.7365873015873013e-07, "loss": 0.0016, "num_tokens": 721083525.0, "reward": 0.315625, "reward_std": 0.07280554883182049, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9446502923965454, "step": 10320 }, { "completion_length": 425.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 425.0, "completions/max_terminated_length": 414.2, "completions/mean_length": 96.3171875, "completions/mean_terminated_length": 95.28844757080078, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009344111264156442, "frac_reward_zero_std": 0.925, "grad_norm": 6.534244537353516, "kl": 3.0787430482800118, "learning_rate": 4.736190476190476e-07, "loss": 0.0031, "num_tokens": 721407179.0, "reward": 0.2703125, "reward_std": 0.05713388435542584, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9615579843521118, "step": 10325 }, { "completion_length": 490.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 490.2, "completions/max_terminated_length": 458.6, "completions/mean_length": 93.890625, "completions/mean_terminated_length": 93.3633316040039, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009348636257504702, "frac_reward_zero_std": 0.925, "grad_norm": 0.08714985847473145, "kl": 3.722892415197566, "learning_rate": 4.735793650793651e-07, "loss": 0.0037, "num_tokens": 721728223.0, "reward": 0.234375, "reward_std": 0.06328470669686795, "rewards/verify_chess_move/mean": 0.234375, "rewards/verify_chess_move/std": 0.9507203102111816, "step": 10330 }, { "completion_length": 355.6, "completions/clipped_ratio": 0.0, "completions/max_length": 355.6, "completions/max_terminated_length": 355.6, "completions/mean_length": 91.10859375, "completions/mean_terminated_length": 91.10859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00935316125085296, "frac_reward_zero_std": 0.91875, "grad_norm": 7.4468994140625, "kl": 2.1839092018315567, "learning_rate": 4.735396825396825e-07, "loss": 0.0022, "num_tokens": 722043810.0, "reward": 0.3703125, "reward_std": 0.06928324699401855, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9272046089172363, "step": 10335 }, { "completion_length": 417.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 417.8, "completions/max_terminated_length": 325.8, "completions/mean_length": 89.2171875, "completions/mean_terminated_length": 88.69551849365234, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00935768624420122, "frac_reward_zero_std": 0.9125, "grad_norm": 10.608460426330566, "kl": 6.050816402491182, "learning_rate": 4.7349999999999995e-07, "loss": 0.0061, "num_tokens": 722355448.0, "reward": 0.2984375, "reward_std": 0.07827240228652954, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9532014846801757, "step": 10340 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 90.66484375, "completions/mean_terminated_length": 90.66484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00936221123754948, "frac_reward_zero_std": 0.89375, "grad_norm": 13.280211448669434, "kl": 2.1306744728935882, "learning_rate": 4.7346031746031745e-07, "loss": 0.0021, "num_tokens": 722670243.0, "reward": 0.3625, "reward_std": 0.09669235944747925, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9184323906898498, "step": 10345 }, { "completion_length": 268.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 89.26640625, "completions/mean_terminated_length": 89.26640625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00936673623089774, "frac_reward_zero_std": 0.91875, "grad_norm": 8.58230209350586, "kl": 1.108777254424058, "learning_rate": 4.734206349206349e-07, "loss": 0.0011, "num_tokens": 722983704.0, "reward": 0.4078125, "reward_std": 0.06975407265126705, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8951959609985352, "step": 10350 }, { "completion_length": 412.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 412.6, "completions/max_terminated_length": 389.8, "completions/mean_length": 95.5015625, "completions/mean_terminated_length": 94.97967376708985, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009371261224246, "frac_reward_zero_std": 0.90625, "grad_norm": 2.9287657737731934, "kl": 3.834450024855323, "learning_rate": 4.7338095238095236e-07, "loss": 0.0038, "num_tokens": 723305522.0, "reward": 0.2921875, "reward_std": 0.08174919188022614, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9499660968780518, "step": 10355 }, { "completion_length": 351.8, "completions/clipped_ratio": 0.0, "completions/max_length": 351.8, "completions/max_terminated_length": 351.8, "completions/mean_length": 92.028125, "completions/mean_terminated_length": 92.028125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.00937578621759426, "frac_reward_zero_std": 0.925, "grad_norm": 2.139206647872925, "kl": 6.555676445807331, "learning_rate": 4.733412698412698e-07, "loss": 0.0066, "num_tokens": 723620942.0, "reward": 0.3546875, "reward_std": 0.06328372620046138, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9272693753242492, "step": 10360 }, { "completion_length": 538.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 538.6, "completions/max_terminated_length": 363.8, "completions/mean_length": 93.3125, "completions/mean_terminated_length": 92.26453704833985, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00938031121094252, "frac_reward_zero_std": 0.89375, "grad_norm": 8.352267265319824, "kl": 3.6986525888089092, "learning_rate": 4.7330158730158727e-07, "loss": 0.0037, "num_tokens": 723939262.0, "reward": 0.284375, "reward_std": 0.08401483371853828, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9561300635337829, "step": 10365 }, { "completion_length": 457.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.6, "completions/max_terminated_length": 434.4, "completions/mean_length": 94.07578125, "completions/mean_terminated_length": 93.56043701171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00938483620429078, "frac_reward_zero_std": 0.94375, "grad_norm": 6.527642726898193, "kl": 0.98227819444146, "learning_rate": 4.732619047619048e-07, "loss": 0.001, "num_tokens": 724258871.0, "reward": 0.2859375, "reward_std": 0.05070846192538738, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9518675684928894, "step": 10370 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 365.0, "completions/max_terminated_length": 341.4, "completions/mean_length": 89.503125, "completions/mean_terminated_length": 88.98290252685547, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.00938936119763904, "frac_reward_zero_std": 0.9375, "grad_norm": 7.349051475524902, "kl": 1.2708187495009042, "learning_rate": 4.732222222222222e-07, "loss": 0.0013, "num_tokens": 724571619.0, "reward": 0.1984375, "reward_std": 0.053290098533034326, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.967404580116272, "step": 10375 }, { "completion_length": 322.8, "completions/clipped_ratio": 0.0, "completions/max_length": 322.8, "completions/max_terminated_length": 322.8, "completions/mean_length": 87.58359375, "completions/mean_terminated_length": 87.58359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0093938861909873, "frac_reward_zero_std": 0.95625, "grad_norm": 7.112320899963379, "kl": 3.9678507725475356, "learning_rate": 4.731825396825397e-07, "loss": 0.004, "num_tokens": 724880542.0, "reward": 0.2875, "reward_std": 0.037981899455189705, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9517965197563172, "step": 10380 }, { "completion_length": 338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 93.56953125, "completions/mean_terminated_length": 93.56953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00939841118433556, "frac_reward_zero_std": 0.9375, "grad_norm": 6.924177169799805, "kl": 4.548538099462166, "learning_rate": 4.7314285714285714e-07, "loss": 0.0045, "num_tokens": 725201415.0, "reward": 0.31875, "reward_std": 0.05581184923648834, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.945279061794281, "step": 10385 }, { "completion_length": 414.2, "completions/clipped_ratio": 0.0, "completions/max_length": 414.2, "completions/max_terminated_length": 414.2, "completions/mean_length": 93.290625, "completions/mean_terminated_length": 93.290625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009402936177683818, "frac_reward_zero_std": 0.91875, "grad_norm": 4.933322906494141, "kl": 4.994514098367654, "learning_rate": 4.7310317460317454e-07, "loss": 0.005, "num_tokens": 725518499.0, "reward": 0.35625, "reward_std": 0.06244845576584339, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.930048131942749, "step": 10390 }, { "completion_length": 494.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 494.4, "completions/max_terminated_length": 421.4, "completions/mean_length": 89.54140625, "completions/mean_terminated_length": 89.01357879638672, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.009407461171032078, "frac_reward_zero_std": 0.90625, "grad_norm": 21.01656723022461, "kl": 3.116763290169183, "learning_rate": 4.7306349206349204e-07, "loss": 0.0031, "num_tokens": 725832488.0, "reward": 0.278125, "reward_std": 0.08311614841222763, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9595937132835388, "step": 10395 }, { "completion_length": 558.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 558.4, "completions/max_terminated_length": 391.8, "completions/mean_length": 94.15, "completions/mean_terminated_length": 93.0897933959961, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009411986164380338, "frac_reward_zero_std": 0.9, "grad_norm": 5.149504661560059, "kl": 4.263876418443397, "learning_rate": 4.730238095238095e-07, "loss": 0.0043, "num_tokens": 726150504.0, "reward": 0.371875, "reward_std": 0.0841186597943306, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9259088277816773, "step": 10400 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 91.59140625, "completions/mean_terminated_length": 91.59140625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009416511157728598, "frac_reward_zero_std": 0.89375, "grad_norm": 5.958953857421875, "kl": 2.0034264071146026, "learning_rate": 4.72984126984127e-07, "loss": 0.002, "num_tokens": 726465501.0, "reward": 0.384375, "reward_std": 0.09105983376502991, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.916258704662323, "step": 10405 }, { "completion_length": 539.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 539.4, "completions/max_terminated_length": 513.6, "completions/mean_length": 94.1703125, "completions/mean_terminated_length": 93.6421401977539, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009421036151076858, "frac_reward_zero_std": 0.91875, "grad_norm": 9.909989356994629, "kl": 0.9094180234242231, "learning_rate": 4.729444444444444e-07, "loss": 0.0009, "num_tokens": 726786199.0, "reward": 0.3328125, "reward_std": 0.0685992807149887, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9254244089126586, "step": 10410 }, { "completion_length": 422.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 422.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 99.56640625, "completions/mean_terminated_length": 98.53868713378907, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.009425561144425118, "frac_reward_zero_std": 0.9125, "grad_norm": 34.480960845947266, "kl": 1.076445641834289, "learning_rate": 4.7290476190476186e-07, "loss": 0.0011, "num_tokens": 727115268.0, "reward": 0.3609375, "reward_std": 0.07733075767755508, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9310239315032959, "step": 10415 }, { "completion_length": 468.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 468.6, "completions/max_terminated_length": 465.2, "completions/mean_length": 90.84609375, "completions/mean_terminated_length": 90.3157943725586, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009430086137773377, "frac_reward_zero_std": 0.9, "grad_norm": 7.540437698364258, "kl": 0.9594834520714357, "learning_rate": 4.7286507936507937e-07, "loss": 0.001, "num_tokens": 727430583.0, "reward": 0.1765625, "reward_std": 0.08343567438423634, "rewards/verify_chess_move/mean": 0.1765625, "rewards/verify_chess_move/std": 0.977728545665741, "step": 10420 }, { "completion_length": 361.4, "completions/clipped_ratio": 0.0, "completions/max_length": 361.4, "completions/max_terminated_length": 361.4, "completions/mean_length": 94.76015625, "completions/mean_terminated_length": 94.76015625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009434611131121637, "frac_reward_zero_std": 0.925, "grad_norm": 0.912330687046051, "kl": 0.6151436958462, "learning_rate": 4.7282539682539677e-07, "loss": 0.0006, "num_tokens": 727751188.0, "reward": 0.403125, "reward_std": 0.06191676929593086, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9057339429855347, "step": 10425 }, { "completion_length": 402.2, "completions/clipped_ratio": 0.0, "completions/max_length": 402.2, "completions/max_terminated_length": 402.2, "completions/mean_length": 85.67578125, "completions/mean_terminated_length": 85.67578125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009439136124469897, "frac_reward_zero_std": 0.93125, "grad_norm": 2.147022008895874, "kl": 1.099107583751902, "learning_rate": 4.727857142857143e-07, "loss": 0.0011, "num_tokens": 728056949.0, "reward": 0.39375, "reward_std": 0.05723869316279888, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.8942021489143371, "step": 10430 }, { "completion_length": 369.8, "completions/clipped_ratio": 0.0, "completions/max_length": 369.8, "completions/max_terminated_length": 369.8, "completions/mean_length": 91.14140625, "completions/mean_terminated_length": 91.14140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009443661117818157, "frac_reward_zero_std": 0.9, "grad_norm": 7.426667213439941, "kl": 1.5037240860052408, "learning_rate": 4.7274603174603173e-07, "loss": 0.0015, "num_tokens": 728371154.0, "reward": 0.371875, "reward_std": 0.08322350792586804, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.919649863243103, "step": 10435 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 89.7359375, "completions/mean_terminated_length": 89.7359375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009448186111166417, "frac_reward_zero_std": 0.89375, "grad_norm": 3.387232542037964, "kl": 0.5085314672905952, "learning_rate": 4.727063492063492e-07, "loss": 0.0005, "num_tokens": 728683056.0, "reward": 0.434375, "reward_std": 0.09237932115793228, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8934550642967224, "step": 10440 }, { "completion_length": 487.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 487.8, "completions/max_terminated_length": 445.8, "completions/mean_length": 94.128125, "completions/mean_terminated_length": 93.59846954345703, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009452711104514677, "frac_reward_zero_std": 0.9, "grad_norm": 0.888968288898468, "kl": 3.509975152974948, "learning_rate": 4.7266666666666664e-07, "loss": 0.0035, "num_tokens": 729002500.0, "reward": 0.31875, "reward_std": 0.08595742397010327, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9485979318618775, "step": 10445 }, { "completion_length": 319.8, "completions/clipped_ratio": 0.0, "completions/max_length": 319.8, "completions/max_terminated_length": 319.8, "completions/mean_length": 93.4078125, "completions/mean_terminated_length": 93.4078125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009457236097862936, "frac_reward_zero_std": 0.93125, "grad_norm": 3.5660903453826904, "kl": 1.4772644772659986, "learning_rate": 4.726269841269841e-07, "loss": 0.0015, "num_tokens": 729322702.0, "reward": 0.3859375, "reward_std": 0.06159724444150925, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9223122358322143, "step": 10450 }, { "completion_length": 548.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 548.4, "completions/max_terminated_length": 545.2, "completions/mean_length": 94.92890625, "completions/mean_terminated_length": 94.41330108642578, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009461761091211196, "frac_reward_zero_std": 0.86875, "grad_norm": 15.101473808288574, "kl": 5.828131073783152, "learning_rate": 4.725873015873016e-07, "loss": 0.0058, "num_tokens": 729641899.0, "reward": 0.2875, "reward_std": 0.10784234702587128, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9524088859558105, "step": 10455 }, { "completion_length": 310.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 89.61796875, "completions/mean_terminated_length": 89.61796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009466286084559456, "frac_reward_zero_std": 0.91875, "grad_norm": 3.5600807666778564, "kl": 2.1165300950407984, "learning_rate": 4.7254761904761905e-07, "loss": 0.0021, "num_tokens": 729955810.0, "reward": 0.3734375, "reward_std": 0.07086041122674942, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9138652443885803, "step": 10460 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 81.12421875, "completions/mean_terminated_length": 81.12421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009470811077907716, "frac_reward_zero_std": 0.9375, "grad_norm": 6.6729841232299805, "kl": 1.944022750435397, "learning_rate": 4.7250793650793645e-07, "loss": 0.0019, "num_tokens": 730254161.0, "reward": 0.3734375, "reward_std": 0.052135304734110835, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.909659993648529, "step": 10465 }, { "completion_length": 522.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 522.2, "completions/max_terminated_length": 403.6, "completions/mean_length": 93.4203125, "completions/mean_terminated_length": 92.38450927734375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009475336071255976, "frac_reward_zero_std": 0.93125, "grad_norm": 9.808263778686523, "kl": 1.7842438308929558, "learning_rate": 4.7246825396825396e-07, "loss": 0.0018, "num_tokens": 730572915.0, "reward": 0.296875, "reward_std": 0.05702554509043693, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9425210952758789, "step": 10470 }, { "completion_length": 364.8, "completions/clipped_ratio": 0.0, "completions/max_length": 364.8, "completions/max_terminated_length": 364.8, "completions/mean_length": 91.715625, "completions/mean_terminated_length": 91.715625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009479861064604234, "frac_reward_zero_std": 0.95, "grad_norm": 1.476691484451294, "kl": 1.089280199375935, "learning_rate": 4.724285714285714e-07, "loss": 0.0011, "num_tokens": 730889151.0, "reward": 0.2765625, "reward_std": 0.04424007833003998, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9599586963653565, "step": 10475 }, { "completion_length": 463.4, "completions/clipped_ratio": 0.0, "completions/max_length": 463.4, "completions/max_terminated_length": 463.4, "completions/mean_length": 95.41328125, "completions/mean_terminated_length": 95.41328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009484386057952494, "frac_reward_zero_std": 0.91875, "grad_norm": 6.188694000244141, "kl": 1.2310988399083727, "learning_rate": 4.7238888888888887e-07, "loss": 0.0012, "num_tokens": 731211240.0, "reward": 0.3453125, "reward_std": 0.0708604134619236, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9343530893325805, "step": 10480 }, { "completion_length": 361.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 361.2, "completions/max_terminated_length": 273.2, "completions/mean_length": 88.471875, "completions/mean_terminated_length": 87.9491195678711, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009488911051300754, "frac_reward_zero_std": 0.9375, "grad_norm": 7.1217803955078125, "kl": 1.328840662469156, "learning_rate": 4.723492063492063e-07, "loss": 0.0013, "num_tokens": 731522860.0, "reward": 0.4421875, "reward_std": 0.048295048624277116, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8955011129379272, "step": 10485 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 92.74453125, "completions/mean_terminated_length": 92.74453125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009493436044649015, "frac_reward_zero_std": 0.9375, "grad_norm": 0.13675475120544434, "kl": 1.0077475850470363, "learning_rate": 4.723095238095238e-07, "loss": 0.001, "num_tokens": 731841301.0, "reward": 0.2828125, "reward_std": 0.049661026895046236, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9565699577331543, "step": 10490 }, { "completion_length": 400.8, "completions/clipped_ratio": 0.0, "completions/max_length": 400.8, "completions/max_terminated_length": 400.8, "completions/mean_length": 89.93125, "completions/mean_terminated_length": 89.93125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009497961037997275, "frac_reward_zero_std": 0.9375, "grad_norm": 7.683217525482178, "kl": 0.5725033360300585, "learning_rate": 4.722698412698413e-07, "loss": 0.0006, "num_tokens": 732155205.0, "reward": 0.3515625, "reward_std": 0.05034499615430832, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9346807718276977, "step": 10495 }, { "completion_length": 271.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 87.44765625, "completions/mean_terminated_length": 87.44765625, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.009502486031345535, "frac_reward_zero_std": 0.94375, "grad_norm": 10.238961219787598, "kl": 2.265333698550239, "learning_rate": 4.722301587301587e-07, "loss": 0.0023, "num_tokens": 732465354.0, "reward": 0.3875, "reward_std": 0.04955465085804463, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9079314708709717, "step": 10500 }, { "completion_length": 504.2, "completions/clipped_ratio": 0.0, "completions/max_length": 504.2, "completions/max_terminated_length": 504.2, "completions/mean_length": 92.88359375, "completions/mean_terminated_length": 92.88359375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009507011024693793, "frac_reward_zero_std": 0.93125, "grad_norm": 15.258447647094727, "kl": 1.8091089787427337, "learning_rate": 4.721904761904762e-07, "loss": 0.0018, "num_tokens": 732783133.0, "reward": 0.4765625, "reward_std": 0.06433116048574447, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8658855319023132, "step": 10505 }, { "completion_length": 369.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 369.8, "completions/max_terminated_length": 344.8, "completions/mean_length": 90.70703125, "completions/mean_terminated_length": 90.17919158935547, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009511536018042053, "frac_reward_zero_std": 0.925, "grad_norm": 14.292963027954102, "kl": 2.4340159742860124, "learning_rate": 4.7215079365079365e-07, "loss": 0.0024, "num_tokens": 733098766.0, "reward": 0.2578125, "reward_std": 0.06349491141736507, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.963598108291626, "step": 10510 }, { "completion_length": 422.6, "completions/clipped_ratio": 0.0, "completions/max_length": 422.6, "completions/max_terminated_length": 422.6, "completions/mean_length": 89.85625, "completions/mean_terminated_length": 89.85625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009516061011390313, "frac_reward_zero_std": 0.975, "grad_norm": 1.2726664543151855, "kl": 1.0779051195830107, "learning_rate": 4.7211111111111105e-07, "loss": 0.0011, "num_tokens": 733411766.0, "reward": 0.2578125, "reward_std": 0.0247236467897892, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9615427732467652, "step": 10515 }, { "completion_length": 571.8, "completions/clipped_ratio": 0.0, "completions/max_length": 571.8, "completions/max_terminated_length": 571.8, "completions/mean_length": 92.95, "completions/mean_terminated_length": 92.95, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009520586004738573, "frac_reward_zero_std": 0.91875, "grad_norm": 0.8313135504722595, "kl": 1.860772559850011, "learning_rate": 4.7207142857142855e-07, "loss": 0.0019, "num_tokens": 733728550.0, "reward": 0.3265625, "reward_std": 0.07243953607976436, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9385124325752259, "step": 10520 }, { "completion_length": 356.6, "completions/clipped_ratio": 0.0, "completions/max_length": 356.6, "completions/max_terminated_length": 356.6, "completions/mean_length": 88.84765625, "completions/mean_terminated_length": 88.84765625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009525110998086833, "frac_reward_zero_std": 0.9, "grad_norm": 9.457674026489258, "kl": 1.7539798269281164, "learning_rate": 4.72031746031746e-07, "loss": 0.0018, "num_tokens": 734039707.0, "reward": 0.33125, "reward_std": 0.08979768231511116, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9401751160621643, "step": 10525 }, { "completion_length": 478.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 478.8, "completions/max_terminated_length": 424.6, "completions/mean_length": 96.05546875, "completions/mean_terminated_length": 95.0262664794922, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009529635991435092, "frac_reward_zero_std": 0.925, "grad_norm": 10.77667236328125, "kl": 3.736101878504269, "learning_rate": 4.719920634920635e-07, "loss": 0.0037, "num_tokens": 734363882.0, "reward": 0.2328125, "reward_std": 0.06891625449061393, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.9651005625724792, "step": 10530 }, { "completion_length": 411.8, "completions/clipped_ratio": 0.0, "completions/max_length": 411.8, "completions/max_terminated_length": 411.8, "completions/mean_length": 86.78671875, "completions/mean_terminated_length": 86.78671875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009534160984783352, "frac_reward_zero_std": 0.88125, "grad_norm": 8.915491104125977, "kl": 2.048071196873207, "learning_rate": 4.719523809523809e-07, "loss": 0.002, "num_tokens": 734671841.0, "reward": 0.284375, "reward_std": 0.10194861441850663, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9588756084442138, "step": 10535 }, { "completion_length": 507.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 507.0, "completions/max_terminated_length": 460.6, "completions/mean_length": 96.9078125, "completions/mean_terminated_length": 95.3479507446289, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009538685978131612, "frac_reward_zero_std": 0.90625, "grad_norm": 6.427971363067627, "kl": 1.5609215404372663, "learning_rate": 4.7191269841269837e-07, "loss": 0.0016, "num_tokens": 734995555.0, "reward": 0.3734375, "reward_std": 0.08448310755193233, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9240176796913147, "step": 10540 }, { "completion_length": 441.2, "completions/clipped_ratio": 0.0, "completions/max_length": 441.2, "completions/max_terminated_length": 441.2, "completions/mean_length": 94.278125, "completions/mean_terminated_length": 94.278125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009543210971479872, "frac_reward_zero_std": 0.88125, "grad_norm": 1.4974218606948853, "kl": 2.5221331500913946, "learning_rate": 4.718730158730159e-07, "loss": 0.0025, "num_tokens": 735316551.0, "reward": 0.296875, "reward_std": 0.10126660317182541, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9349820375442505, "step": 10545 }, { "completion_length": 385.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 88.11796875, "completions/mean_terminated_length": 88.11796875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009547735964828132, "frac_reward_zero_std": 0.94375, "grad_norm": 3.5037388801574707, "kl": 1.3773691721376964, "learning_rate": 4.7183333333333333e-07, "loss": 0.0014, "num_tokens": 735628206.0, "reward": 0.365625, "reward_std": 0.04887068159878254, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9067054033279419, "step": 10550 }, { "completion_length": 429.8, "completions/clipped_ratio": 0.0, "completions/max_length": 429.8, "completions/max_terminated_length": 429.8, "completions/mean_length": 88.5453125, "completions/mean_terminated_length": 88.5453125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009552260958176392, "frac_reward_zero_std": 0.9125, "grad_norm": 9.803321838378906, "kl": 2.51144334343262, "learning_rate": 4.717936507936508e-07, "loss": 0.0025, "num_tokens": 735939272.0, "reward": 0.2828125, "reward_std": 0.07575163505971431, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9581810235977173, "step": 10555 }, { "completion_length": 386.4, "completions/clipped_ratio": 0.0, "completions/max_length": 386.4, "completions/max_terminated_length": 386.4, "completions/mean_length": 94.49921875, "completions/mean_terminated_length": 94.49921875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00955678595152465, "frac_reward_zero_std": 0.93125, "grad_norm": 12.258227348327637, "kl": 2.627368549676612, "learning_rate": 4.7175396825396824e-07, "loss": 0.0026, "num_tokens": 736260951.0, "reward": 0.3328125, "reward_std": 0.06112641990184784, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9356040954589844, "step": 10560 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 87.15703125, "completions/mean_terminated_length": 87.15703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00956131094487291, "frac_reward_zero_std": 0.9625, "grad_norm": 15.840847969055176, "kl": 0.8935407684417441, "learning_rate": 4.717142857142857e-07, "loss": 0.0009, "num_tokens": 736570216.0, "reward": 0.3640625, "reward_std": 0.03287851139903068, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.925860333442688, "step": 10565 }, { "completion_length": 291.2, "completions/clipped_ratio": 0.0, "completions/max_length": 291.2, "completions/max_terminated_length": 291.2, "completions/mean_length": 82.81171875, "completions/mean_terminated_length": 82.81171875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.00956583593822117, "frac_reward_zero_std": 0.9125, "grad_norm": 4.358828544616699, "kl": 2.9162721105618403, "learning_rate": 4.7167460317460315e-07, "loss": 0.0029, "num_tokens": 736873535.0, "reward": 0.2390625, "reward_std": 0.06891782097518444, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9529770612716675, "step": 10570 }, { "completion_length": 470.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 470.8, "completions/max_terminated_length": 418.2, "completions/mean_length": 90.44609375, "completions/mean_terminated_length": 89.9143295288086, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009570360931569431, "frac_reward_zero_std": 0.9125, "grad_norm": 6.047464370727539, "kl": 4.607542378385551, "learning_rate": 4.716349206349206e-07, "loss": 0.0046, "num_tokens": 737186058.0, "reward": 0.3515625, "reward_std": 0.08117101229727268, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9330167651176453, "step": 10575 }, { "completion_length": 479.6, "completions/clipped_ratio": 0.0, "completions/max_length": 479.6, "completions/max_terminated_length": 479.6, "completions/mean_length": 93.27734375, "completions/mean_terminated_length": 93.27734375, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.009574885924917691, "frac_reward_zero_std": 0.925, "grad_norm": 3.213575839996338, "kl": 2.162857530871406, "learning_rate": 4.715952380952381e-07, "loss": 0.0022, "num_tokens": 737505821.0, "reward": 0.3046875, "reward_std": 0.06691279634833336, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9440172672271728, "step": 10580 }, { "completion_length": 365.2, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/max_terminated_length": 365.2, "completions/mean_length": 92.2609375, "completions/mean_terminated_length": 92.2609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00957941091826595, "frac_reward_zero_std": 0.9375, "grad_norm": 0.11267489194869995, "kl": 2.1488574577495454, "learning_rate": 4.7155555555555556e-07, "loss": 0.0021, "num_tokens": 737822067.0, "reward": 0.36875, "reward_std": 0.05465705506503582, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9088731646537781, "step": 10585 }, { "completion_length": 432.4, "completions/clipped_ratio": 0.0, "completions/max_length": 432.4, "completions/max_terminated_length": 432.4, "completions/mean_length": 91.3828125, "completions/mean_terminated_length": 91.3828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00958393591161421, "frac_reward_zero_std": 0.94375, "grad_norm": 3.0230653285980225, "kl": 2.5596261804224922, "learning_rate": 4.7151587301587296e-07, "loss": 0.0026, "num_tokens": 738138901.0, "reward": 0.3625, "reward_std": 0.050920627638697626, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9230699658393859, "step": 10590 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 91.81484375, "completions/mean_terminated_length": 91.81484375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.00958846090496247, "frac_reward_zero_std": 0.91875, "grad_norm": 3.6471543312072754, "kl": 1.8989103768020867, "learning_rate": 4.7147619047619047e-07, "loss": 0.0019, "num_tokens": 738456416.0, "reward": 0.2859375, "reward_std": 0.07632824331521988, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9541390061378479, "step": 10595 }, { "completion_length": 523.6, "completions/clipped_ratio": 0.0, "completions/max_length": 523.6, "completions/max_terminated_length": 523.6, "completions/mean_length": 92.90234375, "completions/mean_terminated_length": 92.90234375, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.00959298589831073, "frac_reward_zero_std": 0.91875, "grad_norm": 13.223648071289062, "kl": 11.951647142355796, "learning_rate": 4.714365079365079e-07, "loss": 0.012, "num_tokens": 738775531.0, "reward": 0.3140625, "reward_std": 0.06586536094546318, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9281349897384643, "step": 10600 }, { "completion_length": 291.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 91.56484375, "completions/mean_terminated_length": 91.56484375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00959751089165899, "frac_reward_zero_std": 0.94375, "grad_norm": 0.26838675141334534, "kl": 2.4930703808553516, "learning_rate": 4.713968253968253e-07, "loss": 0.0025, "num_tokens": 739092622.0, "reward": 0.2765625, "reward_std": 0.04818769171833992, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9591499447822571, "step": 10605 }, { "completion_length": 441.6, "completions/clipped_ratio": 0.0, "completions/max_length": 441.6, "completions/max_terminated_length": 441.6, "completions/mean_length": 89.80390625, "completions/mean_terminated_length": 89.80390625, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.00960203588500725, "frac_reward_zero_std": 0.95, "grad_norm": 9.066498756408691, "kl": 5.365427164756693, "learning_rate": 4.7135714285714283e-07, "loss": 0.0054, "num_tokens": 739405587.0, "reward": 0.434375, "reward_std": 0.04171832874417305, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8640825271606445, "step": 10610 }, { "completion_length": 354.2, "completions/clipped_ratio": 0.0, "completions/max_length": 354.2, "completions/max_terminated_length": 354.2, "completions/mean_length": 90.46484375, "completions/mean_terminated_length": 90.46484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009606560878355508, "frac_reward_zero_std": 0.90625, "grad_norm": 7.3203630447387695, "kl": 9.140153201273643, "learning_rate": 4.713174603174603e-07, "loss": 0.0091, "num_tokens": 739721158.0, "reward": 0.4390625, "reward_std": 0.08448311015963554, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8797330617904663, "step": 10615 }, { "completion_length": 370.2, "completions/clipped_ratio": 0.0, "completions/max_length": 370.2, "completions/max_terminated_length": 370.2, "completions/mean_length": 88.1203125, "completions/mean_terminated_length": 88.1203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009611085871703768, "frac_reward_zero_std": 0.9125, "grad_norm": 7.379371643066406, "kl": 3.9515474789077416, "learning_rate": 4.712777777777778e-07, "loss": 0.004, "num_tokens": 740033320.0, "reward": 0.36875, "reward_std": 0.08574271202087402, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.8999161124229431, "step": 10620 }, { "completion_length": 509.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 509.8, "completions/max_terminated_length": 473.8, "completions/mean_length": 102.5640625, "completions/mean_terminated_length": 102.05289916992187, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009615610865052028, "frac_reward_zero_std": 0.8875, "grad_norm": 8.318771362304688, "kl": 6.890549506130628, "learning_rate": 4.712380952380952e-07, "loss": 0.0069, "num_tokens": 740367362.0, "reward": 0.3390625, "reward_std": 0.09979229122400284, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9024008512496948, "step": 10625 }, { "completion_length": 347.2, "completions/clipped_ratio": 0.0, "completions/max_length": 347.2, "completions/max_terminated_length": 347.2, "completions/mean_length": 94.24375, "completions/mean_terminated_length": 94.24375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009620135858400288, "frac_reward_zero_std": 0.93125, "grad_norm": 13.743973731994629, "kl": 4.988717193366028, "learning_rate": 4.7119841269841265e-07, "loss": 0.005, "num_tokens": 740688594.0, "reward": 0.2265625, "reward_std": 0.057710496708750725, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.9553578138351441, "step": 10630 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.0, "completions/max_length": 394.6, "completions/max_terminated_length": 394.6, "completions/mean_length": 88.07265625, "completions/mean_terminated_length": 88.07265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009624660851748549, "frac_reward_zero_std": 0.91875, "grad_norm": 2.6328861713409424, "kl": 3.577403227984905, "learning_rate": 4.7115873015873015e-07, "loss": 0.0036, "num_tokens": 740999367.0, "reward": 0.3984375, "reward_std": 0.07312350533902645, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9164190888404846, "step": 10635 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 86.803125, "completions/mean_terminated_length": 86.803125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009629185845096807, "frac_reward_zero_std": 0.95, "grad_norm": 6.307389259338379, "kl": 2.7120654010446743, "learning_rate": 4.711190476190476e-07, "loss": 0.0027, "num_tokens": 741307883.0, "reward": 0.2046875, "reward_std": 0.0387722447514534, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9726601839065552, "step": 10640 }, { "completion_length": 558.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 558.8, "completions/max_terminated_length": 400.4, "completions/mean_length": 82.66953125, "completions/mean_terminated_length": 81.6038833618164, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009633710838445067, "frac_reward_zero_std": 0.88125, "grad_norm": 17.073495864868164, "kl": 4.044311333750374, "learning_rate": 4.7107936507936506e-07, "loss": 0.004, "num_tokens": 741609284.0, "reward": 0.309375, "reward_std": 0.10032299906015396, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9435824155807495, "step": 10645 }, { "completion_length": 430.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 97.6296875, "completions/mean_terminated_length": 97.6296875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009638235831793327, "frac_reward_zero_std": 0.925, "grad_norm": 7.121683120727539, "kl": 1.217629813158419, "learning_rate": 4.710396825396825e-07, "loss": 0.0012, "num_tokens": 741937666.0, "reward": 0.1640625, "reward_std": 0.06849192045629024, "rewards/verify_chess_move/mean": 0.1640625, "rewards/verify_chess_move/std": 0.9818051934242249, "step": 10650 }, { "completion_length": 315.2, "completions/clipped_ratio": 0.0, "completions/max_length": 315.2, "completions/max_terminated_length": 315.2, "completions/mean_length": 91.04375, "completions/mean_terminated_length": 91.04375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.009642760825141587, "frac_reward_zero_std": 0.9125, "grad_norm": 4.585793495178223, "kl": 0.9931852858746424, "learning_rate": 4.7099999999999997e-07, "loss": 0.001, "num_tokens": 742254458.0, "reward": 0.3109375, "reward_std": 0.07869673371315003, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9286901950836182, "step": 10655 }, { "completion_length": 507.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 507.6, "completions/max_terminated_length": 435.6, "completions/mean_length": 86.84453125, "completions/mean_terminated_length": 86.31315002441406, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009647285818489847, "frac_reward_zero_std": 0.91875, "grad_norm": 1.4530442953109741, "kl": 0.5275932323769666, "learning_rate": 4.709603174603174e-07, "loss": 0.0005, "num_tokens": 742560939.0, "reward": 0.48125, "reward_std": 0.07222736924886704, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.87367844581604, "step": 10660 }, { "completion_length": 307.6, "completions/clipped_ratio": 0.0, "completions/max_length": 307.6, "completions/max_terminated_length": 307.6, "completions/mean_length": 87.334375, "completions/mean_terminated_length": 87.334375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009651810811838107, "frac_reward_zero_std": 0.9625, "grad_norm": 3.0670063495635986, "kl": 0.3987317699706182, "learning_rate": 4.709206349206349e-07, "loss": 0.0004, "num_tokens": 742868391.0, "reward": 0.4453125, "reward_std": 0.030617379397153855, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.8656489968299865, "step": 10665 }, { "completion_length": 513.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 513.6, "completions/max_terminated_length": 411.2, "completions/mean_length": 95.9609375, "completions/mean_terminated_length": 95.43199005126954, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009656335805186366, "frac_reward_zero_std": 0.95, "grad_norm": 3.227088451385498, "kl": 0.6814665497629904, "learning_rate": 4.708809523809524e-07, "loss": 0.0007, "num_tokens": 743192597.0, "reward": 0.3953125, "reward_std": 0.04287213981151581, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9191532492637634, "step": 10670 }, { "completion_length": 546.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 546.4, "completions/max_terminated_length": 529.2, "completions/mean_length": 92.49453125, "completions/mean_terminated_length": 91.96383666992188, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009660860798534626, "frac_reward_zero_std": 0.9375, "grad_norm": 7.866879463195801, "kl": 0.8195810538483783, "learning_rate": 4.7084126984126984e-07, "loss": 0.0008, "num_tokens": 743510486.0, "reward": 0.2046875, "reward_std": 0.0569191712886095, "rewards/verify_chess_move/mean": 0.2046875, "rewards/verify_chess_move/std": 0.9709741353988648, "step": 10675 }, { "completion_length": 485.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.4, "completions/max_terminated_length": 419.2, "completions/mean_length": 92.84296875, "completions/mean_terminated_length": 92.31090545654297, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009665385791882886, "frac_reward_zero_std": 0.8875, "grad_norm": 8.568583488464355, "kl": 2.8162332961917853, "learning_rate": 4.7080158730158724e-07, "loss": 0.0028, "num_tokens": 743827917.0, "reward": 0.2234375, "reward_std": 0.09479527845978737, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9699979782104492, "step": 10680 }, { "completion_length": 371.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 371.0, "completions/max_terminated_length": 302.8, "completions/mean_length": 90.0171875, "completions/mean_terminated_length": 89.47644958496093, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009669910785231146, "frac_reward_zero_std": 0.93125, "grad_norm": 5.148714065551758, "kl": 2.610021020693239, "learning_rate": 4.7076190476190475e-07, "loss": 0.0026, "num_tokens": 744142315.0, "reward": 0.39375, "reward_std": 0.06364817172288895, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9037176012992859, "step": 10685 }, { "completion_length": 411.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 411.2, "completions/max_terminated_length": 396.4, "completions/mean_length": 91.72265625, "completions/mean_terminated_length": 90.65913543701171, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009674435778579406, "frac_reward_zero_std": 0.9375, "grad_norm": 6.578335285186768, "kl": 1.170845265709795, "learning_rate": 4.707222222222222e-07, "loss": 0.0012, "num_tokens": 744460192.0, "reward": 0.390625, "reward_std": 0.0605937510728836, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9183056235313416, "step": 10690 }, { "completion_length": 390.2, "completions/clipped_ratio": 0.0, "completions/max_length": 390.2, "completions/max_terminated_length": 390.2, "completions/mean_length": 87.55859375, "completions/mean_terminated_length": 87.55859375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009678960771927664, "frac_reward_zero_std": 0.89375, "grad_norm": 2.3112266063690186, "kl": 0.8650856414111331, "learning_rate": 4.706825396825397e-07, "loss": 0.0009, "num_tokens": 744769219.0, "reward": 0.38125, "reward_std": 0.0887502446770668, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9219038724899292, "step": 10695 }, { "completion_length": 311.2, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/max_terminated_length": 311.2, "completions/mean_length": 84.95234375, "completions/mean_terminated_length": 84.95234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009683485765275925, "frac_reward_zero_std": 0.9375, "grad_norm": 13.589829444885254, "kl": 0.8151183856418356, "learning_rate": 4.706428571428571e-07, "loss": 0.0008, "num_tokens": 745073510.0, "reward": 0.403125, "reward_std": 0.052607111260294916, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9055041313171387, "step": 10700 }, { "completion_length": 427.2, "completions/clipped_ratio": 0.0, "completions/max_length": 427.2, "completions/max_terminated_length": 427.2, "completions/mean_length": 91.48046875, "completions/mean_terminated_length": 91.48046875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009688010758624185, "frac_reward_zero_std": 0.9375, "grad_norm": 3.3518762588500977, "kl": 0.6470237508183345, "learning_rate": 4.7060317460317456e-07, "loss": 0.0006, "num_tokens": 745388797.0, "reward": 0.3578125, "reward_std": 0.05418525226414204, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9186300992965698, "step": 10705 }, { "completion_length": 382.4, "completions/clipped_ratio": 0.0, "completions/max_length": 382.4, "completions/max_terminated_length": 382.4, "completions/mean_length": 89.2984375, "completions/mean_terminated_length": 89.2984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009692535751972445, "frac_reward_zero_std": 0.925, "grad_norm": 3.713643789291382, "kl": 0.264885422331281, "learning_rate": 4.7056349206349207e-07, "loss": 0.0003, "num_tokens": 745700571.0, "reward": 0.44375, "reward_std": 0.06622980833053589, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8923515319824219, "step": 10710 }, { "completion_length": 339.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 89.22421875, "completions/mean_terminated_length": 89.22421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009697060745320705, "frac_reward_zero_std": 0.93125, "grad_norm": 7.79880428314209, "kl": 0.359818573971279, "learning_rate": 4.7052380952380947e-07, "loss": 0.0004, "num_tokens": 746013410.0, "reward": 0.3484375, "reward_std": 0.06044245213270187, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9190622091293335, "step": 10715 }, { "completion_length": 442.4, "completions/clipped_ratio": 0.0, "completions/max_length": 442.4, "completions/max_terminated_length": 442.4, "completions/mean_length": 94.68515625, "completions/mean_terminated_length": 94.68515625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009701585738668965, "frac_reward_zero_std": 0.94375, "grad_norm": 0.006058593280613422, "kl": 0.2734208663343452, "learning_rate": 4.70484126984127e-07, "loss": 0.0003, "num_tokens": 746333111.0, "reward": 0.2453125, "reward_std": 0.05181676521897316, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9623651504516602, "step": 10720 }, { "completion_length": 517.8, "completions/clipped_ratio": 0.0, "completions/max_length": 517.8, "completions/max_terminated_length": 517.8, "completions/mean_length": 101.621875, "completions/mean_terminated_length": 101.621875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009706110732017223, "frac_reward_zero_std": 0.89375, "grad_norm": 6.41536283493042, "kl": 0.32373012211173774, "learning_rate": 4.7044444444444443e-07, "loss": 0.0003, "num_tokens": 746664875.0, "reward": 0.3296875, "reward_std": 0.08990601897239685, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9183055877685546, "step": 10725 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 84.31015625, "completions/mean_terminated_length": 84.31015625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009710635725365483, "frac_reward_zero_std": 0.93125, "grad_norm": 0.0972447320818901, "kl": 0.43270215567899867, "learning_rate": 4.704047619047619e-07, "loss": 0.0004, "num_tokens": 746968848.0, "reward": 0.2859375, "reward_std": 0.05634255781769752, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9378196716308593, "step": 10730 }, { "completion_length": 453.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 453.4, "completions/max_terminated_length": 356.4, "completions/mean_length": 95.75078125, "completions/mean_terminated_length": 95.23748321533203, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009715160718713743, "frac_reward_zero_std": 0.91875, "grad_norm": 5.920107364654541, "kl": 0.246792512643151, "learning_rate": 4.7036507936507934e-07, "loss": 0.0002, "num_tokens": 747292121.0, "reward": 0.3578125, "reward_std": 0.06907010227441787, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9311139225959778, "step": 10735 }, { "completion_length": 488.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 488.8, "completions/max_terminated_length": 432.2, "completions/mean_length": 94.35625, "completions/mean_terminated_length": 93.83089904785156, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009719685712062004, "frac_reward_zero_std": 0.91875, "grad_norm": 6.478116989135742, "kl": 0.7886725695570931, "learning_rate": 4.703253968253968e-07, "loss": 0.0008, "num_tokens": 747612377.0, "reward": 0.2765625, "reward_std": 0.07359432689845562, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.952283239364624, "step": 10740 }, { "completion_length": 291.4, "completions/clipped_ratio": 0.0, "completions/max_length": 291.4, "completions/max_terminated_length": 291.4, "completions/mean_length": 82.13671875, "completions/mean_terminated_length": 82.13671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009724210705410264, "frac_reward_zero_std": 0.94375, "grad_norm": 0.5084100365638733, "kl": 1.2049409435363487, "learning_rate": 4.702857142857143e-07, "loss": 0.0012, "num_tokens": 747912248.0, "reward": 0.5390625, "reward_std": 0.05070846229791641, "rewards/verify_chess_move/mean": 0.5390625, "rewards/verify_chess_move/std": 0.8292115926742554, "step": 10745 }, { "completion_length": 434.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 93.50234375, "completions/mean_terminated_length": 93.50234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009728735698758522, "frac_reward_zero_std": 0.91875, "grad_norm": 3.8792922496795654, "kl": 1.1406287449761294, "learning_rate": 4.702460317460317e-07, "loss": 0.0011, "num_tokens": 748230115.0, "reward": 0.44375, "reward_std": 0.07112102918326854, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8966200828552247, "step": 10750 }, { "completion_length": 383.8, "completions/clipped_ratio": 0.0, "completions/max_length": 383.8, "completions/max_terminated_length": 383.8, "completions/mean_length": 94.32734375, "completions/mean_terminated_length": 94.32734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009733260692106782, "frac_reward_zero_std": 0.90625, "grad_norm": 7.940812110900879, "kl": 1.6360866630682722, "learning_rate": 4.7020634920634915e-07, "loss": 0.0016, "num_tokens": 748552030.0, "reward": 0.2359375, "reward_std": 0.08017203062772751, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9648883461952209, "step": 10755 }, { "completion_length": 284.8, "completions/clipped_ratio": 0.0, "completions/max_length": 284.8, "completions/max_terminated_length": 284.8, "completions/mean_length": 87.7078125, "completions/mean_terminated_length": 87.7078125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009737785685455042, "frac_reward_zero_std": 0.95, "grad_norm": 7.229598045349121, "kl": 2.3015170769300313, "learning_rate": 4.7016666666666666e-07, "loss": 0.0023, "num_tokens": 748860968.0, "reward": 0.4171875, "reward_std": 0.04718517921864986, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9048888206481933, "step": 10760 }, { "completion_length": 616.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 616.2, "completions/max_terminated_length": 588.8, "completions/mean_length": 97.09609375, "completions/mean_terminated_length": 96.56385803222656, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009742310678803302, "frac_reward_zero_std": 0.89375, "grad_norm": 3.924137830734253, "kl": 4.337501402455382, "learning_rate": 4.701269841269841e-07, "loss": 0.0043, "num_tokens": 749185875.0, "reward": 0.2765625, "reward_std": 0.0885380782186985, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9611238598823547, "step": 10765 }, { "completion_length": 467.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 467.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 88.1984375, "completions/mean_terminated_length": 87.12432250976562, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009746835672151562, "frac_reward_zero_std": 0.925, "grad_norm": 3.3909382820129395, "kl": 3.9230623151524924, "learning_rate": 4.7008730158730157e-07, "loss": 0.0039, "num_tokens": 749496681.0, "reward": 0.3609375, "reward_std": 0.06281290203332901, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.914433753490448, "step": 10770 }, { "completion_length": 514.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 514.0, "completions/max_terminated_length": 397.2, "completions/mean_length": 91.4, "completions/mean_terminated_length": 90.3417236328125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.009751360665499822, "frac_reward_zero_std": 0.91875, "grad_norm": 1.5572922229766846, "kl": 3.3008852586033752, "learning_rate": 4.70047619047619e-07, "loss": 0.0033, "num_tokens": 749810921.0, "reward": 0.478125, "reward_std": 0.06633716486394406, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8712308645248413, "step": 10775 }, { "completion_length": 423.4, "completions/clipped_ratio": 0.0, "completions/max_length": 423.4, "completions/max_terminated_length": 423.4, "completions/mean_length": 90.6859375, "completions/mean_terminated_length": 90.6859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.00975588565884808, "frac_reward_zero_std": 0.925, "grad_norm": 4.674034595489502, "kl": 2.3417275680112652, "learning_rate": 4.700079365079365e-07, "loss": 0.0023, "num_tokens": 750124583.0, "reward": 0.40625, "reward_std": 0.06260073482990265, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9123579382896423, "step": 10780 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 94.846875, "completions/mean_terminated_length": 94.846875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009760410652196341, "frac_reward_zero_std": 0.8875, "grad_norm": 2.2945775985717773, "kl": 2.4986191820818933, "learning_rate": 4.69968253968254e-07, "loss": 0.0025, "num_tokens": 750446715.0, "reward": 0.31875, "reward_std": 0.09816569313406945, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9268819689750671, "step": 10785 }, { "completion_length": 485.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.0, "completions/max_terminated_length": 391.8, "completions/mean_length": 81.52890625, "completions/mean_terminated_length": 80.98881225585937, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009764935645544601, "frac_reward_zero_std": 0.9375, "grad_norm": 11.770450592041016, "kl": 1.6357515058130958, "learning_rate": 4.699285714285714e-07, "loss": 0.0016, "num_tokens": 750746056.0, "reward": 0.284375, "reward_std": 0.0592277716845274, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9569362878799439, "step": 10790 }, { "completion_length": 416.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 416.2, "completions/max_terminated_length": 322.0, "completions/mean_length": 89.94921875, "completions/mean_terminated_length": 89.41929626464844, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009769460638892861, "frac_reward_zero_std": 0.8875, "grad_norm": 4.693594932556152, "kl": 5.291409432562068, "learning_rate": 4.698888888888889e-07, "loss": 0.0053, "num_tokens": 751060719.0, "reward": 0.3984375, "reward_std": 0.0939001239836216, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9046837449073791, "step": 10795 }, { "completion_length": 390.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 92.90390625, "completions/mean_terminated_length": 92.90390625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009773985632241121, "frac_reward_zero_std": 0.9375, "grad_norm": 1.154194712638855, "kl": 2.4651283577783034, "learning_rate": 4.6984920634920635e-07, "loss": 0.0025, "num_tokens": 751379020.0, "reward": 0.390625, "reward_std": 0.052607108280062674, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9210763335227966, "step": 10800 }, { "completion_length": 407.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 91.74921875, "completions/mean_terminated_length": 91.74921875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00977851062558938, "frac_reward_zero_std": 0.93125, "grad_norm": 5.844335556030273, "kl": 4.697429664549418, "learning_rate": 4.6980952380952375e-07, "loss": 0.0047, "num_tokens": 751695163.0, "reward": 0.321875, "reward_std": 0.06018379479646683, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.936281931400299, "step": 10805 }, { "completion_length": 316.2, "completions/clipped_ratio": 0.0, "completions/max_length": 316.2, "completions/max_terminated_length": 316.2, "completions/mean_length": 85.31796875, "completions/mean_terminated_length": 85.31796875, "completions/min_length": 28.6, "completions/min_terminated_length": 28.6, "epoch": 0.00978303561893764, "frac_reward_zero_std": 0.90625, "grad_norm": 8.425512313842773, "kl": 6.902593838027679, "learning_rate": 4.6976984126984125e-07, "loss": 0.0069, "num_tokens": 752000802.0, "reward": 0.4046875, "reward_std": 0.08243316113948822, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.8938406348228455, "step": 10810 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0, "completions/max_length": 373.6, "completions/max_terminated_length": 373.6, "completions/mean_length": 81.27578125, "completions/mean_terminated_length": 81.27578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0097875606122859, "frac_reward_zero_std": 0.925, "grad_norm": 1.2969062328338623, "kl": 4.179346149018966, "learning_rate": 4.697301587301587e-07, "loss": 0.0042, "num_tokens": 752299307.0, "reward": 0.3328125, "reward_std": 0.06191774867475033, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9405660033226013, "step": 10815 }, { "completion_length": 494.2, "completions/clipped_ratio": 0.0, "completions/max_length": 494.2, "completions/max_terminated_length": 494.2, "completions/mean_length": 92.77265625, "completions/mean_terminated_length": 92.77265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00979208560563416, "frac_reward_zero_std": 0.9, "grad_norm": 5.085630416870117, "kl": 3.7454230630537495, "learning_rate": 4.696904761904762e-07, "loss": 0.0037, "num_tokens": 752616424.0, "reward": 0.20625, "reward_std": 0.08659293949604034, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9591294050216674, "step": 10820 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 94.928125, "completions/mean_terminated_length": 94.928125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00979661059898242, "frac_reward_zero_std": 0.925, "grad_norm": 8.652195930480957, "kl": 4.188598714955151, "learning_rate": 4.696507936507936e-07, "loss": 0.0042, "num_tokens": 752939156.0, "reward": 0.2265625, "reward_std": 0.06212893426418305, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.9702528357505799, "step": 10825 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.00980113559233068, "frac_reward_zero_std": 0.9125, "grad_norm": 3.600166082382202, "kl": 9.76799497581087, "learning_rate": 4.6961111111111107e-07, "loss": 0.0098, "num_tokens": 753251572.0, "reward": 0.2828125, "reward_std": 0.07433720231056214, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9423753023147583, "step": 10830 }, { "completion_length": 321.6, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/max_terminated_length": 321.6, "completions/mean_length": 86.784375, "completions/mean_terminated_length": 86.784375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009805660585678938, "frac_reward_zero_std": 0.9375, "grad_norm": 6.639320373535156, "kl": 3.0161149953259154, "learning_rate": 4.695714285714286e-07, "loss": 0.003, "num_tokens": 753561664.0, "reward": 0.396875, "reward_std": 0.05307793281972408, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9111817955970765, "step": 10835 }, { "completion_length": 385.2, "completions/clipped_ratio": 0.0, "completions/max_length": 385.2, "completions/max_terminated_length": 385.2, "completions/mean_length": 86.3609375, "completions/mean_terminated_length": 86.3609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009810185579027198, "frac_reward_zero_std": 0.93125, "grad_norm": 3.3764288425445557, "kl": 1.8751381419715472, "learning_rate": 4.69531746031746e-07, "loss": 0.0019, "num_tokens": 753870390.0, "reward": 0.3265625, "reward_std": 0.06112642176449299, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.937002158164978, "step": 10840 }, { "completion_length": 441.2, "completions/clipped_ratio": 0.0, "completions/max_length": 441.2, "completions/max_terminated_length": 441.2, "completions/mean_length": 89.484375, "completions/mean_terminated_length": 89.484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009814710572375459, "frac_reward_zero_std": 0.925, "grad_norm": 9.470501899719238, "kl": 2.496487027546391, "learning_rate": 4.694920634920635e-07, "loss": 0.0025, "num_tokens": 754182154.0, "reward": 0.409375, "reward_std": 0.06076197549700737, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9053419232368469, "step": 10845 }, { "completion_length": 357.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 95.1375, "completions/mean_terminated_length": 95.1375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009819235565723719, "frac_reward_zero_std": 0.90625, "grad_norm": 2.1592864990234375, "kl": 3.6996039418736473, "learning_rate": 4.6945238095238094e-07, "loss": 0.0037, "num_tokens": 754504458.0, "reward": 0.321875, "reward_std": 0.07927589416503907, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9443147540092468, "step": 10850 }, { "completion_length": 294.4, "completions/clipped_ratio": 0.0, "completions/max_length": 294.4, "completions/max_terminated_length": 294.4, "completions/mean_length": 97.2671875, "completions/mean_terminated_length": 97.2671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009823760559071979, "frac_reward_zero_std": 0.90625, "grad_norm": 7.610211372375488, "kl": 12.182101308158598, "learning_rate": 4.694126984126984e-07, "loss": 0.0122, "num_tokens": 754829232.0, "reward": 0.2859375, "reward_std": 0.08332831710577011, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9396867871284484, "step": 10855 }, { "completion_length": 388.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 91.97578125, "completions/mean_terminated_length": 91.97578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009828285552420237, "frac_reward_zero_std": 0.9, "grad_norm": 7.2284088134765625, "kl": 9.912818103726021, "learning_rate": 4.6937301587301585e-07, "loss": 0.0099, "num_tokens": 755146433.0, "reward": 0.315625, "reward_std": 0.08185753338038922, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9358961582183838, "step": 10860 }, { "completion_length": 372.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 91.16875, "completions/mean_terminated_length": 91.16875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009832810545768497, "frac_reward_zero_std": 0.94375, "grad_norm": 2.5707225799560547, "kl": 3.473154224676546, "learning_rate": 4.693333333333333e-07, "loss": 0.0035, "num_tokens": 755461833.0, "reward": 0.3984375, "reward_std": 0.05228758528828621, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.8955963253974915, "step": 10865 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 97.09296875, "completions/mean_terminated_length": 97.09296875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.009837335539116757, "frac_reward_zero_std": 0.8875, "grad_norm": 3.0609779357910156, "kl": 2.8842968739336357, "learning_rate": 4.692936507936508e-07, "loss": 0.0029, "num_tokens": 755787496.0, "reward": 0.34375, "reward_std": 0.10047527775168419, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9398372173309326, "step": 10870 }, { "completion_length": 487.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 487.0, "completions/max_terminated_length": 398.4, "completions/mean_length": 91.0359375, "completions/mean_terminated_length": 89.98077697753907, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009841860532465017, "frac_reward_zero_std": 0.9, "grad_norm": 3.970794439315796, "kl": 0.7866445383056998, "learning_rate": 4.6925396825396826e-07, "loss": 0.0008, "num_tokens": 756102950.0, "reward": 0.4, "reward_std": 0.08390747755765915, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9085947275161743, "step": 10875 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 89.84609375, "completions/mean_terminated_length": 89.84609375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009846385525813278, "frac_reward_zero_std": 0.90625, "grad_norm": 16.41352653503418, "kl": 1.1842648497549817, "learning_rate": 4.6921428571428566e-07, "loss": 0.0012, "num_tokens": 756417201.0, "reward": 0.2875, "reward_std": 0.08017105124890804, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9402374982833862, "step": 10880 }, { "completion_length": 481.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 481.6, "completions/max_terminated_length": 367.2, "completions/mean_length": 89.0484375, "completions/mean_terminated_length": 87.99110717773438, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009850910519161538, "frac_reward_zero_std": 0.94375, "grad_norm": 1.26871657371521, "kl": 1.21379923815839, "learning_rate": 4.6917460317460317e-07, "loss": 0.0012, "num_tokens": 756728039.0, "reward": 0.4046875, "reward_std": 0.05070846229791641, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9042811751365661, "step": 10885 }, { "completion_length": 443.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.2, "completions/max_terminated_length": 432.4, "completions/mean_length": 89.78671875, "completions/mean_terminated_length": 89.26207885742187, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.009855435512509796, "frac_reward_zero_std": 0.91875, "grad_norm": 8.017333030700684, "kl": 1.264521039579995, "learning_rate": 4.691349206349206e-07, "loss": 0.0013, "num_tokens": 757042894.0, "reward": 0.184375, "reward_std": 0.07244051732122898, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9748876690864563, "step": 10890 }, { "completion_length": 426.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.4, "completions/max_terminated_length": 332.0, "completions/mean_length": 86.3828125, "completions/mean_terminated_length": 85.85416717529297, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009859960505858056, "frac_reward_zero_std": 0.9, "grad_norm": 8.169329643249512, "kl": 0.7716115476796404, "learning_rate": 4.690952380952381e-07, "loss": 0.0008, "num_tokens": 757348976.0, "reward": 0.3890625, "reward_std": 0.09363892041146755, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9133238792419434, "step": 10895 }, { "completion_length": 378.8, "completions/clipped_ratio": 0.0, "completions/max_length": 378.8, "completions/max_terminated_length": 378.8, "completions/mean_length": 85.1828125, "completions/mean_terminated_length": 85.1828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009864485499206316, "frac_reward_zero_std": 0.9125, "grad_norm": 4.498534679412842, "kl": 1.4266837097355165, "learning_rate": 4.6905555555555553e-07, "loss": 0.0014, "num_tokens": 757654226.0, "reward": 0.2828125, "reward_std": 0.07370168678462505, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9513337969779968, "step": 10900 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 93.89609375, "completions/mean_terminated_length": 93.89609375, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.009869010492554576, "frac_reward_zero_std": 0.9, "grad_norm": 6.761676788330078, "kl": 1.3852534753503278, "learning_rate": 4.69015873015873e-07, "loss": 0.0014, "num_tokens": 757972781.0, "reward": 0.36875, "reward_std": 0.08322350680828094, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.925947391986847, "step": 10905 }, { "completion_length": 504.8, "completions/clipped_ratio": 0.0, "completions/max_length": 504.8, "completions/max_terminated_length": 504.8, "completions/mean_length": 98.096875, "completions/mean_terminated_length": 98.096875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009873535485902836, "frac_reward_zero_std": 0.95, "grad_norm": 0.8824782371520996, "kl": 2.2649581983452665, "learning_rate": 4.689761904761905e-07, "loss": 0.0023, "num_tokens": 758297489.0, "reward": 0.409375, "reward_std": 0.040139202028512955, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9112985491752624, "step": 10910 }, { "completion_length": 537.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 537.4, "completions/max_terminated_length": 412.4, "completions/mean_length": 92.10625, "completions/mean_terminated_length": 90.51659393310547, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009878060479251095, "frac_reward_zero_std": 0.89375, "grad_norm": 4.36221170425415, "kl": 4.0800003843731245, "learning_rate": 4.689365079365079e-07, "loss": 0.0041, "num_tokens": 758613457.0, "reward": 0.2625, "reward_std": 0.09289858937263488, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9615713238716126, "step": 10915 }, { "completion_length": 511.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 511.8, "completions/max_terminated_length": 511.2, "completions/mean_length": 99.6484375, "completions/mean_terminated_length": 98.60482940673828, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.009882585472599355, "frac_reward_zero_std": 0.925, "grad_norm": 11.54903507232666, "kl": 5.611316360591445, "learning_rate": 4.688968253968254e-07, "loss": 0.0056, "num_tokens": 758943191.0, "reward": 0.35, "reward_std": 0.06554583832621574, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9307449102401734, "step": 10920 }, { "completion_length": 360.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 360.0, "completions/max_terminated_length": 268.4, "completions/mean_length": 91.42109375, "completions/mean_terminated_length": 90.88898620605468, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009887110465947615, "frac_reward_zero_std": 0.95, "grad_norm": 0.5738135576248169, "kl": 6.001202279247809, "learning_rate": 4.6885714285714285e-07, "loss": 0.006, "num_tokens": 759260914.0, "reward": 0.378125, "reward_std": 0.04833899140357971, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9197925448417663, "step": 10925 }, { "completion_length": 330.4, "completions/clipped_ratio": 0.0, "completions/max_length": 330.4, "completions/max_terminated_length": 330.4, "completions/mean_length": 83.32890625, "completions/mean_terminated_length": 83.32890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009891635459295875, "frac_reward_zero_std": 0.96875, "grad_norm": 4.532364368438721, "kl": 1.0306155892089008, "learning_rate": 4.6881746031746026e-07, "loss": 0.001, "num_tokens": 759563687.0, "reward": 0.3765625, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9102837681770325, "step": 10930 }, { "completion_length": 496.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 496.0, "completions/max_terminated_length": 482.2, "completions/mean_length": 103.34140625, "completions/mean_terminated_length": 102.82105560302735, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009896160452644135, "frac_reward_zero_std": 0.9625, "grad_norm": 0.7121487259864807, "kl": 2.654495055950247, "learning_rate": 4.6877777777777776e-07, "loss": 0.0027, "num_tokens": 759898900.0, "reward": 0.365625, "reward_std": 0.036719749495387075, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9278798341751099, "step": 10935 }, { "completion_length": 487.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 487.6, "completions/max_terminated_length": 439.8, "completions/mean_length": 97.36328125, "completions/mean_terminated_length": 96.84441223144532, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.009900685445992395, "frac_reward_zero_std": 0.9375, "grad_norm": 0.17708157002925873, "kl": 3.3077700331341475, "learning_rate": 4.687380952380952e-07, "loss": 0.0033, "num_tokens": 760223469.0, "reward": 0.265625, "reward_std": 0.056707005202770236, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9613527774810791, "step": 10940 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 90.728125, "completions/mean_terminated_length": 90.728125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009905210439340653, "frac_reward_zero_std": 0.9125, "grad_norm": 25.339879989624023, "kl": 4.020757351606153, "learning_rate": 4.686984126984127e-07, "loss": 0.004, "num_tokens": 760538273.0, "reward": 0.365625, "reward_std": 0.07596379891037941, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9231261253356934, "step": 10945 }, { "completion_length": 395.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 395.8, "completions/max_terminated_length": 384.2, "completions/mean_length": 88.93515625, "completions/mean_terminated_length": 88.4151107788086, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009909735432688914, "frac_reward_zero_std": 0.95, "grad_norm": 19.025043487548828, "kl": 3.0516076418338343, "learning_rate": 4.686587301587301e-07, "loss": 0.0031, "num_tokens": 760848862.0, "reward": 0.3296875, "reward_std": 0.04287213943898678, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9364412784576416, "step": 10950 }, { "completion_length": 478.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 478.6, "completions/max_terminated_length": 425.6, "completions/mean_length": 92.3046875, "completions/mean_terminated_length": 91.77249755859376, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009914260426037174, "frac_reward_zero_std": 0.8875, "grad_norm": 13.045248031616211, "kl": 2.376388489000965, "learning_rate": 4.686190476190476e-07, "loss": 0.0024, "num_tokens": 761164236.0, "reward": 0.340625, "reward_std": 0.10184125155210495, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.927798330783844, "step": 10955 }, { "completion_length": 538.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 538.6, "completions/max_terminated_length": 430.8, "completions/mean_length": 91.84375, "completions/mean_terminated_length": 90.76780853271484, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009918785419385434, "frac_reward_zero_std": 0.9375, "grad_norm": 6.315328598022461, "kl": 1.7711212954949587, "learning_rate": 4.685793650793651e-07, "loss": 0.0018, "num_tokens": 761479540.0, "reward": 0.3875, "reward_std": 0.058756951987743375, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9187337636947632, "step": 10960 }, { "completion_length": 376.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 91.87421875, "completions/mean_terminated_length": 91.87421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009923310412733694, "frac_reward_zero_std": 0.9375, "grad_norm": 3.3468286991119385, "kl": 0.5793785822112113, "learning_rate": 4.685396825396825e-07, "loss": 0.0006, "num_tokens": 761794467.0, "reward": 0.46875, "reward_std": 0.05717782825231552, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8661344051361084, "step": 10965 }, { "completion_length": 327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 83.85546875, "completions/mean_terminated_length": 83.85546875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009927835406081952, "frac_reward_zero_std": 0.9375, "grad_norm": 5.764570713043213, "kl": 1.1412750154850073, "learning_rate": 4.685e-07, "loss": 0.0011, "num_tokens": 762098834.0, "reward": 0.4921875, "reward_std": 0.05124015100300312, "rewards/verify_chess_move/mean": 0.4921875, "rewards/verify_chess_move/std": 0.8705976009368896, "step": 10970 }, { "completion_length": 397.8, "completions/clipped_ratio": 0.0, "completions/max_length": 397.8, "completions/max_terminated_length": 397.8, "completions/mean_length": 85.64765625, "completions/mean_terminated_length": 85.64765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009932360399430212, "frac_reward_zero_std": 0.95, "grad_norm": 5.974778652191162, "kl": 2.631466212403029, "learning_rate": 4.6846031746031745e-07, "loss": 0.0026, "num_tokens": 762405647.0, "reward": 0.3484375, "reward_std": 0.04240131601691246, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9358396291732788, "step": 10975 }, { "completion_length": 557.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 557.6, "completions/max_terminated_length": 542.2, "completions/mean_length": 93.671875, "completions/mean_terminated_length": 93.1452621459961, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.009936885392778472, "frac_reward_zero_std": 0.90625, "grad_norm": 10.87387466430664, "kl": 6.099801714567002, "learning_rate": 4.684206349206349e-07, "loss": 0.0061, "num_tokens": 762724027.0, "reward": 0.275, "reward_std": 0.07333723679184914, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.945634377002716, "step": 10980 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 87.54765625, "completions/mean_terminated_length": 87.54765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009941410386126733, "frac_reward_zero_std": 0.93125, "grad_norm": 3.3469784259796143, "kl": 2.69699752246961, "learning_rate": 4.6838095238095236e-07, "loss": 0.0027, "num_tokens": 763033456.0, "reward": 0.25, "reward_std": 0.06023028716444969, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9696001648902893, "step": 10985 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 84.55390625, "completions/mean_terminated_length": 84.55390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009945935379474993, "frac_reward_zero_std": 0.9, "grad_norm": 2.1258299350738525, "kl": 2.5754654041375034, "learning_rate": 4.683412698412698e-07, "loss": 0.0026, "num_tokens": 763336621.0, "reward": 0.371875, "reward_std": 0.08711221478879452, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.925722062587738, "step": 10990 }, { "completion_length": 370.6, "completions/clipped_ratio": 0.0, "completions/max_length": 370.6, "completions/max_terminated_length": 370.6, "completions/mean_length": 83.00234375, "completions/mean_terminated_length": 83.00234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009950460372823253, "frac_reward_zero_std": 0.9625, "grad_norm": 6.792393684387207, "kl": 2.4008411968126895, "learning_rate": 4.6830158730158726e-07, "loss": 0.0024, "num_tokens": 763639448.0, "reward": 0.334375, "reward_std": 0.0328794926404953, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9397103905677795, "step": 10995 }, { "completion_length": 505.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 505.2, "completions/max_terminated_length": 428.0, "completions/mean_length": 90.16875, "completions/mean_terminated_length": 89.6404052734375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.009954985366171511, "frac_reward_zero_std": 0.89375, "grad_norm": 12.75457763671875, "kl": 3.21460655820556, "learning_rate": 4.6826190476190477e-07, "loss": 0.0032, "num_tokens": 763952744.0, "reward": 0.23125, "reward_std": 0.09421808049082755, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9729697942733765, "step": 11000 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 83.5578125, "completions/mean_terminated_length": 83.5578125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009959510359519771, "frac_reward_zero_std": 0.95, "grad_norm": 7.588754177093506, "kl": 1.61197236754233, "learning_rate": 4.6822222222222217e-07, "loss": 0.0016, "num_tokens": 764255466.0, "reward": 0.4890625, "reward_std": 0.04287213832139969, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8362247824668885, "step": 11005 }, { "completion_length": 412.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 88.9, "completions/mean_terminated_length": 88.9, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.009964035352868031, "frac_reward_zero_std": 0.925, "grad_norm": 5.531378746032715, "kl": 2.8084205321152695, "learning_rate": 4.681825396825397e-07, "loss": 0.0028, "num_tokens": 764567002.0, "reward": 0.2796875, "reward_std": 0.06622882708907127, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9420093894004822, "step": 11010 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 423.0, "completions/max_terminated_length": 407.4, "completions/mean_length": 95.42734375, "completions/mean_terminated_length": 94.91067504882812, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.009968560346216291, "frac_reward_zero_std": 0.8875, "grad_norm": 10.583465576171875, "kl": 1.7423567898920738, "learning_rate": 4.6814285714285713e-07, "loss": 0.0017, "num_tokens": 764891085.0, "reward": 0.290625, "reward_std": 0.09958012253046036, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9415196776390076, "step": 11015 }, { "completion_length": 385.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 385.4, "completions/max_terminated_length": 332.0, "completions/mean_length": 88.09296875, "completions/mean_terminated_length": 87.57617492675782, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.009973085339564551, "frac_reward_zero_std": 0.94375, "grad_norm": 5.491556167602539, "kl": 1.011636257765349, "learning_rate": 4.6810317460317453e-07, "loss": 0.001, "num_tokens": 765200612.0, "reward": 0.4078125, "reward_std": 0.049342484399676326, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8927706480026245, "step": 11020 }, { "completion_length": 361.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 87.84921875, "completions/mean_terminated_length": 87.84921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.00997761033291281, "frac_reward_zero_std": 0.91875, "grad_norm": 4.7321014404296875, "kl": 0.46300831994740294, "learning_rate": 4.6806349206349204e-07, "loss": 0.0005, "num_tokens": 765512739.0, "reward": 0.309375, "reward_std": 0.07380649521946907, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9417755126953125, "step": 11025 }, { "completion_length": 337.8, "completions/clipped_ratio": 0.0, "completions/max_length": 337.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 99.2390625, "completions/mean_terminated_length": 99.2390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.00998213532626107, "frac_reward_zero_std": 0.94375, "grad_norm": 5.659430980682373, "kl": 0.5266751165268942, "learning_rate": 4.680238095238095e-07, "loss": 0.0005, "num_tokens": 765841365.0, "reward": 0.3453125, "reward_std": 0.048658515140414235, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9275905847549438, "step": 11030 }, { "completion_length": 291.4, "completions/clipped_ratio": 0.0, "completions/max_length": 291.4, "completions/max_terminated_length": 291.4, "completions/mean_length": 98.32890625, "completions/mean_terminated_length": 98.32890625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.00998666031960933, "frac_reward_zero_std": 0.9625, "grad_norm": 3.634702205657959, "kl": 0.5282144270604476, "learning_rate": 4.67984126984127e-07, "loss": 0.0005, "num_tokens": 766168658.0, "reward": 0.325, "reward_std": 0.030145575851202012, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9396728873252869, "step": 11035 }, { "completion_length": 352.2, "completions/clipped_ratio": 0.0, "completions/max_length": 352.2, "completions/max_terminated_length": 352.2, "completions/mean_length": 96.64765625, "completions/mean_terminated_length": 96.64765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.00999118531295759, "frac_reward_zero_std": 0.9625, "grad_norm": 9.769566535949707, "kl": 0.4707014066632837, "learning_rate": 4.679444444444444e-07, "loss": 0.0005, "num_tokens": 766494863.0, "reward": 0.3, "reward_std": 0.02925042100250721, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9326477289199829, "step": 11040 }, { "completion_length": 437.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 437.6, "completions/max_terminated_length": 393.4, "completions/mean_length": 91.50703125, "completions/mean_terminated_length": 90.99427795410156, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.00999571030630585, "frac_reward_zero_std": 0.89375, "grad_norm": 8.575857162475586, "kl": 1.3980051019927486, "learning_rate": 4.6790476190476186e-07, "loss": 0.0014, "num_tokens": 766810848.0, "reward": 0.3546875, "reward_std": 0.09221364259719848, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9134878277778625, "step": 11045 }, { "completion_length": 319.6, "completions/clipped_ratio": 0.0, "completions/max_length": 319.6, "completions/max_terminated_length": 319.6, "completions/mean_length": 88.340625, "completions/mean_terminated_length": 88.340625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01000023529965411, "frac_reward_zero_std": 0.9375, "grad_norm": 5.760633945465088, "kl": 1.16420750268735, "learning_rate": 4.6786507936507936e-07, "loss": 0.0012, "num_tokens": 767121828.0, "reward": 0.4140625, "reward_std": 0.05534004643559456, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8980114817619324, "step": 11050 }, { "completion_length": 499.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 499.2, "completions/max_terminated_length": 480.4, "completions/mean_length": 92.0296875, "completions/mean_terminated_length": 91.51193084716797, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010004760293002369, "frac_reward_zero_std": 0.90625, "grad_norm": 0.011710826307535172, "kl": 0.3758811534498818, "learning_rate": 4.6782539682539676e-07, "loss": 0.0004, "num_tokens": 767437226.0, "reward": 0.334375, "reward_std": 0.08175017535686493, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9432387948036194, "step": 11055 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 89.3453125, "completions/mean_terminated_length": 88.81815643310547, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010009285286350629, "frac_reward_zero_std": 0.925, "grad_norm": 6.147959232330322, "kl": 1.4365044105332345, "learning_rate": 4.6778571428571427e-07, "loss": 0.0014, "num_tokens": 767748812.0, "reward": 0.365625, "reward_std": 0.06029115132987499, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9266093492507934, "step": 11060 }, { "completion_length": 482.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 482.0, "completions/max_terminated_length": 424.6, "completions/mean_length": 93.46015625, "completions/mean_terminated_length": 92.94475860595703, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010013810279698889, "frac_reward_zero_std": 0.9, "grad_norm": 5.8105244636535645, "kl": 2.293773726525251, "learning_rate": 4.677460317460317e-07, "loss": 0.0023, "num_tokens": 768066865.0, "reward": 0.3828125, "reward_std": 0.09321458637714386, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9103870153427124, "step": 11065 }, { "completion_length": 331.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 89.46484375, "completions/mean_terminated_length": 89.46484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010018335273047149, "frac_reward_zero_std": 0.9125, "grad_norm": 9.368833541870117, "kl": 1.7388273203745483, "learning_rate": 4.677063492063492e-07, "loss": 0.0017, "num_tokens": 768379956.0, "reward": 0.4046875, "reward_std": 0.07733075693249702, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.8941281318664551, "step": 11070 }, { "completion_length": 274.4, "completions/clipped_ratio": 0.0, "completions/max_length": 274.4, "completions/max_terminated_length": 274.4, "completions/mean_length": 83.2125, "completions/mean_terminated_length": 83.2125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010022860266395409, "frac_reward_zero_std": 0.94375, "grad_norm": 7.755799770355225, "kl": 1.2789697492262349, "learning_rate": 4.6766666666666663e-07, "loss": 0.0013, "num_tokens": 768682492.0, "reward": 0.3515625, "reward_std": 0.05160361640155316, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9197754502296448, "step": 11075 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 87.42265625, "completions/mean_terminated_length": 87.42265625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010027385259743667, "frac_reward_zero_std": 0.95, "grad_norm": 1.5706572532653809, "kl": 2.62875185564626, "learning_rate": 4.676269841269841e-07, "loss": 0.0026, "num_tokens": 768994041.0, "reward": 0.296875, "reward_std": 0.04492306672036648, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9455667853355407, "step": 11080 }, { "completion_length": 342.8, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 89.33984375, "completions/mean_terminated_length": 89.33984375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010031910253091927, "frac_reward_zero_std": 0.91875, "grad_norm": 5.008044242858887, "kl": 3.323950621834956, "learning_rate": 4.675873015873016e-07, "loss": 0.0033, "num_tokens": 769306844.0, "reward": 0.353125, "reward_std": 0.07470165193080902, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9339828729629517, "step": 11085 }, { "completion_length": 366.4, "completions/clipped_ratio": 0.0, "completions/max_length": 366.4, "completions/max_terminated_length": 366.4, "completions/mean_length": 90.49375, "completions/mean_terminated_length": 90.49375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010036435246440188, "frac_reward_zero_std": 0.875, "grad_norm": 11.497824668884277, "kl": 2.8925204547587784, "learning_rate": 4.6754761904761905e-07, "loss": 0.0029, "num_tokens": 769621668.0, "reward": 0.2296875, "reward_std": 0.11294416859745979, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9632068634033203, "step": 11090 }, { "completion_length": 339.4, "completions/clipped_ratio": 0.0, "completions/max_length": 339.4, "completions/max_terminated_length": 339.4, "completions/mean_length": 91.221875, "completions/mean_terminated_length": 91.221875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010040960239788448, "frac_reward_zero_std": 0.925, "grad_norm": 10.222405433654785, "kl": 1.298576145619154, "learning_rate": 4.6750793650793645e-07, "loss": 0.0013, "num_tokens": 769937496.0, "reward": 0.184375, "reward_std": 0.07011655569076539, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.9653967380523681, "step": 11095 }, { "completion_length": 406.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 406.0, "completions/max_terminated_length": 386.2, "completions/mean_length": 93.53671875, "completions/mean_terminated_length": 93.02305297851562, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010045485233136708, "frac_reward_zero_std": 0.9125, "grad_norm": 13.4749755859375, "kl": 3.5739545407472177, "learning_rate": 4.6746825396825396e-07, "loss": 0.0036, "num_tokens": 770256671.0, "reward": 0.303125, "reward_std": 0.07553946301341057, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9399431943893433, "step": 11100 }, { "completion_length": 421.8, "completions/clipped_ratio": 0.0, "completions/max_length": 421.8, "completions/max_terminated_length": 421.8, "completions/mean_length": 92.1625, "completions/mean_terminated_length": 92.1625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010050010226484968, "frac_reward_zero_std": 0.95, "grad_norm": 0.09941360354423523, "kl": 1.2718219982925802, "learning_rate": 4.674285714285714e-07, "loss": 0.0013, "num_tokens": 770572847.0, "reward": 0.3609375, "reward_std": 0.04376729354262352, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.926519763469696, "step": 11105 }, { "completion_length": 449.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.6, "completions/max_terminated_length": 432.6, "completions/mean_length": 91.6109375, "completions/mean_terminated_length": 91.09987030029296, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010054535219833226, "frac_reward_zero_std": 0.9375, "grad_norm": 10.191574096679688, "kl": 1.718328687106259, "learning_rate": 4.6738888888888886e-07, "loss": 0.0017, "num_tokens": 770888413.0, "reward": 0.41875, "reward_std": 0.057602159306406976, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.9007122159004212, "step": 11110 }, { "completion_length": 433.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 90.109375, "completions/mean_terminated_length": 90.109375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010059060213181486, "frac_reward_zero_std": 0.95625, "grad_norm": 4.1282639503479, "kl": 1.3276788022601977, "learning_rate": 4.673492063492063e-07, "loss": 0.0013, "num_tokens": 771201241.0, "reward": 0.3828125, "reward_std": 0.04139880537986755, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9207679748535156, "step": 11115 }, { "completion_length": 283.2, "completions/clipped_ratio": 0.0, "completions/max_length": 283.2, "completions/max_terminated_length": 283.2, "completions/mean_length": 78.2578125, "completions/mean_terminated_length": 78.2578125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010063585206529746, "frac_reward_zero_std": 0.95625, "grad_norm": 21.859128952026367, "kl": 1.7637681778520347, "learning_rate": 4.6730952380952377e-07, "loss": 0.0018, "num_tokens": 771495531.0, "reward": 0.4171875, "reward_std": 0.037769732624292375, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9032643556594848, "step": 11120 }, { "completion_length": 523.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 523.0, "completions/max_terminated_length": 501.8, "completions/mean_length": 100.76796875, "completions/mean_terminated_length": 99.7509048461914, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010068110199878006, "frac_reward_zero_std": 0.94375, "grad_norm": 8.355389595031738, "kl": 2.5369242649758235, "learning_rate": 4.672698412698413e-07, "loss": 0.0025, "num_tokens": 771826730.0, "reward": 0.3421875, "reward_std": 0.04545377567410469, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9188978433609009, "step": 11125 }, { "completion_length": 303.2, "completions/clipped_ratio": 0.0, "completions/max_length": 303.2, "completions/max_terminated_length": 303.2, "completions/mean_length": 79.29921875, "completions/mean_terminated_length": 79.29921875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010072635193226267, "frac_reward_zero_std": 0.94375, "grad_norm": 6.2095770835876465, "kl": 0.8966592704644427, "learning_rate": 4.672301587301587e-07, "loss": 0.0009, "num_tokens": 772122649.0, "reward": 0.3546875, "reward_std": 0.05044882521033287, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9247088074684143, "step": 11130 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 86.28984375, "completions/mean_terminated_length": 86.28984375, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.010077160186574527, "frac_reward_zero_std": 0.9375, "grad_norm": 5.962512493133545, "kl": 1.7711689857533202, "learning_rate": 4.671904761904762e-07, "loss": 0.0018, "num_tokens": 772430316.0, "reward": 0.375, "reward_std": 0.053077931702136996, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9110910177230835, "step": 11135 }, { "completion_length": 526.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 526.8, "completions/max_terminated_length": 488.4, "completions/mean_length": 95.57890625, "completions/mean_terminated_length": 94.52353973388672, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.010081685179922785, "frac_reward_zero_std": 0.9375, "grad_norm": 15.802103042602539, "kl": 6.711256408772897, "learning_rate": 4.6715079365079364e-07, "loss": 0.0067, "num_tokens": 772751633.0, "reward": 0.3015625, "reward_std": 0.05534004643559456, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9505339145660401, "step": 11140 }, { "completion_length": 340.6, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/max_terminated_length": 340.6, "completions/mean_length": 92.821875, "completions/mean_terminated_length": 92.821875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.010086210173271045, "frac_reward_zero_std": 0.95625, "grad_norm": 1.6622192859649658, "kl": 5.462756033497863, "learning_rate": 4.6711111111111104e-07, "loss": 0.0055, "num_tokens": 773071085.0, "reward": 0.2546875, "reward_std": 0.041187618672847745, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9454879403114319, "step": 11145 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 91.690625, "completions/mean_terminated_length": 91.690625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010090735166619305, "frac_reward_zero_std": 0.925, "grad_norm": 8.535513877868652, "kl": 11.045128029282205, "learning_rate": 4.6707142857142855e-07, "loss": 0.011, "num_tokens": 773386809.0, "reward": 0.3265625, "reward_std": 0.06486285105347633, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9411854863166809, "step": 11150 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 387.4, "completions/max_terminated_length": 367.8, "completions/mean_length": 92.94375, "completions/mean_terminated_length": 92.42898406982422, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010095260159967565, "frac_reward_zero_std": 0.91875, "grad_norm": 6.749245643615723, "kl": 5.011945792287588, "learning_rate": 4.67031746031746e-07, "loss": 0.005, "num_tokens": 773704409.0, "reward": 0.2625, "reward_std": 0.07743556424975395, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9566381096839904, "step": 11155 }, { "completion_length": 437.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 437.4, "completions/max_terminated_length": 361.2, "completions/mean_length": 90.41171875, "completions/mean_terminated_length": 89.88105163574218, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010099785153315825, "frac_reward_zero_std": 0.9125, "grad_norm": 1.460110068321228, "kl": 3.351361623487901, "learning_rate": 4.669920634920635e-07, "loss": 0.0034, "num_tokens": 774019488.0, "reward": 0.35, "reward_std": 0.07527982965111732, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9297977566719056, "step": 11160 }, { "completion_length": 382.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 382.4, "completions/max_terminated_length": 284.8, "completions/mean_length": 89.15078125, "completions/mean_terminated_length": 88.62335357666015, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010104310146664084, "frac_reward_zero_std": 0.93125, "grad_norm": 7.942669868469238, "kl": 0.7663475596462377, "learning_rate": 4.669523809523809e-07, "loss": 0.0008, "num_tokens": 774331689.0, "reward": 0.346875, "reward_std": 0.06044343337416649, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9285794615745544, "step": 11165 }, { "completion_length": 298.4, "completions/clipped_ratio": 0.0, "completions/max_length": 298.4, "completions/max_terminated_length": 298.4, "completions/mean_length": 86.32109375, "completions/mean_terminated_length": 86.32109375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010108835140012344, "frac_reward_zero_std": 0.975, "grad_norm": 0.06826333701610565, "kl": 0.8809495341032744, "learning_rate": 4.6691269841269836e-07, "loss": 0.0009, "num_tokens": 774639396.0, "reward": 0.390625, "reward_std": 0.020411586761474608, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.8939835548400878, "step": 11170 }, { "completion_length": 472.2, "completions/clipped_ratio": 0.0, "completions/max_length": 472.2, "completions/max_terminated_length": 472.2, "completions/mean_length": 92.6296875, "completions/mean_terminated_length": 92.6296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010113360133360604, "frac_reward_zero_std": 0.9, "grad_norm": 7.406998157501221, "kl": 3.6201321120606735, "learning_rate": 4.6687301587301587e-07, "loss": 0.0036, "num_tokens": 774954746.0, "reward": 0.321875, "reward_std": 0.08343665525317193, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9380430698394775, "step": 11175 }, { "completion_length": 347.6, "completions/clipped_ratio": 0.0, "completions/max_length": 347.6, "completions/max_terminated_length": 347.6, "completions/mean_length": 94.2171875, "completions/mean_terminated_length": 94.2171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010117885126708864, "frac_reward_zero_std": 0.94375, "grad_norm": 8.645270347595215, "kl": 2.7046814631205054, "learning_rate": 4.668333333333333e-07, "loss": 0.0027, "num_tokens": 775275496.0, "reward": 0.453125, "reward_std": 0.05249975249171257, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8883729457855225, "step": 11180 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 88.01640625, "completions/mean_terminated_length": 88.01640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010122410120057124, "frac_reward_zero_std": 0.94375, "grad_norm": 0.1419835388660431, "kl": 2.496629535476677, "learning_rate": 4.667936507936508e-07, "loss": 0.0025, "num_tokens": 775586477.0, "reward": 0.303125, "reward_std": 0.051815783977508544, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9452757477760315, "step": 11185 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 394.6, "completions/max_terminated_length": 340.2, "completions/mean_length": 94.5625, "completions/mean_terminated_length": 94.0353012084961, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010126935113405384, "frac_reward_zero_std": 0.93125, "grad_norm": 0.9544463753700256, "kl": 4.732428813190199, "learning_rate": 4.6675396825396823e-07, "loss": 0.0047, "num_tokens": 775909421.0, "reward": 0.290625, "reward_std": 0.061809411644935607, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9537694931030274, "step": 11190 }, { "completion_length": 534.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 534.2, "completions/max_terminated_length": 472.6, "completions/mean_length": 91.121875, "completions/mean_terminated_length": 90.07015228271484, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010131460106753643, "frac_reward_zero_std": 0.9125, "grad_norm": 4.497587203979492, "kl": 10.953495870321058, "learning_rate": 4.667142857142857e-07, "loss": 0.011, "num_tokens": 776224001.0, "reward": 0.325, "reward_std": 0.08048802465200425, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9407461881637573, "step": 11195 }, { "completion_length": 391.2, "completions/clipped_ratio": 0.0, "completions/max_length": 391.2, "completions/max_terminated_length": 391.2, "completions/mean_length": 99.521875, "completions/mean_terminated_length": 99.521875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010135985100101903, "frac_reward_zero_std": 0.9125, "grad_norm": 14.48811149597168, "kl": 6.479470160789788, "learning_rate": 4.6667460317460314e-07, "loss": 0.0065, "num_tokens": 776552501.0, "reward": 0.2984375, "reward_std": 0.07711760997772217, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9513463854789734, "step": 11200 }, { "completion_length": 306.4, "completions/clipped_ratio": 0.0, "completions/max_length": 306.4, "completions/max_terminated_length": 306.4, "completions/mean_length": 91.25390625, "completions/mean_terminated_length": 91.25390625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010140510093450163, "frac_reward_zero_std": 0.91875, "grad_norm": 1.8156017065048218, "kl": 1.9835730107966811, "learning_rate": 4.666349206349206e-07, "loss": 0.002, "num_tokens": 776868906.0, "reward": 0.2765625, "reward_std": 0.060186341777443884, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9447980999946595, "step": 11205 }, { "completion_length": 412.6, "completions/clipped_ratio": 0.003125, "completions/max_length": 412.6, "completions/max_terminated_length": 366.4, "completions/mean_length": 94.41640625, "completions/mean_terminated_length": 92.32068634033203, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010145035086798423, "frac_reward_zero_std": 0.89375, "grad_norm": 1.0119349956512451, "kl": 2.621613764204085, "learning_rate": 4.665952380952381e-07, "loss": 0.0026, "num_tokens": 777186967.0, "reward": 0.3234375, "reward_std": 0.09311075657606124, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9365873694419861, "step": 11210 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 93.96015625, "completions/mean_terminated_length": 93.96015625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010149560080146683, "frac_reward_zero_std": 0.90625, "grad_norm": 0.028100961819291115, "kl": 1.3422074816655367, "learning_rate": 4.6655555555555556e-07, "loss": 0.0013, "num_tokens": 777506340.0, "reward": 0.2515625, "reward_std": 0.07995888143777848, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9622568964958191, "step": 11215 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.0, "completions/max_length": 365.4, "completions/max_terminated_length": 365.4, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010154085073494941, "frac_reward_zero_std": 0.93125, "grad_norm": 4.657567024230957, "kl": 1.8676267760805785, "learning_rate": 4.6651587301587296e-07, "loss": 0.0019, "num_tokens": 777827860.0, "reward": 0.328125, "reward_std": 0.06338853314518929, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9298630118370056, "step": 11220 }, { "completion_length": 450.2, "completions/clipped_ratio": 0.0, "completions/max_length": 450.2, "completions/max_terminated_length": 450.2, "completions/mean_length": 88.4640625, "completions/mean_terminated_length": 88.4640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010158610066843201, "frac_reward_zero_std": 0.94375, "grad_norm": 9.717899322509766, "kl": 3.0836133421515113, "learning_rate": 4.6647619047619046e-07, "loss": 0.0031, "num_tokens": 778138710.0, "reward": 0.425, "reward_std": 0.05070944242179394, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8967779397964477, "step": 11225 }, { "completion_length": 364.2, "completions/clipped_ratio": 0.0, "completions/max_length": 364.2, "completions/max_terminated_length": 364.2, "completions/mean_length": 86.91796875, "completions/mean_terminated_length": 86.91796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010163135060191461, "frac_reward_zero_std": 0.925, "grad_norm": 11.856583595275879, "kl": 2.6655675982590763, "learning_rate": 4.664365079365079e-07, "loss": 0.0027, "num_tokens": 778446885.0, "reward": 0.3671875, "reward_std": 0.06533367224037648, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9091467142105103, "step": 11230 }, { "completion_length": 436.4, "completions/clipped_ratio": 0.0, "completions/max_length": 436.4, "completions/max_terminated_length": 436.4, "completions/mean_length": 96.9171875, "completions/mean_terminated_length": 96.9171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010167660053539722, "frac_reward_zero_std": 0.93125, "grad_norm": 8.700281143188477, "kl": 2.9746157778427005, "learning_rate": 4.663968253968254e-07, "loss": 0.003, "num_tokens": 778769699.0, "reward": 0.41875, "reward_std": 0.05886430852115154, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8989851951599122, "step": 11235 }, { "completion_length": 428.8, "completions/clipped_ratio": 0.0, "completions/max_length": 428.8, "completions/max_terminated_length": 428.8, "completions/mean_length": 98.14765625, "completions/mean_terminated_length": 98.14765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010172185046887982, "frac_reward_zero_std": 0.9125, "grad_norm": 7.015154838562012, "kl": 2.7199484185432086, "learning_rate": 4.663571428571428e-07, "loss": 0.0027, "num_tokens": 779095496.0, "reward": 0.4578125, "reward_std": 0.07506766021251679, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.889218807220459, "step": 11240 }, { "completion_length": 434.6, "completions/clipped_ratio": 0.0, "completions/max_length": 434.6, "completions/max_terminated_length": 434.6, "completions/mean_length": 91.6328125, "completions/mean_terminated_length": 91.6328125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010176710040236242, "frac_reward_zero_std": 0.86875, "grad_norm": 7.899017810821533, "kl": 2.342146999912802, "learning_rate": 4.663174603174603e-07, "loss": 0.0023, "num_tokens": 779411522.0, "reward": 0.365625, "reward_std": 0.1060035839676857, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9275438904762268, "step": 11245 }, { "completion_length": 302.6, "completions/clipped_ratio": 0.0, "completions/max_length": 302.6, "completions/max_terminated_length": 302.6, "completions/mean_length": 88.640625, "completions/mean_terminated_length": 88.640625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0101812350335845, "frac_reward_zero_std": 0.9125, "grad_norm": 9.633819580078125, "kl": 2.5361903541954236, "learning_rate": 4.662777777777778e-07, "loss": 0.0025, "num_tokens": 779724038.0, "reward": 0.328125, "reward_std": 0.07348951995372772, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9439234614372254, "step": 11250 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 82.61796875, "completions/mean_terminated_length": 82.61796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01018576002693276, "frac_reward_zero_std": 0.925, "grad_norm": 9.209723472595215, "kl": 1.7691961230244488, "learning_rate": 4.662380952380952e-07, "loss": 0.0018, "num_tokens": 780025045.0, "reward": 0.3328125, "reward_std": 0.061233779042959215, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9402491569519043, "step": 11255 }, { "completion_length": 336.4, "completions/clipped_ratio": 0.0, "completions/max_length": 336.4, "completions/max_terminated_length": 336.4, "completions/mean_length": 92.3671875, "completions/mean_terminated_length": 92.3671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01019028502028102, "frac_reward_zero_std": 0.925, "grad_norm": 9.065438270568848, "kl": 1.2519669339060784, "learning_rate": 4.661984126984127e-07, "loss": 0.0013, "num_tokens": 780343059.0, "reward": 0.2828125, "reward_std": 0.06486284993588924, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.957424008846283, "step": 11260 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 428.0, "completions/max_terminated_length": 338.6, "completions/mean_length": 86.02421875, "completions/mean_terminated_length": 84.96796264648438, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01019481001362928, "frac_reward_zero_std": 0.95, "grad_norm": 2.527299404144287, "kl": 1.9189233797602356, "learning_rate": 4.6615873015873015e-07, "loss": 0.0019, "num_tokens": 780650706.0, "reward": 0.45625, "reward_std": 0.044239097461104396, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8853490948677063, "step": 11265 }, { "completion_length": 514.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 514.2, "completions/max_terminated_length": 409.4, "completions/mean_length": 91.4015625, "completions/mean_terminated_length": 90.32819213867188, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01019933500697754, "frac_reward_zero_std": 0.925, "grad_norm": 2.5672576427459717, "kl": 1.7279395082383417, "learning_rate": 4.661190476190476e-07, "loss": 0.0017, "num_tokens": 780965476.0, "reward": 0.3359375, "reward_std": 0.06870310679078102, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9334576725959778, "step": 11270 }, { "completion_length": 375.2, "completions/clipped_ratio": 0.0, "completions/max_length": 375.2, "completions/max_terminated_length": 375.2, "completions/mean_length": 87.25, "completions/mean_terminated_length": 87.25, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010203860000325799, "frac_reward_zero_std": 0.91875, "grad_norm": 0.3353332579135895, "kl": 1.355108916014433, "learning_rate": 4.6607936507936506e-07, "loss": 0.0014, "num_tokens": 781275580.0, "reward": 0.34375, "reward_std": 0.06654835119843483, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9342964887619019, "step": 11275 }, { "completion_length": 398.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.0, "completions/max_terminated_length": 353.6, "completions/mean_length": 94.36484375, "completions/mean_terminated_length": 93.83793640136719, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010208384993674059, "frac_reward_zero_std": 0.89375, "grad_norm": 1.7300549745559692, "kl": 4.524044478742871, "learning_rate": 4.660396825396825e-07, "loss": 0.0045, "num_tokens": 781597495.0, "reward": 0.2484375, "reward_std": 0.09289761334657669, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9590695619583129, "step": 11280 }, { "completion_length": 417.8, "completions/clipped_ratio": 0.0, "completions/max_length": 417.8, "completions/max_terminated_length": 417.8, "completions/mean_length": 81.23359375, "completions/mean_terminated_length": 81.23359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010212909987022319, "frac_reward_zero_std": 0.90625, "grad_norm": 15.65539836883545, "kl": 3.2254240961861798, "learning_rate": 4.66e-07, "loss": 0.0032, "num_tokens": 781896818.0, "reward": 0.3671875, "reward_std": 0.07948806062340737, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9193824291229248, "step": 11285 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.0, "completions/max_length": 392.6, "completions/max_terminated_length": 392.6, "completions/mean_length": 91.9890625, "completions/mean_terminated_length": 91.9890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010217434980370579, "frac_reward_zero_std": 0.9125, "grad_norm": 16.73480224609375, "kl": 4.728418962401338, "learning_rate": 4.659603174603174e-07, "loss": 0.0047, "num_tokens": 782214292.0, "reward": 0.3, "reward_std": 0.07075560204684735, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9469204425811768, "step": 11290 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 87.93984375, "completions/mean_terminated_length": 87.93984375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01022195997371884, "frac_reward_zero_std": 0.925, "grad_norm": 0.2323811948299408, "kl": 3.2338439265964554, "learning_rate": 4.6592063492063487e-07, "loss": 0.0032, "num_tokens": 782524007.0, "reward": 0.378125, "reward_std": 0.06938609592616558, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.920292592048645, "step": 11295 }, { "completion_length": 372.6, "completions/clipped_ratio": 0.0, "completions/max_length": 372.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 91.4625, "completions/mean_terminated_length": 91.4625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0102264849670671, "frac_reward_zero_std": 0.93125, "grad_norm": 8.35643482208252, "kl": 2.313459054753184, "learning_rate": 4.658809523809524e-07, "loss": 0.0023, "num_tokens": 782840015.0, "reward": 0.353125, "reward_std": 0.062493379414081576, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9120193839073181, "step": 11300 }, { "completion_length": 615.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 615.0, "completions/max_terminated_length": 545.2, "completions/mean_length": 91.47734375, "completions/mean_terminated_length": 90.95171661376953, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010231009960415358, "frac_reward_zero_std": 0.95625, "grad_norm": 5.6559672355651855, "kl": 0.5397306470316835, "learning_rate": 4.6584126984126983e-07, "loss": 0.0005, "num_tokens": 783155778.0, "reward": 0.334375, "reward_std": 0.0372979287058115, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9060857653617859, "step": 11305 }, { "completion_length": 321.2, "completions/clipped_ratio": 0.0, "completions/max_length": 321.2, "completions/max_terminated_length": 321.2, "completions/mean_length": 81.01875, "completions/mean_terminated_length": 81.01875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010235534953763618, "frac_reward_zero_std": 0.91875, "grad_norm": 6.410536289215088, "kl": 0.616528932552319, "learning_rate": 4.658015873015873e-07, "loss": 0.0006, "num_tokens": 783454434.0, "reward": 0.325, "reward_std": 0.0722273699939251, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9396993160247803, "step": 11310 }, { "completion_length": 255.8, "completions/clipped_ratio": 0.0, "completions/max_length": 255.8, "completions/max_terminated_length": 255.8, "completions/mean_length": 82.859375, "completions/mean_terminated_length": 82.859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010240059947111878, "frac_reward_zero_std": 0.95, "grad_norm": 4.1645917892456055, "kl": 0.9488032811786979, "learning_rate": 4.6576190476190474e-07, "loss": 0.0009, "num_tokens": 783757606.0, "reward": 0.43125, "reward_std": 0.03898441009223461, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8920152306556701, "step": 11315 }, { "completion_length": 574.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 574.4, "completions/max_terminated_length": 519.2, "completions/mean_length": 92.946875, "completions/mean_terminated_length": 91.89953155517578, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010244584940460138, "frac_reward_zero_std": 0.90625, "grad_norm": 11.953104972839355, "kl": 0.8019107029540464, "learning_rate": 4.657222222222222e-07, "loss": 0.0008, "num_tokens": 784074482.0, "reward": 0.44375, "reward_std": 0.0765419777482748, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8874007105827332, "step": 11320 }, { "completion_length": 434.2, "completions/clipped_ratio": 0.0, "completions/max_length": 434.2, "completions/max_terminated_length": 434.2, "completions/mean_length": 84.45078125, "completions/mean_terminated_length": 84.45078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010249109933808398, "frac_reward_zero_std": 0.93125, "grad_norm": 5.0877251625061035, "kl": 1.7564957770053298, "learning_rate": 4.656825396825397e-07, "loss": 0.0018, "num_tokens": 784379771.0, "reward": 0.3578125, "reward_std": 0.057921682670712474, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9170611023902893, "step": 11325 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.0, "completions/max_length": 390.4, "completions/max_terminated_length": 390.4, "completions/mean_length": 92.73671875, "completions/mean_terminated_length": 92.73671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010253634927156656, "frac_reward_zero_std": 0.9375, "grad_norm": 7.153120040893555, "kl": 2.235324705892708, "learning_rate": 4.656428571428571e-07, "loss": 0.0022, "num_tokens": 784697434.0, "reward": 0.3671875, "reward_std": 0.054185254871845244, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9197898387908936, "step": 11330 }, { "completion_length": 571.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 571.6, "completions/max_terminated_length": 535.0, "completions/mean_length": 98.253125, "completions/mean_terminated_length": 97.21197052001953, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010258159920504916, "frac_reward_zero_std": 0.9375, "grad_norm": 8.086668968200684, "kl": 5.101424312382005, "learning_rate": 4.656031746031746e-07, "loss": 0.0051, "num_tokens": 785023758.0, "reward": 0.296875, "reward_std": 0.05192313939332962, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9532975196838379, "step": 11335 }, { "completion_length": 400.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.2, "completions/max_terminated_length": 307.4, "completions/mean_length": 93.25, "completions/mean_terminated_length": 92.72352142333985, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010262684913853177, "frac_reward_zero_std": 0.91875, "grad_norm": 4.761451244354248, "kl": 5.913908894313499, "learning_rate": 4.6556349206349206e-07, "loss": 0.0059, "num_tokens": 785343478.0, "reward": 0.3421875, "reward_std": 0.07154437899589539, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9226303577423096, "step": 11340 }, { "completion_length": 552.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 552.0, "completions/max_terminated_length": 495.8, "completions/mean_length": 90.0140625, "completions/mean_terminated_length": 89.48965454101562, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010267209907201437, "frac_reward_zero_std": 0.93125, "grad_norm": 11.275293350219727, "kl": 5.2333597060991455, "learning_rate": 4.6552380952380946e-07, "loss": 0.0052, "num_tokens": 785657608.0, "reward": 0.440625, "reward_std": 0.05813384801149368, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8884105801582336, "step": 11345 }, { "completion_length": 503.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 503.6, "completions/max_terminated_length": 444.0, "completions/mean_length": 95.371875, "completions/mean_terminated_length": 94.85627899169921, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010271734900549697, "frac_reward_zero_std": 0.9125, "grad_norm": 2.0732526779174805, "kl": 5.14778872433817, "learning_rate": 4.6548412698412697e-07, "loss": 0.0051, "num_tokens": 785979044.0, "reward": 0.284375, "reward_std": 0.07348951995372772, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9482470989227295, "step": 11350 }, { "completion_length": 472.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 472.0, "completions/max_terminated_length": 421.8, "completions/mean_length": 97.61484375, "completions/mean_terminated_length": 96.5618667602539, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010276259893897957, "frac_reward_zero_std": 0.9375, "grad_norm": 6.684890270233154, "kl": 3.974521297297906, "learning_rate": 4.654444444444444e-07, "loss": 0.004, "num_tokens": 786305887.0, "reward": 0.384375, "reward_std": 0.055341027677059174, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9067739129066468, "step": 11355 }, { "completion_length": 458.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.2, "completions/max_terminated_length": 397.2, "completions/mean_length": 89.121875, "completions/mean_terminated_length": 88.58818054199219, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010280784887246215, "frac_reward_zero_std": 0.93125, "grad_norm": 5.288444519042969, "kl": 2.1183806551038287, "learning_rate": 4.6540476190476193e-07, "loss": 0.0021, "num_tokens": 786617819.0, "reward": 0.378125, "reward_std": 0.06338853389024734, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9138996005058289, "step": 11360 }, { "completion_length": 338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 96.06171875, "completions/mean_terminated_length": 96.06171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010285309880594475, "frac_reward_zero_std": 0.9625, "grad_norm": 1.2940903902053833, "kl": 0.9354218511492945, "learning_rate": 4.6536507936507933e-07, "loss": 0.0009, "num_tokens": 786942730.0, "reward": 0.39375, "reward_std": 0.03219552300870419, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.908365273475647, "step": 11365 }, { "completion_length": 456.4, "completions/clipped_ratio": 0.0, "completions/max_length": 456.4, "completions/max_terminated_length": 456.4, "completions/mean_length": 84.0484375, "completions/mean_terminated_length": 84.0484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010289834873942735, "frac_reward_zero_std": 0.9125, "grad_norm": 2.312570810317993, "kl": 0.9656471428228542, "learning_rate": 4.653253968253968e-07, "loss": 0.001, "num_tokens": 787245152.0, "reward": 0.321875, "reward_std": 0.07827338352799415, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9360557675361634, "step": 11370 }, { "completion_length": 410.4, "completions/clipped_ratio": 0.0, "completions/max_length": 410.4, "completions/max_terminated_length": 410.4, "completions/mean_length": 90.70859375, "completions/mean_terminated_length": 90.70859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010294359867290995, "frac_reward_zero_std": 0.9125, "grad_norm": 10.266326904296875, "kl": 2.0235925601795315, "learning_rate": 4.652857142857143e-07, "loss": 0.002, "num_tokens": 787559891.0, "reward": 0.3828125, "reward_std": 0.07391286939382553, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9174596071243286, "step": 11375 }, { "completion_length": 375.2, "completions/clipped_ratio": 0.0, "completions/max_length": 375.2, "completions/max_terminated_length": 375.2, "completions/mean_length": 88.0765625, "completions/mean_terminated_length": 88.0765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010298884860639256, "frac_reward_zero_std": 0.9375, "grad_norm": 14.557384490966797, "kl": 1.7861022420227528, "learning_rate": 4.652460317460317e-07, "loss": 0.0018, "num_tokens": 787871421.0, "reward": 0.1984375, "reward_std": 0.0532900981605053, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.9704809188842773, "step": 11380 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 90.38671875, "completions/mean_terminated_length": 90.38671875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010303409853987514, "frac_reward_zero_std": 0.94375, "grad_norm": 5.292203426361084, "kl": 3.2775265853852034, "learning_rate": 4.6520634920634915e-07, "loss": 0.0033, "num_tokens": 788186260.0, "reward": 0.3921875, "reward_std": 0.04660856761038303, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9107851266860962, "step": 11385 }, { "completion_length": 335.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 85.9375, "completions/mean_terminated_length": 85.9375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010307934847335774, "frac_reward_zero_std": 0.88125, "grad_norm": 14.173686981201172, "kl": 5.480843528499827, "learning_rate": 4.6516666666666666e-07, "loss": 0.0055, "num_tokens": 788493892.0, "reward": 0.2609375, "reward_std": 0.10037047266960145, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9473122119903564, "step": 11390 }, { "completion_length": 439.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.4, "completions/max_terminated_length": 425.6, "completions/mean_length": 95.65703125, "completions/mean_terminated_length": 95.11903228759766, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010312459840684034, "frac_reward_zero_std": 0.95625, "grad_norm": 2.8517746925354004, "kl": 1.9153701910632663, "learning_rate": 4.651269841269841e-07, "loss": 0.0019, "num_tokens": 788816253.0, "reward": 0.3515625, "reward_std": 0.03298586942255497, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9160271286964417, "step": 11395 }, { "completion_length": 483.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 483.4, "completions/max_terminated_length": 447.0, "completions/mean_length": 93.059375, "completions/mean_terminated_length": 92.54083099365235, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010316984834032294, "frac_reward_zero_std": 0.93125, "grad_norm": 3.6068506240844727, "kl": 3.0012746378197335, "learning_rate": 4.6508730158730156e-07, "loss": 0.003, "num_tokens": 789132281.0, "reward": 0.3734375, "reward_std": 0.05339745357632637, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9181925773620605, "step": 11400 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 394.6, "completions/max_terminated_length": 376.4, "completions/mean_length": 79.2109375, "completions/mean_terminated_length": 78.66869506835937, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010321509827380554, "frac_reward_zero_std": 0.9625, "grad_norm": 1.2224459648132324, "kl": 1.2875732966233044, "learning_rate": 4.65047619047619e-07, "loss": 0.0013, "num_tokens": 789426927.0, "reward": 0.440625, "reward_std": 0.0319843377918005, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8750808358192443, "step": 11405 }, { "completion_length": 408.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 93.35078125, "completions/mean_terminated_length": 93.35078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010326034820728814, "frac_reward_zero_std": 0.9375, "grad_norm": 2.7502379417419434, "kl": 0.9250271119177341, "learning_rate": 4.6500793650793647e-07, "loss": 0.0009, "num_tokens": 789745712.0, "reward": 0.3515625, "reward_std": 0.05329009778797626, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9160438656806946, "step": 11410 }, { "completion_length": 324.4, "completions/clipped_ratio": 0.0, "completions/max_length": 324.4, "completions/max_terminated_length": 324.4, "completions/mean_length": 86.09765625, "completions/mean_terminated_length": 86.09765625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.010330559814077073, "frac_reward_zero_std": 0.8875, "grad_norm": 2.62914776802063, "kl": 3.257369570550509, "learning_rate": 4.64968253968254e-07, "loss": 0.0033, "num_tokens": 790053269.0, "reward": 0.259375, "reward_std": 0.09548023045063019, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9671732544898987, "step": 11415 }, { "completion_length": 295.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 88.07421875, "completions/mean_terminated_length": 88.07421875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010335084807425333, "frac_reward_zero_std": 0.95625, "grad_norm": 13.3726224899292, "kl": 3.6565620628418403, "learning_rate": 4.649285714285714e-07, "loss": 0.0037, "num_tokens": 790364060.0, "reward": 0.2984375, "reward_std": 0.035035816580057146, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9471090078353882, "step": 11420 }, { "completion_length": 436.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.2, "completions/max_terminated_length": 420.6, "completions/mean_length": 89.9234375, "completions/mean_terminated_length": 89.39844207763672, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010339609800773593, "frac_reward_zero_std": 0.93125, "grad_norm": 21.193838119506836, "kl": 1.2467667642398736, "learning_rate": 4.648888888888889e-07, "loss": 0.0012, "num_tokens": 790674538.0, "reward": 0.2453125, "reward_std": 0.05839250646531582, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9499157905578614, "step": 11425 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 378.4, "completions/max_terminated_length": 285.4, "completions/mean_length": 92.31875, "completions/mean_terminated_length": 91.79652709960938, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010344134794121853, "frac_reward_zero_std": 0.9, "grad_norm": 5.775417327880859, "kl": 1.4163180771865882, "learning_rate": 4.6484920634920634e-07, "loss": 0.0014, "num_tokens": 790992258.0, "reward": 0.3453125, "reward_std": 0.07686150297522545, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.8846402645111084, "step": 11430 }, { "completion_length": 575.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 575.6, "completions/max_terminated_length": 364.4, "completions/mean_length": 83.57109375, "completions/mean_terminated_length": 82.49127655029297, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010348659787470113, "frac_reward_zero_std": 0.94375, "grad_norm": 5.343926906585693, "kl": 1.2250399856828154, "learning_rate": 4.6480952380952374e-07, "loss": 0.0012, "num_tokens": 791293933.0, "reward": 0.3453125, "reward_std": 0.053866710513830185, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9322997093200683, "step": 11435 }, { "completion_length": 379.4, "completions/clipped_ratio": 0.0, "completions/max_length": 379.4, "completions/max_terminated_length": 379.4, "completions/mean_length": 91.1796875, "completions/mean_terminated_length": 91.1796875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010353184780818371, "frac_reward_zero_std": 0.95625, "grad_norm": 7.463128566741943, "kl": 1.026465772325173, "learning_rate": 4.6476984126984125e-07, "loss": 0.001, "num_tokens": 791609955.0, "reward": 0.4, "reward_std": 0.035247981920838355, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.8891234278678894, "step": 11440 }, { "completion_length": 277.4, "completions/clipped_ratio": 0.0, "completions/max_length": 277.4, "completions/max_terminated_length": 277.4, "completions/mean_length": 86.33046875, "completions/mean_terminated_length": 86.33046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010357709774166632, "frac_reward_zero_std": 0.93125, "grad_norm": 10.898134231567383, "kl": 5.6235141946701335, "learning_rate": 4.647301587301587e-07, "loss": 0.0056, "num_tokens": 791918450.0, "reward": 0.2359375, "reward_std": 0.06112642176449299, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9641900897026062, "step": 11445 }, { "completion_length": 494.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 494.0, "completions/max_terminated_length": 451.6, "completions/mean_length": 92.70546875, "completions/mean_terminated_length": 91.64803771972656, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010362234767514892, "frac_reward_zero_std": 0.95, "grad_norm": 3.135101318359375, "kl": 4.46786453591194, "learning_rate": 4.646904761904762e-07, "loss": 0.0045, "num_tokens": 792235873.0, "reward": 0.375, "reward_std": 0.04218915030360222, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.8974393129348754, "step": 11450 }, { "completion_length": 314.2, "completions/clipped_ratio": 0.0, "completions/max_length": 314.2, "completions/max_terminated_length": 314.2, "completions/mean_length": 91.87578125, "completions/mean_terminated_length": 91.87578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010366759760863152, "frac_reward_zero_std": 0.95, "grad_norm": 15.590027809143066, "kl": 8.893491226341576, "learning_rate": 4.646507936507936e-07, "loss": 0.0089, "num_tokens": 792552794.0, "reward": 0.4296875, "reward_std": 0.04035136960446835, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8998390197753906, "step": 11455 }, { "completion_length": 386.6, "completions/clipped_ratio": 0.0, "completions/max_length": 386.6, "completions/max_terminated_length": 386.6, "completions/mean_length": 92.53828125, "completions/mean_terminated_length": 92.53828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010371284754211412, "frac_reward_zero_std": 0.9375, "grad_norm": 0.3469141721725464, "kl": 7.712203212780878, "learning_rate": 4.6461111111111106e-07, "loss": 0.0077, "num_tokens": 792870867.0, "reward": 0.3546875, "reward_std": 0.0523949459195137, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9284203290939331, "step": 11460 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 96.51171875, "completions/mean_terminated_length": 96.51171875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.010375809747559672, "frac_reward_zero_std": 0.925, "grad_norm": 7.654244899749756, "kl": 3.5583616249030454, "learning_rate": 4.6457142857142857e-07, "loss": 0.0036, "num_tokens": 793195314.0, "reward": 0.43125, "reward_std": 0.05939599797129631, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.895192289352417, "step": 11465 }, { "completion_length": 399.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 399.2, "completions/max_terminated_length": 331.2, "completions/mean_length": 93.52421875, "completions/mean_terminated_length": 93.00043640136718, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01038033474090793, "frac_reward_zero_std": 0.925, "grad_norm": 7.989089488983154, "kl": 2.3143845376675016, "learning_rate": 4.6453174603174597e-07, "loss": 0.0023, "num_tokens": 793514417.0, "reward": 0.4359375, "reward_std": 0.062128932774066926, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.886716651916504, "step": 11470 }, { "completion_length": 515.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 515.4, "completions/max_terminated_length": 428.4, "completions/mean_length": 96.040625, "completions/mean_terminated_length": 94.99913177490234, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01038485973425619, "frac_reward_zero_std": 0.9, "grad_norm": 8.667787551879883, "kl": 3.5334758378448896, "learning_rate": 4.644920634920635e-07, "loss": 0.0035, "num_tokens": 793837061.0, "reward": 0.2828125, "reward_std": 0.08869035840034485, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9427711367607117, "step": 11475 }, { "completion_length": 422.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 422.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 96.07421875, "completions/mean_terminated_length": 95.00045471191406, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01038938472760445, "frac_reward_zero_std": 0.95, "grad_norm": 0.03351263701915741, "kl": 3.2605190109345132, "learning_rate": 4.6445238095238093e-07, "loss": 0.0033, "num_tokens": 794160260.0, "reward": 0.4671875, "reward_std": 0.043556107208132744, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.873958420753479, "step": 11480 }, { "completion_length": 424.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 424.4, "completions/max_terminated_length": 421.8, "completions/mean_length": 100.575, "completions/mean_terminated_length": 100.06457214355468, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01039390972095271, "frac_reward_zero_std": 0.90625, "grad_norm": 15.376875877380371, "kl": 5.461459418106824, "learning_rate": 4.644126984126984e-07, "loss": 0.0055, "num_tokens": 794490364.0, "reward": 0.296875, "reward_std": 0.08017104715108872, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.940253472328186, "step": 11485 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 92.5703125, "completions/mean_terminated_length": 92.5703125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01039843471430097, "frac_reward_zero_std": 0.93125, "grad_norm": 4.58311128616333, "kl": 3.627679049759172, "learning_rate": 4.6437301587301584e-07, "loss": 0.0036, "num_tokens": 794809462.0, "reward": 0.2984375, "reward_std": 0.05907647348940372, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9505215764045716, "step": 11490 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 89.95234375, "completions/mean_terminated_length": 89.95234375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010402959707649229, "frac_reward_zero_std": 0.94375, "grad_norm": 11.999092102050781, "kl": 6.257911040796898, "learning_rate": 4.643333333333333e-07, "loss": 0.0063, "num_tokens": 795123769.0, "reward": 0.259375, "reward_std": 0.04408681578934193, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9519876003265381, "step": 11495 }, { "completion_length": 464.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 464.8, "completions/max_terminated_length": 406.2, "completions/mean_length": 94.903125, "completions/mean_terminated_length": 93.84955444335938, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010407484700997489, "frac_reward_zero_std": 0.89375, "grad_norm": 8.271516799926758, "kl": 4.111417953320779, "learning_rate": 4.642936507936508e-07, "loss": 0.0041, "num_tokens": 795445981.0, "reward": 0.1953125, "reward_std": 0.09285112284123898, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9774738430976868, "step": 11500 }, { "completion_length": 518.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 518.2, "completions/max_terminated_length": 465.6, "completions/mean_length": 93.62265625, "completions/mean_terminated_length": 92.57236175537109, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01041200969434575, "frac_reward_zero_std": 0.90625, "grad_norm": 15.331501960754395, "kl": 6.506922411255073, "learning_rate": 4.6425396825396826e-07, "loss": 0.0065, "num_tokens": 795765338.0, "reward": 0.325, "reward_std": 0.08311615251004696, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9396474242210389, "step": 11505 }, { "completion_length": 315.6, "completions/clipped_ratio": 0.0, "completions/max_length": 315.6, "completions/max_terminated_length": 315.6, "completions/mean_length": 90.5953125, "completions/mean_terminated_length": 90.5953125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01041653468769401, "frac_reward_zero_std": 0.925, "grad_norm": 13.710043907165527, "kl": 4.013428645534441, "learning_rate": 4.6421428571428566e-07, "loss": 0.004, "num_tokens": 796079916.0, "reward": 0.35, "reward_std": 0.06917490884661674, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9210214972496032, "step": 11510 }, { "completion_length": 414.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 91.425, "completions/mean_terminated_length": 91.425, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01042105968104227, "frac_reward_zero_std": 0.925, "grad_norm": 10.781082153320312, "kl": 6.6385640283348035, "learning_rate": 4.6417460317460316e-07, "loss": 0.0066, "num_tokens": 796395468.0, "reward": 0.4140625, "reward_std": 0.0675967674702406, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9030631422996521, "step": 11515 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 93.78125, "completions/mean_terminated_length": 93.78125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01042558467439053, "frac_reward_zero_std": 0.9, "grad_norm": 13.960601806640625, "kl": 7.027175072184764, "learning_rate": 4.641349206349206e-07, "loss": 0.007, "num_tokens": 796715524.0, "reward": 0.4125, "reward_std": 0.08800737112760544, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.8966872572898865, "step": 11520 }, { "completion_length": 389.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 389.0, "completions/max_terminated_length": 336.4, "completions/mean_length": 93.10703125, "completions/mean_terminated_length": 92.064013671875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010430109667738788, "frac_reward_zero_std": 0.95625, "grad_norm": 8.86798095703125, "kl": 5.211517348606139, "learning_rate": 4.6409523809523807e-07, "loss": 0.0052, "num_tokens": 797032277.0, "reward": 0.415625, "reward_std": 0.0370867446064949, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.8852375626564026, "step": 11525 }, { "completion_length": 287.6, "completions/clipped_ratio": 0.0, "completions/max_length": 287.6, "completions/max_terminated_length": 287.6, "completions/mean_length": 86.31875, "completions/mean_terminated_length": 86.31875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010434634661087048, "frac_reward_zero_std": 0.925, "grad_norm": 10.953014373779297, "kl": 2.208602358191274, "learning_rate": 4.6405555555555553e-07, "loss": 0.0022, "num_tokens": 797341301.0, "reward": 0.3390625, "reward_std": 0.06460321247577668, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9248692750930786, "step": 11530 }, { "completion_length": 271.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 87.153125, "completions/mean_terminated_length": 87.153125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010439159654435308, "frac_reward_zero_std": 0.9375, "grad_norm": 0.14243514835834503, "kl": 1.5170025649713352, "learning_rate": 4.64015873015873e-07, "loss": 0.0015, "num_tokens": 797649793.0, "reward": 0.3484375, "reward_std": 0.05124015174806118, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9304957747459411, "step": 11535 }, { "completion_length": 387.2, "completions/clipped_ratio": 0.0, "completions/max_length": 387.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 88.4109375, "completions/mean_terminated_length": 88.4109375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010443684647783568, "frac_reward_zero_std": 0.90625, "grad_norm": 11.07612419128418, "kl": 4.471938525908627, "learning_rate": 4.639761904761905e-07, "loss": 0.0045, "num_tokens": 797960759.0, "reward": 0.4734375, "reward_std": 0.0724411003291607, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8774821400642395, "step": 11540 }, { "completion_length": 371.8, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 87.25078125, "completions/mean_terminated_length": 87.25078125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010448209641131828, "frac_reward_zero_std": 0.9625, "grad_norm": 6.923981189727783, "kl": 4.896994135668502, "learning_rate": 4.639365079365079e-07, "loss": 0.0049, "num_tokens": 798269792.0, "reward": 0.271875, "reward_std": 0.03582459464669228, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9379767656326294, "step": 11545 }, { "completion_length": 324.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 92.52578125, "completions/mean_terminated_length": 92.52578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010452734634480087, "frac_reward_zero_std": 0.9125, "grad_norm": 10.711873054504395, "kl": 6.317943787621334, "learning_rate": 4.638968253968254e-07, "loss": 0.0063, "num_tokens": 798588065.0, "reward": 0.415625, "reward_std": 0.0773297768086195, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9071672797203064, "step": 11550 }, { "completion_length": 335.8, "completions/clipped_ratio": 0.0, "completions/max_length": 335.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 79.57890625, "completions/mean_terminated_length": 79.57890625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010457259627828347, "frac_reward_zero_std": 0.9375, "grad_norm": 6.930018901824951, "kl": 3.1331517609767614, "learning_rate": 4.6385714285714285e-07, "loss": 0.0031, "num_tokens": 798884662.0, "reward": 0.346875, "reward_std": 0.056236181780695914, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9142839431762695, "step": 11555 }, { "completion_length": 434.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 434.2, "completions/max_terminated_length": 411.4, "completions/mean_length": 102.55, "completions/mean_terminated_length": 102.03720092773438, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010461784621176607, "frac_reward_zero_std": 0.925, "grad_norm": 11.175651550292969, "kl": 3.093607999186497, "learning_rate": 4.6381746031746025e-07, "loss": 0.0031, "num_tokens": 799219190.0, "reward": 0.315625, "reward_std": 0.06413140818476677, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.943393099308014, "step": 11560 }, { "completion_length": 419.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.6, "completions/max_terminated_length": 385.0, "completions/mean_length": 89.79609375, "completions/mean_terminated_length": 89.27510223388671, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010466309614524867, "frac_reward_zero_std": 0.9375, "grad_norm": 2.982090473175049, "kl": 2.1383072797209026, "learning_rate": 4.6377777777777776e-07, "loss": 0.0021, "num_tokens": 799530321.0, "reward": 0.4015625, "reward_std": 0.05329010002315045, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9142991304397583, "step": 11565 }, { "completion_length": 517.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 517.8, "completions/max_terminated_length": 456.6, "completions/mean_length": 91.6, "completions/mean_terminated_length": 91.07202911376953, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010470834607873127, "frac_reward_zero_std": 0.95625, "grad_norm": 7.769857883453369, "kl": 2.0424556128215046, "learning_rate": 4.637380952380952e-07, "loss": 0.002, "num_tokens": 799846441.0, "reward": 0.159375, "reward_std": 0.036827107146382335, "rewards/verify_chess_move/mean": 0.159375, "rewards/verify_chess_move/std": 0.9802672147750855, "step": 11570 }, { "completion_length": 382.6, "completions/clipped_ratio": 0.0, "completions/max_length": 382.6, "completions/max_terminated_length": 382.6, "completions/mean_length": 96.28046875, "completions/mean_terminated_length": 96.28046875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010475359601221387, "frac_reward_zero_std": 0.96875, "grad_norm": 0.037647418677806854, "kl": 2.347576040425338, "learning_rate": 4.636984126984127e-07, "loss": 0.0023, "num_tokens": 800170208.0, "reward": 0.2640625, "reward_std": 0.028930897638201714, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9611958384513855, "step": 11575 }, { "completion_length": 307.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 83.57421875, "completions/mean_terminated_length": 83.57421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010479884594569645, "frac_reward_zero_std": 0.9625, "grad_norm": 0.17419709265232086, "kl": 1.4256000716239214, "learning_rate": 4.636587301587301e-07, "loss": 0.0014, "num_tokens": 800473527.0, "reward": 0.3375, "reward_std": 0.03377464823424816, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9285631775856018, "step": 11580 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 84.54296875, "completions/mean_terminated_length": 84.54296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010484409587917905, "frac_reward_zero_std": 0.89375, "grad_norm": 2.3596739768981934, "kl": 3.2041874835500495, "learning_rate": 4.6361904761904757e-07, "loss": 0.0032, "num_tokens": 800776390.0, "reward": 0.3625, "reward_std": 0.09353410974144935, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.923979663848877, "step": 11585 }, { "completion_length": 421.8, "completions/clipped_ratio": 0.0, "completions/max_length": 421.8, "completions/max_terminated_length": 421.8, "completions/mean_length": 97.8296875, "completions/mean_terminated_length": 97.8296875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010488934581266166, "frac_reward_zero_std": 0.90625, "grad_norm": 13.88459300994873, "kl": 2.5091606735251846, "learning_rate": 4.635793650793651e-07, "loss": 0.0025, "num_tokens": 801101988.0, "reward": 0.2375, "reward_std": 0.08222099840641021, "rewards/verify_chess_move/mean": 0.2375, "rewards/verify_chess_move/std": 0.9678288459777832, "step": 11590 }, { "completion_length": 290.2, "completions/clipped_ratio": 0.0, "completions/max_length": 290.2, "completions/max_terminated_length": 290.2, "completions/mean_length": 92.94453125, "completions/mean_terminated_length": 92.94453125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010493459574614426, "frac_reward_zero_std": 0.93125, "grad_norm": 5.79805326461792, "kl": 2.641557881818153, "learning_rate": 4.6353968253968253e-07, "loss": 0.0026, "num_tokens": 801421629.0, "reward": 0.3421875, "reward_std": 0.05950080454349518, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.930249035358429, "step": 11595 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 380.6, "completions/max_terminated_length": 285.0, "completions/mean_length": 88.6578125, "completions/mean_terminated_length": 88.14109802246094, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010497984567962686, "frac_reward_zero_std": 0.94375, "grad_norm": 27.703283309936523, "kl": 3.1576402325183155, "learning_rate": 4.635e-07, "loss": 0.0032, "num_tokens": 801733359.0, "reward": 0.4078125, "reward_std": 0.04545377530157566, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.902732789516449, "step": 11600 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 88.340625, "completions/mean_terminated_length": 88.340625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010502509561310944, "frac_reward_zero_std": 0.94375, "grad_norm": 0.5954638123512268, "kl": 3.68754471935099, "learning_rate": 4.6346031746031744e-07, "loss": 0.0037, "num_tokens": 802045275.0, "reward": 0.421875, "reward_std": 0.04797552525997162, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.9004475235939026, "step": 11605 }, { "completion_length": 396.2, "completions/clipped_ratio": 0.0, "completions/max_length": 396.2, "completions/max_terminated_length": 396.2, "completions/mean_length": 93.471875, "completions/mean_terminated_length": 93.471875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.010507034554659204, "frac_reward_zero_std": 0.95625, "grad_norm": 22.21535873413086, "kl": 4.0124359912704675, "learning_rate": 4.634206349206349e-07, "loss": 0.004, "num_tokens": 802365983.0, "reward": 0.28125, "reward_std": 0.034352827817201614, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9477265834808349, "step": 11610 }, { "completion_length": 440.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 89.13125, "completions/mean_terminated_length": 89.13125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010511559548007464, "frac_reward_zero_std": 0.95, "grad_norm": 7.557254314422607, "kl": 3.373003961937502, "learning_rate": 4.6338095238095235e-07, "loss": 0.0034, "num_tokens": 802678583.0, "reward": 0.3734375, "reward_std": 0.04376729428768158, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9247393608093262, "step": 11615 }, { "completion_length": 277.6, "completions/clipped_ratio": 0.0, "completions/max_length": 277.6, "completions/max_terminated_length": 277.6, "completions/mean_length": 85.134375, "completions/mean_terminated_length": 85.134375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010516084541355724, "frac_reward_zero_std": 0.94375, "grad_norm": 5.007608413696289, "kl": 1.7377289654454215, "learning_rate": 4.633412698412698e-07, "loss": 0.0017, "num_tokens": 802986179.0, "reward": 0.3640625, "reward_std": 0.04660856761038303, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.918877899646759, "step": 11620 }, { "completion_length": 306.2, "completions/clipped_ratio": 0.0, "completions/max_length": 306.2, "completions/max_terminated_length": 306.2, "completions/mean_length": 94.29765625, "completions/mean_terminated_length": 94.29765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010520609534703984, "frac_reward_zero_std": 0.94375, "grad_norm": 0.46270957589149475, "kl": 1.347532957047224, "learning_rate": 4.633015873015873e-07, "loss": 0.0013, "num_tokens": 803309472.0, "reward": 0.40625, "reward_std": 0.052075419947504996, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9104458808898925, "step": 11625 }, { "completion_length": 644.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 644.6, "completions/max_terminated_length": 628.0, "completions/mean_length": 91.64765625, "completions/mean_terminated_length": 90.58480529785156, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010525134528052245, "frac_reward_zero_std": 0.9125, "grad_norm": 16.191051483154297, "kl": 7.435912717168685, "learning_rate": 4.6326190476190476e-07, "loss": 0.0074, "num_tokens": 803623661.0, "reward": 0.4421875, "reward_std": 0.07391287013888359, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8854249358177185, "step": 11630 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 83.28125, "completions/mean_terminated_length": 83.28125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010529659521400503, "frac_reward_zero_std": 0.95625, "grad_norm": 2.2151873111724854, "kl": 3.805040143080987, "learning_rate": 4.6322222222222217e-07, "loss": 0.0038, "num_tokens": 803926077.0, "reward": 0.40625, "reward_std": 0.037981899082660676, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9031651735305786, "step": 11635 }, { "completion_length": 331.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 83.08671875, "completions/mean_terminated_length": 83.08671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010534184514748763, "frac_reward_zero_std": 0.95, "grad_norm": 4.264224052429199, "kl": 3.0734626164776273, "learning_rate": 4.6318253968253967e-07, "loss": 0.0031, "num_tokens": 804228492.0, "reward": 0.36875, "reward_std": 0.04192951284348965, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9274573802947998, "step": 11640 }, { "completion_length": 477.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 477.8, "completions/max_terminated_length": 476.4, "completions/mean_length": 91.059375, "completions/mean_terminated_length": 90.52908477783203, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010538709508097023, "frac_reward_zero_std": 0.94375, "grad_norm": 7.827868938446045, "kl": 2.9419407783891076, "learning_rate": 4.6314285714285713e-07, "loss": 0.0029, "num_tokens": 804544144.0, "reward": 0.20625, "reward_std": 0.05023665800690651, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9685636758804321, "step": 11645 }, { "completion_length": 486.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 486.6, "completions/max_terminated_length": 391.0, "completions/mean_length": 92.02421875, "completions/mean_terminated_length": 91.49524993896485, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010543234501445283, "frac_reward_zero_std": 0.91875, "grad_norm": 14.67703628540039, "kl": 3.325747394876089, "learning_rate": 4.631031746031746e-07, "loss": 0.0033, "num_tokens": 804861087.0, "reward": 0.315625, "reward_std": 0.07543210834264755, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9382569551467895, "step": 11650 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 88.709375, "completions/mean_terminated_length": 88.709375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010547759494793543, "frac_reward_zero_std": 0.91875, "grad_norm": 6.02962064743042, "kl": 3.3600993001600727, "learning_rate": 4.6306349206349203e-07, "loss": 0.0034, "num_tokens": 805171851.0, "reward": 0.409375, "reward_std": 0.06654835119843483, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.909915292263031, "step": 11655 }, { "completion_length": 422.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 96.2921875, "completions/mean_terminated_length": 96.2921875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.010552284488141802, "frac_reward_zero_std": 0.95625, "grad_norm": 16.501413345336914, "kl": 1.0445401727687567, "learning_rate": 4.630238095238095e-07, "loss": 0.001, "num_tokens": 805496777.0, "reward": 0.353125, "reward_std": 0.03524798266589642, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9318266987800599, "step": 11660 }, { "completion_length": 470.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 470.4, "completions/max_terminated_length": 445.4, "completions/mean_length": 88.70234375, "completions/mean_terminated_length": 88.17195892333984, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.010556809481490062, "frac_reward_zero_std": 0.95625, "grad_norm": 9.048562049865723, "kl": 0.3124992823461071, "learning_rate": 4.62984126984127e-07, "loss": 0.0003, "num_tokens": 805808740.0, "reward": 0.2859375, "reward_std": 0.033669838309288026, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9584157824516296, "step": 11665 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/max_terminated_length": 374.6, "completions/mean_length": 93.6796875, "completions/mean_terminated_length": 93.6796875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010561334474838322, "frac_reward_zero_std": 0.925, "grad_norm": 6.620248317718506, "kl": 1.161953494604677, "learning_rate": 4.629444444444444e-07, "loss": 0.0012, "num_tokens": 806127154.0, "reward": 0.190625, "reward_std": 0.0671249631792307, "rewards/verify_chess_move/mean": 0.190625, "rewards/verify_chess_move/std": 0.970536744594574, "step": 11670 }, { "completion_length": 414.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 97.7125, "completions/mean_terminated_length": 97.7125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010565859468186582, "frac_reward_zero_std": 0.9, "grad_norm": 8.838102340698242, "kl": 0.8470650630770251, "learning_rate": 4.629047619047619e-07, "loss": 0.0008, "num_tokens": 806454978.0, "reward": 0.33125, "reward_std": 0.08164438083767891, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9306567549705506, "step": 11675 }, { "completion_length": 292.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 86.5484375, "completions/mean_terminated_length": 86.5484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010570384461534842, "frac_reward_zero_std": 0.93125, "grad_norm": 5.4487810134887695, "kl": 1.3295611050212757, "learning_rate": 4.6286507936507936e-07, "loss": 0.0013, "num_tokens": 806764336.0, "reward": 0.346875, "reward_std": 0.054764414206147194, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9276525259017945, "step": 11680 }, { "completion_length": 552.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 552.8, "completions/max_terminated_length": 493.4, "completions/mean_length": 96.9875, "completions/mean_terminated_length": 96.4556167602539, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010574909454883102, "frac_reward_zero_std": 0.875, "grad_norm": 26.755773544311523, "kl": 4.571128001314355, "learning_rate": 4.628253968253968e-07, "loss": 0.0046, "num_tokens": 807089136.0, "reward": 0.31875, "reward_std": 0.110209272056818, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9441000699996949, "step": 11685 }, { "completion_length": 277.8, "completions/clipped_ratio": 0.0, "completions/max_length": 277.8, "completions/max_terminated_length": 277.8, "completions/mean_length": 83.40625, "completions/mean_terminated_length": 83.40625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01057943444823136, "frac_reward_zero_std": 0.96875, "grad_norm": 0.9146697521209717, "kl": 1.4612806035671384, "learning_rate": 4.6278571428571427e-07, "loss": 0.0015, "num_tokens": 807392344.0, "reward": 0.384375, "reward_std": 0.027563939243555068, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9166916370391845, "step": 11690 }, { "completion_length": 551.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 551.4, "completions/max_terminated_length": 548.8, "completions/mean_length": 99.2625, "completions/mean_terminated_length": 98.19711456298828, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01058395944157962, "frac_reward_zero_std": 0.90625, "grad_norm": 0.3977125585079193, "kl": 10.947328920650762, "learning_rate": 4.627460317460317e-07, "loss": 0.0109, "num_tokens": 807717872.0, "reward": 0.415625, "reward_std": 0.08448409140110016, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.905742859840393, "step": 11695 }, { "completion_length": 486.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 486.8, "completions/max_terminated_length": 413.4, "completions/mean_length": 88.178125, "completions/mean_terminated_length": 87.6519287109375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01058848443492788, "frac_reward_zero_std": 0.875, "grad_norm": 7.471502304077148, "kl": 3.12159693967551, "learning_rate": 4.627063492063492e-07, "loss": 0.0031, "num_tokens": 808027812.0, "reward": 0.2953125, "reward_std": 0.1056860253214836, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9538237690925598, "step": 11700 }, { "completion_length": 282.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 83.57890625, "completions/mean_terminated_length": 83.57890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01059300942827614, "frac_reward_zero_std": 0.93125, "grad_norm": 6.811197280883789, "kl": 3.3788171432446688, "learning_rate": 4.6266666666666663e-07, "loss": 0.0034, "num_tokens": 808331465.0, "reward": 0.4046875, "reward_std": 0.06386033743619919, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.905887770652771, "step": 11705 }, { "completion_length": 330.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 93.23515625, "completions/mean_terminated_length": 93.23515625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0105975344216244, "frac_reward_zero_std": 0.93125, "grad_norm": 3.5133979320526123, "kl": 3.9308405028190463, "learning_rate": 4.626269841269841e-07, "loss": 0.0039, "num_tokens": 808650838.0, "reward": 0.3078125, "reward_std": 0.0615507535636425, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9511208295822143, "step": 11710 }, { "completion_length": 421.8, "completions/clipped_ratio": 0.0, "completions/max_length": 421.8, "completions/max_terminated_length": 421.8, "completions/mean_length": 94.40625, "completions/mean_terminated_length": 94.40625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01060205941497266, "frac_reward_zero_std": 0.93125, "grad_norm": 8.751608848571777, "kl": 2.8748340775608083, "learning_rate": 4.625873015873016e-07, "loss": 0.0029, "num_tokens": 808969790.0, "reward": 0.3171875, "reward_std": 0.057497349381446836, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9450085997581482, "step": 11715 }, { "completion_length": 498.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 498.0, "completions/max_terminated_length": 396.2, "completions/mean_length": 89.52578125, "completions/mean_terminated_length": 88.9932357788086, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01060658440832092, "frac_reward_zero_std": 0.94375, "grad_norm": 4.448764324188232, "kl": 1.1002209499594755, "learning_rate": 4.6254761904761904e-07, "loss": 0.0011, "num_tokens": 809284215.0, "reward": 0.328125, "reward_std": 0.04750470370054245, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9308826327323914, "step": 11720 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 90.99140625, "completions/mean_terminated_length": 90.99140625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01061110940166918, "frac_reward_zero_std": 0.90625, "grad_norm": 11.97076416015625, "kl": 2.434481023391709, "learning_rate": 4.625079365079365e-07, "loss": 0.0024, "num_tokens": 809597564.0, "reward": 0.4484375, "reward_std": 0.08153800964355469, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8829939484596252, "step": 11725 }, { "completion_length": 474.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 474.6, "completions/max_terminated_length": 392.2, "completions/mean_length": 91.696875, "completions/mean_terminated_length": 91.17064819335937, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.01061563439501744, "frac_reward_zero_std": 0.875, "grad_norm": 15.783365249633789, "kl": 5.384862202801742, "learning_rate": 4.6246825396825395e-07, "loss": 0.0054, "num_tokens": 809913680.0, "reward": 0.321875, "reward_std": 0.10841895639896393, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9450072407722473, "step": 11730 }, { "completion_length": 437.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 437.8, "completions/max_terminated_length": 383.6, "completions/mean_length": 87.6796875, "completions/mean_terminated_length": 87.14662475585938, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.0106201593883657, "frac_reward_zero_std": 0.9375, "grad_norm": 4.786325931549072, "kl": 3.266298176499549, "learning_rate": 4.624285714285714e-07, "loss": 0.0033, "num_tokens": 810223326.0, "reward": 0.2515625, "reward_std": 0.04829504787921905, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9604734420776367, "step": 11735 }, { "completion_length": 339.6, "completions/clipped_ratio": 0.0, "completions/max_length": 339.6, "completions/max_terminated_length": 339.6, "completions/mean_length": 87.88046875, "completions/mean_terminated_length": 87.88046875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01062468438171396, "frac_reward_zero_std": 0.925, "grad_norm": 3.512718439102173, "kl": 4.301662080944515, "learning_rate": 4.6238888888888886e-07, "loss": 0.0043, "num_tokens": 810532397.0, "reward": 0.4140625, "reward_std": 0.06554681956768035, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8943340897560119, "step": 11740 }, { "completion_length": 384.8, "completions/clipped_ratio": 0.0, "completions/max_length": 384.8, "completions/max_terminated_length": 384.8, "completions/mean_length": 94.99609375, "completions/mean_terminated_length": 94.99609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010629209375062218, "frac_reward_zero_std": 0.925, "grad_norm": 3.5378267765045166, "kl": 5.3330501021118835, "learning_rate": 4.623492063492063e-07, "loss": 0.0053, "num_tokens": 810853744.0, "reward": 0.359375, "reward_std": 0.06465068459510803, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9223859786987305, "step": 11745 }, { "completion_length": 387.2, "completions/clipped_ratio": 0.0, "completions/max_length": 387.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 91.94375, "completions/mean_terminated_length": 91.94375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010633734368410478, "frac_reward_zero_std": 0.9, "grad_norm": 12.512605667114258, "kl": 5.188854506448843, "learning_rate": 4.623095238095238e-07, "loss": 0.0052, "num_tokens": 811170920.0, "reward": 0.278125, "reward_std": 0.08868938237428665, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9560907483100891, "step": 11750 }, { "completion_length": 336.6, "completions/clipped_ratio": 0.0, "completions/max_length": 336.6, "completions/max_terminated_length": 336.6, "completions/mean_length": 88.5890625, "completions/mean_terminated_length": 88.5890625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010638259361758738, "frac_reward_zero_std": 0.94375, "grad_norm": 3.5464389324188232, "kl": 5.176676669344306, "learning_rate": 4.6226984126984127e-07, "loss": 0.0052, "num_tokens": 811482522.0, "reward": 0.3265625, "reward_std": 0.04887166172266007, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9290046930313111, "step": 11755 }, { "completion_length": 481.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 481.0, "completions/max_terminated_length": 373.8, "completions/mean_length": 89.3234375, "completions/mean_terminated_length": 88.26166229248047, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010642784355106998, "frac_reward_zero_std": 0.94375, "grad_norm": 1.5829625129699707, "kl": 4.710937956080306, "learning_rate": 4.622301587301587e-07, "loss": 0.0047, "num_tokens": 811793936.0, "reward": 0.30625, "reward_std": 0.04976583532989025, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9383267402648926, "step": 11760 }, { "completion_length": 522.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 522.4, "completions/max_terminated_length": 427.8, "completions/mean_length": 94.284375, "completions/mean_terminated_length": 93.24026794433594, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010647309348455258, "frac_reward_zero_std": 0.9375, "grad_norm": 5.510026454925537, "kl": 7.578284629969858, "learning_rate": 4.621904761904762e-07, "loss": 0.0076, "num_tokens": 812114788.0, "reward": 0.3, "reward_std": 0.052607111632823944, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9450281500816345, "step": 11765 }, { "completion_length": 314.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 89.92890625, "completions/mean_terminated_length": 89.92890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010651834341803517, "frac_reward_zero_std": 0.95625, "grad_norm": 9.769007682800293, "kl": 1.9343668540008365, "learning_rate": 4.6215079365079363e-07, "loss": 0.0019, "num_tokens": 812429105.0, "reward": 0.5, "reward_std": 0.038452721387147906, "rewards/verify_chess_move/mean": 0.5, "rewards/verify_chess_move/std": 0.8556517124176025, "step": 11770 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.0, "completions/max_length": 392.6, "completions/max_terminated_length": 392.6, "completions/mean_length": 85.6234375, "completions/mean_terminated_length": 85.6234375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010656359335151777, "frac_reward_zero_std": 0.93125, "grad_norm": 13.846053123474121, "kl": 2.6968841017922385, "learning_rate": 4.621111111111111e-07, "loss": 0.0027, "num_tokens": 812736231.0, "reward": 0.3015625, "reward_std": 0.06112642250955105, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9459588527679443, "step": 11775 }, { "completion_length": 315.8, "completions/clipped_ratio": 0.0, "completions/max_length": 315.8, "completions/max_terminated_length": 315.8, "completions/mean_length": 95.25390625, "completions/mean_terminated_length": 95.25390625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010660884328500037, "frac_reward_zero_std": 0.94375, "grad_norm": 5.25714111328125, "kl": 1.5628062686650082, "learning_rate": 4.6207142857142854e-07, "loss": 0.0016, "num_tokens": 813060004.0, "reward": 0.3046875, "reward_std": 0.04545377679169178, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9486651301383973, "step": 11780 }, { "completion_length": 479.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 479.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 95.80859375, "completions/mean_terminated_length": 95.27781677246094, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010665409321848297, "frac_reward_zero_std": 0.925, "grad_norm": 6.626271724700928, "kl": 0.4048528954735957, "learning_rate": 4.62031746031746e-07, "loss": 0.0004, "num_tokens": 813382439.0, "reward": 0.3671875, "reward_std": 0.06097414195537567, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9089988112449646, "step": 11785 }, { "completion_length": 306.6, "completions/clipped_ratio": 0.0, "completions/max_length": 306.6, "completions/max_terminated_length": 306.6, "completions/mean_length": 87.77734375, "completions/mean_terminated_length": 87.77734375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010669934315196557, "frac_reward_zero_std": 0.94375, "grad_norm": 13.142852783203125, "kl": 0.8080620212247596, "learning_rate": 4.619920634920635e-07, "loss": 0.0008, "num_tokens": 813691122.0, "reward": 0.515625, "reward_std": 0.04660954885184765, "rewards/verify_chess_move/mean": 0.515625, "rewards/verify_chess_move/std": 0.8440919399261475, "step": 11790 }, { "completion_length": 372.2, "completions/clipped_ratio": 0.0, "completions/max_length": 372.2, "completions/max_terminated_length": 372.2, "completions/mean_length": 86.68203125, "completions/mean_terminated_length": 86.68203125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.010674459308544817, "frac_reward_zero_std": 0.94375, "grad_norm": 4.10551643371582, "kl": 0.4630474646342918, "learning_rate": 4.619523809523809e-07, "loss": 0.0005, "num_tokens": 814000363.0, "reward": 0.4390625, "reward_std": 0.04455862008035183, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8907040238380433, "step": 11795 }, { "completion_length": 368.8, "completions/clipped_ratio": 0.0, "completions/max_length": 368.8, "completions/max_terminated_length": 368.8, "completions/mean_length": 94.68984375, "completions/mean_terminated_length": 94.68984375, "completions/min_length": 28.6, "completions/min_terminated_length": 28.6, "epoch": 0.010678984301893076, "frac_reward_zero_std": 0.9, "grad_norm": 9.252066612243652, "kl": 1.0135390212060884, "learning_rate": 4.6191269841269836e-07, "loss": 0.001, "num_tokens": 814321542.0, "reward": 0.40625, "reward_std": 0.08096237927675247, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9057838082313537, "step": 11800 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.8, "completions/max_terminated_length": 394.6, "completions/mean_length": 89.696875, "completions/mean_terminated_length": 89.17003479003907, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010683509295241336, "frac_reward_zero_std": 0.9375, "grad_norm": 11.711370468139648, "kl": 0.6922327516367659, "learning_rate": 4.6187301587301587e-07, "loss": 0.0007, "num_tokens": 814634290.0, "reward": 0.396875, "reward_std": 0.05644736513495445, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9011197805404663, "step": 11805 }, { "completion_length": 436.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.0, "completions/max_terminated_length": 428.2, "completions/mean_length": 91.80390625, "completions/mean_terminated_length": 91.28538970947265, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010688034288589596, "frac_reward_zero_std": 0.94375, "grad_norm": 0.22319427132606506, "kl": 0.8523493702639826, "learning_rate": 4.618333333333333e-07, "loss": 0.0009, "num_tokens": 814950599.0, "reward": 0.2546875, "reward_std": 0.05365356355905533, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9565677762031555, "step": 11810 }, { "completion_length": 293.6, "completions/clipped_ratio": 0.0, "completions/max_length": 293.6, "completions/max_terminated_length": 293.6, "completions/mean_length": 89.97265625, "completions/mean_terminated_length": 89.97265625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010692559281937856, "frac_reward_zero_std": 0.96875, "grad_norm": 8.118123054504395, "kl": 1.258090288634412, "learning_rate": 4.617936507936508e-07, "loss": 0.0013, "num_tokens": 815264084.0, "reward": 0.4546875, "reward_std": 0.03098084479570389, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8844232439994812, "step": 11815 }, { "completion_length": 298.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 94.375, "completions/mean_terminated_length": 94.375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010697084275286116, "frac_reward_zero_std": 0.9625, "grad_norm": 2.0238802433013916, "kl": 0.8982577743474394, "learning_rate": 4.6175396825396823e-07, "loss": 0.0009, "num_tokens": 815586404.0, "reward": 0.290625, "reward_std": 0.03130036853253841, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9351450681686402, "step": 11820 }, { "completion_length": 429.8, "completions/clipped_ratio": 0.0, "completions/max_length": 429.8, "completions/max_terminated_length": 429.8, "completions/mean_length": 85.67578125, "completions/mean_terminated_length": 85.67578125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010701609268634374, "frac_reward_zero_std": 0.96875, "grad_norm": 2.7472662925720215, "kl": 0.42389041421702134, "learning_rate": 4.617142857142857e-07, "loss": 0.0004, "num_tokens": 815893253.0, "reward": 0.2546875, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9624292731285096, "step": 11825 }, { "completion_length": 347.4, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/max_terminated_length": 347.4, "completions/mean_length": 87.2796875, "completions/mean_terminated_length": 87.2796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010706134261982634, "frac_reward_zero_std": 0.95625, "grad_norm": 3.842296838760376, "kl": 0.5693551817908883, "learning_rate": 4.6167460317460314e-07, "loss": 0.0006, "num_tokens": 816201763.0, "reward": 0.2921875, "reward_std": 0.03661493994295597, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.950889503955841, "step": 11830 }, { "completion_length": 341.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 89.01640625, "completions/mean_terminated_length": 89.01640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010710659255330894, "frac_reward_zero_std": 0.9375, "grad_norm": 3.4249088764190674, "kl": 1.0218407230451703, "learning_rate": 4.616349206349206e-07, "loss": 0.001, "num_tokens": 816513040.0, "reward": 0.31875, "reward_std": 0.05465705692768097, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9392361044883728, "step": 11835 }, { "completion_length": 375.4, "completions/clipped_ratio": 0.0, "completions/max_length": 375.4, "completions/max_terminated_length": 375.4, "completions/mean_length": 91.23203125, "completions/mean_terminated_length": 91.23203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010715184248679155, "frac_reward_zero_std": 0.94375, "grad_norm": 10.902640342712402, "kl": 2.1796782307093965, "learning_rate": 4.615952380952381e-07, "loss": 0.0022, "num_tokens": 816829209.0, "reward": 0.4421875, "reward_std": 0.04729253761470318, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8680336833000183, "step": 11840 }, { "completion_length": 314.4, "completions/clipped_ratio": 0.0, "completions/max_length": 314.4, "completions/max_terminated_length": 314.4, "completions/mean_length": 87.615625, "completions/mean_terminated_length": 87.615625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010719709242027415, "frac_reward_zero_std": 0.925, "grad_norm": 7.952181816101074, "kl": 0.8902148372959345, "learning_rate": 4.6155555555555555e-07, "loss": 0.0009, "num_tokens": 817139933.0, "reward": 0.3859375, "reward_std": 0.05918383039534092, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9070437908172607, "step": 11845 }, { "completion_length": 567.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 567.4, "completions/max_terminated_length": 522.6, "completions/mean_length": 91.11171875, "completions/mean_terminated_length": 90.58723754882813, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010724234235375675, "frac_reward_zero_std": 0.925, "grad_norm": 11.061532020568848, "kl": 1.947953497839626, "learning_rate": 4.6151587301587295e-07, "loss": 0.0019, "num_tokens": 817454308.0, "reward": 0.3125, "reward_std": 0.0634958915412426, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9231663584709168, "step": 11850 }, { "completion_length": 577.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 577.2, "completions/max_terminated_length": 570.4, "completions/mean_length": 89.70390625, "completions/mean_terminated_length": 89.17547607421875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010728759228723933, "frac_reward_zero_std": 0.925, "grad_norm": 14.870312690734863, "kl": 2.025913864397444, "learning_rate": 4.6147619047619046e-07, "loss": 0.002, "num_tokens": 817764329.0, "reward": 0.3453125, "reward_std": 0.06896274462342263, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9361253023147583, "step": 11855 }, { "completion_length": 492.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 492.8, "completions/max_terminated_length": 418.6, "completions/mean_length": 92.10625, "completions/mean_terminated_length": 91.57896575927734, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010733284222072193, "frac_reward_zero_std": 0.91875, "grad_norm": 14.003740310668945, "kl": 2.733605802350212, "learning_rate": 4.614365079365079e-07, "loss": 0.0027, "num_tokens": 818080097.0, "reward": 0.1875, "reward_std": 0.072227368876338, "rewards/verify_chess_move/mean": 0.1875, "rewards/verify_chess_move/std": 0.9552165985107421, "step": 11860 }, { "completion_length": 374.8, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 94.61015625, "completions/mean_terminated_length": 94.61015625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010737809215420453, "frac_reward_zero_std": 0.93125, "grad_norm": 3.611865997314453, "kl": 2.683657191100065, "learning_rate": 4.613968253968254e-07, "loss": 0.0027, "num_tokens": 818400206.0, "reward": 0.396875, "reward_std": 0.06296420283615589, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9080606698989868, "step": 11865 }, { "completion_length": 494.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 494.4, "completions/max_terminated_length": 474.8, "completions/mean_length": 87.64375, "completions/mean_terminated_length": 87.12569122314453, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010742334208768713, "frac_reward_zero_std": 0.91875, "grad_norm": 0.02461315132677555, "kl": 2.381865377177019, "learning_rate": 4.613571428571428e-07, "loss": 0.0024, "num_tokens": 818709022.0, "reward": 0.428125, "reward_std": 0.06975308954715728, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.890409529209137, "step": 11870 }, { "completion_length": 413.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 413.2, "completions/max_terminated_length": 323.0, "completions/mean_length": 91.925, "completions/mean_terminated_length": 91.40394744873046, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010746859202116973, "frac_reward_zero_std": 0.9625, "grad_norm": 1.139678955078125, "kl": 2.934419459779747, "learning_rate": 4.613174603174603e-07, "loss": 0.0029, "num_tokens": 819026502.0, "reward": 0.4203125, "reward_std": 0.029933410137891768, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.9031791925430298, "step": 11875 }, { "completion_length": 499.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 499.6, "completions/max_terminated_length": 438.4, "completions/mean_length": 99.28203125, "completions/mean_terminated_length": 98.75811767578125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010751384195465234, "frac_reward_zero_std": 0.95, "grad_norm": 1.6238110065460205, "kl": 5.1502539096865805, "learning_rate": 4.612777777777778e-07, "loss": 0.0052, "num_tokens": 819356239.0, "reward": 0.371875, "reward_std": 0.04308430477976799, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9205781817436218, "step": 11880 }, { "completion_length": 442.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 442.6, "completions/max_terminated_length": 401.0, "completions/mean_length": 91.084375, "completions/mean_terminated_length": 90.0559295654297, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010755909188813492, "frac_reward_zero_std": 0.93125, "grad_norm": 15.892696380615234, "kl": 4.196557525335811, "learning_rate": 4.612380952380952e-07, "loss": 0.0042, "num_tokens": 819670675.0, "reward": 0.3859375, "reward_std": 0.06428467184305191, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9079543590545655, "step": 11885 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 88.24921875, "completions/mean_terminated_length": 88.24921875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010760434182161752, "frac_reward_zero_std": 0.91875, "grad_norm": 9.312211990356445, "kl": 1.6005501684965566, "learning_rate": 4.611984126984127e-07, "loss": 0.0016, "num_tokens": 819981378.0, "reward": 0.303125, "reward_std": 0.06902263052761555, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.939993965625763, "step": 11890 }, { "completion_length": 519.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 519.2, "completions/max_terminated_length": 495.4, "completions/mean_length": 88.84453125, "completions/mean_terminated_length": 88.31125946044922, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010764959175510012, "frac_reward_zero_std": 0.9625, "grad_norm": 1.614241361618042, "kl": 1.245491117867641, "learning_rate": 4.6115873015873014e-07, "loss": 0.0012, "num_tokens": 820293219.0, "reward": 0.215625, "reward_std": 0.03130036853253841, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9715643286705017, "step": 11895 }, { "completion_length": 340.2, "completions/clipped_ratio": 0.0, "completions/max_length": 340.2, "completions/max_terminated_length": 340.2, "completions/mean_length": 85.30546875, "completions/mean_terminated_length": 85.30546875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.010769484168858272, "frac_reward_zero_std": 0.94375, "grad_norm": 10.37004566192627, "kl": 2.326218460360542, "learning_rate": 4.611190476190476e-07, "loss": 0.0023, "num_tokens": 820600706.0, "reward": 0.3765625, "reward_std": 0.05023763924837112, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9237682580947876, "step": 11900 }, { "completion_length": 519.8, "completions/clipped_ratio": 0.0, "completions/max_length": 519.8, "completions/max_terminated_length": 519.8, "completions/mean_length": 90.70859375, "completions/mean_terminated_length": 90.70859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010774009162206532, "frac_reward_zero_std": 0.90625, "grad_norm": 3.1652324199676514, "kl": 4.1044620407396, "learning_rate": 4.6107936507936505e-07, "loss": 0.0041, "num_tokens": 820916021.0, "reward": 0.3296875, "reward_std": 0.08332831710577011, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9293485403060913, "step": 11905 }, { "completion_length": 403.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.4, "completions/max_terminated_length": 306.4, "completions/mean_length": 91.2625, "completions/mean_terminated_length": 90.72897491455078, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01077853415555479, "frac_reward_zero_std": 0.925, "grad_norm": 4.842031478881836, "kl": 4.3596744373324325, "learning_rate": 4.610396825396825e-07, "loss": 0.0044, "num_tokens": 821233077.0, "reward": 0.3953125, "reward_std": 0.060078985244035724, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.8950742721557617, "step": 11910 }, { "completion_length": 414.4, "completions/clipped_ratio": 0.0, "completions/max_length": 414.4, "completions/max_terminated_length": 414.4, "completions/mean_length": 89.31328125, "completions/mean_terminated_length": 89.31328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01078305914890305, "frac_reward_zero_std": 0.9375, "grad_norm": 14.224220275878906, "kl": 3.9516674279933794, "learning_rate": 4.61e-07, "loss": 0.004, "num_tokens": 821544542.0, "reward": 0.403125, "reward_std": 0.0591812826693058, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9005958795547485, "step": 11915 }, { "completion_length": 297.4, "completions/clipped_ratio": 0.0, "completions/max_length": 297.4, "completions/max_terminated_length": 297.4, "completions/mean_length": 96.14609375, "completions/mean_terminated_length": 96.14609375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01078758414225131, "frac_reward_zero_std": 0.89375, "grad_norm": 6.934328079223633, "kl": 4.835371030773968, "learning_rate": 4.609603174603174e-07, "loss": 0.0048, "num_tokens": 821870145.0, "reward": 0.359375, "reward_std": 0.08811473064124584, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9183117628097535, "step": 11920 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 93.0390625, "completions/mean_terminated_length": 93.0390625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010792109135599571, "frac_reward_zero_std": 0.9375, "grad_norm": 8.751813888549805, "kl": 7.0191331268288195, "learning_rate": 4.6092063492063487e-07, "loss": 0.007, "num_tokens": 822190035.0, "reward": 0.334375, "reward_std": 0.05439741872251034, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9319504618644714, "step": 11925 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 462.6, "completions/max_terminated_length": 450.6, "completions/mean_length": 94.38984375, "completions/mean_terminated_length": 93.35655364990234, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010796634128947831, "frac_reward_zero_std": 0.93125, "grad_norm": 10.669107437133789, "kl": 6.548276722640731, "learning_rate": 4.608809523809524e-07, "loss": 0.0065, "num_tokens": 822510926.0, "reward": 0.2921875, "reward_std": 0.06155075393617153, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9267610073089599, "step": 11930 }, { "completion_length": 429.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 91.49453125, "completions/mean_terminated_length": 91.49453125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010801159122296091, "frac_reward_zero_std": 0.9125, "grad_norm": 27.54694366455078, "kl": 9.208079742547124, "learning_rate": 4.6084126984126983e-07, "loss": 0.0092, "num_tokens": 822828407.0, "reward": 0.2109375, "reward_std": 0.07780157849192619, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.9589392900466919, "step": 11935 }, { "completion_length": 405.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.2, "completions/max_terminated_length": 387.4, "completions/mean_length": 90.4453125, "completions/mean_terminated_length": 89.92811431884766, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01080568411564435, "frac_reward_zero_std": 0.95, "grad_norm": 9.9916353225708, "kl": 2.6851357342442497, "learning_rate": 4.608015873015873e-07, "loss": 0.0027, "num_tokens": 823144545.0, "reward": 0.2484375, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9643076896667481, "step": 11940 }, { "completion_length": 395.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 395.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 93.1609375, "completions/mean_terminated_length": 92.11675720214843, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01081020910899261, "frac_reward_zero_std": 0.9375, "grad_norm": 7.384859561920166, "kl": 2.6075843154918403, "learning_rate": 4.6076190476190474e-07, "loss": 0.0026, "num_tokens": 823462911.0, "reward": 0.3671875, "reward_std": 0.05034499540925026, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9170311689376831, "step": 11945 }, { "completion_length": 285.4, "completions/clipped_ratio": 0.0, "completions/max_length": 285.4, "completions/max_terminated_length": 285.4, "completions/mean_length": 87.0140625, "completions/mean_terminated_length": 87.0140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01081473410234087, "frac_reward_zero_std": 0.94375, "grad_norm": 6.743404388427734, "kl": 3.3007373836357146, "learning_rate": 4.607222222222222e-07, "loss": 0.0033, "num_tokens": 823772249.0, "reward": 0.3015625, "reward_std": 0.04865851625800133, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9471118092536926, "step": 11950 }, { "completion_length": 287.6, "completions/clipped_ratio": 0.0, "completions/max_length": 287.6, "completions/max_terminated_length": 287.6, "completions/mean_length": 84.08515625, "completions/mean_terminated_length": 84.08515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01081925909568913, "frac_reward_zero_std": 0.94375, "grad_norm": 2.020857095718384, "kl": 4.4419283552560955, "learning_rate": 4.606825396825397e-07, "loss": 0.0044, "num_tokens": 824076318.0, "reward": 0.2625, "reward_std": 0.04818671084940433, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.957681167125702, "step": 11955 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 89.09765625, "completions/mean_terminated_length": 89.09765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01082378408903739, "frac_reward_zero_std": 0.93125, "grad_norm": 13.992627143859863, "kl": 4.176438922435045, "learning_rate": 4.606428571428571e-07, "loss": 0.0042, "num_tokens": 824390179.0, "reward": 0.3390625, "reward_std": 0.0583925049751997, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.937843382358551, "step": 11960 }, { "completion_length": 505.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 505.4, "completions/max_terminated_length": 391.4, "completions/mean_length": 94.72265625, "completions/mean_terminated_length": 93.66460723876953, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010828309082385648, "frac_reward_zero_std": 0.91875, "grad_norm": 2.474519968032837, "kl": 1.602323014580179, "learning_rate": 4.606031746031746e-07, "loss": 0.0016, "num_tokens": 824711848.0, "reward": 0.3484375, "reward_std": 0.06723133847117424, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9125507712364197, "step": 11965 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 79.96640625, "completions/mean_terminated_length": 79.96640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010832834075733908, "frac_reward_zero_std": 0.9125, "grad_norm": 12.37169361114502, "kl": 2.5178275439655406, "learning_rate": 4.6056349206349206e-07, "loss": 0.0025, "num_tokens": 825009941.0, "reward": 0.4171875, "reward_std": 0.07280653044581413, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9016038060188294, "step": 11970 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.0, "completions/max_length": 457.4, "completions/max_terminated_length": 457.4, "completions/mean_length": 89.79296875, "completions/mean_terminated_length": 89.79296875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010837359069082168, "frac_reward_zero_std": 0.91875, "grad_norm": 8.235418319702148, "kl": 2.974383027141448, "learning_rate": 4.6052380952380946e-07, "loss": 0.003, "num_tokens": 825323244.0, "reward": 0.3578125, "reward_std": 0.07201520577073098, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9091623425483704, "step": 11975 }, { "completion_length": 412.8, "completions/clipped_ratio": 0.0, "completions/max_length": 412.8, "completions/max_terminated_length": 412.8, "completions/mean_length": 94.621875, "completions/mean_terminated_length": 94.621875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010841884062430428, "frac_reward_zero_std": 0.93125, "grad_norm": 3.4667422771453857, "kl": 5.652045072265901, "learning_rate": 4.6048412698412697e-07, "loss": 0.0057, "num_tokens": 825644936.0, "reward": 0.3125, "reward_std": 0.05681436061859131, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9473131537437439, "step": 11980 }, { "completion_length": 422.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 422.0, "completions/max_terminated_length": 369.6, "completions/mean_length": 93.9671875, "completions/mean_terminated_length": 93.46160736083985, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010846409055778689, "frac_reward_zero_std": 0.875, "grad_norm": 13.904390335083008, "kl": 10.661212759651244, "learning_rate": 4.604444444444444e-07, "loss": 0.0107, "num_tokens": 825963950.0, "reward": 0.353125, "reward_std": 0.1058497380465269, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9313737511634826, "step": 11985 }, { "completion_length": 368.8, "completions/clipped_ratio": 0.0, "completions/max_length": 368.8, "completions/max_terminated_length": 368.8, "completions/mean_length": 91.88671875, "completions/mean_terminated_length": 91.88671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010850934049126949, "frac_reward_zero_std": 0.9375, "grad_norm": 13.834128379821777, "kl": 8.551499547227285, "learning_rate": 4.6040476190476193e-07, "loss": 0.0086, "num_tokens": 826280549.0, "reward": 0.3546875, "reward_std": 0.05124015063047409, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9361808061599731, "step": 11990 }, { "completion_length": 593.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 593.2, "completions/max_terminated_length": 491.8, "completions/mean_length": 93.1734375, "completions/mean_terminated_length": 92.12042541503907, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010855459042475207, "frac_reward_zero_std": 0.93125, "grad_norm": 14.458710670471191, "kl": 7.591237494116649, "learning_rate": 4.6036507936507933e-07, "loss": 0.0076, "num_tokens": 826599539.0, "reward": 0.409375, "reward_std": 0.05975946336984635, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9009290337562561, "step": 11995 }, { "completion_length": 401.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 401.0, "completions/max_terminated_length": 365.8, "completions/mean_length": 90.25234375, "completions/mean_terminated_length": 89.72188873291016, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010859984035823467, "frac_reward_zero_std": 0.9375, "grad_norm": 2.218754291534424, "kl": 10.66672386142891, "learning_rate": 4.603253968253968e-07, "loss": 0.0107, "num_tokens": 826913286.0, "reward": 0.4453125, "reward_std": 0.05124015212059021, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.8733574390411377, "step": 12000 }, { "completion_length": 476.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 476.4, "completions/max_terminated_length": 452.6, "completions/mean_length": 90.865625, "completions/mean_terminated_length": 90.34723205566407, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010864509029171727, "frac_reward_zero_std": 0.93125, "grad_norm": 6.079963684082031, "kl": 2.1749968749471007, "learning_rate": 4.602857142857143e-07, "loss": 0.0022, "num_tokens": 827228746.0, "reward": 0.3359375, "reward_std": 0.0668054386973381, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9342699527740479, "step": 12005 }, { "completion_length": 316.6, "completions/clipped_ratio": 0.0, "completions/max_length": 316.6, "completions/max_terminated_length": 316.6, "completions/mean_length": 84.98046875, "completions/mean_terminated_length": 84.98046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010869034022519987, "frac_reward_zero_std": 0.90625, "grad_norm": 11.291769981384277, "kl": 1.2212380962911993, "learning_rate": 4.602460317460317e-07, "loss": 0.0012, "num_tokens": 827535505.0, "reward": 0.3640625, "reward_std": 0.08332832008600236, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9267820715904236, "step": 12010 }, { "completion_length": 417.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 417.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 91.25703125, "completions/mean_terminated_length": 90.73225250244141, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.010873559015868247, "frac_reward_zero_std": 0.9375, "grad_norm": 10.758814811706543, "kl": 1.2023266211268493, "learning_rate": 4.602063492063492e-07, "loss": 0.0012, "num_tokens": 827852138.0, "reward": 0.3765625, "reward_std": 0.05691916979849339, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9144224762916565, "step": 12015 }, { "completion_length": 465.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 465.0, "completions/max_terminated_length": 370.6, "completions/mean_length": 93.02890625, "completions/mean_terminated_length": 92.494189453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010878084009216506, "frac_reward_zero_std": 0.93125, "grad_norm": 7.310199737548828, "kl": 3.7090838962350974, "learning_rate": 4.6016666666666665e-07, "loss": 0.0037, "num_tokens": 828168687.0, "reward": 0.4390625, "reward_std": 0.0597604438662529, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8940436124801636, "step": 12020 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 85.00859375, "completions/mean_terminated_length": 85.00859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010882609002564766, "frac_reward_zero_std": 0.96875, "grad_norm": 10.043466567993164, "kl": 4.51620757218916, "learning_rate": 4.601269841269841e-07, "loss": 0.0045, "num_tokens": 828474594.0, "reward": 0.3, "reward_std": 0.029613886773586274, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9454727411270142, "step": 12025 }, { "completion_length": 390.8, "completions/clipped_ratio": 0.0, "completions/max_length": 390.8, "completions/max_terminated_length": 390.8, "completions/mean_length": 94.065625, "completions/mean_terminated_length": 94.065625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010887133995913026, "frac_reward_zero_std": 0.91875, "grad_norm": 10.307860374450684, "kl": 2.0471503764856607, "learning_rate": 4.6008730158730156e-07, "loss": 0.002, "num_tokens": 828796494.0, "reward": 0.2484375, "reward_std": 0.07427633777260781, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9537069916725158, "step": 12030 }, { "completion_length": 326.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 84.71015625, "completions/mean_terminated_length": 84.71015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.010891658989261286, "frac_reward_zero_std": 0.9375, "grad_norm": 1.7603468894958496, "kl": 2.8535488358698786, "learning_rate": 4.60047619047619e-07, "loss": 0.0029, "num_tokens": 829101379.0, "reward": 0.409375, "reward_std": 0.052347471192479136, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.8928169727325439, "step": 12035 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 86.90234375, "completions/mean_terminated_length": 86.90234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010896183982609546, "frac_reward_zero_std": 0.925, "grad_norm": 10.231335639953613, "kl": 2.0370388337876646, "learning_rate": 4.600079365079365e-07, "loss": 0.002, "num_tokens": 829410558.0, "reward": 0.3125, "reward_std": 0.06439104452729225, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9489224672317504, "step": 12040 }, { "completion_length": 392.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 392.4, "completions/max_terminated_length": 334.0, "completions/mean_length": 83.9203125, "completions/mean_terminated_length": 82.86300506591797, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.010900708975957806, "frac_reward_zero_std": 0.95, "grad_norm": 10.426712989807129, "kl": 5.1429322537500415, "learning_rate": 4.59968253968254e-07, "loss": 0.0051, "num_tokens": 829713240.0, "reward": 0.3546875, "reward_std": 0.03945621512830257, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9352646470069885, "step": 12045 }, { "completion_length": 489.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 489.0, "completions/max_terminated_length": 455.8, "completions/mean_length": 87.225, "completions/mean_terminated_length": 86.68822937011718, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010905233969306065, "frac_reward_zero_std": 0.9375, "grad_norm": 4.517693519592285, "kl": 3.8529460980324073, "learning_rate": 4.599285714285714e-07, "loss": 0.0039, "num_tokens": 830021160.0, "reward": 0.3671875, "reward_std": 0.05034499578177929, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9161499977111817, "step": 12050 }, { "completion_length": 446.2, "completions/clipped_ratio": 0.0, "completions/max_length": 446.2, "completions/max_terminated_length": 446.2, "completions/mean_length": 96.2046875, "completions/mean_terminated_length": 96.2046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010909758962654325, "frac_reward_zero_std": 0.95, "grad_norm": 4.094583511352539, "kl": 3.519296206906438, "learning_rate": 4.598888888888889e-07, "loss": 0.0035, "num_tokens": 830344950.0, "reward": 0.2640625, "reward_std": 0.04355610907077789, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9564998030662537, "step": 12055 }, { "completion_length": 484.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 484.2, "completions/max_terminated_length": 433.8, "completions/mean_length": 84.221875, "completions/mean_terminated_length": 83.68263244628906, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010914283956002585, "frac_reward_zero_std": 0.925, "grad_norm": 9.163246154785156, "kl": 1.9197959176730364, "learning_rate": 4.5984920634920634e-07, "loss": 0.0019, "num_tokens": 830645834.0, "reward": 0.4421875, "reward_std": 0.0600789837539196, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.896334993839264, "step": 12060 }, { "completion_length": 300.4, "completions/clipped_ratio": 0.0, "completions/max_length": 300.4, "completions/max_terminated_length": 300.4, "completions/mean_length": 81.2484375, "completions/mean_terminated_length": 81.2484375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010918808949350845, "frac_reward_zero_std": 0.93125, "grad_norm": 6.486876964569092, "kl": 0.4690717148594558, "learning_rate": 4.598095238095238e-07, "loss": 0.0005, "num_tokens": 830945272.0, "reward": 0.5078125, "reward_std": 0.0595472976565361, "rewards/verify_chess_move/mean": 0.5078125, "rewards/verify_chess_move/std": 0.8608319640159607, "step": 12065 }, { "completion_length": 317.2, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/max_terminated_length": 317.2, "completions/mean_length": 86.16328125, "completions/mean_terminated_length": 86.16328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010923333942699105, "frac_reward_zero_std": 0.94375, "grad_norm": 38.81160354614258, "kl": 0.9780986024066806, "learning_rate": 4.5976984126984124e-07, "loss": 0.001, "num_tokens": 831254337.0, "reward": 0.3, "reward_std": 0.05160459727048874, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9540255427360534, "step": 12070 }, { "completion_length": 477.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 477.0, "completions/max_terminated_length": 384.4, "completions/mean_length": 86.09765625, "completions/mean_terminated_length": 85.56932983398437, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010927858936047363, "frac_reward_zero_std": 0.95, "grad_norm": 0.028491010889410973, "kl": 0.7398362484527752, "learning_rate": 4.597301587301587e-07, "loss": 0.0007, "num_tokens": 831562982.0, "reward": 0.3484375, "reward_std": 0.0465012114495039, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9281362771987915, "step": 12075 }, { "completion_length": 552.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 552.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 88.52109375, "completions/mean_terminated_length": 87.46533660888672, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010932383929395623, "frac_reward_zero_std": 0.925, "grad_norm": 16.177227020263672, "kl": 0.6530969693674706, "learning_rate": 4.596904761904762e-07, "loss": 0.0007, "num_tokens": 831872393.0, "reward": 0.465625, "reward_std": 0.0695992436259985, "rewards/verify_chess_move/mean": 0.465625, "rewards/verify_chess_move/std": 0.8743690848350525, "step": 12080 }, { "completion_length": 271.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 85.33125, "completions/mean_terminated_length": 85.33125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.010936908922743883, "frac_reward_zero_std": 0.94375, "grad_norm": 0.1999325156211853, "kl": 0.32279559345915915, "learning_rate": 4.596507936507936e-07, "loss": 0.0003, "num_tokens": 832179801.0, "reward": 0.4609375, "reward_std": 0.047503722831606865, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8780578732490539, "step": 12085 }, { "completion_length": 400.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.2, "completions/max_terminated_length": 345.6, "completions/mean_length": 92.21875, "completions/mean_terminated_length": 91.69823608398437, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010941433916092144, "frac_reward_zero_std": 0.9625, "grad_norm": 5.990819454193115, "kl": 0.25602838282939044, "learning_rate": 4.596111111111111e-07, "loss": 0.0003, "num_tokens": 832496137.0, "reward": 0.3375, "reward_std": 0.02925042100250721, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9321074962615967, "step": 12090 }, { "completion_length": 474.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 474.2, "completions/max_terminated_length": 406.2, "completions/mean_length": 97.89375, "completions/mean_terminated_length": 97.37408752441407, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.010945958909440404, "frac_reward_zero_std": 0.95, "grad_norm": 4.889877796173096, "kl": 0.21811532212886958, "learning_rate": 4.5957142857142857e-07, "loss": 0.0002, "num_tokens": 832823001.0, "reward": 0.26875, "reward_std": 0.041034357994794844, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9423585414886475, "step": 12095 }, { "completion_length": 531.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 531.2, "completions/max_terminated_length": 467.2, "completions/mean_length": 94.21640625, "completions/mean_terminated_length": 93.16898803710937, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.010950483902788664, "frac_reward_zero_std": 0.9625, "grad_norm": 0.9651572704315186, "kl": 0.26283714475575837, "learning_rate": 4.5953174603174597e-07, "loss": 0.0003, "num_tokens": 833143142.0, "reward": 0.3328125, "reward_std": 0.03198335617780686, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9300069332122802, "step": 12100 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 462.6, "completions/max_terminated_length": 371.2, "completions/mean_length": 83.5546875, "completions/mean_terminated_length": 83.02488403320312, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.010955008896136922, "frac_reward_zero_std": 0.94375, "grad_norm": 8.965737342834473, "kl": 0.24404631983488798, "learning_rate": 4.594920634920635e-07, "loss": 0.0002, "num_tokens": 833446844.0, "reward": 0.328125, "reward_std": 0.04729155562818051, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9259266138076783, "step": 12105 }, { "completion_length": 410.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 410.8, "completions/max_terminated_length": 366.2, "completions/mean_length": 88.3765625, "completions/mean_terminated_length": 87.85721282958984, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010959533889485182, "frac_reward_zero_std": 0.95, "grad_norm": 1.8111451864242554, "kl": 0.3016087502008304, "learning_rate": 4.5945238095238093e-07, "loss": 0.0003, "num_tokens": 833758046.0, "reward": 0.2390625, "reward_std": 0.03877224437892437, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.959657883644104, "step": 12110 }, { "completion_length": 503.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 503.6, "completions/max_terminated_length": 499.8, "completions/mean_length": 94.909375, "completions/mean_terminated_length": 94.38453216552735, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.010964058882833442, "frac_reward_zero_std": 0.925, "grad_norm": 1.2973710298538208, "kl": 0.25058740059612317, "learning_rate": 4.5941269841269844e-07, "loss": 0.0003, "num_tokens": 834078994.0, "reward": 0.4078125, "reward_std": 0.059183831140398976, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.899210250377655, "step": 12115 }, { "completion_length": 321.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 84.84296875, "completions/mean_terminated_length": 84.84296875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010968583876181702, "frac_reward_zero_std": 0.93125, "grad_norm": 11.33619213104248, "kl": 0.7965311161708086, "learning_rate": 4.5937301587301584e-07, "loss": 0.0008, "num_tokens": 834385177.0, "reward": 0.36875, "reward_std": 0.05860466919839382, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9168227910995483, "step": 12120 }, { "completion_length": 393.4, "completions/clipped_ratio": 0.0, "completions/max_length": 393.4, "completions/max_terminated_length": 393.4, "completions/mean_length": 90.25078125, "completions/mean_terminated_length": 90.25078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010973108869529962, "frac_reward_zero_std": 0.9375, "grad_norm": 23.208993911743164, "kl": 0.5018989344127476, "learning_rate": 4.593333333333333e-07, "loss": 0.0005, "num_tokens": 834698890.0, "reward": 0.40625, "reward_std": 0.048978038132190704, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9049517512321472, "step": 12125 }, { "completion_length": 442.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 442.2, "completions/max_terminated_length": 346.0, "completions/mean_length": 89.0859375, "completions/mean_terminated_length": 88.55811004638672, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01097763386287822, "frac_reward_zero_std": 0.9375, "grad_norm": 0.41313737630844116, "kl": 2.132979965372942, "learning_rate": 4.592936507936508e-07, "loss": 0.0021, "num_tokens": 835012912.0, "reward": 0.415625, "reward_std": 0.05102798528969288, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9029873490333558, "step": 12130 }, { "completion_length": 552.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 552.2, "completions/max_terminated_length": 495.2, "completions/mean_length": 86.43828125, "completions/mean_terminated_length": 85.9018539428711, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010982158856226481, "frac_reward_zero_std": 0.93125, "grad_norm": 3.316160202026367, "kl": 3.316644970525522, "learning_rate": 4.5925396825396825e-07, "loss": 0.0033, "num_tokens": 835318769.0, "reward": 0.40625, "reward_std": 0.05975946336984635, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.8976459622383117, "step": 12135 }, { "completion_length": 300.8, "completions/clipped_ratio": 0.0, "completions/max_length": 300.8, "completions/max_terminated_length": 300.8, "completions/mean_length": 86.9921875, "completions/mean_terminated_length": 86.9921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.010986683849574741, "frac_reward_zero_std": 0.9375, "grad_norm": 3.437265634536743, "kl": 2.24241564439144, "learning_rate": 4.592142857142857e-07, "loss": 0.0022, "num_tokens": 835631175.0, "reward": 0.1484375, "reward_std": 0.05329009667038918, "rewards/verify_chess_move/mean": 0.1484375, "rewards/verify_chess_move/std": 0.980302095413208, "step": 12140 }, { "completion_length": 412.8, "completions/clipped_ratio": 0.0, "completions/max_length": 412.8, "completions/max_terminated_length": 412.8, "completions/mean_length": 91.69453125, "completions/mean_terminated_length": 91.69453125, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.010991208842923001, "frac_reward_zero_std": 0.9375, "grad_norm": 3.8214237689971924, "kl": 1.9734874224988743, "learning_rate": 4.5917460317460316e-07, "loss": 0.002, "num_tokens": 835947704.0, "reward": 0.3140625, "reward_std": 0.05034499540925026, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9487357258796691, "step": 12145 }, { "completion_length": 475.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 475.2, "completions/max_terminated_length": 462.0, "completions/mean_length": 87.16875, "completions/mean_terminated_length": 86.64271240234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.010995733836271261, "frac_reward_zero_std": 0.95, "grad_norm": 0.43490394949913025, "kl": 3.9097157271578906, "learning_rate": 4.591349206349206e-07, "loss": 0.0039, "num_tokens": 836256144.0, "reward": 0.3296875, "reward_std": 0.039456214755773544, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9295507788658142, "step": 12150 }, { "completion_length": 482.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 482.6, "completions/max_terminated_length": 351.2, "completions/mean_length": 87.71875, "completions/mean_terminated_length": 86.6567367553711, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011000258829619521, "frac_reward_zero_std": 0.93125, "grad_norm": 0.4675155282020569, "kl": 0.5304542304715142, "learning_rate": 4.5909523809523807e-07, "loss": 0.0005, "num_tokens": 836566496.0, "reward": 0.40625, "reward_std": 0.06249338015913963, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9064981341362, "step": 12155 }, { "completion_length": 437.8, "completions/clipped_ratio": 0.0, "completions/max_length": 437.8, "completions/max_terminated_length": 437.8, "completions/mean_length": 81.08046875, "completions/mean_terminated_length": 81.08046875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01100478382296778, "frac_reward_zero_std": 0.93125, "grad_norm": 7.143225193023682, "kl": 0.6031408680020831, "learning_rate": 4.590555555555555e-07, "loss": 0.0006, "num_tokens": 836867087.0, "reward": 0.271875, "reward_std": 0.057498329877853395, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9510789752006531, "step": 12160 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 82.73828125, "completions/mean_terminated_length": 82.73828125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01100930881631604, "frac_reward_zero_std": 0.95625, "grad_norm": 8.878653526306152, "kl": 1.5056927104480564, "learning_rate": 4.59015873015873e-07, "loss": 0.0015, "num_tokens": 837166640.0, "reward": 0.35625, "reward_std": 0.034352827444672586, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9302765965461731, "step": 12165 }, { "completion_length": 307.2, "completions/clipped_ratio": 0.0, "completions/max_length": 307.2, "completions/max_terminated_length": 307.2, "completions/mean_length": 85.15703125, "completions/mean_terminated_length": 85.15703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0110138338096643, "frac_reward_zero_std": 0.94375, "grad_norm": 4.208813667297363, "kl": 0.905707441479899, "learning_rate": 4.589761904761905e-07, "loss": 0.0009, "num_tokens": 837471577.0, "reward": 0.3515625, "reward_std": 0.05407789796590805, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9193930268287659, "step": 12170 }, { "completion_length": 485.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.6, "completions/max_terminated_length": 381.8, "completions/mean_length": 90.02109375, "completions/mean_terminated_length": 89.48617095947266, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01101835880301256, "frac_reward_zero_std": 0.9375, "grad_norm": 5.819984436035156, "kl": 0.9639018362096976, "learning_rate": 4.589365079365079e-07, "loss": 0.001, "num_tokens": 837788564.0, "reward": 0.403125, "reward_std": 0.051711955666542055, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.8983567953109741, "step": 12175 }, { "completion_length": 404.4, "completions/clipped_ratio": 0.0, "completions/max_length": 404.4, "completions/max_terminated_length": 404.4, "completions/mean_length": 89.3046875, "completions/mean_terminated_length": 89.3046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01102288379636082, "frac_reward_zero_std": 0.93125, "grad_norm": 2.0502424240112305, "kl": 5.007443803292699, "learning_rate": 4.588968253968254e-07, "loss": 0.005, "num_tokens": 838102650.0, "reward": 0.3125, "reward_std": 0.05907549224793911, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9373291254043579, "step": 12180 }, { "completion_length": 403.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.4, "completions/max_terminated_length": 312.2, "completions/mean_length": 86.34765625, "completions/mean_terminated_length": 85.81040496826172, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011027408789709078, "frac_reward_zero_std": 0.9, "grad_norm": 15.139842987060547, "kl": 9.599770520231687, "learning_rate": 4.5885714285714284e-07, "loss": 0.0096, "num_tokens": 838410039.0, "reward": 0.3046875, "reward_std": 0.08390649780631065, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9424605250358582, "step": 12185 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 87.66015625, "completions/mean_terminated_length": 87.66015625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011031933783057338, "frac_reward_zero_std": 0.91875, "grad_norm": 5.242936611175537, "kl": 4.032986899255775, "learning_rate": 4.5881746031746025e-07, "loss": 0.004, "num_tokens": 838721524.0, "reward": 0.3140625, "reward_std": 0.0692812867462635, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9423979640007019, "step": 12190 }, { "completion_length": 413.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 95.3859375, "completions/mean_terminated_length": 95.3859375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011036458776405599, "frac_reward_zero_std": 0.95, "grad_norm": 25.069252014160156, "kl": 2.052964866464026, "learning_rate": 4.5877777777777775e-07, "loss": 0.0021, "num_tokens": 839045218.0, "reward": 0.3125, "reward_std": 0.041034359112381937, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9444594502449035, "step": 12195 }, { "completion_length": 345.4, "completions/clipped_ratio": 0.0, "completions/max_length": 345.4, "completions/max_terminated_length": 345.4, "completions/mean_length": 84.76796875, "completions/mean_terminated_length": 84.76796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011040983769753859, "frac_reward_zero_std": 0.95, "grad_norm": 14.248255729675293, "kl": 1.390160491107963, "learning_rate": 4.587380952380952e-07, "loss": 0.0014, "num_tokens": 839350345.0, "reward": 0.4296875, "reward_std": 0.044451262801885605, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8986464619636536, "step": 12200 }, { "completion_length": 335.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 91.3953125, "completions/mean_terminated_length": 91.3953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011045508763102119, "frac_reward_zero_std": 0.9375, "grad_norm": 4.51034688949585, "kl": 3.7464658407028764, "learning_rate": 4.586984126984127e-07, "loss": 0.0037, "num_tokens": 839666395.0, "reward": 0.5625, "reward_std": 0.05260710902512074, "rewards/verify_chess_move/mean": 0.5625, "rewards/verify_chess_move/std": 0.8142319321632385, "step": 12205 }, { "completion_length": 344.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 84.66953125, "completions/mean_terminated_length": 84.66953125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011050033756450379, "frac_reward_zero_std": 0.9375, "grad_norm": 0.017241988331079483, "kl": 4.577146648580674, "learning_rate": 4.586587301587301e-07, "loss": 0.0046, "num_tokens": 839971412.0, "reward": 0.384375, "reward_std": 0.05102798230946064, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9152701139450073, "step": 12210 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 81.225, "completions/mean_terminated_length": 81.225, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011054558749798637, "frac_reward_zero_std": 0.94375, "grad_norm": 2.8121323585510254, "kl": 1.0279196608811616, "learning_rate": 4.5861904761904757e-07, "loss": 0.001, "num_tokens": 840271900.0, "reward": 0.21875, "reward_std": 0.04613676406443119, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9611406087875366, "step": 12215 }, { "completion_length": 310.2, "completions/clipped_ratio": 0.0, "completions/max_length": 310.2, "completions/max_terminated_length": 310.2, "completions/mean_length": 87.9296875, "completions/mean_terminated_length": 87.9296875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011059083743146897, "frac_reward_zero_std": 0.96875, "grad_norm": 0.05531417950987816, "kl": 1.120316391158849, "learning_rate": 4.585793650793651e-07, "loss": 0.0011, "num_tokens": 840582106.0, "reward": 0.2953125, "reward_std": 0.02414703294634819, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.953486168384552, "step": 12220 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 92.915625, "completions/mean_terminated_length": 92.915625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011063608736495157, "frac_reward_zero_std": 0.9625, "grad_norm": 0.30315661430358887, "kl": 0.7106308893999085, "learning_rate": 4.5853968253968253e-07, "loss": 0.0007, "num_tokens": 840901038.0, "reward": 0.3640625, "reward_std": 0.027883461490273476, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9241158246994019, "step": 12225 }, { "completion_length": 507.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 89.88828125, "completions/mean_terminated_length": 89.88828125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011068133729843417, "frac_reward_zero_std": 0.94375, "grad_norm": 0.009581410326063633, "kl": 1.0835883746040054, "learning_rate": 4.585e-07, "loss": 0.0011, "num_tokens": 841215863.0, "reward": 0.328125, "reward_std": 0.05070944279432297, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9390902400016785, "step": 12230 }, { "completion_length": 393.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 95.12890625, "completions/mean_terminated_length": 95.12890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011072658723191678, "frac_reward_zero_std": 0.9625, "grad_norm": 1.2341859340667725, "kl": 0.4679191967006773, "learning_rate": 4.5846031746031744e-07, "loss": 0.0005, "num_tokens": 841539948.0, "reward": 0.3671875, "reward_std": 0.03266732692718506, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.8611937165260315, "step": 12235 }, { "completion_length": 575.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 575.0, "completions/max_terminated_length": 402.8, "completions/mean_length": 94.4203125, "completions/mean_terminated_length": 93.37808074951172, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011077183716539936, "frac_reward_zero_std": 0.93125, "grad_norm": 1.3270760774612427, "kl": 1.2823035783600063, "learning_rate": 4.584206349206349e-07, "loss": 0.0013, "num_tokens": 841860278.0, "reward": 0.36875, "reward_std": 0.06112740263342857, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9045325517654419, "step": 12240 }, { "completion_length": 429.8, "completions/clipped_ratio": 0.0, "completions/max_length": 429.8, "completions/max_terminated_length": 429.8, "completions/mean_length": 90.89296875, "completions/mean_terminated_length": 90.89296875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011081708709888196, "frac_reward_zero_std": 0.94375, "grad_norm": 5.265957832336426, "kl": 2.004608332668431, "learning_rate": 4.5838095238095234e-07, "loss": 0.002, "num_tokens": 842176477.0, "reward": 0.35, "reward_std": 0.04797552675008774, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9209959030151367, "step": 12245 }, { "completion_length": 412.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 412.4, "completions/max_terminated_length": 323.2, "completions/mean_length": 94.5875, "completions/mean_terminated_length": 94.05394744873047, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011086233703236456, "frac_reward_zero_std": 0.91875, "grad_norm": 12.123735427856445, "kl": 6.324612929217983, "learning_rate": 4.583412698412698e-07, "loss": 0.0063, "num_tokens": 842497213.0, "reward": 0.3390625, "reward_std": 0.07064922563731671, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9369014978408814, "step": 12250 }, { "completion_length": 378.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 378.8, "completions/max_terminated_length": 311.8, "completions/mean_length": 88.6171875, "completions/mean_terminated_length": 88.0851333618164, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.011090758696584716, "frac_reward_zero_std": 0.9625, "grad_norm": 1.3596386909484863, "kl": 6.873889875912573, "learning_rate": 4.583015873015873e-07, "loss": 0.0069, "num_tokens": 842810267.0, "reward": 0.2984375, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9464283108711242, "step": 12255 }, { "completion_length": 596.2, "completions/clipped_ratio": 0.003125, "completions/max_length": 596.2, "completions/max_terminated_length": 400.0, "completions/mean_length": 90.66875, "completions/mean_terminated_length": 88.53909912109376, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011095283689932976, "frac_reward_zero_std": 0.93125, "grad_norm": 3.655881643295288, "kl": 2.899080211878754, "learning_rate": 4.5826190476190476e-07, "loss": 0.0029, "num_tokens": 843125651.0, "reward": 0.3859375, "reward_std": 0.054763433337211606, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9122056365013123, "step": 12260 }, { "completion_length": 385.8, "completions/clipped_ratio": 0.0, "completions/max_length": 385.8, "completions/max_terminated_length": 385.8, "completions/mean_length": 93.4921875, "completions/mean_terminated_length": 93.4921875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011099808683281236, "frac_reward_zero_std": 0.975, "grad_norm": 1.0637526512145996, "kl": 2.464614102197811, "learning_rate": 4.5822222222222216e-07, "loss": 0.0025, "num_tokens": 843447905.0, "reward": 0.2375, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.2375, "rewards/verify_chess_move/std": 0.9607269883155822, "step": 12265 }, { "completion_length": 435.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.4, "completions/max_terminated_length": 414.0, "completions/mean_length": 93.7796875, "completions/mean_terminated_length": 93.25105438232421, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011104333676629495, "frac_reward_zero_std": 0.95, "grad_norm": 3.2271029949188232, "kl": 1.8921943568973802, "learning_rate": 4.5818253968253967e-07, "loss": 0.0019, "num_tokens": 843769031.0, "reward": 0.2140625, "reward_std": 0.03877224512398243, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9599435687065124, "step": 12270 }, { "completion_length": 376.6, "completions/clipped_ratio": 0.0, "completions/max_length": 376.6, "completions/max_terminated_length": 376.6, "completions/mean_length": 90.49375, "completions/mean_terminated_length": 90.49375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.011108858669977755, "frac_reward_zero_std": 0.9375, "grad_norm": 4.496208667755127, "kl": 1.274907778808847, "learning_rate": 4.581428571428571e-07, "loss": 0.0013, "num_tokens": 844083527.0, "reward": 0.4078125, "reward_std": 0.056919168680906296, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9074596762657166, "step": 12275 }, { "completion_length": 306.6, "completions/clipped_ratio": 0.0, "completions/max_length": 306.6, "completions/max_terminated_length": 306.6, "completions/mean_length": 83.50546875, "completions/mean_terminated_length": 83.50546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011113383663326015, "frac_reward_zero_std": 0.975, "grad_norm": 1.4103069305419922, "kl": 0.153512775991112, "learning_rate": 4.5810317460317463e-07, "loss": 0.0002, "num_tokens": 844386294.0, "reward": 0.20625, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.9778976798057556, "step": 12280 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 93.1671875, "completions/mean_terminated_length": 93.1671875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011117908656674275, "frac_reward_zero_std": 0.93125, "grad_norm": 11.142801284790039, "kl": 0.3529378810664639, "learning_rate": 4.5806349206349203e-07, "loss": 0.0004, "num_tokens": 844705420.0, "reward": 0.2734375, "reward_std": 0.05749734900891781, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9603789448738098, "step": 12285 }, { "completion_length": 448.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 92.19296875, "completions/mean_terminated_length": 92.19296875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.011122433650022535, "frac_reward_zero_std": 0.95625, "grad_norm": 0.029757685959339142, "kl": 0.17727526070084423, "learning_rate": 4.580238095238095e-07, "loss": 0.0002, "num_tokens": 845021611.0, "reward": 0.4015625, "reward_std": 0.03298586905002594, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9100170612335206, "step": 12290 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 94.83515625, "completions/mean_terminated_length": 94.83515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011126958643370793, "frac_reward_zero_std": 0.91875, "grad_norm": 0.24345150589942932, "kl": 0.4499307798221707, "learning_rate": 4.57984126984127e-07, "loss": 0.0004, "num_tokens": 845345712.0, "reward": 0.10625, "reward_std": 0.07470164857804776, "rewards/verify_chess_move/mean": 0.10625, "rewards/verify_chess_move/std": 0.9831378102302551, "step": 12295 }, { "completion_length": 360.4, "completions/clipped_ratio": 0.0, "completions/max_length": 360.4, "completions/max_terminated_length": 360.4, "completions/mean_length": 95.41484375, "completions/mean_terminated_length": 95.41484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011131483636719054, "frac_reward_zero_std": 0.925, "grad_norm": 4.1675567626953125, "kl": 0.7888100357726217, "learning_rate": 4.579444444444444e-07, "loss": 0.0008, "num_tokens": 845668667.0, "reward": 0.284375, "reward_std": 0.05713290125131607, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9248729228973389, "step": 12300 }, { "completion_length": 500.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 500.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 91.2390625, "completions/mean_terminated_length": 90.19033660888672, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011136008630067314, "frac_reward_zero_std": 0.93125, "grad_norm": 9.000247955322266, "kl": 2.2771897914120927, "learning_rate": 4.579047619047619e-07, "loss": 0.0023, "num_tokens": 845983693.0, "reward": 0.3328125, "reward_std": 0.05749734900891781, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9357598900794983, "step": 12305 }, { "completion_length": 469.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 469.8, "completions/max_terminated_length": 414.6, "completions/mean_length": 98.05546875, "completions/mean_terminated_length": 97.52896881103516, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011140533623415574, "frac_reward_zero_std": 0.9625, "grad_norm": 3.2489805221557617, "kl": 0.6236286410712637, "learning_rate": 4.5786507936507935e-07, "loss": 0.0006, "num_tokens": 846310364.0, "reward": 0.4015625, "reward_std": 0.03445763699710369, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9019386529922485, "step": 12310 }, { "completion_length": 282.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 86.92578125, "completions/mean_terminated_length": 86.92578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011145058616763834, "frac_reward_zero_std": 0.925, "grad_norm": 11.037848472595215, "kl": 4.049372841091826, "learning_rate": 4.578253968253968e-07, "loss": 0.004, "num_tokens": 846620901.0, "reward": 0.2171875, "reward_std": 0.0621289324015379, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9656719326972961, "step": 12315 }, { "completion_length": 369.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 369.4, "completions/max_terminated_length": 304.4, "completions/mean_length": 79.63515625, "completions/mean_terminated_length": 79.09960021972657, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011149583610112094, "frac_reward_zero_std": 0.95625, "grad_norm": 14.048995971679688, "kl": 2.0263480230933055, "learning_rate": 4.5778571428571426e-07, "loss": 0.002, "num_tokens": 846917826.0, "reward": 0.275, "reward_std": 0.037086743861436844, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9581957936286927, "step": 12320 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 96.4140625, "completions/mean_terminated_length": 96.4140625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011154108603460352, "frac_reward_zero_std": 0.95625, "grad_norm": 0.8869137763977051, "kl": 3.603921816428192, "learning_rate": 4.577460317460317e-07, "loss": 0.0036, "num_tokens": 847243380.0, "reward": 0.296875, "reward_std": 0.044602562487125394, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9539791345596313, "step": 12325 }, { "completion_length": 308.8, "completions/clipped_ratio": 0.0, "completions/max_length": 308.8, "completions/max_terminated_length": 308.8, "completions/mean_length": 89.16484375, "completions/mean_terminated_length": 89.16484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011158633596808612, "frac_reward_zero_std": 0.9375, "grad_norm": 5.948394775390625, "kl": 4.059071559412405, "learning_rate": 4.577063492063492e-07, "loss": 0.0041, "num_tokens": 847556047.0, "reward": 0.3671875, "reward_std": 0.05691916793584824, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9289471507072449, "step": 12330 }, { "completion_length": 523.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 523.4, "completions/max_terminated_length": 429.0, "completions/mean_length": 90.59765625, "completions/mean_terminated_length": 90.06526489257813, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011163158590156872, "frac_reward_zero_std": 0.91875, "grad_norm": 12.266880989074707, "kl": 5.508571368455887, "learning_rate": 4.576666666666666e-07, "loss": 0.0055, "num_tokens": 847870204.0, "reward": 0.34375, "reward_std": 0.07427731528878212, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.931068217754364, "step": 12335 }, { "completion_length": 412.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 412.2, "completions/max_terminated_length": 350.6, "completions/mean_length": 86.0828125, "completions/mean_terminated_length": 85.55960540771484, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011167683583505133, "frac_reward_zero_std": 0.94375, "grad_norm": 15.190770149230957, "kl": 3.0917327585513705, "learning_rate": 4.576269841269841e-07, "loss": 0.0031, "num_tokens": 848179374.0, "reward": 0.3421875, "reward_std": 0.05023763999342919, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9393400430679322, "step": 12340 }, { "completion_length": 577.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 577.4, "completions/max_terminated_length": 418.6, "completions/mean_length": 94.54609375, "completions/mean_terminated_length": 93.4926025390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011172208576853393, "frac_reward_zero_std": 0.88125, "grad_norm": 22.859434127807617, "kl": 1.8343982244608923, "learning_rate": 4.575873015873016e-07, "loss": 0.0018, "num_tokens": 848499761.0, "reward": 0.3671875, "reward_std": 0.09327702075242997, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9262471675872803, "step": 12345 }, { "completion_length": 272.8, "completions/clipped_ratio": 0.0, "completions/max_length": 272.8, "completions/max_terminated_length": 272.8, "completions/mean_length": 93.27265625, "completions/mean_terminated_length": 93.27265625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011176733570201651, "frac_reward_zero_std": 0.9, "grad_norm": 11.647893905639648, "kl": 0.5259926883038133, "learning_rate": 4.5754761904761904e-07, "loss": 0.0005, "num_tokens": 848820270.0, "reward": 0.3375, "reward_std": 0.08027840480208397, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9267609119415283, "step": 12350 }, { "completion_length": 362.4, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/max_terminated_length": 362.4, "completions/mean_length": 93.13671875, "completions/mean_terminated_length": 93.13671875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011181258563549911, "frac_reward_zero_std": 0.9375, "grad_norm": 3.7858545780181885, "kl": 1.6320024273358285, "learning_rate": 4.575079365079365e-07, "loss": 0.0016, "num_tokens": 849139853.0, "reward": 0.41875, "reward_std": 0.05760215893387795, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8919111967086792, "step": 12355 }, { "completion_length": 331.6, "completions/clipped_ratio": 0.0, "completions/max_length": 331.6, "completions/max_terminated_length": 331.6, "completions/mean_length": 90.11875, "completions/mean_terminated_length": 90.11875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011185783556898171, "frac_reward_zero_std": 0.91875, "grad_norm": 23.665363311767578, "kl": 0.36470159083837644, "learning_rate": 4.5746825396825395e-07, "loss": 0.0004, "num_tokens": 849454413.0, "reward": 0.428125, "reward_std": 0.06812747344374656, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8818472027778625, "step": 12360 }, { "completion_length": 360.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 89.58203125, "completions/mean_terminated_length": 89.58203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011190308550246431, "frac_reward_zero_std": 0.95625, "grad_norm": 4.90873908996582, "kl": 0.3342311532702297, "learning_rate": 4.574285714285714e-07, "loss": 0.0003, "num_tokens": 849768486.0, "reward": 0.3203125, "reward_std": 0.03298586793243885, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.936632764339447, "step": 12365 }, { "completion_length": 502.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 502.4, "completions/max_terminated_length": 413.0, "completions/mean_length": 86.81015625, "completions/mean_terminated_length": 86.27684783935547, "completions/min_length": 28.8, "completions/min_terminated_length": 28.8, "epoch": 0.011194833543594691, "frac_reward_zero_std": 0.9625, "grad_norm": 0.004997181706130505, "kl": 0.3827997911372222, "learning_rate": 4.573888888888889e-07, "loss": 0.0004, "num_tokens": 850076315.0, "reward": 0.3390625, "reward_std": 0.03287851177155972, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9274988889694213, "step": 12370 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.0, "completions/max_length": 380.6, "completions/max_terminated_length": 380.6, "completions/mean_length": 80.684375, "completions/mean_terminated_length": 80.684375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011199358536942951, "frac_reward_zero_std": 0.925, "grad_norm": 37.79359817504883, "kl": 0.6225233690114692, "learning_rate": 4.573492063492063e-07, "loss": 0.0006, "num_tokens": 850375199.0, "reward": 0.3203125, "reward_std": 0.06806758940219879, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9128700375556946, "step": 12375 }, { "completion_length": 356.8, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 89.125, "completions/mean_terminated_length": 89.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01120388353029121, "frac_reward_zero_std": 0.91875, "grad_norm": 5.26190185546875, "kl": 0.23344741153996437, "learning_rate": 4.573095238095238e-07, "loss": 0.0002, "num_tokens": 850687239.0, "reward": 0.3578125, "reward_std": 0.07359432876110077, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9248865485191345, "step": 12380 }, { "completion_length": 362.4, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/max_terminated_length": 362.4, "completions/mean_length": 89.83046875, "completions/mean_terminated_length": 89.83046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01120840852363947, "frac_reward_zero_std": 0.89375, "grad_norm": 36.084197998046875, "kl": 0.3264555081957951, "learning_rate": 4.5726984126984127e-07, "loss": 0.0003, "num_tokens": 850999318.0, "reward": 0.46875, "reward_std": 0.09148416593670845, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8801679491996766, "step": 12385 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 95.17109375, "completions/mean_terminated_length": 95.17109375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01121293351698773, "frac_reward_zero_std": 0.925, "grad_norm": 9.575860023498535, "kl": 0.9568638420663774, "learning_rate": 4.5723015873015867e-07, "loss": 0.001, "num_tokens": 851320473.0, "reward": 0.2453125, "reward_std": 0.06575800627470016, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9540632247924805, "step": 12390 }, { "completion_length": 336.8, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/max_terminated_length": 336.8, "completions/mean_length": 84.19765625, "completions/mean_terminated_length": 84.19765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01121745851033599, "frac_reward_zero_std": 0.95625, "grad_norm": 4.573204040527344, "kl": 0.25378000452183186, "learning_rate": 4.571904761904762e-07, "loss": 0.0003, "num_tokens": 851623182.0, "reward": 0.4859375, "reward_std": 0.039560042321681976, "rewards/verify_chess_move/mean": 0.4859375, "rewards/verify_chess_move/std": 0.8682936310768128, "step": 12395 }, { "completion_length": 305.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 85.9828125, "completions/mean_terminated_length": 85.9828125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01122198350368425, "frac_reward_zero_std": 0.95625, "grad_norm": 11.708888053894043, "kl": 0.2891963049536571, "learning_rate": 4.5715079365079363e-07, "loss": 0.0003, "num_tokens": 851931168.0, "reward": 0.4015625, "reward_std": 0.033669838681817055, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8915926337242126, "step": 12400 }, { "completion_length": 342.4, "completions/clipped_ratio": 0.0, "completions/max_length": 342.4, "completions/max_terminated_length": 342.4, "completions/mean_length": 89.24453125, "completions/mean_terminated_length": 89.24453125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011226508497032509, "frac_reward_zero_std": 0.94375, "grad_norm": 5.394712924957275, "kl": 0.9104967981693335, "learning_rate": 4.5711111111111114e-07, "loss": 0.0009, "num_tokens": 852244689.0, "reward": 0.425, "reward_std": 0.04797552675008774, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9009334564208984, "step": 12405 }, { "completion_length": 420.4, "completions/clipped_ratio": 0.0, "completions/max_length": 420.4, "completions/max_terminated_length": 420.4, "completions/mean_length": 96.39609375, "completions/mean_terminated_length": 96.39609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011231033490380769, "frac_reward_zero_std": 0.9375, "grad_norm": 3.817203998565674, "kl": 1.8981486419215798, "learning_rate": 4.5707142857142854e-07, "loss": 0.0019, "num_tokens": 852568324.0, "reward": 0.3125, "reward_std": 0.05102798454463482, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9465835332870484, "step": 12410 }, { "completion_length": 444.4, "completions/clipped_ratio": 0.0, "completions/max_length": 444.4, "completions/max_terminated_length": 444.4, "completions/mean_length": 89.875, "completions/mean_terminated_length": 89.875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011235558483729029, "frac_reward_zero_std": 0.9375, "grad_norm": 15.210139274597168, "kl": 2.233633906778414, "learning_rate": 4.57031746031746e-07, "loss": 0.0022, "num_tokens": 852881236.0, "reward": 0.284375, "reward_std": 0.05739097334444523, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9459868311882019, "step": 12415 }, { "completion_length": 416.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 416.2, "completions/max_terminated_length": 337.0, "completions/mean_length": 91.62734375, "completions/mean_terminated_length": 90.58148956298828, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011240083477077289, "frac_reward_zero_std": 0.93125, "grad_norm": 9.682707786560059, "kl": 7.092699271289166, "learning_rate": 4.569920634920635e-07, "loss": 0.0071, "num_tokens": 853197543.0, "reward": 0.415625, "reward_std": 0.05476441346108914, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9002684116363525, "step": 12420 }, { "completion_length": 388.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 388.8, "completions/max_terminated_length": 365.6, "completions/mean_length": 97.9875, "completions/mean_terminated_length": 97.47179718017578, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011244608470425549, "frac_reward_zero_std": 0.94375, "grad_norm": 5.255337715148926, "kl": 2.9107230748981237, "learning_rate": 4.569523809523809e-07, "loss": 0.0029, "num_tokens": 853523999.0, "reward": 0.3109375, "reward_std": 0.0513924315571785, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9398425459861756, "step": 12425 }, { "completion_length": 267.2, "completions/clipped_ratio": 0.0, "completions/max_length": 267.2, "completions/max_terminated_length": 267.2, "completions/mean_length": 85.32421875, "completions/mean_terminated_length": 85.32421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011249133463773809, "frac_reward_zero_std": 0.9625, "grad_norm": 10.898957252502441, "kl": 1.664860667847097, "learning_rate": 4.569126984126984e-07, "loss": 0.0017, "num_tokens": 853830470.0, "reward": 0.3421875, "reward_std": 0.030617379397153855, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9309602618217468, "step": 12430 }, { "completion_length": 412.4, "completions/clipped_ratio": 0.0, "completions/max_length": 412.4, "completions/max_terminated_length": 412.4, "completions/mean_length": 95.1359375, "completions/mean_terminated_length": 95.1359375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011253658457122067, "frac_reward_zero_std": 0.925, "grad_norm": 15.791041374206543, "kl": 1.4933153915568256, "learning_rate": 4.5687301587301586e-07, "loss": 0.0015, "num_tokens": 854153924.0, "reward": 0.3921875, "reward_std": 0.06144496239721775, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.920660400390625, "step": 12435 }, { "completion_length": 319.2, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/max_terminated_length": 319.2, "completions/mean_length": 87.85390625, "completions/mean_terminated_length": 87.85390625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011258183450470327, "frac_reward_zero_std": 0.925, "grad_norm": 8.53834056854248, "kl": 2.058853797405027, "learning_rate": 4.568333333333333e-07, "loss": 0.0021, "num_tokens": 854464513.0, "reward": 0.4171875, "reward_std": 0.0637080579996109, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8916038632392883, "step": 12440 }, { "completion_length": 354.8, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/max_terminated_length": 354.8, "completions/mean_length": 82.68046875, "completions/mean_terminated_length": 82.68046875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011262708443818588, "frac_reward_zero_std": 0.95625, "grad_norm": 1.023571252822876, "kl": 0.9389315848471597, "learning_rate": 4.5679365079365077e-07, "loss": 0.0009, "num_tokens": 854766288.0, "reward": 0.359375, "reward_std": 0.035247981548309326, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9309756398200989, "step": 12445 }, { "completion_length": 414.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 414.0, "completions/max_terminated_length": 363.8, "completions/mean_length": 91.88828125, "completions/mean_terminated_length": 91.36880950927734, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011267233437166848, "frac_reward_zero_std": 0.9, "grad_norm": 19.394424438476562, "kl": 4.091942484979517, "learning_rate": 4.567539682539682e-07, "loss": 0.0041, "num_tokens": 855082345.0, "reward": 0.2546875, "reward_std": 0.08501479998230935, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9667436599731445, "step": 12450 }, { "completion_length": 270.6, "completions/clipped_ratio": 0.0, "completions/max_length": 270.6, "completions/max_terminated_length": 270.6, "completions/mean_length": 86.59140625, "completions/mean_terminated_length": 86.59140625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011271758430515108, "frac_reward_zero_std": 0.95, "grad_norm": 6.5900492668151855, "kl": 3.1961285140365363, "learning_rate": 4.5671428571428573e-07, "loss": 0.0032, "num_tokens": 855390414.0, "reward": 0.3890625, "reward_std": 0.04650121033191681, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9133020877838135, "step": 12455 }, { "completion_length": 330.6, "completions/clipped_ratio": 0.0, "completions/max_length": 330.6, "completions/max_terminated_length": 330.6, "completions/mean_length": 86.4734375, "completions/mean_terminated_length": 86.4734375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011276283423863366, "frac_reward_zero_std": 0.90625, "grad_norm": 21.773365020751953, "kl": 3.8816404677694663, "learning_rate": 4.5667460317460313e-07, "loss": 0.0039, "num_tokens": 855698252.0, "reward": 0.4734375, "reward_std": 0.08085403889417649, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8725157737731933, "step": 12460 }, { "completion_length": 390.6, "completions/clipped_ratio": 0.0, "completions/max_length": 390.6, "completions/max_terminated_length": 390.6, "completions/mean_length": 84.75859375, "completions/mean_terminated_length": 84.75859375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011280808417211626, "frac_reward_zero_std": 0.975, "grad_norm": 5.803497314453125, "kl": 3.6420241463813, "learning_rate": 4.566349206349206e-07, "loss": 0.0036, "num_tokens": 856002231.0, "reward": 0.53125, "reward_std": 0.023827511072158813, "rewards/verify_chess_move/mean": 0.53125, "rewards/verify_chess_move/std": 0.8379926443099975, "step": 12465 }, { "completion_length": 282.2, "completions/clipped_ratio": 0.0, "completions/max_length": 282.2, "completions/max_terminated_length": 282.2, "completions/mean_length": 91.15625, "completions/mean_terminated_length": 91.15625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011285333410559886, "frac_reward_zero_std": 0.93125, "grad_norm": 5.6116557121276855, "kl": 3.2890361715108156, "learning_rate": 4.565952380952381e-07, "loss": 0.0033, "num_tokens": 856319775.0, "reward": 0.2640625, "reward_std": 0.050663537532091144, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9562499523162842, "step": 12470 }, { "completion_length": 451.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 87.22578125, "completions/mean_terminated_length": 86.68658752441407, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011289858403908146, "frac_reward_zero_std": 0.91875, "grad_norm": 12.756050109863281, "kl": 2.829300629440695, "learning_rate": 4.5655555555555555e-07, "loss": 0.0028, "num_tokens": 856629608.0, "reward": 0.396875, "reward_std": 0.0745369553565979, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9156683444976806, "step": 12475 }, { "completion_length": 448.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 448.8, "completions/max_terminated_length": 386.8, "completions/mean_length": 86.9421875, "completions/mean_terminated_length": 86.42104644775391, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011294383397256406, "frac_reward_zero_std": 0.95, "grad_norm": 5.775125026702881, "kl": 3.6879413351183756, "learning_rate": 4.56515873015873e-07, "loss": 0.0037, "num_tokens": 856938126.0, "reward": 0.503125, "reward_std": 0.0374052856117487, "rewards/verify_chess_move/mean": 0.503125, "rewards/verify_chess_move/std": 0.854828667640686, "step": 12480 }, { "completion_length": 492.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 87.340625, "completions/mean_terminated_length": 87.340625, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.011298908390604667, "frac_reward_zero_std": 0.95, "grad_norm": 5.64084529876709, "kl": 0.6312473181518726, "learning_rate": 4.5647619047619045e-07, "loss": 0.0006, "num_tokens": 857246202.0, "reward": 0.315625, "reward_std": 0.04308430440723896, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9486071825027466, "step": 12485 }, { "completion_length": 392.4, "completions/clipped_ratio": 0.0, "completions/max_length": 392.4, "completions/max_terminated_length": 392.4, "completions/mean_length": 93.89609375, "completions/mean_terminated_length": 93.89609375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011303433383952925, "frac_reward_zero_std": 0.94375, "grad_norm": 1.7310500144958496, "kl": 1.420553237956483, "learning_rate": 4.564365079365079e-07, "loss": 0.0014, "num_tokens": 857565869.0, "reward": 0.346875, "reward_std": 0.04524161070585251, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9285802721977234, "step": 12490 }, { "completion_length": 316.2, "completions/clipped_ratio": 0.0, "completions/max_length": 316.2, "completions/max_terminated_length": 316.2, "completions/mean_length": 86.1796875, "completions/mean_terminated_length": 86.1796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011307958377301185, "frac_reward_zero_std": 0.95, "grad_norm": 10.184063911437988, "kl": 0.7143266498111188, "learning_rate": 4.563968253968254e-07, "loss": 0.0007, "num_tokens": 857874323.0, "reward": 0.328125, "reward_std": 0.046502193063497545, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9390044212341309, "step": 12495 }, { "completion_length": 516.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 516.0, "completions/max_terminated_length": 444.6, "completions/mean_length": 89.4453125, "completions/mean_terminated_length": 88.90561370849609, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011312483370649445, "frac_reward_zero_std": 0.9375, "grad_norm": 7.1113152503967285, "kl": 0.49039616832742466, "learning_rate": 4.563571428571428e-07, "loss": 0.0005, "num_tokens": 858188045.0, "reward": 0.2796875, "reward_std": 0.05124015025794506, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9401378870010376, "step": 12500 }, { "completion_length": 493.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 493.2, "completions/max_terminated_length": 484.6, "completions/mean_length": 92.0984375, "completions/mean_terminated_length": 91.0346893310547, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011317008363997705, "frac_reward_zero_std": 0.91875, "grad_norm": 8.497904777526855, "kl": 0.2309459627373144, "learning_rate": 4.563174603174603e-07, "loss": 0.0002, "num_tokens": 858504107.0, "reward": 0.3234375, "reward_std": 0.06907010078430176, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9085479974746704, "step": 12505 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 93.67734375, "completions/mean_terminated_length": 93.67734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011321533357345965, "frac_reward_zero_std": 0.9125, "grad_norm": 3.655825614929199, "kl": 0.9027132852585055, "learning_rate": 4.562777777777778e-07, "loss": 0.0009, "num_tokens": 858823886.0, "reward": 0.2515625, "reward_std": 0.07596477940678596, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9595314025878906, "step": 12510 }, { "completion_length": 287.2, "completions/clipped_ratio": 0.0, "completions/max_length": 287.2, "completions/max_terminated_length": 287.2, "completions/mean_length": 86.2875, "completions/mean_terminated_length": 86.2875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011326058350694224, "frac_reward_zero_std": 0.94375, "grad_norm": 0.9990772604942322, "kl": 0.3991483114194125, "learning_rate": 4.562380952380952e-07, "loss": 0.0004, "num_tokens": 859132830.0, "reward": 0.4390625, "reward_std": 0.04750372394919396, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8805689096450806, "step": 12515 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 87.79140625, "completions/mean_terminated_length": 87.79140625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011330583344042484, "frac_reward_zero_std": 0.93125, "grad_norm": 17.755599975585938, "kl": 1.1845109428977594, "learning_rate": 4.561984126984127e-07, "loss": 0.0012, "num_tokens": 859441419.0, "reward": 0.3453125, "reward_std": 0.059971627220511435, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9344810485839844, "step": 12520 }, { "completion_length": 369.6, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/max_terminated_length": 369.6, "completions/mean_length": 91.315625, "completions/mean_terminated_length": 91.315625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011335108337390744, "frac_reward_zero_std": 0.95, "grad_norm": 0.0009463595342822373, "kl": 0.7585922695230692, "learning_rate": 4.5615873015873014e-07, "loss": 0.0008, "num_tokens": 859757463.0, "reward": 0.4875, "reward_std": 0.04082317315042019, "rewards/verify_chess_move/mean": 0.4875, "rewards/verify_chess_move/std": 0.8106208622455597, "step": 12525 }, { "completion_length": 397.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 397.0, "completions/max_terminated_length": 327.8, "completions/mean_length": 91.66640625, "completions/mean_terminated_length": 91.1423843383789, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011339633330739004, "frac_reward_zero_std": 0.93125, "grad_norm": 2.26082444190979, "kl": 2.8404704570770263, "learning_rate": 4.5611904761904764e-07, "loss": 0.0028, "num_tokens": 860074380.0, "reward": 0.346875, "reward_std": 0.05818033739924431, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9342099905014039, "step": 12530 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 392.6, "completions/max_terminated_length": 294.0, "completions/mean_length": 90.41328125, "completions/mean_terminated_length": 89.86403198242188, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011344158324087264, "frac_reward_zero_std": 0.94375, "grad_norm": 1.2455531358718872, "kl": 2.2952425666386262, "learning_rate": 4.5607936507936505e-07, "loss": 0.0023, "num_tokens": 860389253.0, "reward": 0.2609375, "reward_std": 0.046608566865324975, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9601973414421081, "step": 12535 }, { "completion_length": 276.6, "completions/clipped_ratio": 0.0, "completions/max_length": 276.6, "completions/max_terminated_length": 276.6, "completions/mean_length": 86.78515625, "completions/mean_terminated_length": 86.78515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011348683317435524, "frac_reward_zero_std": 0.95625, "grad_norm": 0.04629750922322273, "kl": 0.48258750066161155, "learning_rate": 4.560396825396825e-07, "loss": 0.0005, "num_tokens": 860697978.0, "reward": 0.3953125, "reward_std": 0.03934885673224926, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9188748717308044, "step": 12540 }, { "completion_length": 284.4, "completions/clipped_ratio": 0.0, "completions/max_length": 284.4, "completions/max_terminated_length": 284.4, "completions/mean_length": 92.459375, "completions/mean_terminated_length": 92.459375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011353208310783782, "frac_reward_zero_std": 0.9625, "grad_norm": 0.31886622309684753, "kl": 1.0909123613499105, "learning_rate": 4.56e-07, "loss": 0.0011, "num_tokens": 861016646.0, "reward": 0.3546875, "reward_std": 0.03356248252093792, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9215829372406006, "step": 12545 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 366.2, "completions/max_terminated_length": 302.2, "completions/mean_length": 88.00390625, "completions/mean_terminated_length": 87.48596649169922, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011357733304132043, "frac_reward_zero_std": 0.9375, "grad_norm": 14.126205444335938, "kl": 1.418535756971687, "learning_rate": 4.559603174603174e-07, "loss": 0.0014, "num_tokens": 861325147.0, "reward": 0.4296875, "reward_std": 0.057389992475509646, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8923425674438477, "step": 12550 }, { "completion_length": 296.8, "completions/clipped_ratio": 0.0, "completions/max_length": 296.8, "completions/max_terminated_length": 296.8, "completions/mean_length": 87.89296875, "completions/mean_terminated_length": 87.89296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011362258297480303, "frac_reward_zero_std": 0.9625, "grad_norm": 7.081122398376465, "kl": 0.7626691489014774, "learning_rate": 4.5592063492063486e-07, "loss": 0.0008, "num_tokens": 861635874.0, "reward": 0.453125, "reward_std": 0.03424547053873539, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8742341876029969, "step": 12555 }, { "completion_length": 345.4, "completions/clipped_ratio": 0.0, "completions/max_length": 345.4, "completions/max_terminated_length": 345.4, "completions/mean_length": 89.896875, "completions/mean_terminated_length": 89.896875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011366783290828563, "frac_reward_zero_std": 0.9375, "grad_norm": 13.789678573608398, "kl": 2.1967006211401894, "learning_rate": 4.5588095238095237e-07, "loss": 0.0022, "num_tokens": 861950014.0, "reward": 0.334375, "reward_std": 0.05465705655515194, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9394620180130004, "step": 12560 }, { "completion_length": 453.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 453.4, "completions/max_terminated_length": 433.2, "completions/mean_length": 95.2234375, "completions/mean_terminated_length": 94.70154266357422, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011371308284176823, "frac_reward_zero_std": 0.9375, "grad_norm": 12.687609672546387, "kl": 1.0186440605903044, "learning_rate": 4.558412698412698e-07, "loss": 0.001, "num_tokens": 862272044.0, "reward": 0.346875, "reward_std": 0.0537619024515152, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9151248574256897, "step": 12565 }, { "completion_length": 491.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 491.2, "completions/max_terminated_length": 432.8, "completions/mean_length": 102.0296875, "completions/mean_terminated_length": 100.44876861572266, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011375833277525083, "frac_reward_zero_std": 0.8875, "grad_norm": 0.11912932246923447, "kl": 1.8244403422111646, "learning_rate": 4.558015873015873e-07, "loss": 0.0018, "num_tokens": 862604514.0, "reward": 0.471875, "reward_std": 0.09889615178108216, "rewards/verify_chess_move/mean": 0.471875, "rewards/verify_chess_move/std": 0.8793991804122925, "step": 12570 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 94.60859375, "completions/mean_terminated_length": 94.60859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011380358270873341, "frac_reward_zero_std": 0.9625, "grad_norm": 2.632317543029785, "kl": 1.1379636428668163, "learning_rate": 4.5576190476190473e-07, "loss": 0.0011, "num_tokens": 862925117.0, "reward": 0.459375, "reward_std": 0.03540026247501373, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8699515223503113, "step": 12575 }, { "completion_length": 429.2, "completions/clipped_ratio": 0.0, "completions/max_length": 429.2, "completions/max_terminated_length": 429.2, "completions/mean_length": 97.915625, "completions/mean_terminated_length": 97.915625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011384883264221601, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0028058572206646204, "kl": 0.9426652021007612, "learning_rate": 4.557222222222222e-07, "loss": 0.0009, "num_tokens": 863252849.0, "reward": 0.315625, "reward_std": 0.03866586871445179, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9474978685379029, "step": 12580 }, { "completion_length": 421.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.4, "completions/max_terminated_length": 323.0, "completions/mean_length": 97.37109375, "completions/mean_terminated_length": 96.85313568115234, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011389408257569861, "frac_reward_zero_std": 0.91875, "grad_norm": 0.24867013096809387, "kl": 0.9271587657509371, "learning_rate": 4.556825396825397e-07, "loss": 0.0009, "num_tokens": 863579116.0, "reward": 0.3078125, "reward_std": 0.06702015250921249, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9457099318504334, "step": 12585 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 88.0953125, "completions/mean_terminated_length": 88.0953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011393933250918122, "frac_reward_zero_std": 0.95, "grad_norm": 6.821982383728027, "kl": 0.8347311092540621, "learning_rate": 4.556428571428571e-07, "loss": 0.0008, "num_tokens": 863890406.0, "reward": 0.3375, "reward_std": 0.04671337679028511, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9315119385719299, "step": 12590 }, { "completion_length": 440.2, "completions/clipped_ratio": 0.0, "completions/max_length": 440.2, "completions/max_terminated_length": 440.2, "completions/mean_length": 88.459375, "completions/mean_terminated_length": 88.459375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011398458244266382, "frac_reward_zero_std": 0.9375, "grad_norm": 31.98075294494629, "kl": 1.1591539888177067, "learning_rate": 4.556031746031746e-07, "loss": 0.0012, "num_tokens": 864201858.0, "reward": 0.421875, "reward_std": 0.05234747231006622, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8954668402671814, "step": 12595 }, { "completion_length": 323.4, "completions/clipped_ratio": 0.0, "completions/max_length": 323.4, "completions/max_terminated_length": 323.4, "completions/mean_length": 89.61953125, "completions/mean_terminated_length": 89.61953125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01140298323761464, "frac_reward_zero_std": 0.93125, "grad_norm": 12.216195106506348, "kl": 1.5566577205434442, "learning_rate": 4.5556349206349205e-07, "loss": 0.0016, "num_tokens": 864514291.0, "reward": 0.4421875, "reward_std": 0.05339745618402958, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8679111123085022, "step": 12600 }, { "completion_length": 345.2, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/max_terminated_length": 345.2, "completions/mean_length": 89.7125, "completions/mean_terminated_length": 89.7125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0114075082309629, "frac_reward_zero_std": 0.95, "grad_norm": 0.008917411789298058, "kl": 1.402724428009242, "learning_rate": 4.5552380952380945e-07, "loss": 0.0014, "num_tokens": 864829363.0, "reward": 0.3078125, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9489368319511413, "step": 12605 }, { "completion_length": 351.4, "completions/clipped_ratio": 0.0, "completions/max_length": 351.4, "completions/max_terminated_length": 351.4, "completions/mean_length": 91.48515625, "completions/mean_terminated_length": 91.48515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01141203322431116, "frac_reward_zero_std": 0.95, "grad_norm": 9.800779342651367, "kl": 1.9315836191410198, "learning_rate": 4.5548412698412696e-07, "loss": 0.0019, "num_tokens": 865143848.0, "reward": 0.3671875, "reward_std": 0.04445126317441463, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9240276336669921, "step": 12610 }, { "completion_length": 491.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 491.4, "completions/max_terminated_length": 393.2, "completions/mean_length": 87.66484375, "completions/mean_terminated_length": 87.12904205322266, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01141655821765942, "frac_reward_zero_std": 0.94375, "grad_norm": 14.160648345947266, "kl": 2.194118094164878, "learning_rate": 4.554444444444444e-07, "loss": 0.0022, "num_tokens": 865452507.0, "reward": 0.3640625, "reward_std": 0.049342484399676326, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9280421376228333, "step": 12615 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 84.04453125, "completions/mean_terminated_length": 84.04453125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01142108321100768, "frac_reward_zero_std": 0.94375, "grad_norm": 9.083357810974121, "kl": 1.535612259944901, "learning_rate": 4.554047619047619e-07, "loss": 0.0015, "num_tokens": 865756220.0, "reward": 0.2671875, "reward_std": 0.04750372357666492, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9380276083946228, "step": 12620 }, { "completion_length": 523.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 523.2, "completions/max_terminated_length": 498.4, "completions/mean_length": 98.546875, "completions/mean_terminated_length": 98.03376617431641, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01142560820435594, "frac_reward_zero_std": 0.94375, "grad_norm": 8.96264934539795, "kl": 1.2352261729771272, "learning_rate": 4.553650793650793e-07, "loss": 0.0012, "num_tokens": 866081672.0, "reward": 0.3234375, "reward_std": 0.04592459686100483, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9408758282661438, "step": 12625 }, { "completion_length": 315.6, "completions/clipped_ratio": 0.0, "completions/max_length": 315.6, "completions/max_terminated_length": 315.6, "completions/mean_length": 87.58125, "completions/mean_terminated_length": 87.58125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011430133197704199, "frac_reward_zero_std": 0.9625, "grad_norm": 2.387840986251831, "kl": 0.750459850858897, "learning_rate": 4.553253968253968e-07, "loss": 0.0008, "num_tokens": 866392496.0, "reward": 0.3890625, "reward_std": 0.028778617456555366, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.919851279258728, "step": 12630 }, { "completion_length": 423.2, "completions/clipped_ratio": 0.0, "completions/max_length": 423.2, "completions/max_terminated_length": 423.2, "completions/mean_length": 95.98984375, "completions/mean_terminated_length": 95.98984375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011434658191052459, "frac_reward_zero_std": 0.9625, "grad_norm": 8.11136245727539, "kl": 0.7296312436228618, "learning_rate": 4.552857142857143e-07, "loss": 0.0007, "num_tokens": 866716331.0, "reward": 0.3125, "reward_std": 0.03424547091126442, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9445823907852173, "step": 12635 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 387.6, "completions/max_terminated_length": 386.8, "completions/mean_length": 91.94453125, "completions/mean_terminated_length": 91.41916198730469, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011439183184400719, "frac_reward_zero_std": 0.9, "grad_norm": 19.53377914428711, "kl": 0.6886359137250111, "learning_rate": 4.552460317460317e-07, "loss": 0.0007, "num_tokens": 867032428.0, "reward": 0.3015625, "reward_std": 0.08664041459560394, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9522030830383301, "step": 12640 }, { "completion_length": 348.4, "completions/clipped_ratio": 0.0, "completions/max_length": 348.4, "completions/max_terminated_length": 348.4, "completions/mean_length": 96.35625, "completions/mean_terminated_length": 96.35625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011443708177748979, "frac_reward_zero_std": 0.91875, "grad_norm": 15.37309455871582, "kl": 0.9553338607307523, "learning_rate": 4.552063492063492e-07, "loss": 0.001, "num_tokens": 867356276.0, "reward": 0.215625, "reward_std": 0.06975308954715728, "rewards/verify_chess_move/mean": 0.215625, "rewards/verify_chess_move/std": 0.9737971901893616, "step": 12645 }, { "completion_length": 435.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.6, "completions/max_terminated_length": 375.2, "completions/mean_length": 90.821875, "completions/mean_terminated_length": 90.2986068725586, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01144823317109724, "frac_reward_zero_std": 0.93125, "grad_norm": 0.0021075706463307142, "kl": 0.4012082805624232, "learning_rate": 4.5516666666666665e-07, "loss": 0.0004, "num_tokens": 867670520.0, "reward": 0.425, "reward_std": 0.05997261106967926, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9022023200988769, "step": 12650 }, { "completion_length": 312.4, "completions/clipped_ratio": 0.0, "completions/max_length": 312.4, "completions/max_terminated_length": 312.4, "completions/mean_length": 83.61484375, "completions/mean_terminated_length": 83.61484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011452758164445498, "frac_reward_zero_std": 0.96875, "grad_norm": 6.521162509918213, "kl": 0.390823873039335, "learning_rate": 4.551269841269841e-07, "loss": 0.0004, "num_tokens": 867973835.0, "reward": 0.3109375, "reward_std": 0.029826053231954575, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9451753973960877, "step": 12655 }, { "completion_length": 563.2, "completions/clipped_ratio": 0.00390625, "completions/max_length": 563.2, "completions/max_terminated_length": 460.2, "completions/mean_length": 91.91796875, "completions/mean_terminated_length": 89.336181640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011457283157793758, "frac_reward_zero_std": 0.91875, "grad_norm": 15.522074699401855, "kl": 1.8520678949309513, "learning_rate": 4.5508730158730155e-07, "loss": 0.0019, "num_tokens": 868288202.0, "reward": 0.4984375, "reward_std": 0.06907010078430176, "rewards/verify_chess_move/mean": 0.4984375, "rewards/verify_chess_move/std": 0.8578610420227051, "step": 12660 }, { "completion_length": 448.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 448.8, "completions/max_terminated_length": 433.0, "completions/mean_length": 86.13125, "completions/mean_terminated_length": 85.07059478759766, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011461808151142018, "frac_reward_zero_std": 0.91875, "grad_norm": 0.438909113407135, "kl": 0.8644809682737105, "learning_rate": 4.55047619047619e-07, "loss": 0.0009, "num_tokens": 868596426.0, "reward": 0.2625, "reward_std": 0.06812747493386269, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9467707872390747, "step": 12665 }, { "completion_length": 298.6, "completions/clipped_ratio": 0.0, "completions/max_length": 298.6, "completions/max_terminated_length": 298.6, "completions/mean_length": 88.1546875, "completions/mean_terminated_length": 88.1546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011466333144490278, "frac_reward_zero_std": 0.925, "grad_norm": 8.005627632141113, "kl": 0.8454791102092714, "learning_rate": 4.550079365079365e-07, "loss": 0.0008, "num_tokens": 868907392.0, "reward": 0.359375, "reward_std": 0.06943454891443253, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9200557231903076, "step": 12670 }, { "completion_length": 332.2, "completions/clipped_ratio": 0.0, "completions/max_length": 332.2, "completions/max_terminated_length": 332.2, "completions/mean_length": 82.44140625, "completions/mean_terminated_length": 82.44140625, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.011470858137838538, "frac_reward_zero_std": 0.96875, "grad_norm": 11.560561180114746, "kl": 1.434578682249412, "learning_rate": 4.5496825396825397e-07, "loss": 0.0014, "num_tokens": 869208269.0, "reward": 0.5359375, "reward_std": 0.0268809512257576, "rewards/verify_chess_move/mean": 0.5359375, "rewards/verify_chess_move/std": 0.8415142059326172, "step": 12675 }, { "completion_length": 368.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 368.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 89.84375, "completions/mean_terminated_length": 89.32042694091797, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011475383131186798, "frac_reward_zero_std": 0.94375, "grad_norm": 9.655946731567383, "kl": 0.835150255379267, "learning_rate": 4.5492857142857137e-07, "loss": 0.0008, "num_tokens": 869522685.0, "reward": 0.3203125, "reward_std": 0.0511327937245369, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9457594275474548, "step": 12680 }, { "completion_length": 372.6, "completions/clipped_ratio": 0.0, "completions/max_length": 372.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 87.378125, "completions/mean_terminated_length": 87.378125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011479908124535056, "frac_reward_zero_std": 0.94375, "grad_norm": 2.2795486450195312, "kl": 0.7640276505611837, "learning_rate": 4.548888888888889e-07, "loss": 0.0008, "num_tokens": 869831913.0, "reward": 0.49375, "reward_std": 0.05160459838807583, "rewards/verify_chess_move/mean": 0.49375, "rewards/verify_chess_move/std": 0.8653579711914062, "step": 12685 }, { "completion_length": 424.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 424.0, "completions/max_terminated_length": 384.6, "completions/mean_length": 92.37421875, "completions/mean_terminated_length": 91.84783477783203, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011484433117883316, "frac_reward_zero_std": 0.95, "grad_norm": 4.336388111114502, "kl": 1.223329971334897, "learning_rate": 4.5484920634920633e-07, "loss": 0.0012, "num_tokens": 870148912.0, "reward": 0.34375, "reward_std": 0.042873119562864305, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9379880785942077, "step": 12690 }, { "completion_length": 322.8, "completions/clipped_ratio": 0.0, "completions/max_length": 322.8, "completions/max_terminated_length": 322.8, "completions/mean_length": 93.65625, "completions/mean_terminated_length": 93.65625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011488958111231577, "frac_reward_zero_std": 0.925, "grad_norm": 6.0525689125061035, "kl": 1.186637774296105, "learning_rate": 4.548095238095238e-07, "loss": 0.0012, "num_tokens": 870468240.0, "reward": 0.303125, "reward_std": 0.06938609592616558, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9502152323722839, "step": 12695 }, { "completion_length": 362.6, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/max_terminated_length": 362.6, "completions/mean_length": 94.03515625, "completions/mean_terminated_length": 94.03515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011493483104579837, "frac_reward_zero_std": 0.95625, "grad_norm": 8.239413261413574, "kl": 5.951454191887751, "learning_rate": 4.5476984126984124e-07, "loss": 0.006, "num_tokens": 870788045.0, "reward": 0.4875, "reward_std": 0.03640277311205864, "rewards/verify_chess_move/mean": 0.4875, "rewards/verify_chess_move/std": 0.839310085773468, "step": 12700 }, { "completion_length": 394.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 89.8484375, "completions/mean_terminated_length": 89.8484375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011498008097928097, "frac_reward_zero_std": 0.9375, "grad_norm": 11.493680000305176, "kl": 2.726875028957147, "learning_rate": 4.547301587301587e-07, "loss": 0.0027, "num_tokens": 871102931.0, "reward": 0.3390625, "reward_std": 0.05239494368433952, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9382880449295044, "step": 12705 }, { "completion_length": 400.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 88.646875, "completions/mean_terminated_length": 88.646875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011502533091276355, "frac_reward_zero_std": 0.95, "grad_norm": 7.371463298797607, "kl": 3.836793415877037, "learning_rate": 4.546904761904762e-07, "loss": 0.0038, "num_tokens": 871415127.0, "reward": 0.30625, "reward_std": 0.04376827478408814, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9467539429664612, "step": 12710 }, { "completion_length": 448.4, "completions/clipped_ratio": 0.0, "completions/max_length": 448.4, "completions/max_terminated_length": 448.4, "completions/mean_length": 88.7046875, "completions/mean_terminated_length": 88.7046875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011507058084624615, "frac_reward_zero_std": 0.91875, "grad_norm": 6.589635372161865, "kl": 2.4643347366480155, "learning_rate": 4.546507936507936e-07, "loss": 0.0025, "num_tokens": 871727421.0, "reward": 0.36875, "reward_std": 0.06949345320463181, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.920977509021759, "step": 12715 }, { "completion_length": 335.8, "completions/clipped_ratio": 0.0, "completions/max_length": 335.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 87.2125, "completions/mean_terminated_length": 87.2125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011511583077972875, "frac_reward_zero_std": 0.95625, "grad_norm": 0.8655860424041748, "kl": 2.2013039944227786, "learning_rate": 4.546111111111111e-07, "loss": 0.0022, "num_tokens": 872036685.0, "reward": 0.4921875, "reward_std": 0.03503581546247005, "rewards/verify_chess_move/mean": 0.4921875, "rewards/verify_chess_move/std": 0.8535487174987793, "step": 12720 }, { "completion_length": 468.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 468.8, "completions/max_terminated_length": 335.8, "completions/mean_length": 91.2609375, "completions/mean_terminated_length": 90.20901489257812, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.011516108071321135, "frac_reward_zero_std": 0.91875, "grad_norm": 24.97184944152832, "kl": 4.184839822782669, "learning_rate": 4.5457142857142856e-07, "loss": 0.0042, "num_tokens": 872352395.0, "reward": 0.3546875, "reward_std": 0.07359432876110077, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9198927998542785, "step": 12725 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 378.4, "completions/max_terminated_length": 282.0, "completions/mean_length": 91.2421875, "completions/mean_terminated_length": 90.72357025146485, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011520633064669395, "frac_reward_zero_std": 0.93125, "grad_norm": 7.24605655670166, "kl": 1.912107908213511, "learning_rate": 4.5453174603174596e-07, "loss": 0.0019, "num_tokens": 872669073.0, "reward": 0.290625, "reward_std": 0.05997261069715023, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9364944815635681, "step": 12730 }, { "completion_length": 530.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 530.2, "completions/max_terminated_length": 472.8, "completions/mean_length": 92.27734375, "completions/mean_terminated_length": 91.74918060302734, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011525158058017656, "frac_reward_zero_std": 0.9375, "grad_norm": 0.6204748153686523, "kl": 2.711256845726166, "learning_rate": 4.5449206349206347e-07, "loss": 0.0027, "num_tokens": 872987012.0, "reward": 0.328125, "reward_std": 0.058970099315047264, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9154635787010192, "step": 12735 }, { "completion_length": 445.6, "completions/clipped_ratio": 0.0, "completions/max_length": 445.6, "completions/max_terminated_length": 445.6, "completions/mean_length": 91.9453125, "completions/mean_terminated_length": 91.9453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011529683051365914, "frac_reward_zero_std": 0.94375, "grad_norm": 19.67056655883789, "kl": 3.1523205669946037, "learning_rate": 4.544523809523809e-07, "loss": 0.0032, "num_tokens": 873303758.0, "reward": 0.4078125, "reward_std": 0.05228758715093136, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9069579482078552, "step": 12740 }, { "completion_length": 618.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 618.6, "completions/max_terminated_length": 333.2, "completions/mean_length": 89.8265625, "completions/mean_terminated_length": 88.22635040283203, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011534208044714174, "frac_reward_zero_std": 0.9125, "grad_norm": 13.80642318725586, "kl": 1.0087562968139536, "learning_rate": 4.5441269841269843e-07, "loss": 0.001, "num_tokens": 873617760.0, "reward": 0.31875, "reward_std": 0.07143957167863846, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9105932950973511, "step": 12745 }, { "completion_length": 288.8, "completions/clipped_ratio": 0.0, "completions/max_length": 288.8, "completions/max_terminated_length": 288.8, "completions/mean_length": 87.97734375, "completions/mean_terminated_length": 87.97734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011538733038062434, "frac_reward_zero_std": 0.95625, "grad_norm": 0.10611043870449066, "kl": 0.8325828508241102, "learning_rate": 4.5437301587301583e-07, "loss": 0.0008, "num_tokens": 873928067.0, "reward": 0.4875, "reward_std": 0.03798189871013165, "rewards/verify_chess_move/mean": 0.4875, "rewards/verify_chess_move/std": 0.8694269299507141, "step": 12750 }, { "completion_length": 305.6, "completions/clipped_ratio": 0.0, "completions/max_length": 305.6, "completions/max_terminated_length": 305.6, "completions/mean_length": 81.36484375, "completions/mean_terminated_length": 81.36484375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011543258031410694, "frac_reward_zero_std": 0.95625, "grad_norm": 11.737669944763184, "kl": 0.9625482875155285, "learning_rate": 4.543333333333333e-07, "loss": 0.001, "num_tokens": 874228166.0, "reward": 0.296875, "reward_std": 0.039347875490784646, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9469344615936279, "step": 12755 }, { "completion_length": 572.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 572.8, "completions/max_terminated_length": 481.0, "completions/mean_length": 103.06953125, "completions/mean_terminated_length": 102.55327911376953, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011547783024758954, "frac_reward_zero_std": 0.91875, "grad_norm": 2.6277785301208496, "kl": 2.6456523941014893, "learning_rate": 4.542936507936508e-07, "loss": 0.0026, "num_tokens": 874563055.0, "reward": 0.23125, "reward_std": 0.07222737036645413, "rewards/verify_chess_move/mean": 0.23125, "rewards/verify_chess_move/std": 0.9675172090530395, "step": 12760 }, { "completion_length": 420.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 420.2, "completions/max_terminated_length": 394.8, "completions/mean_length": 90.984375, "completions/mean_terminated_length": 90.43989410400391, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011552308018107213, "frac_reward_zero_std": 0.91875, "grad_norm": 2.5025038719177246, "kl": 0.8329353958601132, "learning_rate": 4.5425396825396825e-07, "loss": 0.0008, "num_tokens": 874877819.0, "reward": 0.4796875, "reward_std": 0.0683880940079689, "rewards/verify_chess_move/mean": 0.4796875, "rewards/verify_chess_move/std": 0.8549906373023987, "step": 12765 }, { "completion_length": 384.2, "completions/clipped_ratio": 0.0, "completions/max_length": 384.2, "completions/max_terminated_length": 384.2, "completions/mean_length": 89.63828125, "completions/mean_terminated_length": 89.63828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011556833011455473, "frac_reward_zero_std": 0.9125, "grad_norm": 20.005338668823242, "kl": 1.0449306856375187, "learning_rate": 4.542142857142857e-07, "loss": 0.001, "num_tokens": 875191572.0, "reward": 0.3328125, "reward_std": 0.06802266538143158, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9392531633377075, "step": 12770 }, { "completion_length": 310.8, "completions/clipped_ratio": 0.0, "completions/max_length": 310.8, "completions/max_terminated_length": 310.8, "completions/mean_length": 90.47265625, "completions/mean_terminated_length": 90.47265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011561358004803733, "frac_reward_zero_std": 0.95625, "grad_norm": 14.799702644348145, "kl": 1.9773017633939163, "learning_rate": 4.5417460317460315e-07, "loss": 0.002, "num_tokens": 875508945.0, "reward": 0.3109375, "reward_std": 0.03729890994727612, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.929349684715271, "step": 12775 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 88.284375, "completions/mean_terminated_length": 88.284375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011565882998151993, "frac_reward_zero_std": 0.95625, "grad_norm": 10.234296798706055, "kl": 1.5428930695168674, "learning_rate": 4.541349206349206e-07, "loss": 0.0015, "num_tokens": 875820317.0, "reward": 0.35, "reward_std": 0.0350367970764637, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9330574631690979, "step": 12780 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 87.46640625, "completions/mean_terminated_length": 87.46640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011570407991500253, "frac_reward_zero_std": 0.95625, "grad_norm": 0.5409303903579712, "kl": 0.7600930637214333, "learning_rate": 4.5409523809523806e-07, "loss": 0.0008, "num_tokens": 876130314.0, "reward": 0.3796875, "reward_std": 0.030935921147465704, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9183763861656189, "step": 12785 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 81.70625, "completions/mean_terminated_length": 81.70625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011574932984848513, "frac_reward_zero_std": 0.94375, "grad_norm": 8.249019622802734, "kl": 0.6197396165225655, "learning_rate": 4.540555555555555e-07, "loss": 0.0006, "num_tokens": 876430338.0, "reward": 0.4125, "reward_std": 0.04887068048119545, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9030860543251038, "step": 12790 }, { "completion_length": 393.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 393.2, "completions/max_terminated_length": 370.4, "completions/mean_length": 89.171875, "completions/mean_terminated_length": 88.64993438720703, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011579457978196771, "frac_reward_zero_std": 0.95, "grad_norm": 2.138305902481079, "kl": 0.35128207948291673, "learning_rate": 4.54015873015873e-07, "loss": 0.0004, "num_tokens": 876743038.0, "reward": 0.3046875, "reward_std": 0.04376729317009449, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9256335258483886, "step": 12795 }, { "completion_length": 326.2, "completions/clipped_ratio": 0.0, "completions/max_length": 326.2, "completions/max_terminated_length": 326.2, "completions/mean_length": 84.81875, "completions/mean_terminated_length": 84.81875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011583982971545032, "frac_reward_zero_std": 0.94375, "grad_norm": 3.598069667816162, "kl": 0.23223684155382215, "learning_rate": 4.539761904761905e-07, "loss": 0.0002, "num_tokens": 877048894.0, "reward": 0.36875, "reward_std": 0.0447707861661911, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9292406916618348, "step": 12800 }, { "completion_length": 364.6, "completions/clipped_ratio": 0.0, "completions/max_length": 364.6, "completions/max_terminated_length": 364.6, "completions/mean_length": 88.0828125, "completions/mean_terminated_length": 88.0828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011588507964893292, "frac_reward_zero_std": 0.925, "grad_norm": 15.69361686706543, "kl": 0.13512620078399779, "learning_rate": 4.539365079365079e-07, "loss": 0.0001, "num_tokens": 877358144.0, "reward": 0.353125, "reward_std": 0.05986681953072548, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9180132031440735, "step": 12805 }, { "completion_length": 277.8, "completions/clipped_ratio": 0.0, "completions/max_length": 277.8, "completions/max_terminated_length": 277.8, "completions/mean_length": 92.57421875, "completions/mean_terminated_length": 92.57421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011593032958241552, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0015837210230529308, "kl": 0.3015630315756425, "learning_rate": 4.538968253968254e-07, "loss": 0.0003, "num_tokens": 877677319.0, "reward": 0.3890625, "reward_std": 0.07675414457917214, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9150351166725159, "step": 12810 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0, "completions/max_length": 393.8, "completions/max_terminated_length": 393.8, "completions/mean_length": 88.3421875, "completions/mean_terminated_length": 88.3421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011597557951589812, "frac_reward_zero_std": 0.95625, "grad_norm": 0.027279037982225418, "kl": 0.4429670905927196, "learning_rate": 4.5385714285714284e-07, "loss": 0.0004, "num_tokens": 877990141.0, "reward": 0.43125, "reward_std": 0.03593195080757141, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8932056784629822, "step": 12815 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 84.465625, "completions/mean_terminated_length": 84.465625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01160208294493807, "frac_reward_zero_std": 0.93125, "grad_norm": 5.367681980133057, "kl": 1.0821084642317147, "learning_rate": 4.5381746031746035e-07, "loss": 0.0011, "num_tokens": 878293817.0, "reward": 0.3875, "reward_std": 0.06223374232649803, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9080802202224731, "step": 12820 }, { "completion_length": 527.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 527.8, "completions/max_terminated_length": 524.4, "completions/mean_length": 90.95, "completions/mean_terminated_length": 90.42442169189454, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01160660793828633, "frac_reward_zero_std": 0.9125, "grad_norm": 3.9345617294311523, "kl": 5.496683460031636, "learning_rate": 4.5377777777777775e-07, "loss": 0.0055, "num_tokens": 878608625.0, "reward": 0.26875, "reward_std": 0.07959286868572235, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9470136404037476, "step": 12825 }, { "completion_length": 367.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 88.66015625, "completions/mean_terminated_length": 88.66015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01161113293163459, "frac_reward_zero_std": 0.95625, "grad_norm": 3.4771292209625244, "kl": 0.6783936139661819, "learning_rate": 4.537380952380952e-07, "loss": 0.0007, "num_tokens": 878920494.0, "reward": 0.29375, "reward_std": 0.035036797448992726, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.9445637822151184, "step": 12830 }, { "completion_length": 373.2, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/max_terminated_length": 373.2, "completions/mean_length": 88.89140625, "completions/mean_terminated_length": 88.89140625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01161565792498285, "frac_reward_zero_std": 0.95625, "grad_norm": 0.10362754762172699, "kl": 1.3610696875257418, "learning_rate": 4.536984126984127e-07, "loss": 0.0014, "num_tokens": 879233259.0, "reward": 0.421875, "reward_std": 0.03956102393567562, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8967779278755188, "step": 12835 }, { "completion_length": 519.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 519.4, "completions/max_terminated_length": 451.8, "completions/mean_length": 86.1515625, "completions/mean_terminated_length": 85.08669891357422, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01162018291833111, "frac_reward_zero_std": 0.91875, "grad_norm": 13.301894187927246, "kl": 1.3206473971833474, "learning_rate": 4.536587301587301e-07, "loss": 0.0013, "num_tokens": 879541613.0, "reward": 0.4390625, "reward_std": 0.06702015250921249, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8976165890693665, "step": 12840 }, { "completion_length": 297.4, "completions/clipped_ratio": 0.0, "completions/max_length": 297.4, "completions/max_terminated_length": 297.4, "completions/mean_length": 93.29296875, "completions/mean_terminated_length": 93.29296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01162470791167937, "frac_reward_zero_std": 0.95625, "grad_norm": 1.5979478359222412, "kl": 0.7568237686995417, "learning_rate": 4.536190476190476e-07, "loss": 0.0008, "num_tokens": 879862212.0, "reward": 0.446875, "reward_std": 0.041610969603061675, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.8874142527580261, "step": 12845 }, { "completion_length": 428.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 428.6, "completions/max_terminated_length": 381.4, "completions/mean_length": 95.9125, "completions/mean_terminated_length": 94.87882080078126, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011629232905027629, "frac_reward_zero_std": 0.95, "grad_norm": 4.731963634490967, "kl": 2.4015414013294505, "learning_rate": 4.5357936507936507e-07, "loss": 0.0024, "num_tokens": 880184132.0, "reward": 0.296875, "reward_std": 0.047397346794605257, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9478764414787293, "step": 12850 }, { "completion_length": 424.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 424.2, "completions/max_terminated_length": 405.8, "completions/mean_length": 99.43046875, "completions/mean_terminated_length": 98.91134948730469, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011633757898375889, "frac_reward_zero_std": 0.93125, "grad_norm": 3.2634894847869873, "kl": 1.4317276098299772, "learning_rate": 4.535396825396825e-07, "loss": 0.0014, "num_tokens": 880513019.0, "reward": 0.40625, "reward_std": 0.062493379414081576, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9121092438697815, "step": 12855 }, { "completion_length": 377.2, "completions/clipped_ratio": 0.0, "completions/max_length": 377.2, "completions/max_terminated_length": 377.2, "completions/mean_length": 85.71171875, "completions/mean_terminated_length": 85.71171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01163828289172415, "frac_reward_zero_std": 0.925, "grad_norm": 17.07968521118164, "kl": 2.2245815833681264, "learning_rate": 4.535e-07, "loss": 0.0022, "num_tokens": 880820370.0, "reward": 0.3484375, "reward_std": 0.06465166658163071, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9344390630722046, "step": 12860 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 94.02265625, "completions/mean_terminated_length": 94.02265625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.01164280788507241, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0847669467329979, "kl": 0.5828894936246798, "learning_rate": 4.5346031746031743e-07, "loss": 0.0006, "num_tokens": 881141311.0, "reward": 0.4296875, "reward_std": 0.03240768946707249, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8895210981369018, "step": 12865 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.0, "completions/max_length": 457.4, "completions/max_terminated_length": 457.4, "completions/mean_length": 88.1328125, "completions/mean_terminated_length": 88.1328125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01164733287842067, "frac_reward_zero_std": 0.94375, "grad_norm": 9.84853458404541, "kl": 0.671781281195581, "learning_rate": 4.5342063492063494e-07, "loss": 0.0007, "num_tokens": 881451009.0, "reward": 0.4078125, "reward_std": 0.04955366849899292, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9104501843452454, "step": 12870 }, { "completion_length": 483.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 483.2, "completions/max_terminated_length": 363.4, "completions/mean_length": 89.096875, "completions/mean_terminated_length": 87.51674957275391, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011651857871768928, "frac_reward_zero_std": 0.95, "grad_norm": 0.4119516611099243, "kl": 1.1915850383462385, "learning_rate": 4.5338095238095234e-07, "loss": 0.0012, "num_tokens": 881762357.0, "reward": 0.3734375, "reward_std": 0.040822191163897514, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9165700674057007, "step": 12875 }, { "completion_length": 641.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 641.4, "completions/max_terminated_length": 526.2, "completions/mean_length": 96.30859375, "completions/mean_terminated_length": 94.73756561279296, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011656382865117188, "frac_reward_zero_std": 0.9, "grad_norm": 5.454131126403809, "kl": 1.6059057852602563, "learning_rate": 4.533412698412698e-07, "loss": 0.0016, "num_tokens": 882085032.0, "reward": 0.3359375, "reward_std": 0.08343567326664925, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9343371748924255, "step": 12880 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 91.57734375, "completions/mean_terminated_length": 91.57734375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011660907858465448, "frac_reward_zero_std": 0.96875, "grad_norm": 9.006953239440918, "kl": 3.0077266307314856, "learning_rate": 4.533015873015873e-07, "loss": 0.003, "num_tokens": 882403475.0, "reward": 0.2640625, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9627984166145325, "step": 12885 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 83.25859375, "completions/mean_terminated_length": 83.25859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011665432851813708, "frac_reward_zero_std": 0.9375, "grad_norm": 7.522796154022217, "kl": 2.451173542952165, "learning_rate": 4.5326190476190475e-07, "loss": 0.0025, "num_tokens": 882706918.0, "reward": 0.490625, "reward_std": 0.05623618252575398, "rewards/verify_chess_move/mean": 0.490625, "rewards/verify_chess_move/std": 0.8593785524368286, "step": 12890 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 88.340625, "completions/mean_terminated_length": 88.340625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011669957845161968, "frac_reward_zero_std": 0.95625, "grad_norm": 0.18593889474868774, "kl": 0.5925943393725902, "learning_rate": 4.532222222222222e-07, "loss": 0.0006, "num_tokens": 883019506.0, "reward": 0.325, "reward_std": 0.0350367970764637, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9403777599334717, "step": 12895 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 89.89140625, "completions/mean_terminated_length": 89.89140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011674482838510228, "frac_reward_zero_std": 0.95, "grad_norm": 3.623518705368042, "kl": 1.0494229091797025, "learning_rate": 4.5318253968253966e-07, "loss": 0.001, "num_tokens": 883334327.0, "reward": 0.265625, "reward_std": 0.04013920314610005, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9560040712356568, "step": 12900 }, { "completion_length": 357.4, "completions/clipped_ratio": 0.0, "completions/max_length": 357.4, "completions/max_terminated_length": 357.4, "completions/mean_length": 90.98515625, "completions/mean_terminated_length": 90.98515625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011679007831858487, "frac_reward_zero_std": 0.95, "grad_norm": 0.027894487604498863, "kl": 0.3060282764723524, "learning_rate": 4.531428571428571e-07, "loss": 0.0003, "num_tokens": 883650548.0, "reward": 0.459375, "reward_std": 0.04013920277357101, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8766536593437195, "step": 12905 }, { "completion_length": 341.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 89.08671875, "completions/mean_terminated_length": 89.08671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011683532825206747, "frac_reward_zero_std": 0.93125, "grad_norm": 5.95627498626709, "kl": 1.7500299316830934, "learning_rate": 4.531031746031746e-07, "loss": 0.0018, "num_tokens": 883962283.0, "reward": 0.4265625, "reward_std": 0.05818131938576698, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.8980637431144715, "step": 12910 }, { "completion_length": 452.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 452.6, "completions/max_terminated_length": 399.2, "completions/mean_length": 93.06796875, "completions/mean_terminated_length": 92.54188232421875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011688057818555007, "frac_reward_zero_std": 0.95, "grad_norm": 0.0018578157760202885, "kl": 0.5979834944475442, "learning_rate": 4.53063492063492e-07, "loss": 0.0006, "num_tokens": 884280122.0, "reward": 0.4046875, "reward_std": 0.044451264292001726, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9115007042884826, "step": 12915 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 372.4, "completions/max_terminated_length": 314.6, "completions/mean_length": 94.246875, "completions/mean_terminated_length": 93.71200103759766, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011692582811903267, "frac_reward_zero_std": 0.94375, "grad_norm": 4.387025356292725, "kl": 0.5437677660025656, "learning_rate": 4.5302380952380953e-07, "loss": 0.0005, "num_tokens": 884600878.0, "reward": 0.3109375, "reward_std": 0.04955366961658001, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9140185713768005, "step": 12920 }, { "completion_length": 374.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 374.4, "completions/max_terminated_length": 364.2, "completions/mean_length": 90.56796875, "completions/mean_terminated_length": 89.51369934082031, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011697107805251527, "frac_reward_zero_std": 0.95, "grad_norm": 5.914226531982422, "kl": 1.186194795067422, "learning_rate": 4.52984126984127e-07, "loss": 0.0012, "num_tokens": 884915285.0, "reward": 0.353125, "reward_std": 0.04602940678596497, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.924402940273285, "step": 12925 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 92.859375, "completions/mean_terminated_length": 92.859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011701632798599785, "frac_reward_zero_std": 0.9, "grad_norm": 13.231800079345703, "kl": 3.461474592075683, "learning_rate": 4.529444444444444e-07, "loss": 0.0035, "num_tokens": 885234473.0, "reward": 0.4578125, "reward_std": 0.08890350759029389, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.88302583694458, "step": 12930 }, { "completion_length": 338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 87.8015625, "completions/mean_terminated_length": 87.8015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011706157791948045, "frac_reward_zero_std": 0.9375, "grad_norm": 12.17165756225586, "kl": 1.5602633686969056, "learning_rate": 4.529047619047619e-07, "loss": 0.0016, "num_tokens": 885545427.0, "reward": 0.35, "reward_std": 0.05649385750293732, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9257179975509644, "step": 12935 }, { "completion_length": 509.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 509.2, "completions/max_terminated_length": 341.8, "completions/mean_length": 89.9203125, "completions/mean_terminated_length": 88.85977325439453, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011710682785296305, "frac_reward_zero_std": 0.9375, "grad_norm": 2.075599431991577, "kl": 5.091000346525107, "learning_rate": 4.5286507936507935e-07, "loss": 0.0051, "num_tokens": 885858549.0, "reward": 0.3265625, "reward_std": 0.0573899906128645, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9312273263931274, "step": 12940 }, { "completion_length": 476.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 476.0, "completions/max_terminated_length": 460.4, "completions/mean_length": 92.7203125, "completions/mean_terminated_length": 91.66527404785157, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011715207778644566, "frac_reward_zero_std": 0.91875, "grad_norm": 13.64140510559082, "kl": 6.263299883133731, "learning_rate": 4.5282539682539685e-07, "loss": 0.0063, "num_tokens": 886173951.0, "reward": 0.3609375, "reward_std": 0.07038958743214607, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.932340943813324, "step": 12945 }, { "completion_length": 304.2, "completions/clipped_ratio": 0.0, "completions/max_length": 304.2, "completions/max_terminated_length": 304.2, "completions/mean_length": 90.521875, "completions/mean_terminated_length": 90.521875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011719732771992826, "frac_reward_zero_std": 0.90625, "grad_norm": 19.442996978759766, "kl": 6.403872767090798, "learning_rate": 4.5278571428571426e-07, "loss": 0.0064, "num_tokens": 886489435.0, "reward": 0.1390625, "reward_std": 0.07402022629976272, "rewards/verify_chess_move/mean": 0.1390625, "rewards/verify_chess_move/std": 0.9737486124038697, "step": 12950 }, { "completion_length": 447.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.2, "completions/max_terminated_length": 348.6, "completions/mean_length": 85.36171875, "completions/mean_terminated_length": 84.82298431396484, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011724257765341086, "frac_reward_zero_std": 0.94375, "grad_norm": 7.236507415771484, "kl": 2.5619752521743067, "learning_rate": 4.527460317460317e-07, "loss": 0.0026, "num_tokens": 886794466.0, "reward": 0.46875, "reward_std": 0.04955465197563171, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8822445154190064, "step": 12955 }, { "completion_length": 416.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 416.4, "completions/max_terminated_length": 348.8, "completions/mean_length": 86.23984375, "completions/mean_terminated_length": 85.69711456298828, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.011728782758689344, "frac_reward_zero_std": 0.94375, "grad_norm": 14.011670112609863, "kl": 2.5790829280158505, "learning_rate": 4.527063492063492e-07, "loss": 0.0026, "num_tokens": 887101133.0, "reward": 0.4875, "reward_std": 0.04771588854491711, "rewards/verify_chess_move/mean": 0.4875, "rewards/verify_chess_move/std": 0.8229683399200439, "step": 12960 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 95.5140625, "completions/mean_terminated_length": 95.5140625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011733307752037604, "frac_reward_zero_std": 0.975, "grad_norm": 9.471213340759277, "kl": 0.6841076008044183, "learning_rate": 4.526666666666666e-07, "loss": 0.0007, "num_tokens": 887424911.0, "reward": 0.3625, "reward_std": 0.02041158638894558, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9127665758132935, "step": 12965 }, { "completion_length": 415.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 415.6, "completions/max_terminated_length": 328.6, "completions/mean_length": 87.490625, "completions/mean_terminated_length": 86.9590087890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011737832745385864, "frac_reward_zero_std": 0.95, "grad_norm": 7.985079765319824, "kl": 1.012656865583267, "learning_rate": 4.5262698412698407e-07, "loss": 0.001, "num_tokens": 887735315.0, "reward": 0.2546875, "reward_std": 0.04240131713449955, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9570955514907837, "step": 12970 }, { "completion_length": 402.6, "completions/clipped_ratio": 0.0, "completions/max_length": 402.6, "completions/max_terminated_length": 402.6, "completions/mean_length": 84.83125, "completions/mean_terminated_length": 84.83125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011742357738734124, "frac_reward_zero_std": 0.94375, "grad_norm": 2.2365405559539795, "kl": 1.3848412819672375, "learning_rate": 4.525873015873016e-07, "loss": 0.0014, "num_tokens": 888040499.0, "reward": 0.4765625, "reward_std": 0.04455861933529377, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8781918287277222, "step": 12975 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 90.0703125, "completions/mean_terminated_length": 90.0703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011746882732082384, "frac_reward_zero_std": 0.93125, "grad_norm": 3.143110752105713, "kl": 2.3100122647127135, "learning_rate": 4.5254761904761903e-07, "loss": 0.0023, "num_tokens": 888354349.0, "reward": 0.44375, "reward_std": 0.06044343262910843, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8798882007598877, "step": 12980 }, { "completion_length": 362.4, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/max_terminated_length": 362.4, "completions/mean_length": 85.28046875, "completions/mean_terminated_length": 85.28046875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011751407725430643, "frac_reward_zero_std": 0.975, "grad_norm": 0.8279138803482056, "kl": 3.3825091542559678, "learning_rate": 4.525079365079365e-07, "loss": 0.0034, "num_tokens": 888661804.0, "reward": 0.24375, "reward_std": 0.0245114803314209, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9498223423957824, "step": 12985 }, { "completion_length": 363.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 85.85078125, "completions/mean_terminated_length": 85.85078125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011755932718778903, "frac_reward_zero_std": 0.95, "grad_norm": 10.5081148147583, "kl": 1.8912835754919797, "learning_rate": 4.5246825396825394e-07, "loss": 0.0019, "num_tokens": 888968637.0, "reward": 0.4171875, "reward_std": 0.04219013154506683, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8911986231803894, "step": 12990 }, { "completion_length": 454.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 454.6, "completions/max_terminated_length": 405.6, "completions/mean_length": 97.0640625, "completions/mean_terminated_length": 96.54360656738281, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011760457712127163, "frac_reward_zero_std": 0.95, "grad_norm": 1.6782180070877075, "kl": 2.9589081602403895, "learning_rate": 4.524285714285714e-07, "loss": 0.003, "num_tokens": 889294871.0, "reward": 0.3453125, "reward_std": 0.04082219228148461, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9291912198066712, "step": 12995 }, { "completion_length": 335.2, "completions/clipped_ratio": 0.0, "completions/max_length": 335.2, "completions/max_terminated_length": 335.2, "completions/mean_length": 92.3390625, "completions/mean_terminated_length": 92.3390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011764982705475423, "frac_reward_zero_std": 0.96875, "grad_norm": 0.011460628360509872, "kl": 2.373396507720463, "learning_rate": 4.523888888888889e-07, "loss": 0.0024, "num_tokens": 889613009.0, "reward": 0.290625, "reward_std": 0.026409148424863815, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9494798183441162, "step": 13000 }, { "completion_length": 290.2, "completions/clipped_ratio": 0.0, "completions/max_length": 290.2, "completions/max_terminated_length": 290.2, "completions/mean_length": 86.26875, "completions/mean_terminated_length": 86.26875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011769507698823683, "frac_reward_zero_std": 0.95625, "grad_norm": 10.57164192199707, "kl": 2.561834739940241, "learning_rate": 4.523492063492063e-07, "loss": 0.0026, "num_tokens": 889921241.0, "reward": 0.4109375, "reward_std": 0.033669838681817055, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.8915752053260804, "step": 13005 }, { "completion_length": 485.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 485.0, "completions/max_terminated_length": 355.6, "completions/mean_length": 98.50234375, "completions/mean_terminated_length": 96.9509521484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011774032692171943, "frac_reward_zero_std": 0.9625, "grad_norm": 9.030359268188477, "kl": 3.5332135916571135, "learning_rate": 4.523095238095238e-07, "loss": 0.0035, "num_tokens": 890249660.0, "reward": 0.3015625, "reward_std": 0.02993340976536274, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9421547412872314, "step": 13010 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 86.3515625, "completions/mean_terminated_length": 86.3515625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011778557685520202, "frac_reward_zero_std": 0.9375, "grad_norm": 13.779500961303711, "kl": 1.1904766453895719, "learning_rate": 4.5226984126984126e-07, "loss": 0.0012, "num_tokens": 890557286.0, "reward": 0.4234375, "reward_std": 0.05512886010110378, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8640401721000671, "step": 13015 }, { "completion_length": 460.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 460.0, "completions/max_terminated_length": 421.8, "completions/mean_length": 86.971875, "completions/mean_terminated_length": 86.44391326904297, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011783082678868462, "frac_reward_zero_std": 0.94375, "grad_norm": 10.47144603729248, "kl": 0.7405194718157873, "learning_rate": 4.5223015873015866e-07, "loss": 0.0007, "num_tokens": 890865786.0, "reward": 0.325, "reward_std": 0.05023665837943554, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9178032875061035, "step": 13020 }, { "completion_length": 447.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.6, "completions/max_terminated_length": 415.4, "completions/mean_length": 94.034375, "completions/mean_terminated_length": 93.50456848144532, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.011787607672216722, "frac_reward_zero_std": 0.925, "grad_norm": 3.2054593563079834, "kl": 1.6870183408609591, "learning_rate": 4.5219047619047617e-07, "loss": 0.0017, "num_tokens": 891186878.0, "reward": 0.415625, "reward_std": 0.0659701719880104, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.8970438003540039, "step": 13025 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.0, "completions/max_length": 425.6, "completions/max_terminated_length": 425.6, "completions/mean_length": 84.2484375, "completions/mean_terminated_length": 84.2484375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011792132665564982, "frac_reward_zero_std": 0.94375, "grad_norm": 3.525700092315674, "kl": 0.6426302171428688, "learning_rate": 4.521507936507936e-07, "loss": 0.0006, "num_tokens": 891491148.0, "reward": 0.4421875, "reward_std": 0.05070846229791641, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8658108234405517, "step": 13030 }, { "completion_length": 622.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 622.2, "completions/max_terminated_length": 565.6, "completions/mean_length": 90.42890625, "completions/mean_terminated_length": 89.36580200195313, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011796657658913242, "frac_reward_zero_std": 0.9375, "grad_norm": 0.5350587368011475, "kl": 1.1697725902544334, "learning_rate": 4.5211111111111113e-07, "loss": 0.0012, "num_tokens": 891804793.0, "reward": 0.2859375, "reward_std": 0.05486922301352024, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9563382148742676, "step": 13035 }, { "completion_length": 278.4, "completions/clipped_ratio": 0.0, "completions/max_length": 278.4, "completions/max_terminated_length": 278.4, "completions/mean_length": 83.50859375, "completions/mean_terminated_length": 83.50859375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0118011826522615, "frac_reward_zero_std": 0.93125, "grad_norm": 0.39543581008911133, "kl": 0.8146956059383228, "learning_rate": 4.5207142857142853e-07, "loss": 0.0008, "num_tokens": 892106260.0, "reward": 0.496875, "reward_std": 0.05613039135932922, "rewards/verify_chess_move/mean": 0.496875, "rewards/verify_chess_move/std": 0.8632640480995178, "step": 13040 }, { "completion_length": 399.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 399.6, "completions/max_terminated_length": 338.2, "completions/mean_length": 91.12109375, "completions/mean_terminated_length": 90.6028076171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01180570764560976, "frac_reward_zero_std": 0.9125, "grad_norm": 4.349839687347412, "kl": 1.3735708628082648, "learning_rate": 4.52031746031746e-07, "loss": 0.0014, "num_tokens": 892420591.0, "reward": 0.30625, "reward_std": 0.07075560316443444, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9404862403869629, "step": 13045 }, { "completion_length": 418.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 418.0, "completions/max_terminated_length": 349.8, "completions/mean_length": 89.30234375, "completions/mean_terminated_length": 88.766796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01181023263895802, "frac_reward_zero_std": 0.99375, "grad_norm": 1.7123992443084717, "kl": 0.6263388331164605, "learning_rate": 4.519920634920635e-07, "loss": 0.0006, "num_tokens": 892733194.0, "reward": 0.4109375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.8917336344718934, "step": 13050 }, { "completion_length": 528.6, "completions/clipped_ratio": 0.0, "completions/max_length": 528.6, "completions/max_terminated_length": 528.6, "completions/mean_length": 83.890625, "completions/mean_terminated_length": 83.890625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01181475763230628, "frac_reward_zero_std": 0.90625, "grad_norm": 25.252498626708984, "kl": 1.9252798499073833, "learning_rate": 4.519523809523809e-07, "loss": 0.0019, "num_tokens": 893035902.0, "reward": 0.465625, "reward_std": 0.07039213590323926, "rewards/verify_chess_move/mean": 0.465625, "rewards/verify_chess_move/std": 0.8484958171844482, "step": 13055 }, { "completion_length": 608.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 608.6, "completions/max_terminated_length": 397.0, "completions/mean_length": 95.7859375, "completions/mean_terminated_length": 94.20977935791015, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01181928262565454, "frac_reward_zero_std": 0.9375, "grad_norm": 6.1506500244140625, "kl": 2.55238132532686, "learning_rate": 4.519126984126984e-07, "loss": 0.0026, "num_tokens": 893357356.0, "reward": 0.5, "reward_std": 0.048978038132190704, "rewards/verify_chess_move/mean": 0.5, "rewards/verify_chess_move/std": 0.8456491231918335, "step": 13060 }, { "completion_length": 346.4, "completions/clipped_ratio": 0.0, "completions/max_length": 346.4, "completions/max_terminated_length": 346.4, "completions/mean_length": 89.64765625, "completions/mean_terminated_length": 89.64765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.0118238076190028, "frac_reward_zero_std": 0.91875, "grad_norm": 14.590874671936035, "kl": 2.535027127014473, "learning_rate": 4.5187301587301586e-07, "loss": 0.0025, "num_tokens": 893672993.0, "reward": 0.3609375, "reward_std": 0.06907010152935981, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9182738065719604, "step": 13065 }, { "completion_length": 383.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 383.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 84.26484375, "completions/mean_terminated_length": 83.73926849365235, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01182833261235106, "frac_reward_zero_std": 0.9125, "grad_norm": 10.70964527130127, "kl": 3.5101724749081766, "learning_rate": 4.518333333333333e-07, "loss": 0.0035, "num_tokens": 893976068.0, "reward": 0.3328125, "reward_std": 0.0764356017112732, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9391157507896424, "step": 13070 }, { "completion_length": 367.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 367.8, "completions/max_terminated_length": 290.8, "completions/mean_length": 87.07109375, "completions/mean_terminated_length": 86.0024917602539, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01183285760569932, "frac_reward_zero_std": 0.95625, "grad_norm": 12.761114120483398, "kl": 7.847163484606426, "learning_rate": 4.5179365079365076e-07, "loss": 0.0078, "num_tokens": 894285879.0, "reward": 0.271875, "reward_std": 0.037981899082660676, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9578185915946961, "step": 13075 }, { "completion_length": 453.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 453.0, "completions/max_terminated_length": 436.2, "completions/mean_length": 89.13828125, "completions/mean_terminated_length": 88.08221588134765, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01183738259904758, "frac_reward_zero_std": 0.9, "grad_norm": 10.907933235168457, "kl": 13.09468749818625, "learning_rate": 4.517539682539682e-07, "loss": 0.0131, "num_tokens": 894597728.0, "reward": 0.2734375, "reward_std": 0.0863807775080204, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.955322515964508, "step": 13080 }, { "completion_length": 463.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 463.6, "completions/max_terminated_length": 386.8, "completions/mean_length": 88.77421875, "completions/mean_terminated_length": 88.24755249023437, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01184190759239584, "frac_reward_zero_std": 0.86875, "grad_norm": 7.0105977058410645, "kl": 15.411210117046721, "learning_rate": 4.517142857142857e-07, "loss": 0.0154, "num_tokens": 894908791.0, "reward": 0.3546875, "reward_std": 0.11399316936731338, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9330353736877441, "step": 13085 }, { "completion_length": 449.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 84.4265625, "completions/mean_terminated_length": 83.88944702148437, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0118464325857441, "frac_reward_zero_std": 0.95, "grad_norm": 0.49248096346855164, "kl": 2.3656018738634885, "learning_rate": 4.516746031746032e-07, "loss": 0.0024, "num_tokens": 895214745.0, "reward": 0.371875, "reward_std": 0.04308430515229702, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9217543601989746, "step": 13090 }, { "completion_length": 300.4, "completions/clipped_ratio": 0.0, "completions/max_length": 300.4, "completions/max_terminated_length": 300.4, "completions/mean_length": 85.80078125, "completions/mean_terminated_length": 85.80078125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011850957579092358, "frac_reward_zero_std": 0.9625, "grad_norm": 3.1267855167388916, "kl": 3.769011013954878, "learning_rate": 4.516349206349206e-07, "loss": 0.0038, "num_tokens": 895522458.0, "reward": 0.384375, "reward_std": 0.03061639815568924, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.8991190075874329, "step": 13095 }, { "completion_length": 278.8, "completions/clipped_ratio": 0.0, "completions/max_length": 278.8, "completions/max_terminated_length": 278.8, "completions/mean_length": 91.3640625, "completions/mean_terminated_length": 91.3640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011855482572440618, "frac_reward_zero_std": 0.91875, "grad_norm": 21.471765518188477, "kl": 2.544147417275235, "learning_rate": 4.515952380952381e-07, "loss": 0.0025, "num_tokens": 895839116.0, "reward": 0.390625, "reward_std": 0.07154340110719204, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.8916261672973633, "step": 13100 }, { "completion_length": 323.4, "completions/clipped_ratio": 0.0, "completions/max_length": 323.4, "completions/max_terminated_length": 323.4, "completions/mean_length": 87.30546875, "completions/mean_terminated_length": 87.30546875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011860007565788878, "frac_reward_zero_std": 0.9, "grad_norm": 0.43037256598472595, "kl": 2.7946363410446793, "learning_rate": 4.5155555555555554e-07, "loss": 0.0028, "num_tokens": 896149667.0, "reward": 0.290625, "reward_std": 0.09005731940269471, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9508809447288513, "step": 13105 }, { "completion_length": 550.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 550.8, "completions/max_terminated_length": 530.0, "completions/mean_length": 90.00234375, "completions/mean_terminated_length": 89.46887664794922, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011864532559137138, "frac_reward_zero_std": 0.9375, "grad_norm": 11.699840545654297, "kl": 2.3362554402439857, "learning_rate": 4.51515873015873e-07, "loss": 0.0023, "num_tokens": 896462438.0, "reward": 0.3515625, "reward_std": 0.05124015063047409, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9242001056671143, "step": 13110 }, { "completion_length": 291.6, "completions/clipped_ratio": 0.0, "completions/max_length": 291.6, "completions/max_terminated_length": 291.6, "completions/mean_length": 82.99140625, "completions/mean_terminated_length": 82.99140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011869057552485398, "frac_reward_zero_std": 0.975, "grad_norm": 0.48806822299957275, "kl": 1.0847784222569317, "learning_rate": 4.5147619047619045e-07, "loss": 0.0011, "num_tokens": 896763547.0, "reward": 0.3203125, "reward_std": 0.019044627994298936, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9239413976669312, "step": 13115 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 89.56015625, "completions/mean_terminated_length": 89.56015625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011873582545833658, "frac_reward_zero_std": 0.9375, "grad_norm": 4.266815185546875, "kl": 2.368019398674369, "learning_rate": 4.514365079365079e-07, "loss": 0.0024, "num_tokens": 897077776.0, "reward": 0.2421875, "reward_std": 0.05124015137553215, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9592183351516723, "step": 13120 }, { "completion_length": 266.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 81.09765625, "completions/mean_terminated_length": 81.09765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011878107539181917, "frac_reward_zero_std": 0.94375, "grad_norm": 9.45446491241455, "kl": 1.130396976089105, "learning_rate": 4.513968253968254e-07, "loss": 0.0011, "num_tokens": 897378357.0, "reward": 0.35, "reward_std": 0.04682073332369328, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9124268531799317, "step": 13125 }, { "completion_length": 382.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 382.4, "completions/max_terminated_length": 310.2, "completions/mean_length": 85.6609375, "completions/mean_terminated_length": 85.1269744873047, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011882632532530177, "frac_reward_zero_std": 0.925, "grad_norm": 33.67985916137695, "kl": 1.347666952898726, "learning_rate": 4.513571428571428e-07, "loss": 0.0013, "num_tokens": 897684091.0, "reward": 0.3109375, "reward_std": 0.059867801144719124, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9460315346717835, "step": 13130 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 373.6, "completions/max_terminated_length": 350.8, "completions/mean_length": 85.6359375, "completions/mean_terminated_length": 84.57560272216797, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011887157525878437, "frac_reward_zero_std": 0.94375, "grad_norm": 14.658065795898438, "kl": 0.5562111372011713, "learning_rate": 4.513174603174603e-07, "loss": 0.0006, "num_tokens": 897991121.0, "reward": 0.415625, "reward_std": 0.05202893018722534, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9057571768760682, "step": 13135 }, { "completion_length": 443.4, "completions/clipped_ratio": 0.0, "completions/max_length": 443.4, "completions/max_terminated_length": 443.4, "completions/mean_length": 91.63515625, "completions/mean_terminated_length": 91.63515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011891682519226697, "frac_reward_zero_std": 0.95625, "grad_norm": 4.1318817138671875, "kl": 0.3902024372015148, "learning_rate": 4.5127777777777777e-07, "loss": 0.0004, "num_tokens": 898307910.0, "reward": 0.3234375, "reward_std": 0.03571978583931923, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9413725137710571, "step": 13140 }, { "completion_length": 287.6, "completions/clipped_ratio": 0.0, "completions/max_length": 287.6, "completions/max_terminated_length": 287.6, "completions/mean_length": 92.4484375, "completions/mean_terminated_length": 92.4484375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011896207512574957, "frac_reward_zero_std": 0.95625, "grad_norm": 9.341815948486328, "kl": 0.3713763473555446, "learning_rate": 4.5123809523809517e-07, "loss": 0.0004, "num_tokens": 898627588.0, "reward": 0.3125, "reward_std": 0.03887705430388451, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.914953863620758, "step": 13145 }, { "completion_length": 356.4, "completions/clipped_ratio": 0.0, "completions/max_length": 356.4, "completions/max_terminated_length": 356.4, "completions/mean_length": 85.99609375, "completions/mean_terminated_length": 85.99609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011900732505923215, "frac_reward_zero_std": 0.9625, "grad_norm": 0.5284102559089661, "kl": 0.19885624954476952, "learning_rate": 4.511984126984127e-07, "loss": 0.0002, "num_tokens": 898934567.0, "reward": 0.275, "reward_std": 0.03629541657865047, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9604009509086608, "step": 13150 }, { "completion_length": 458.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.6, "completions/max_terminated_length": 452.4, "completions/mean_length": 85.06328125, "completions/mean_terminated_length": 84.52896270751953, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011905257499271476, "frac_reward_zero_std": 0.93125, "grad_norm": 6.278136730194092, "kl": 0.28759082737378777, "learning_rate": 4.5115873015873013e-07, "loss": 0.0003, "num_tokens": 899240528.0, "reward": 0.33125, "reward_std": 0.06180940866470337, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9305826425552368, "step": 13155 }, { "completion_length": 564.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 564.8, "completions/max_terminated_length": 383.6, "completions/mean_length": 91.58671875, "completions/mean_terminated_length": 90.52906799316406, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011909782492619736, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0018668597331270576, "kl": 0.3387625715928152, "learning_rate": 4.5111904761904764e-07, "loss": 0.0003, "num_tokens": 899558231.0, "reward": 0.3515625, "reward_std": 0.03356248140335083, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9036986827850342, "step": 13160 }, { "completion_length": 457.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.2, "completions/max_terminated_length": 427.8, "completions/mean_length": 90.29453125, "completions/mean_terminated_length": 89.77920532226562, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011914307485967996, "frac_reward_zero_std": 0.95625, "grad_norm": 3.9410972595214844, "kl": 0.2525344133726321, "learning_rate": 4.5107936507936504e-07, "loss": 0.0003, "num_tokens": 899872768.0, "reward": 0.459375, "reward_std": 0.037981899082660676, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8838858842849732, "step": 13165 }, { "completion_length": 289.2, "completions/clipped_ratio": 0.0, "completions/max_length": 289.2, "completions/max_terminated_length": 289.2, "completions/mean_length": 86.6890625, "completions/mean_terminated_length": 86.6890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011918832479316256, "frac_reward_zero_std": 0.9375, "grad_norm": 10.728577613830566, "kl": 0.19234066009521483, "learning_rate": 4.510396825396825e-07, "loss": 0.0002, "num_tokens": 900181290.0, "reward": 0.2265625, "reward_std": 0.06191421747207641, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.9537909269332886, "step": 13170 }, { "completion_length": 391.6, "completions/clipped_ratio": 0.0, "completions/max_length": 391.6, "completions/max_terminated_length": 391.6, "completions/mean_length": 86.96484375, "completions/mean_terminated_length": 86.96484375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011923357472664516, "frac_reward_zero_std": 0.925, "grad_norm": 3.9973301887512207, "kl": 0.3474556275177747, "learning_rate": 4.51e-07, "loss": 0.0003, "num_tokens": 900489701.0, "reward": 0.4015625, "reward_std": 0.06486285179853439, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.887345016002655, "step": 13175 }, { "completion_length": 368.2, "completions/clipped_ratio": 0.0, "completions/max_length": 368.2, "completions/max_terminated_length": 368.2, "completions/mean_length": 89.621875, "completions/mean_terminated_length": 89.621875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011927882466012774, "frac_reward_zero_std": 0.9625, "grad_norm": 15.853581428527832, "kl": 0.5727200875524432, "learning_rate": 4.5096031746031746e-07, "loss": 0.0006, "num_tokens": 900802313.0, "reward": 0.4234375, "reward_std": 0.03492846004664898, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8957647323608399, "step": 13180 }, { "completion_length": 408.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 408.0, "completions/max_terminated_length": 400.4, "completions/mean_length": 91.959375, "completions/mean_terminated_length": 91.43684539794921, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.011932407459361034, "frac_reward_zero_std": 0.95625, "grad_norm": 4.285308837890625, "kl": 1.0950571586145088, "learning_rate": 4.509206349206349e-07, "loss": 0.0011, "num_tokens": 901119333.0, "reward": 0.140625, "reward_std": 0.0359319519251585, "rewards/verify_chess_move/mean": 0.140625, "rewards/verify_chess_move/std": 0.9839522242546082, "step": 13185 }, { "completion_length": 333.8, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/max_terminated_length": 333.8, "completions/mean_length": 94.509375, "completions/mean_terminated_length": 94.509375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011936932452709294, "frac_reward_zero_std": 0.975, "grad_norm": 0.1015312597155571, "kl": 0.4136024337960407, "learning_rate": 4.5088095238095236e-07, "loss": 0.0004, "num_tokens": 901441801.0, "reward": 0.325, "reward_std": 0.023356688022613526, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9462376594543457, "step": 13190 }, { "completion_length": 458.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.6, "completions/max_terminated_length": 395.6, "completions/mean_length": 93.93125, "completions/mean_terminated_length": 93.40477447509765, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.011941457446057555, "frac_reward_zero_std": 0.9375, "grad_norm": 16.325376510620117, "kl": 1.704305725125596, "learning_rate": 4.508412698412698e-07, "loss": 0.0017, "num_tokens": 901762753.0, "reward": 0.38125, "reward_std": 0.058072981983423234, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9194788932800293, "step": 13195 }, { "completion_length": 442.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 442.8, "completions/max_terminated_length": 348.2, "completions/mean_length": 93.8703125, "completions/mean_terminated_length": 93.34032897949218, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011945982439405815, "frac_reward_zero_std": 0.95, "grad_norm": 6.283607482910156, "kl": 1.952591547742486, "learning_rate": 4.5080158730158727e-07, "loss": 0.002, "num_tokens": 902083675.0, "reward": 0.2578125, "reward_std": 0.04240131638944149, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9630145907402039, "step": 13200 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 88.3859375, "completions/mean_terminated_length": 88.3859375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.011950507432754073, "frac_reward_zero_std": 0.93125, "grad_norm": 7.003514766693115, "kl": 1.0621647046646103, "learning_rate": 4.507619047619047e-07, "loss": 0.0011, "num_tokens": 902393713.0, "reward": 0.3640625, "reward_std": 0.06386033743619919, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.918933916091919, "step": 13205 }, { "completion_length": 444.4, "completions/clipped_ratio": 0.0, "completions/max_length": 444.4, "completions/max_terminated_length": 444.4, "completions/mean_length": 84.69296875, "completions/mean_terminated_length": 84.69296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011955032426102333, "frac_reward_zero_std": 0.925, "grad_norm": 6.09194278717041, "kl": 2.1776776691200213, "learning_rate": 4.5072222222222223e-07, "loss": 0.0022, "num_tokens": 902696336.0, "reward": 0.4109375, "reward_std": 0.06828073635697365, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.901929771900177, "step": 13210 }, { "completion_length": 272.2, "completions/clipped_ratio": 0.0, "completions/max_length": 272.2, "completions/max_terminated_length": 272.2, "completions/mean_length": 87.821875, "completions/mean_terminated_length": 87.821875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011959557419450593, "frac_reward_zero_std": 0.99375, "grad_norm": 3.2117536067962646, "kl": 0.5555365533335135, "learning_rate": 4.506825396825397e-07, "loss": 0.0006, "num_tokens": 903006356.0, "reward": 0.371875, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9253827452659606, "step": 13215 }, { "completion_length": 386.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 386.8, "completions/max_terminated_length": 329.4, "completions/mean_length": 91.1640625, "completions/mean_terminated_length": 90.0958480834961, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011964082412798853, "frac_reward_zero_std": 0.9375, "grad_norm": 11.004076957702637, "kl": 1.3679233325878157, "learning_rate": 4.506428571428571e-07, "loss": 0.0014, "num_tokens": 903321774.0, "reward": 0.284375, "reward_std": 0.05713133439421654, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9458828806877136, "step": 13220 }, { "completion_length": 542.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 542.0, "completions/max_terminated_length": 519.8, "completions/mean_length": 90.70703125, "completions/mean_terminated_length": 89.13747863769531, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011968607406147113, "frac_reward_zero_std": 0.94375, "grad_norm": 14.808318138122559, "kl": 1.0438109904294834, "learning_rate": 4.506031746031746e-07, "loss": 0.001, "num_tokens": 903634631.0, "reward": 0.3828125, "reward_std": 0.049342484772205354, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9173547387123108, "step": 13225 }, { "completion_length": 284.2, "completions/clipped_ratio": 0.0, "completions/max_length": 284.2, "completions/max_terminated_length": 284.2, "completions/mean_length": 86.83828125, "completions/mean_terminated_length": 86.83828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.011973132399495373, "frac_reward_zero_std": 0.975, "grad_norm": 0.6079545021057129, "kl": 0.4046687394380569, "learning_rate": 4.5056349206349205e-07, "loss": 0.0004, "num_tokens": 903942760.0, "reward": 0.3578125, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9222240447998047, "step": 13230 }, { "completion_length": 323.4, "completions/clipped_ratio": 0.0, "completions/max_length": 323.4, "completions/max_terminated_length": 323.4, "completions/mean_length": 92.41953125, "completions/mean_terminated_length": 92.41953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.011977657392843632, "frac_reward_zero_std": 0.91875, "grad_norm": 8.368127822875977, "kl": 2.655868194834329, "learning_rate": 4.505238095238095e-07, "loss": 0.0027, "num_tokens": 904260225.0, "reward": 0.371875, "reward_std": 0.06633716672658921, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9259037971496582, "step": 13235 }, { "completion_length": 289.8, "completions/clipped_ratio": 0.0, "completions/max_length": 289.8, "completions/max_terminated_length": 289.8, "completions/mean_length": 90.2125, "completions/mean_terminated_length": 90.2125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011982182386191892, "frac_reward_zero_std": 0.925, "grad_norm": 0.0028505439404398203, "kl": 1.3323178614489735, "learning_rate": 4.5048412698412696e-07, "loss": 0.0013, "num_tokens": 904574689.0, "reward": 0.4203125, "reward_std": 0.06644197553396225, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.893009614944458, "step": 13240 }, { "completion_length": 452.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 88.13125, "completions/mean_terminated_length": 88.13125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.011986707379540152, "frac_reward_zero_std": 0.95, "grad_norm": 0.3672820031642914, "kl": 1.7265148265054449, "learning_rate": 4.504444444444444e-07, "loss": 0.0017, "num_tokens": 904884273.0, "reward": 0.2390625, "reward_std": 0.04603038765490055, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9573310732841491, "step": 13245 }, { "completion_length": 308.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 88.6328125, "completions/mean_terminated_length": 88.6328125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.011991232372888412, "frac_reward_zero_std": 0.93125, "grad_norm": 13.707475662231445, "kl": 1.0787743476452305, "learning_rate": 4.504047619047619e-07, "loss": 0.0011, "num_tokens": 905197499.0, "reward": 0.3203125, "reward_std": 0.05749734900891781, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9394412398338318, "step": 13250 }, { "completion_length": 501.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 501.6, "completions/max_terminated_length": 325.0, "completions/mean_length": 96.534375, "completions/mean_terminated_length": 95.48704071044922, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.011995757366236672, "frac_reward_zero_std": 0.95625, "grad_norm": 0.44044405221939087, "kl": 3.106813927984331, "learning_rate": 4.503650793650793e-07, "loss": 0.0031, "num_tokens": 905523439.0, "reward": 0.31875, "reward_std": 0.03640277497470379, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9027238368988038, "step": 13255 }, { "completion_length": 362.6, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/max_terminated_length": 362.6, "completions/mean_length": 89.546875, "completions/mean_terminated_length": 89.546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01200028235958493, "frac_reward_zero_std": 0.96875, "grad_norm": 0.11471165716648102, "kl": 1.7608937173616142, "learning_rate": 4.503253968253968e-07, "loss": 0.0018, "num_tokens": 905837075.0, "reward": 0.253125, "reward_std": 0.031663833558559416, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9513713598251343, "step": 13260 }, { "completion_length": 380.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 380.0, "completions/max_terminated_length": 357.4, "completions/mean_length": 97.02109375, "completions/mean_terminated_length": 96.5065673828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01200480735293319, "frac_reward_zero_std": 0.95625, "grad_norm": 2.162888526916504, "kl": 4.409307922516018, "learning_rate": 4.502857142857143e-07, "loss": 0.0044, "num_tokens": 906162518.0, "reward": 0.259375, "reward_std": 0.042081793397665025, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.941780149936676, "step": 13265 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 377.8, "completions/max_terminated_length": 345.0, "completions/mean_length": 90.215625, "completions/mean_terminated_length": 89.15667419433593, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01200933234628145, "frac_reward_zero_std": 0.9375, "grad_norm": 10.711796760559082, "kl": 1.8241650948068127, "learning_rate": 4.5024603174603173e-07, "loss": 0.0018, "num_tokens": 906477090.0, "reward": 0.4140625, "reward_std": 0.0592287540435791, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8999288082122803, "step": 13270 }, { "completion_length": 321.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 95.4046875, "completions/mean_terminated_length": 95.4046875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01201385733962971, "frac_reward_zero_std": 0.975, "grad_norm": 0.14231623709201813, "kl": 2.8027788824401796, "learning_rate": 4.502063492063492e-07, "loss": 0.0028, "num_tokens": 906798768.0, "reward": 0.3546875, "reward_std": 0.024039676785469054, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9122311234474182, "step": 13275 }, { "completion_length": 311.6, "completions/clipped_ratio": 0.0, "completions/max_length": 311.6, "completions/max_terminated_length": 311.6, "completions/mean_length": 92.334375, "completions/mean_terminated_length": 92.334375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012018382332977971, "frac_reward_zero_std": 0.93125, "grad_norm": 15.438239097595215, "kl": 1.480896059772931, "learning_rate": 4.5016666666666664e-07, "loss": 0.0015, "num_tokens": 907117116.0, "reward": 0.4453125, "reward_std": 0.06044245027005672, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.867991316318512, "step": 13280 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 366.0, "completions/max_terminated_length": 361.6, "completions/mean_length": 82.51953125, "completions/mean_terminated_length": 81.98817443847656, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012022907326326231, "frac_reward_zero_std": 0.96875, "grad_norm": 3.6897997856140137, "kl": 1.5980104171205312, "learning_rate": 4.5012698412698415e-07, "loss": 0.0016, "num_tokens": 907419173.0, "reward": 0.284375, "reward_std": 0.02709311693906784, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9570700287818908, "step": 13285 }, { "completion_length": 274.8, "completions/clipped_ratio": 0.0, "completions/max_length": 274.8, "completions/max_terminated_length": 274.8, "completions/mean_length": 87.33828125, "completions/mean_terminated_length": 87.33828125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01202743231967449, "frac_reward_zero_std": 0.975, "grad_norm": 0.03460387885570526, "kl": 1.2714497552486137, "learning_rate": 4.5008730158730155e-07, "loss": 0.0013, "num_tokens": 907728782.0, "reward": 0.4625, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8694353699684143, "step": 13290 }, { "completion_length": 372.4, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 96.41953125, "completions/mean_terminated_length": 96.41953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01203195731302275, "frac_reward_zero_std": 0.93125, "grad_norm": 14.316489219665527, "kl": 2.1479025955311952, "learning_rate": 4.50047619047619e-07, "loss": 0.0021, "num_tokens": 908052983.0, "reward": 0.3234375, "reward_std": 0.057710496336221696, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9367906332015992, "step": 13295 }, { "completion_length": 316.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 91.67421875, "completions/mean_terminated_length": 91.67421875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01203648230637101, "frac_reward_zero_std": 0.94375, "grad_norm": 10.67195987701416, "kl": 4.5289729087846355, "learning_rate": 4.500079365079365e-07, "loss": 0.0045, "num_tokens": 908369446.0, "reward": 0.3484375, "reward_std": 0.049082846567034724, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9303852319717407, "step": 13300 }, { "completion_length": 392.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 392.2, "completions/max_terminated_length": 392.0, "completions/mean_length": 99.65390625, "completions/mean_terminated_length": 99.13326721191406, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01204100729971927, "frac_reward_zero_std": 0.9, "grad_norm": 2.398505926132202, "kl": 3.287482509005349, "learning_rate": 4.4996825396825396e-07, "loss": 0.0033, "num_tokens": 908697531.0, "reward": 0.3640625, "reward_std": 0.08979866318404675, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.925446879863739, "step": 13305 }, { "completion_length": 440.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 440.8, "completions/max_terminated_length": 340.2, "completions/mean_length": 95.9234375, "completions/mean_terminated_length": 95.39395904541016, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01204553229306753, "frac_reward_zero_std": 0.95, "grad_norm": 6.892808437347412, "kl": 3.5077290606335736, "learning_rate": 4.499285714285714e-07, "loss": 0.0035, "num_tokens": 909019289.0, "reward": 0.3515625, "reward_std": 0.04492208622395992, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9310031294822693, "step": 13310 }, { "completion_length": 310.2, "completions/clipped_ratio": 0.0, "completions/max_length": 310.2, "completions/max_terminated_length": 310.2, "completions/mean_length": 87.446875, "completions/mean_terminated_length": 87.446875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01205005728641579, "frac_reward_zero_std": 0.9375, "grad_norm": 18.110124588012695, "kl": 1.4658441715408117, "learning_rate": 4.4988888888888887e-07, "loss": 0.0015, "num_tokens": 909328741.0, "reward": 0.365625, "reward_std": 0.056707002222537994, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.8896223664283752, "step": 13315 }, { "completion_length": 316.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 92.36015625, "completions/mean_terminated_length": 92.36015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012054582279764048, "frac_reward_zero_std": 0.9375, "grad_norm": 3.5523271560668945, "kl": 0.27800462269224224, "learning_rate": 4.498492063492063e-07, "loss": 0.0003, "num_tokens": 909647690.0, "reward": 0.334375, "reward_std": 0.05965210571885109, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9354050397872925, "step": 13320 }, { "completion_length": 309.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 90.7765625, "completions/mean_terminated_length": 90.7765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012059107273112308, "frac_reward_zero_std": 0.95, "grad_norm": 4.310946941375732, "kl": 0.766899812198244, "learning_rate": 4.498095238095238e-07, "loss": 0.0008, "num_tokens": 909962724.0, "reward": 0.353125, "reward_std": 0.04103435724973679, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9254178404808044, "step": 13325 }, { "completion_length": 418.8, "completions/clipped_ratio": 0.0, "completions/max_length": 418.8, "completions/max_terminated_length": 418.8, "completions/mean_length": 90.91796875, "completions/mean_terminated_length": 90.91796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012063632266460568, "frac_reward_zero_std": 0.91875, "grad_norm": 0.21453957259655, "kl": 1.192783141997643, "learning_rate": 4.4976984126984123e-07, "loss": 0.0012, "num_tokens": 910279859.0, "reward": 0.2890625, "reward_std": 0.06838613077998161, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9463361382484436, "step": 13330 }, { "completion_length": 458.4, "completions/clipped_ratio": 0.0, "completions/max_length": 458.4, "completions/max_terminated_length": 458.4, "completions/mean_length": 87.953125, "completions/mean_terminated_length": 87.953125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012068157259808828, "frac_reward_zero_std": 0.9375, "grad_norm": 11.85755443572998, "kl": 9.242352405679412, "learning_rate": 4.4973015873015874e-07, "loss": 0.0092, "num_tokens": 910587927.0, "reward": 0.3703125, "reward_std": 0.05486922301352024, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9194631338119507, "step": 13335 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 94.159375, "completions/mean_terminated_length": 94.159375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012072682253157089, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2532481253147125, "kl": 4.966171618318185, "learning_rate": 4.496904761904762e-07, "loss": 0.005, "num_tokens": 910908771.0, "reward": 0.2734375, "reward_std": 0.05444488972425461, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9469860792160034, "step": 13340 }, { "completion_length": 315.8, "completions/clipped_ratio": 0.0, "completions/max_length": 315.8, "completions/max_terminated_length": 315.8, "completions/mean_length": 88.03515625, "completions/mean_terminated_length": 88.03515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012077207246505347, "frac_reward_zero_std": 0.94375, "grad_norm": 9.111397743225098, "kl": 2.2317400680156423, "learning_rate": 4.496507936507936e-07, "loss": 0.0022, "num_tokens": 911218976.0, "reward": 0.3796875, "reward_std": 0.04455862157046795, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.908746874332428, "step": 13345 }, { "completion_length": 298.6, "completions/clipped_ratio": 0.0, "completions/max_length": 298.6, "completions/max_terminated_length": 298.6, "completions/mean_length": 92.32265625, "completions/mean_terminated_length": 92.32265625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.012081732239853607, "frac_reward_zero_std": 0.94375, "grad_norm": 5.360095500946045, "kl": 3.8673410958610477, "learning_rate": 4.496111111111111e-07, "loss": 0.0039, "num_tokens": 911536693.0, "reward": 0.4046875, "reward_std": 0.049082846567034724, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9139262437820435, "step": 13350 }, { "completion_length": 364.4, "completions/clipped_ratio": 0.0, "completions/max_length": 364.4, "completions/max_terminated_length": 364.4, "completions/mean_length": 92.0890625, "completions/mean_terminated_length": 92.0890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012086257233201867, "frac_reward_zero_std": 0.98125, "grad_norm": 20.42605972290039, "kl": 0.7146865020738915, "learning_rate": 4.4957142857142856e-07, "loss": 0.0007, "num_tokens": 911854863.0, "reward": 0.359375, "reward_std": 0.01872510462999344, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9110239148139954, "step": 13355 }, { "completion_length": 366.6, "completions/clipped_ratio": 0.0, "completions/max_length": 366.6, "completions/max_terminated_length": 366.6, "completions/mean_length": 86.684375, "completions/mean_terminated_length": 86.684375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012090782226550127, "frac_reward_zero_std": 0.95, "grad_norm": 11.6556396484375, "kl": 1.3831063052173704, "learning_rate": 4.49531746031746e-07, "loss": 0.0014, "num_tokens": 912162019.0, "reward": 0.3265625, "reward_std": 0.04560605585575104, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9354976773262024, "step": 13360 }, { "completion_length": 295.4, "completions/clipped_ratio": 0.0, "completions/max_length": 295.4, "completions/max_terminated_length": 295.4, "completions/mean_length": 89.80625, "completions/mean_terminated_length": 89.80625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012095307219898387, "frac_reward_zero_std": 0.95, "grad_norm": 1.584868311882019, "kl": 2.7045292779570445, "learning_rate": 4.4949206349206346e-07, "loss": 0.0027, "num_tokens": 912477011.0, "reward": 0.2078125, "reward_std": 0.043296470493078235, "rewards/verify_chess_move/mean": 0.2078125, "rewards/verify_chess_move/std": 0.9705015420913696, "step": 13365 }, { "completion_length": 449.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.2, "completions/max_terminated_length": 448.6, "completions/mean_length": 89.35234375, "completions/mean_terminated_length": 88.82633056640626, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012099832213246647, "frac_reward_zero_std": 0.95625, "grad_norm": 18.797731399536133, "kl": 3.636658887937665, "learning_rate": 4.494523809523809e-07, "loss": 0.0036, "num_tokens": 912788582.0, "reward": 0.3875, "reward_std": 0.037981899455189705, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9193676948547364, "step": 13370 }, { "completion_length": 294.2, "completions/clipped_ratio": 0.0, "completions/max_length": 294.2, "completions/max_terminated_length": 294.2, "completions/mean_length": 91.878125, "completions/mean_terminated_length": 91.878125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012104357206594906, "frac_reward_zero_std": 0.96875, "grad_norm": 1.9423612356185913, "kl": 1.3825708801159635, "learning_rate": 4.494126984126984e-07, "loss": 0.0014, "num_tokens": 913104594.0, "reward": 0.3359375, "reward_std": 0.024831003695726394, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9172020792961121, "step": 13375 }, { "completion_length": 403.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 90.64296875, "completions/mean_terminated_length": 90.64296875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012108882199943166, "frac_reward_zero_std": 0.95625, "grad_norm": 11.519205093383789, "kl": 1.7545305368490518, "learning_rate": 4.4937301587301583e-07, "loss": 0.0018, "num_tokens": 913420881.0, "reward": 0.2703125, "reward_std": 0.04318911507725716, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9627882719039917, "step": 13380 }, { "completion_length": 299.2, "completions/clipped_ratio": 0.0, "completions/max_length": 299.2, "completions/max_terminated_length": 299.2, "completions/mean_length": 81.7921875, "completions/mean_terminated_length": 81.7921875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012113407193291426, "frac_reward_zero_std": 0.96875, "grad_norm": 12.989363670349121, "kl": 0.4790870146593079, "learning_rate": 4.493333333333333e-07, "loss": 0.0005, "num_tokens": 913719575.0, "reward": 0.4890625, "reward_std": 0.03098084479570389, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8578991174697876, "step": 13385 }, { "completion_length": 324.8, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/max_terminated_length": 324.8, "completions/mean_length": 87.73203125, "completions/mean_terminated_length": 87.73203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012117932186639686, "frac_reward_zero_std": 0.96875, "grad_norm": 3.341263771057129, "kl": 0.25998980975709857, "learning_rate": 4.492936507936508e-07, "loss": 0.0003, "num_tokens": 914029584.0, "reward": 0.340625, "reward_std": 0.026409146562218665, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9292871475219726, "step": 13390 }, { "completion_length": 451.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 93.62265625, "completions/mean_terminated_length": 93.62265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012122457179987946, "frac_reward_zero_std": 0.9375, "grad_norm": 0.4723723530769348, "kl": 0.5670833233161829, "learning_rate": 4.4925396825396824e-07, "loss": 0.0006, "num_tokens": 914350285.0, "reward": 0.3953125, "reward_std": 0.05124015025794506, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9088901162147522, "step": 13395 }, { "completion_length": 473.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 473.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 89.278125, "completions/mean_terminated_length": 88.22326202392578, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012126982173336204, "frac_reward_zero_std": 0.93125, "grad_norm": 13.455509185791016, "kl": 0.9654836448724382, "learning_rate": 4.492142857142857e-07, "loss": 0.001, "num_tokens": 914662665.0, "reward": 0.3359375, "reward_std": 0.06023126505315304, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9372910737991333, "step": 13400 }, { "completion_length": 391.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 97.58203125, "completions/mean_terminated_length": 97.58203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012131507166684465, "frac_reward_zero_std": 0.9625, "grad_norm": 0.004510205704718828, "kl": 0.19810570229310542, "learning_rate": 4.4917460317460315e-07, "loss": 0.0002, "num_tokens": 914988050.0, "reward": 0.390625, "reward_std": 0.03424546979367733, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9120364427566529, "step": 13405 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 92.546875, "completions/mean_terminated_length": 92.546875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012136032160032725, "frac_reward_zero_std": 0.93125, "grad_norm": 13.279236793518066, "kl": 0.32063783921767025, "learning_rate": 4.491349206349206e-07, "loss": 0.0003, "num_tokens": 915304110.0, "reward": 0.4515625, "reward_std": 0.05634255670011044, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8656131148338317, "step": 13410 }, { "completion_length": 289.2, "completions/clipped_ratio": 0.0, "completions/max_length": 289.2, "completions/max_terminated_length": 289.2, "completions/mean_length": 79.5765625, "completions/mean_terminated_length": 79.5765625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012140557153380985, "frac_reward_zero_std": 0.9625, "grad_norm": 0.11079901456832886, "kl": 0.2249411356402561, "learning_rate": 4.4909523809523806e-07, "loss": 0.0002, "num_tokens": 915599456.0, "reward": 0.3890625, "reward_std": 0.03492845855653286, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9180670142173767, "step": 13415 }, { "completion_length": 283.4, "completions/clipped_ratio": 0.0, "completions/max_length": 283.4, "completions/max_terminated_length": 283.4, "completions/mean_length": 92.26015625, "completions/mean_terminated_length": 92.26015625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012145082146729245, "frac_reward_zero_std": 0.9625, "grad_norm": 0.012795308604836464, "kl": 0.32952715931460264, "learning_rate": 4.490555555555555e-07, "loss": 0.0003, "num_tokens": 915917877.0, "reward": 0.403125, "reward_std": 0.03424546979367733, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9121041178703309, "step": 13420 }, { "completion_length": 462.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 462.6, "completions/max_terminated_length": 458.8, "completions/mean_length": 95.77265625, "completions/mean_terminated_length": 94.73248596191407, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012149607140077505, "frac_reward_zero_std": 0.95, "grad_norm": 2.7295358180999756, "kl": 1.8238832313683815, "learning_rate": 4.49015873015873e-07, "loss": 0.0018, "num_tokens": 916238770.0, "reward": 0.33125, "reward_std": 0.04513425230979919, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9273011326789856, "step": 13425 }, { "completion_length": 377.4, "completions/clipped_ratio": 0.0, "completions/max_length": 377.4, "completions/max_terminated_length": 377.4, "completions/mean_length": 83.21640625, "completions/mean_terminated_length": 83.21640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012154132133425763, "frac_reward_zero_std": 0.95625, "grad_norm": 2.3074405193328857, "kl": 0.30829997655237096, "learning_rate": 4.4897619047619047e-07, "loss": 0.0003, "num_tokens": 916540223.0, "reward": 0.3046875, "reward_std": 0.033669837936759, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9530230164527893, "step": 13430 }, { "completion_length": 387.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 387.8, "completions/max_terminated_length": 319.2, "completions/mean_length": 90.9515625, "completions/mean_terminated_length": 90.42715148925781, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012158657126774023, "frac_reward_zero_std": 0.95, "grad_norm": 0.7069880962371826, "kl": 0.8277134581468999, "learning_rate": 4.4893650793650787e-07, "loss": 0.0008, "num_tokens": 916855473.0, "reward": 0.365625, "reward_std": 0.043768275529146194, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9235346436500549, "step": 13435 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 480.2, "completions/max_terminated_length": 382.6, "completions/mean_length": 88.9828125, "completions/mean_terminated_length": 87.9033706665039, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012163182120122283, "frac_reward_zero_std": 0.95625, "grad_norm": 5.573620796203613, "kl": 0.7091945898486302, "learning_rate": 4.488968253968254e-07, "loss": 0.0007, "num_tokens": 917166099.0, "reward": 0.4078125, "reward_std": 0.037769732996821404, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9097385883331299, "step": 13440 }, { "completion_length": 435.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.4, "completions/max_terminated_length": 352.6, "completions/mean_length": 93.61875, "completions/mean_terminated_length": 93.08966674804688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012167707113470544, "frac_reward_zero_std": 0.975, "grad_norm": 9.900766372680664, "kl": 0.5231267744442448, "learning_rate": 4.4885714285714283e-07, "loss": 0.0005, "num_tokens": 917485243.0, "reward": 0.4125, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.897387421131134, "step": 13445 }, { "completion_length": 344.8, "completions/clipped_ratio": 0.0, "completions/max_length": 344.8, "completions/max_terminated_length": 344.8, "completions/mean_length": 91.1796875, "completions/mean_terminated_length": 91.1796875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012172232106818804, "frac_reward_zero_std": 0.96875, "grad_norm": 9.097612380981445, "kl": 0.8428274664562195, "learning_rate": 4.4881746031746034e-07, "loss": 0.0008, "num_tokens": 917800777.0, "reward": 0.3234375, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9456029891967773, "step": 13450 }, { "completion_length": 367.4, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/max_terminated_length": 367.4, "completions/mean_length": 97.925, "completions/mean_terminated_length": 97.925, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012176757100167062, "frac_reward_zero_std": 0.95625, "grad_norm": 5.000816822052002, "kl": 0.7728267756756395, "learning_rate": 4.4877777777777774e-07, "loss": 0.0008, "num_tokens": 918128161.0, "reward": 0.26875, "reward_std": 0.03845272101461887, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9616543889045716, "step": 13455 }, { "completion_length": 326.8, "completions/clipped_ratio": 0.0, "completions/max_length": 326.8, "completions/max_terminated_length": 326.8, "completions/mean_length": 90.43671875, "completions/mean_terminated_length": 90.43671875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012181282093515322, "frac_reward_zero_std": 0.9625, "grad_norm": 0.017811810597777367, "kl": 1.6771648790920153, "learning_rate": 4.487380952380952e-07, "loss": 0.0017, "num_tokens": 918442216.0, "reward": 0.3203125, "reward_std": 0.03561242893338203, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.942713713645935, "step": 13460 }, { "completion_length": 431.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 431.2, "completions/max_terminated_length": 336.8, "completions/mean_length": 88.7375, "completions/mean_terminated_length": 88.20948638916016, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012185807086863582, "frac_reward_zero_std": 0.95, "grad_norm": 10.74189281463623, "kl": 1.0450114352512174, "learning_rate": 4.486984126984127e-07, "loss": 0.001, "num_tokens": 918753440.0, "reward": 0.309375, "reward_std": 0.044239097461104396, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9421341180801391, "step": 13465 }, { "completion_length": 325.6, "completions/clipped_ratio": 0.0, "completions/max_length": 325.6, "completions/max_terminated_length": 325.6, "completions/mean_length": 95.2078125, "completions/mean_terminated_length": 95.2078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012190332080211842, "frac_reward_zero_std": 0.93125, "grad_norm": 8.667315483093262, "kl": 2.289178528776392, "learning_rate": 4.486587301587301e-07, "loss": 0.0023, "num_tokens": 919076938.0, "reward": 0.325, "reward_std": 0.06381286717951298, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9258642435073853, "step": 13470 }, { "completion_length": 295.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 81.91875, "completions/mean_terminated_length": 81.91875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.012194857073560102, "frac_reward_zero_std": 0.9625, "grad_norm": 9.662023544311523, "kl": 1.3100198335363529, "learning_rate": 4.486190476190476e-07, "loss": 0.0013, "num_tokens": 919377122.0, "reward": 0.3359375, "reward_std": 0.030617379397153855, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9335513830184936, "step": 13475 }, { "completion_length": 280.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 88.21953125, "completions/mean_terminated_length": 88.21953125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012199382066908362, "frac_reward_zero_std": 0.93125, "grad_norm": 17.677532196044922, "kl": 1.6531117282342165, "learning_rate": 4.4857936507936506e-07, "loss": 0.0017, "num_tokens": 919687955.0, "reward": 0.3484375, "reward_std": 0.064071524143219, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.937584114074707, "step": 13480 }, { "completion_length": 464.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 464.0, "completions/max_terminated_length": 407.4, "completions/mean_length": 90.3703125, "completions/mean_terminated_length": 89.84838104248047, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01220390706025662, "frac_reward_zero_std": 0.93125, "grad_norm": 7.686020851135254, "kl": 3.105699821084272, "learning_rate": 4.485396825396825e-07, "loss": 0.0031, "num_tokens": 920002613.0, "reward": 0.403125, "reward_std": 0.05613039135932922, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9155347228050232, "step": 13485 }, { "completion_length": 326.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 96.12578125, "completions/mean_terminated_length": 96.12578125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012208432053604881, "frac_reward_zero_std": 0.96875, "grad_norm": 0.04026356711983681, "kl": 1.3793572471709923, "learning_rate": 4.4849999999999997e-07, "loss": 0.0014, "num_tokens": 920328526.0, "reward": 0.371875, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9149271607398987, "step": 13490 }, { "completion_length": 346.2, "completions/clipped_ratio": 0.0, "completions/max_length": 346.2, "completions/max_terminated_length": 346.2, "completions/mean_length": 92.09453125, "completions/mean_terminated_length": 92.09453125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012212957046953141, "frac_reward_zero_std": 0.93125, "grad_norm": 0.04554350674152374, "kl": 2.4346945804310964, "learning_rate": 4.4846031746031743e-07, "loss": 0.0024, "num_tokens": 920646239.0, "reward": 0.4484375, "reward_std": 0.06360070034861565, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8706506013870239, "step": 13495 }, { "completion_length": 419.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.8, "completions/max_terminated_length": 327.0, "completions/mean_length": 89.14609375, "completions/mean_terminated_length": 88.61463317871093, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012217482040301401, "frac_reward_zero_std": 0.94375, "grad_norm": 7.221227169036865, "kl": 1.537296906323172, "learning_rate": 4.4842063492063493e-07, "loss": 0.0015, "num_tokens": 920958490.0, "reward": 0.3796875, "reward_std": 0.04750372171401977, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9242557644844055, "step": 13500 }, { "completion_length": 399.4, "completions/clipped_ratio": 0.0, "completions/max_length": 399.4, "completions/max_terminated_length": 399.4, "completions/mean_length": 83.665625, "completions/mean_terminated_length": 83.665625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012222007033649661, "frac_reward_zero_std": 0.94375, "grad_norm": 4.418682098388672, "kl": 2.6224758128053507, "learning_rate": 4.4838095238095233e-07, "loss": 0.0026, "num_tokens": 921260262.0, "reward": 0.484375, "reward_std": 0.050025473535060885, "rewards/verify_chess_move/mean": 0.484375, "rewards/verify_chess_move/std": 0.8659189701080322, "step": 13505 }, { "completion_length": 313.6, "completions/clipped_ratio": 0.0, "completions/max_length": 313.6, "completions/max_terminated_length": 313.6, "completions/mean_length": 85.0328125, "completions/mean_terminated_length": 85.0328125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01222653202699792, "frac_reward_zero_std": 0.96875, "grad_norm": 0.22085972130298615, "kl": 2.8185639064526185, "learning_rate": 4.483412698412698e-07, "loss": 0.0028, "num_tokens": 921564568.0, "reward": 0.4515625, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8915515780448914, "step": 13510 }, { "completion_length": 419.8, "completions/clipped_ratio": 0.0, "completions/max_length": 419.8, "completions/max_terminated_length": 419.8, "completions/mean_length": 100.08984375, "completions/mean_terminated_length": 100.08984375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01223105702034618, "frac_reward_zero_std": 0.9625, "grad_norm": 9.257256507873535, "kl": 2.3748037768062202, "learning_rate": 4.483015873015873e-07, "loss": 0.0024, "num_tokens": 921894659.0, "reward": 0.2390625, "reward_std": 0.02993340939283371, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9689783334732056, "step": 13515 }, { "completion_length": 375.8, "completions/clipped_ratio": 0.0, "completions/max_length": 375.8, "completions/max_terminated_length": 375.8, "completions/mean_length": 85.446875, "completions/mean_terminated_length": 85.446875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01223558201369444, "frac_reward_zero_std": 0.975, "grad_norm": 6.179490089416504, "kl": 3.3381194938672705, "learning_rate": 4.4826190476190475e-07, "loss": 0.0033, "num_tokens": 922200351.0, "reward": 0.425, "reward_std": 0.0245114803314209, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8854163885116577, "step": 13520 }, { "completion_length": 471.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.6, "completions/max_terminated_length": 439.8, "completions/mean_length": 98.87109375, "completions/mean_terminated_length": 98.35054321289063, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0122401070070427, "frac_reward_zero_std": 0.95625, "grad_norm": 2.3535068035125732, "kl": 3.252160304924473, "learning_rate": 4.482222222222222e-07, "loss": 0.0033, "num_tokens": 922528306.0, "reward": 0.4421875, "reward_std": 0.03729890994727612, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8945988416671753, "step": 13525 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 99.24140625, "completions/mean_terminated_length": 99.24140625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01224463200039096, "frac_reward_zero_std": 0.95, "grad_norm": 4.151458740234375, "kl": 2.0777494597947226, "learning_rate": 4.4818253968253966e-07, "loss": 0.0021, "num_tokens": 922857711.0, "reward": 0.2984375, "reward_std": 0.0415061604231596, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9536880254745483, "step": 13530 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.0, "completions/max_length": 444.6, "completions/max_terminated_length": 444.6, "completions/mean_length": 93.16015625, "completions/mean_terminated_length": 93.16015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01224915699373922, "frac_reward_zero_std": 0.95625, "grad_norm": 12.627819061279297, "kl": 1.535335206030868, "learning_rate": 4.481428571428571e-07, "loss": 0.0015, "num_tokens": 923175532.0, "reward": 0.446875, "reward_std": 0.03230287954211235, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.8861581921577454, "step": 13535 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 98.98359375, "completions/mean_terminated_length": 98.98359375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012253681987087478, "frac_reward_zero_std": 0.925, "grad_norm": 9.823542594909668, "kl": 4.970394715387374, "learning_rate": 4.481031746031746e-07, "loss": 0.005, "num_tokens": 923503503.0, "reward": 0.396875, "reward_std": 0.06076197475194931, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.8963746309280396, "step": 13540 }, { "completion_length": 413.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 413.0, "completions/max_terminated_length": 321.4, "completions/mean_length": 93.8515625, "completions/mean_terminated_length": 93.32007751464843, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012258206980435738, "frac_reward_zero_std": 0.8875, "grad_norm": 23.027774810791016, "kl": 2.8139465942163953, "learning_rate": 4.48063492063492e-07, "loss": 0.0028, "num_tokens": 923822017.0, "reward": 0.2765625, "reward_std": 0.09842435121536255, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9539449095726014, "step": 13545 }, { "completion_length": 321.6, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/max_terminated_length": 321.6, "completions/mean_length": 86.59453125, "completions/mean_terminated_length": 86.59453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012262731973783999, "frac_reward_zero_std": 0.9625, "grad_norm": 0.1627970188856125, "kl": 1.2195680083939806, "learning_rate": 4.480238095238095e-07, "loss": 0.0012, "num_tokens": 924130402.0, "reward": 0.2890625, "reward_std": 0.03719155341386795, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9323653221130371, "step": 13550 }, { "completion_length": 567.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 567.2, "completions/max_terminated_length": 482.4, "completions/mean_length": 95.28515625, "completions/mean_terminated_length": 94.74989624023438, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012267256967132259, "frac_reward_zero_std": 0.9375, "grad_norm": 9.804206848144531, "kl": 1.6703974312753416, "learning_rate": 4.47984126984127e-07, "loss": 0.0017, "num_tokens": 924452511.0, "reward": 0.3046875, "reward_std": 0.05307891331613064, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9410781264305115, "step": 13555 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 89.4078125, "completions/mean_terminated_length": 89.4078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012271781960480519, "frac_reward_zero_std": 0.9625, "grad_norm": 3.9640626907348633, "kl": 1.7081414927262812, "learning_rate": 4.479444444444444e-07, "loss": 0.0017, "num_tokens": 924763873.0, "reward": 0.40625, "reward_std": 0.0306163989007473, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9128809690475463, "step": 13560 }, { "completion_length": 286.6, "completions/clipped_ratio": 0.0, "completions/max_length": 286.6, "completions/max_terminated_length": 286.6, "completions/mean_length": 93.6390625, "completions/mean_terminated_length": 93.6390625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012276306953828777, "frac_reward_zero_std": 0.975, "grad_norm": 0.23838600516319275, "kl": 1.2710676059592516, "learning_rate": 4.479047619047619e-07, "loss": 0.0013, "num_tokens": 925083403.0, "reward": 0.3234375, "reward_std": 0.019044627994298936, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9442860245704651, "step": 13565 }, { "completion_length": 286.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 92.51953125, "completions/mean_terminated_length": 92.51953125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012280831947177037, "frac_reward_zero_std": 0.9375, "grad_norm": 6.097043037414551, "kl": 2.048880698601715, "learning_rate": 4.4786507936507934e-07, "loss": 0.002, "num_tokens": 925402692.0, "reward": 0.4203125, "reward_std": 0.05555319339036942, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8819689631462098, "step": 13570 }, { "completion_length": 390.8, "completions/clipped_ratio": 0.0, "completions/max_length": 390.8, "completions/max_terminated_length": 390.8, "completions/mean_length": 81.16875, "completions/mean_terminated_length": 81.16875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012285356940525297, "frac_reward_zero_std": 0.96875, "grad_norm": 18.849918365478516, "kl": 1.4801165360957385, "learning_rate": 4.4782539682539685e-07, "loss": 0.0015, "num_tokens": 925701340.0, "reward": 0.253125, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9672653198242187, "step": 13575 }, { "completion_length": 345.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 89.7640625, "completions/mean_terminated_length": 89.7640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012289881933873557, "frac_reward_zero_std": 0.95, "grad_norm": 1.2073346376419067, "kl": 2.2179929120698945, "learning_rate": 4.4778571428571425e-07, "loss": 0.0022, "num_tokens": 926014702.0, "reward": 0.1984375, "reward_std": 0.04240131564438343, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.9693786382675171, "step": 13580 }, { "completion_length": 393.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 393.2, "completions/max_terminated_length": 391.2, "completions/mean_length": 85.97734375, "completions/mean_terminated_length": 85.45132751464844, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012294406927221817, "frac_reward_zero_std": 0.95, "grad_norm": 10.767751693725586, "kl": 0.9178845600225032, "learning_rate": 4.477460317460317e-07, "loss": 0.0009, "num_tokens": 926321129.0, "reward": 0.41875, "reward_std": 0.04492306672036648, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8791290163993836, "step": 13585 }, { "completion_length": 293.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 88.30625, "completions/mean_terminated_length": 88.30625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012298931920570078, "frac_reward_zero_std": 0.95, "grad_norm": 3.2485837936401367, "kl": 2.734885406401008, "learning_rate": 4.477063492063492e-07, "loss": 0.0027, "num_tokens": 926632569.0, "reward": 0.35, "reward_std": 0.04534740038216114, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9271889925003052, "step": 13590 }, { "completion_length": 278.4, "completions/clipped_ratio": 0.0, "completions/max_length": 278.4, "completions/max_terminated_length": 278.4, "completions/mean_length": 87.10234375, "completions/mean_terminated_length": 87.10234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012303456913918336, "frac_reward_zero_std": 0.94375, "grad_norm": 2.397298812866211, "kl": 1.2244275572244079, "learning_rate": 4.476666666666666e-07, "loss": 0.0012, "num_tokens": 926943004.0, "reward": 0.39375, "reward_std": 0.04955465085804463, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9161169409751893, "step": 13595 }, { "completion_length": 292.4, "completions/clipped_ratio": 0.0, "completions/max_length": 292.4, "completions/max_terminated_length": 292.4, "completions/mean_length": 83.60625, "completions/mean_terminated_length": 83.60625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012307981907266596, "frac_reward_zero_std": 0.95625, "grad_norm": 5.699654579162598, "kl": 0.643917353451252, "learning_rate": 4.476269841269841e-07, "loss": 0.0006, "num_tokens": 927247028.0, "reward": 0.2953125, "reward_std": 0.03571978434920311, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9325159549713135, "step": 13600 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 92.31640625, "completions/mean_terminated_length": 92.31640625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012312506900614856, "frac_reward_zero_std": 0.9625, "grad_norm": 1.58925461769104, "kl": 0.36475690675433725, "learning_rate": 4.4758730158730157e-07, "loss": 0.0004, "num_tokens": 927562961.0, "reward": 0.2890625, "reward_std": 0.032667326182127, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9434988737106323, "step": 13605 }, { "completion_length": 448.6, "completions/clipped_ratio": 0.0, "completions/max_length": 448.6, "completions/max_terminated_length": 448.6, "completions/mean_length": 83.61015625, "completions/mean_terminated_length": 83.61015625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012317031893963116, "frac_reward_zero_std": 0.95625, "grad_norm": 0.05316130444407463, "kl": 1.185577236081008, "learning_rate": 4.4754761904761903e-07, "loss": 0.0012, "num_tokens": 927864838.0, "reward": 0.39375, "reward_std": 0.035036797448992726, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.911237919330597, "step": 13610 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 91.04140625, "completions/mean_terminated_length": 91.04140625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012321556887311376, "frac_reward_zero_std": 0.9875, "grad_norm": 0.2068227082490921, "kl": 0.7354175436543301, "learning_rate": 4.475079365079365e-07, "loss": 0.0007, "num_tokens": 928182691.0, "reward": 0.3109375, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9464634418487549, "step": 13615 }, { "completion_length": 340.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 96.60859375, "completions/mean_terminated_length": 96.60859375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012326081880659635, "frac_reward_zero_std": 0.925, "grad_norm": 4.69100284576416, "kl": 0.5442435313947499, "learning_rate": 4.4746825396825393e-07, "loss": 0.0005, "num_tokens": 928507662.0, "reward": 0.4203125, "reward_std": 0.06191774569451809, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8934035420417785, "step": 13620 }, { "completion_length": 287.4, "completions/clipped_ratio": 0.0, "completions/max_length": 287.4, "completions/max_terminated_length": 287.4, "completions/mean_length": 91.9140625, "completions/mean_terminated_length": 91.9140625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012330606874007895, "frac_reward_zero_std": 0.93125, "grad_norm": 14.76817798614502, "kl": 0.44092725722584875, "learning_rate": 4.4742857142857144e-07, "loss": 0.0004, "num_tokens": 928824472.0, "reward": 0.2578125, "reward_std": 0.05839250311255455, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9582268834114075, "step": 13625 }, { "completion_length": 399.2, "completions/clipped_ratio": 0.0, "completions/max_length": 399.2, "completions/max_terminated_length": 399.2, "completions/mean_length": 97.690625, "completions/mean_terminated_length": 97.690625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012335131867356155, "frac_reward_zero_std": 0.93125, "grad_norm": 6.951592922210693, "kl": 1.220489654969424, "learning_rate": 4.473888888888889e-07, "loss": 0.0012, "num_tokens": 929150580.0, "reward": 0.2890625, "reward_std": 0.056813379377126695, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.953091561794281, "step": 13630 }, { "completion_length": 572.4, "completions/clipped_ratio": 0.003125, "completions/max_length": 572.4, "completions/max_terminated_length": 523.6, "completions/mean_length": 96.50703125, "completions/mean_terminated_length": 94.43641967773438, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012339656860704415, "frac_reward_zero_std": 0.94375, "grad_norm": 0.5796162486076355, "kl": 0.8400028305011802, "learning_rate": 4.473492063492063e-07, "loss": 0.0008, "num_tokens": 929474109.0, "reward": 0.4125, "reward_std": 0.050920628011226654, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.8780403375625611, "step": 13635 }, { "completion_length": 316.2, "completions/clipped_ratio": 0.0, "completions/max_length": 316.2, "completions/max_terminated_length": 316.2, "completions/mean_length": 82.08359375, "completions/mean_terminated_length": 82.08359375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012344181854052675, "frac_reward_zero_std": 0.9625, "grad_norm": 8.548069953918457, "kl": 0.7636636043898761, "learning_rate": 4.473095238095238e-07, "loss": 0.0008, "num_tokens": 929776368.0, "reward": 0.3921875, "reward_std": 0.031983356550335885, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9179409861564636, "step": 13640 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 91.43046875, "completions/mean_terminated_length": 91.43046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012348706847400935, "frac_reward_zero_std": 0.93125, "grad_norm": 14.509827613830566, "kl": 1.8407615900272503, "learning_rate": 4.4726984126984126e-07, "loss": 0.0018, "num_tokens": 930092151.0, "reward": 0.4703125, "reward_std": 0.05954729653894901, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8742824792861938, "step": 13645 }, { "completion_length": 624.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 624.6, "completions/max_terminated_length": 553.8, "completions/mean_length": 95.63828125, "completions/mean_terminated_length": 94.07629089355468, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012353231840749193, "frac_reward_zero_std": 0.93125, "grad_norm": 10.365503311157227, "kl": 1.9029917588224634, "learning_rate": 4.472301587301587e-07, "loss": 0.0019, "num_tokens": 930413432.0, "reward": 0.271875, "reward_std": 0.06133858561515808, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9556421756744384, "step": 13650 }, { "completion_length": 293.2, "completions/clipped_ratio": 0.0, "completions/max_length": 293.2, "completions/max_terminated_length": 293.2, "completions/mean_length": 89.89609375, "completions/mean_terminated_length": 89.89609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012357756834097454, "frac_reward_zero_std": 0.94375, "grad_norm": 9.232946395874023, "kl": 0.988986179116182, "learning_rate": 4.4719047619047617e-07, "loss": 0.001, "num_tokens": 930727291.0, "reward": 0.4109375, "reward_std": 0.04797454625368118, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.9066898226737976, "step": 13655 }, { "completion_length": 322.2, "completions/clipped_ratio": 0.0, "completions/max_length": 322.2, "completions/max_terminated_length": 322.2, "completions/mean_length": 92.44765625, "completions/mean_terminated_length": 92.44765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012362281827445714, "frac_reward_zero_std": 0.93125, "grad_norm": 9.285995483398438, "kl": 1.9487893290584908, "learning_rate": 4.471507936507936e-07, "loss": 0.0019, "num_tokens": 931044864.0, "reward": 0.325, "reward_std": 0.06543848365545273, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9409350752830505, "step": 13660 }, { "completion_length": 330.2, "completions/clipped_ratio": 0.0, "completions/max_length": 330.2, "completions/max_terminated_length": 330.2, "completions/mean_length": 86.98359375, "completions/mean_terminated_length": 86.98359375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012366806820793974, "frac_reward_zero_std": 0.95625, "grad_norm": 0.9125006198883057, "kl": 3.4643127942457794, "learning_rate": 4.4711111111111113e-07, "loss": 0.0035, "num_tokens": 931353099.0, "reward": 0.4265625, "reward_std": 0.04071483500301838, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.8713345289230346, "step": 13665 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 91.85234375, "completions/mean_terminated_length": 91.85234375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012371331814142234, "frac_reward_zero_std": 0.96875, "grad_norm": 9.689765930175781, "kl": 1.4246282132109627, "learning_rate": 4.4707142857142853e-07, "loss": 0.0014, "num_tokens": 931671558.0, "reward": 0.3390625, "reward_std": 0.02414703369140625, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9390253186225891, "step": 13670 }, { "completion_length": 429.4, "completions/clipped_ratio": 0.0, "completions/max_length": 429.4, "completions/max_terminated_length": 429.4, "completions/mean_length": 91.753125, "completions/mean_terminated_length": 91.753125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012375856807490492, "frac_reward_zero_std": 0.94375, "grad_norm": 17.975543975830078, "kl": 2.195030330074951, "learning_rate": 4.4703174603174603e-07, "loss": 0.0022, "num_tokens": 931988490.0, "reward": 0.328125, "reward_std": 0.050920627638697626, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9447761178016663, "step": 13675 }, { "completion_length": 406.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 406.8, "completions/max_terminated_length": 326.8, "completions/mean_length": 88.38515625, "completions/mean_terminated_length": 87.84449920654296, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012380381800838752, "frac_reward_zero_std": 0.9375, "grad_norm": 15.845841407775879, "kl": 0.9714588402886875, "learning_rate": 4.469920634920635e-07, "loss": 0.001, "num_tokens": 932300351.0, "reward": 0.3390625, "reward_std": 0.05512886010110378, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9375898599624634, "step": 13680 }, { "completion_length": 391.6, "completions/clipped_ratio": 0.0, "completions/max_length": 391.6, "completions/max_terminated_length": 391.6, "completions/mean_length": 93.28828125, "completions/mean_terminated_length": 93.28828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012384906794187012, "frac_reward_zero_std": 0.95, "grad_norm": 3.1839795112609863, "kl": 2.083809086692054, "learning_rate": 4.469523809523809e-07, "loss": 0.0021, "num_tokens": 932620768.0, "reward": 0.303125, "reward_std": 0.04308430477976799, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9285055279731751, "step": 13685 }, { "completion_length": 335.6, "completions/clipped_ratio": 0.0, "completions/max_length": 335.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 87.4953125, "completions/mean_terminated_length": 87.4953125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012389431787535272, "frac_reward_zero_std": 0.9375, "grad_norm": 10.299342155456543, "kl": 2.4382756252540276, "learning_rate": 4.469126984126984e-07, "loss": 0.0024, "num_tokens": 932930642.0, "reward": 0.340625, "reward_std": 0.05307793132960796, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9051625370979309, "step": 13690 }, { "completion_length": 412.2, "completions/clipped_ratio": 0.0, "completions/max_length": 412.2, "completions/max_terminated_length": 412.2, "completions/mean_length": 82.60625, "completions/mean_terminated_length": 82.60625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012393956780883533, "frac_reward_zero_std": 0.9375, "grad_norm": 8.670393943786621, "kl": 1.8485332337208091, "learning_rate": 4.4687301587301585e-07, "loss": 0.0018, "num_tokens": 933230794.0, "reward": 0.4703125, "reward_std": 0.05444489233195782, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8804496049880981, "step": 13695 }, { "completion_length": 491.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 491.6, "completions/max_terminated_length": 406.8, "completions/mean_length": 91.65078125, "completions/mean_terminated_length": 90.59744110107422, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.012398481774231793, "frac_reward_zero_std": 0.9375, "grad_norm": 12.126840591430664, "kl": 1.8841224566102028, "learning_rate": 4.4683333333333336e-07, "loss": 0.0019, "num_tokens": 933546243.0, "reward": 0.265625, "reward_std": 0.05739097446203232, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.9559686183929443, "step": 13700 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 369.2, "completions/max_terminated_length": 336.4, "completions/mean_length": 90.44375, "completions/mean_terminated_length": 89.91396484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012403006767580051, "frac_reward_zero_std": 0.94375, "grad_norm": 18.383098602294922, "kl": 1.1316723937285134, "learning_rate": 4.4679365079365076e-07, "loss": 0.0011, "num_tokens": 933860763.0, "reward": 0.3484375, "reward_std": 0.048658515140414235, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.92985520362854, "step": 13705 }, { "completion_length": 377.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 377.2, "completions/max_terminated_length": 341.6, "completions/mean_length": 80.29765625, "completions/mean_terminated_length": 79.7673324584961, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012407531760928311, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03601764142513275, "kl": 0.40418951625470073, "learning_rate": 4.467539682539682e-07, "loss": 0.0004, "num_tokens": 934158776.0, "reward": 0.403125, "reward_std": 0.028247909247875215, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9146329402923584, "step": 13710 }, { "completion_length": 461.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 461.4, "completions/max_terminated_length": 295.4, "completions/mean_length": 87.43046875, "completions/mean_terminated_length": 85.83064117431641, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012412056754276571, "frac_reward_zero_std": 0.95625, "grad_norm": 4.550685882568359, "kl": 0.9286166983190924, "learning_rate": 4.467142857142857e-07, "loss": 0.0009, "num_tokens": 934469463.0, "reward": 0.2234375, "reward_std": 0.037298908829689024, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9697602033615113, "step": 13715 }, { "completion_length": 383.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 383.2, "completions/max_terminated_length": 287.4, "completions/mean_length": 95.50859375, "completions/mean_terminated_length": 94.98611907958984, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012416581747624831, "frac_reward_zero_std": 0.9375, "grad_norm": 7.775527000427246, "kl": 0.31394267388386654, "learning_rate": 4.4667460317460317e-07, "loss": 0.0003, "num_tokens": 934794058.0, "reward": 0.321875, "reward_std": 0.05307793319225311, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9394501209259033, "step": 13720 }, { "completion_length": 404.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 89.534375, "completions/mean_terminated_length": 89.534375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012421106740973091, "frac_reward_zero_std": 0.975, "grad_norm": 0.540615439414978, "kl": 0.7317762213526293, "learning_rate": 4.4663492063492063e-07, "loss": 0.0007, "num_tokens": 935107158.0, "reward": 0.3515625, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9274076819419861, "step": 13725 }, { "completion_length": 329.8, "completions/clipped_ratio": 0.0, "completions/max_length": 329.8, "completions/max_terminated_length": 329.8, "completions/mean_length": 82.64765625, "completions/mean_terminated_length": 82.64765625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01242563173432135, "frac_reward_zero_std": 0.96875, "grad_norm": 10.459733009338379, "kl": 0.4048871246399358, "learning_rate": 4.465952380952381e-07, "loss": 0.0004, "num_tokens": 935408555.0, "reward": 0.290625, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9473110556602478, "step": 13730 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 97.03515625, "completions/mean_terminated_length": 97.03515625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01243015672766961, "frac_reward_zero_std": 0.94375, "grad_norm": 4.847263813018799, "kl": 0.4655273240059614, "learning_rate": 4.4655555555555554e-07, "loss": 0.0005, "num_tokens": 935734728.0, "reward": 0.3484375, "reward_std": 0.04955366887152195, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.932516622543335, "step": 13735 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.6, "completions/max_terminated_length": 330.6, "completions/mean_length": 86.86640625, "completions/mean_terminated_length": 86.34093322753907, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01243468172101787, "frac_reward_zero_std": 0.9625, "grad_norm": 20.417892456054688, "kl": 1.1021426364546643, "learning_rate": 4.46515873015873e-07, "loss": 0.0011, "num_tokens": 936043917.0, "reward": 0.3, "reward_std": 0.03719057217240333, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9230977177619935, "step": 13740 }, { "completion_length": 402.4, "completions/clipped_ratio": 0.0, "completions/max_length": 402.4, "completions/max_terminated_length": 402.4, "completions/mean_length": 87.15390625, "completions/mean_terminated_length": 87.15390625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01243920671436613, "frac_reward_zero_std": 0.9375, "grad_norm": 2.2633860111236572, "kl": 0.23122291445033624, "learning_rate": 4.4647619047619044e-07, "loss": 0.0002, "num_tokens": 936350674.0, "reward": 0.3265625, "reward_std": 0.052135304734110835, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9353760242462158, "step": 13745 }, { "completion_length": 489.4, "completions/clipped_ratio": 0.0, "completions/max_length": 489.4, "completions/max_terminated_length": 489.4, "completions/mean_length": 90.59609375, "completions/mean_terminated_length": 90.59609375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01244373170771439, "frac_reward_zero_std": 0.95, "grad_norm": 1.61968195438385, "kl": 0.807655976805836, "learning_rate": 4.464365079365079e-07, "loss": 0.0008, "num_tokens": 936662869.0, "reward": 0.3109375, "reward_std": 0.045606055110692975, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9229877114295959, "step": 13750 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 90.53828125, "completions/mean_terminated_length": 90.53828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01244825670106265, "frac_reward_zero_std": 0.96875, "grad_norm": 16.207759857177734, "kl": 0.5885222520446405, "learning_rate": 4.463968253968254e-07, "loss": 0.0006, "num_tokens": 936978566.0, "reward": 0.3453125, "reward_std": 0.028460076451301573, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9301599740982056, "step": 13755 }, { "completion_length": 361.4, "completions/clipped_ratio": 0.0, "completions/max_length": 361.4, "completions/max_terminated_length": 361.4, "completions/mean_length": 91.06953125, "completions/mean_terminated_length": 91.06953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012452781694410909, "frac_reward_zero_std": 0.9625, "grad_norm": 0.005069556646049023, "kl": 2.5587736021494494, "learning_rate": 4.463571428571428e-07, "loss": 0.0026, "num_tokens": 937293343.0, "reward": 0.3640625, "reward_std": 0.03198335729539394, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9278542160987854, "step": 13760 }, { "completion_length": 324.4, "completions/clipped_ratio": 0.0, "completions/max_length": 324.4, "completions/max_terminated_length": 324.4, "completions/mean_length": 93.80078125, "completions/mean_terminated_length": 93.80078125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012457306687759169, "frac_reward_zero_std": 0.975, "grad_norm": 1.2739399671554565, "kl": 1.1576052844990046, "learning_rate": 4.463174603174603e-07, "loss": 0.0012, "num_tokens": 937615552.0, "reward": 0.409375, "reward_std": 0.023356688022613526, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.8964225292205811, "step": 13765 }, { "completion_length": 465.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 465.0, "completions/max_terminated_length": 367.8, "completions/mean_length": 96.03359375, "completions/mean_terminated_length": 95.50682067871094, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012461831681107429, "frac_reward_zero_std": 0.96875, "grad_norm": 1.465075135231018, "kl": 1.130070722894743, "learning_rate": 4.4627777777777777e-07, "loss": 0.0011, "num_tokens": 937938899.0, "reward": 0.3375, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9127062559127808, "step": 13770 }, { "completion_length": 436.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 436.8, "completions/max_terminated_length": 416.8, "completions/mean_length": 93.2625, "completions/mean_terminated_length": 91.66353454589844, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012466356674455689, "frac_reward_zero_std": 0.95625, "grad_norm": 0.002353955525904894, "kl": 0.2569245248567313, "learning_rate": 4.4623809523809517e-07, "loss": 0.0003, "num_tokens": 938258299.0, "reward": 0.403125, "reward_std": 0.042552615702152255, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.8798584461212158, "step": 13775 }, { "completion_length": 322.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 90.5609375, "completions/mean_terminated_length": 90.5609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012470881667803949, "frac_reward_zero_std": 0.9625, "grad_norm": 3.1482553482055664, "kl": 1.459560235706158, "learning_rate": 4.461984126984127e-07, "loss": 0.0015, "num_tokens": 938574337.0, "reward": 0.371875, "reward_std": 0.02925042174756527, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9265636682510376, "step": 13780 }, { "completion_length": 336.2, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/max_terminated_length": 336.2, "completions/mean_length": 86.603125, "completions/mean_terminated_length": 86.603125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012475406661152207, "frac_reward_zero_std": 0.9375, "grad_norm": 5.3781962394714355, "kl": 0.8615400116890669, "learning_rate": 4.4615873015873013e-07, "loss": 0.0009, "num_tokens": 938883613.0, "reward": 0.303125, "reward_std": 0.0537619024515152, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9486968517303467, "step": 13785 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 94.63515625, "completions/mean_terminated_length": 94.63515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012479931654500467, "frac_reward_zero_std": 0.95625, "grad_norm": 1.0798158645629883, "kl": 1.5637803153134882, "learning_rate": 4.4611904761904763e-07, "loss": 0.0016, "num_tokens": 939206178.0, "reward": 0.2984375, "reward_std": 0.037769732624292375, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9490845441818238, "step": 13790 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 93.25078125, "completions/mean_terminated_length": 93.25078125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012484456647848727, "frac_reward_zero_std": 0.94375, "grad_norm": 0.673958420753479, "kl": 2.6365719925961457, "learning_rate": 4.4607936507936504e-07, "loss": 0.0026, "num_tokens": 939525795.0, "reward": 0.309375, "reward_std": 0.044086816534399983, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9356370210647583, "step": 13795 }, { "completion_length": 297.8, "completions/clipped_ratio": 0.0, "completions/max_length": 297.8, "completions/max_terminated_length": 297.8, "completions/mean_length": 88.175, "completions/mean_terminated_length": 88.175, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012488981641196988, "frac_reward_zero_std": 0.95, "grad_norm": 7.704982757568359, "kl": 1.7698301663855092, "learning_rate": 4.460396825396825e-07, "loss": 0.0018, "num_tokens": 939836835.0, "reward": 0.3625, "reward_std": 0.045818221569061277, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9217177987098694, "step": 13800 }, { "completion_length": 381.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 381.8, "completions/max_terminated_length": 323.2, "completions/mean_length": 90.7390625, "completions/mean_terminated_length": 90.21522827148438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012493506634545248, "frac_reward_zero_std": 0.95, "grad_norm": 10.946720123291016, "kl": 0.6213038160931319, "learning_rate": 4.46e-07, "loss": 0.0006, "num_tokens": 940151341.0, "reward": 0.5171875, "reward_std": 0.04308528564870358, "rewards/verify_chess_move/mean": 0.5171875, "rewards/verify_chess_move/std": 0.8336009621620178, "step": 13805 }, { "completion_length": 336.8, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/max_terminated_length": 336.8, "completions/mean_length": 92.15078125, "completions/mean_terminated_length": 92.15078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012498031627893508, "frac_reward_zero_std": 0.9375, "grad_norm": 18.08649444580078, "kl": 3.210861660260707, "learning_rate": 4.4596031746031745e-07, "loss": 0.0032, "num_tokens": 940469734.0, "reward": 0.3765625, "reward_std": 0.05581086836755276, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.917759382724762, "step": 13810 }, { "completion_length": 264.4, "completions/clipped_ratio": 0.0, "completions/max_length": 264.4, "completions/max_terminated_length": 264.4, "completions/mean_length": 87.20078125, "completions/mean_terminated_length": 87.20078125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012502556621241766, "frac_reward_zero_std": 0.98125, "grad_norm": 0.007248949725180864, "kl": 0.148629281274043, "learning_rate": 4.459206349206349e-07, "loss": 0.0001, "num_tokens": 940780007.0, "reward": 0.4625, "reward_std": 0.016675157845020293, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.876534616947174, "step": 13815 }, { "completion_length": 567.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 567.2, "completions/max_terminated_length": 416.0, "completions/mean_length": 96.6828125, "completions/mean_terminated_length": 95.12975311279297, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012507081614590026, "frac_reward_zero_std": 0.9625, "grad_norm": 0.004076177254319191, "kl": 2.799395631253719, "learning_rate": 4.4588095238095236e-07, "loss": 0.0028, "num_tokens": 941105465.0, "reward": 0.24375, "reward_std": 0.03424547016620636, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9450334191322327, "step": 13820 }, { "completion_length": 460.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 460.4, "completions/max_terminated_length": 397.2, "completions/mean_length": 84.56953125, "completions/mean_terminated_length": 84.02745819091797, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012511606607938286, "frac_reward_zero_std": 0.9625, "grad_norm": 17.678791046142578, "kl": 3.0412223247811196, "learning_rate": 4.458412698412698e-07, "loss": 0.003, "num_tokens": 941409114.0, "reward": 0.5125, "reward_std": 0.031300367787480354, "rewards/verify_chess_move/mean": 0.5125, "rewards/verify_chess_move/std": 0.8580702900886535, "step": 13825 }, { "completion_length": 275.6, "completions/clipped_ratio": 0.0, "completions/max_length": 275.6, "completions/max_terminated_length": 275.6, "completions/mean_length": 80.70625, "completions/mean_terminated_length": 80.70625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012516131601286546, "frac_reward_zero_std": 0.9375, "grad_norm": 0.017903635278344154, "kl": 0.7102462187875063, "learning_rate": 4.4580158730158727e-07, "loss": 0.0007, "num_tokens": 941707010.0, "reward": 0.5453125, "reward_std": 0.056024014949798584, "rewards/verify_chess_move/mean": 0.5453125, "rewards/verify_chess_move/std": 0.8219718933105469, "step": 13830 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 91.53828125, "completions/mean_terminated_length": 91.53828125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012520656594634806, "frac_reward_zero_std": 0.95, "grad_norm": 0.0939299687743187, "kl": 0.49315714484546336, "learning_rate": 4.457619047619047e-07, "loss": 0.0005, "num_tokens": 942024931.0, "reward": 0.3140625, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9501711130142212, "step": 13835 }, { "completion_length": 388.4, "completions/clipped_ratio": 0.0, "completions/max_length": 388.4, "completions/max_terminated_length": 388.4, "completions/mean_length": 91.215625, "completions/mean_terminated_length": 91.215625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.012525181587983065, "frac_reward_zero_std": 0.93125, "grad_norm": 20.56283187866211, "kl": 1.7177413681289182, "learning_rate": 4.4572222222222223e-07, "loss": 0.0017, "num_tokens": 942342071.0, "reward": 0.2640625, "reward_std": 0.05818132013082504, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9635833740234375, "step": 13840 }, { "completion_length": 418.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 93.81484375, "completions/mean_terminated_length": 93.81484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012529706581331325, "frac_reward_zero_std": 0.9125, "grad_norm": 10.78567886352539, "kl": 2.48105883304961, "learning_rate": 4.456825396825397e-07, "loss": 0.0025, "num_tokens": 942661610.0, "reward": 0.334375, "reward_std": 0.07890890017151833, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9367896676063537, "step": 13845 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 80.81171875, "completions/mean_terminated_length": 80.81171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012534231574679585, "frac_reward_zero_std": 0.96875, "grad_norm": 25.267671585083008, "kl": 6.7142376460833475, "learning_rate": 4.456428571428571e-07, "loss": 0.0067, "num_tokens": 942961129.0, "reward": 0.4484375, "reward_std": 0.027564920112490655, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8694368004798889, "step": 13850 }, { "completion_length": 591.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 591.4, "completions/max_terminated_length": 498.0, "completions/mean_length": 97.47109375, "completions/mean_terminated_length": 96.93780364990235, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012538756568027845, "frac_reward_zero_std": 0.9375, "grad_norm": 4.946943283081055, "kl": 10.419283183140214, "learning_rate": 4.456031746031746e-07, "loss": 0.0104, "num_tokens": 943287132.0, "reward": 0.3171875, "reward_std": 0.05124015025794506, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9208624601364136, "step": 13855 }, { "completion_length": 326.8, "completions/clipped_ratio": 0.0, "completions/max_length": 326.8, "completions/max_terminated_length": 326.8, "completions/mean_length": 83.67421875, "completions/mean_terminated_length": 83.67421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012543281561376105, "frac_reward_zero_std": 0.93125, "grad_norm": 7.252847671508789, "kl": 2.4019881268730385, "learning_rate": 4.4556349206349204e-07, "loss": 0.0024, "num_tokens": 943590251.0, "reward": 0.384375, "reward_std": 0.06249337755143643, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9132934808731079, "step": 13860 }, { "completion_length": 411.4, "completions/clipped_ratio": 0.0, "completions/max_length": 411.4, "completions/max_terminated_length": 411.4, "completions/mean_length": 94.671875, "completions/mean_terminated_length": 94.671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012547806554724365, "frac_reward_zero_std": 0.9375, "grad_norm": 11.593186378479004, "kl": 6.277386583504267, "learning_rate": 4.4552380952380955e-07, "loss": 0.0063, "num_tokens": 943912495.0, "reward": 0.3328125, "reward_std": 0.05239494368433952, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9393522500991821, "step": 13865 }, { "completion_length": 301.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 93.046875, "completions/mean_terminated_length": 93.046875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012552331548072624, "frac_reward_zero_std": 0.95, "grad_norm": 4.969786643981934, "kl": 0.7978169463574887, "learning_rate": 4.4548412698412695e-07, "loss": 0.0008, "num_tokens": 944233339.0, "reward": 0.3234375, "reward_std": 0.04445126354694366, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9405418753623962, "step": 13870 }, { "completion_length": 316.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 84.4953125, "completions/mean_terminated_length": 84.4953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012556856541420884, "frac_reward_zero_std": 0.93125, "grad_norm": 6.792377948760986, "kl": 2.360035869642161, "learning_rate": 4.454444444444444e-07, "loss": 0.0024, "num_tokens": 944538285.0, "reward": 0.4375, "reward_std": 0.05818033888936043, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8928818941116333, "step": 13875 }, { "completion_length": 320.4, "completions/clipped_ratio": 0.0, "completions/max_length": 320.4, "completions/max_terminated_length": 320.4, "completions/mean_length": 89.01484375, "completions/mean_terminated_length": 89.01484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012561381534769144, "frac_reward_zero_std": 0.94375, "grad_norm": 9.271404266357422, "kl": 0.7431009885156528, "learning_rate": 4.454047619047619e-07, "loss": 0.0007, "num_tokens": 944853464.0, "reward": 0.4140625, "reward_std": 0.04908284731209278, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.885945463180542, "step": 13880 }, { "completion_length": 332.8, "completions/clipped_ratio": 0.0, "completions/max_length": 332.8, "completions/max_terminated_length": 332.8, "completions/mean_length": 89.5453125, "completions/mean_terminated_length": 89.5453125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012565906528117404, "frac_reward_zero_std": 0.9625, "grad_norm": 9.462688446044922, "kl": 0.5555504236370326, "learning_rate": 4.453650793650793e-07, "loss": 0.0006, "num_tokens": 945165330.0, "reward": 0.3640625, "reward_std": 0.02993340939283371, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9268692970275879, "step": 13885 }, { "completion_length": 396.4, "completions/clipped_ratio": 0.0, "completions/max_length": 396.4, "completions/max_terminated_length": 396.4, "completions/mean_length": 90.396875, "completions/mean_terminated_length": 90.396875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012570431521465664, "frac_reward_zero_std": 0.95, "grad_norm": 12.630802154541016, "kl": 0.3228490558685735, "learning_rate": 4.453253968253968e-07, "loss": 0.0003, "num_tokens": 945479702.0, "reward": 0.35625, "reward_std": 0.042873119562864305, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9336000323295593, "step": 13890 }, { "completion_length": 402.8, "completions/clipped_ratio": 0.0, "completions/max_length": 402.8, "completions/max_terminated_length": 402.8, "completions/mean_length": 89.46953125, "completions/mean_terminated_length": 89.46953125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012574956514813922, "frac_reward_zero_std": 0.95, "grad_norm": 16.00882339477539, "kl": 2.4631610656855627, "learning_rate": 4.452857142857143e-07, "loss": 0.0025, "num_tokens": 945791415.0, "reward": 0.403125, "reward_std": 0.046973013877868654, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.913924777507782, "step": 13895 }, { "completion_length": 390.8, "completions/clipped_ratio": 0.0, "completions/max_length": 390.8, "completions/max_terminated_length": 390.8, "completions/mean_length": 90.84765625, "completions/mean_terminated_length": 90.84765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012579481508162182, "frac_reward_zero_std": 0.975, "grad_norm": 0.16742680966854095, "kl": 0.5283354206476361, "learning_rate": 4.4524603174603173e-07, "loss": 0.0005, "num_tokens": 946104972.0, "reward": 0.3328125, "reward_std": 0.021778544411063194, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.935850989818573, "step": 13900 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 370.0, "completions/max_terminated_length": 305.6, "completions/mean_length": 90.1703125, "completions/mean_terminated_length": 89.64364929199219, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012584006501510443, "frac_reward_zero_std": 0.975, "grad_norm": 10.598573684692383, "kl": 2.8958755197236314, "learning_rate": 4.452063492063492e-07, "loss": 0.0029, "num_tokens": 946419894.0, "reward": 0.4890625, "reward_std": 0.024039676412940025, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8575030922889709, "step": 13905 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 83.80078125, "completions/mean_terminated_length": 83.80078125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.012588531494858703, "frac_reward_zero_std": 0.96875, "grad_norm": 0.009562856517732143, "kl": 1.6185156900668516, "learning_rate": 4.4516666666666664e-07, "loss": 0.0016, "num_tokens": 946724583.0, "reward": 0.28125, "reward_std": 0.02756393849849701, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9458833456039428, "step": 13910 }, { "completion_length": 495.6, "completions/clipped_ratio": 0.0, "completions/max_length": 495.6, "completions/max_terminated_length": 495.6, "completions/mean_length": 90.49765625, "completions/mean_terminated_length": 90.49765625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012593056488206963, "frac_reward_zero_std": 0.94375, "grad_norm": 13.2465238571167, "kl": 1.3003385601681656, "learning_rate": 4.4512698412698414e-07, "loss": 0.0013, "num_tokens": 947039484.0, "reward": 0.4171875, "reward_std": 0.04387465007603168, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9083101153373718, "step": 13915 }, { "completion_length": 295.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 97.27265625, "completions/mean_terminated_length": 97.27265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012597581481555223, "frac_reward_zero_std": 0.925, "grad_norm": 10.451923370361328, "kl": 8.720685129845515, "learning_rate": 4.4508730158730154e-07, "loss": 0.0087, "num_tokens": 947365897.0, "reward": 0.3875, "reward_std": 0.05939599797129631, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.8797137379646301, "step": 13920 }, { "completion_length": 331.2, "completions/clipped_ratio": 0.0, "completions/max_length": 331.2, "completions/max_terminated_length": 331.2, "completions/mean_length": 87.675, "completions/mean_terminated_length": 87.675, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012602106474903481, "frac_reward_zero_std": 0.95, "grad_norm": 0.7954887747764587, "kl": 3.10978718935512, "learning_rate": 4.45047619047619e-07, "loss": 0.0031, "num_tokens": 947674809.0, "reward": 0.3359375, "reward_std": 0.040822192654013635, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9288051962852478, "step": 13925 }, { "completion_length": 504.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 504.4, "completions/max_terminated_length": 415.6, "completions/mean_length": 92.98359375, "completions/mean_terminated_length": 92.45133056640626, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012606631468251741, "frac_reward_zero_std": 0.93125, "grad_norm": 7.732799530029297, "kl": 4.163808863854501, "learning_rate": 4.450079365079365e-07, "loss": 0.0042, "num_tokens": 947994012.0, "reward": 0.2265625, "reward_std": 0.05792168155312538, "rewards/verify_chess_move/mean": 0.2265625, "rewards/verify_chess_move/std": 0.9566613078117371, "step": 13930 }, { "completion_length": 415.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 91.7, "completions/mean_terminated_length": 91.7, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012611156461600001, "frac_reward_zero_std": 0.96875, "grad_norm": 1.017687439918518, "kl": 2.6093725998187436, "learning_rate": 4.4496825396825396e-07, "loss": 0.0026, "num_tokens": 948310900.0, "reward": 0.3703125, "reward_std": 0.03051002249121666, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9225232124328613, "step": 13935 }, { "completion_length": 263.8, "completions/clipped_ratio": 0.0, "completions/max_length": 263.8, "completions/max_terminated_length": 263.8, "completions/mean_length": 90.23046875, "completions/mean_terminated_length": 90.23046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012615681454948261, "frac_reward_zero_std": 0.9625, "grad_norm": 0.15758271515369415, "kl": 0.37638815597165376, "learning_rate": 4.449285714285714e-07, "loss": 0.0004, "num_tokens": 948627227.0, "reward": 0.346875, "reward_std": 0.03424547016620636, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9333954930305481, "step": 13940 }, { "completion_length": 417.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 417.4, "completions/max_terminated_length": 335.0, "completions/mean_length": 88.565625, "completions/mean_terminated_length": 88.03456573486328, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012620206448296522, "frac_reward_zero_std": 0.9625, "grad_norm": 3.1503446102142334, "kl": 1.256453863380011, "learning_rate": 4.4488888888888887e-07, "loss": 0.0013, "num_tokens": 948938471.0, "reward": 0.378125, "reward_std": 0.037190573662519454, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9213185667991638, "step": 13945 }, { "completion_length": 419.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.0, "completions/max_terminated_length": 385.2, "completions/mean_length": 85.0390625, "completions/mean_terminated_length": 84.49821472167969, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01262473144164478, "frac_reward_zero_std": 0.94375, "grad_norm": 0.4750935733318329, "kl": 0.5164618293638341, "learning_rate": 4.448492063492063e-07, "loss": 0.0005, "num_tokens": 949242505.0, "reward": 0.453125, "reward_std": 0.050025473535060885, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8888858914375305, "step": 13950 }, { "completion_length": 278.8, "completions/clipped_ratio": 0.0, "completions/max_length": 278.8, "completions/max_terminated_length": 278.8, "completions/mean_length": 87.46015625, "completions/mean_terminated_length": 87.46015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01262925643499304, "frac_reward_zero_std": 0.925, "grad_norm": 0.5885481834411621, "kl": 0.8453262974042446, "learning_rate": 4.448095238095238e-07, "loss": 0.0008, "num_tokens": 949552958.0, "reward": 0.39375, "reward_std": 0.06917490772902965, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.913848090171814, "step": 13955 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.0, "completions/max_length": 457.4, "completions/max_terminated_length": 457.4, "completions/mean_length": 104.11875, "completions/mean_terminated_length": 104.11875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0126337814283413, "frac_reward_zero_std": 0.94375, "grad_norm": 0.42106765508651733, "kl": 1.7007275788113474, "learning_rate": 4.4476984126984123e-07, "loss": 0.0017, "num_tokens": 949892142.0, "reward": 0.2328125, "reward_std": 0.04545377567410469, "rewards/verify_chess_move/mean": 0.2328125, "rewards/verify_chess_move/std": 0.9661232352256774, "step": 13960 }, { "completion_length": 307.6, "completions/clipped_ratio": 0.0, "completions/max_length": 307.6, "completions/max_terminated_length": 307.6, "completions/mean_length": 90.46328125, "completions/mean_terminated_length": 90.46328125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01263830642168956, "frac_reward_zero_std": 0.975, "grad_norm": 0.3694351613521576, "kl": 0.785171381640248, "learning_rate": 4.4473015873015874e-07, "loss": 0.0008, "num_tokens": 950208231.0, "reward": 0.415625, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9055006742477417, "step": 13965 }, { "completion_length": 545.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 545.8, "completions/max_terminated_length": 438.4, "completions/mean_length": 86.97734375, "completions/mean_terminated_length": 85.91929168701172, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01264283141503782, "frac_reward_zero_std": 0.925, "grad_norm": 4.11160945892334, "kl": 1.0234364861738867, "learning_rate": 4.446904761904762e-07, "loss": 0.001, "num_tokens": 950515442.0, "reward": 0.378125, "reward_std": 0.06533465348184109, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9052493214607239, "step": 13970 }, { "completion_length": 428.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.6, "completions/max_terminated_length": 340.4, "completions/mean_length": 94.06796875, "completions/mean_terminated_length": 93.54437561035157, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01264735640838608, "frac_reward_zero_std": 0.9125, "grad_norm": 10.77508544921875, "kl": 3.6629001907305794, "learning_rate": 4.446507936507936e-07, "loss": 0.0037, "num_tokens": 950835937.0, "reward": 0.328125, "reward_std": 0.07165075466036797, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9220895171165466, "step": 13975 }, { "completion_length": 471.2, "completions/clipped_ratio": 0.0, "completions/max_length": 471.2, "completions/max_terminated_length": 471.2, "completions/mean_length": 94.60703125, "completions/mean_terminated_length": 94.60703125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012651881401734339, "frac_reward_zero_std": 0.9375, "grad_norm": 10.57803726196289, "kl": 0.8734394169645384, "learning_rate": 4.446111111111111e-07, "loss": 0.0009, "num_tokens": 951155482.0, "reward": 0.5109375, "reward_std": 0.05397406741976738, "rewards/verify_chess_move/mean": 0.5109375, "rewards/verify_chess_move/std": 0.8369932651519776, "step": 13980 }, { "completion_length": 522.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 522.6, "completions/max_terminated_length": 451.6, "completions/mean_length": 95.08203125, "completions/mean_terminated_length": 93.48460235595704, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012656406395082599, "frac_reward_zero_std": 0.9375, "grad_norm": 6.43428373336792, "kl": 0.20081598615506663, "learning_rate": 4.4457142857142855e-07, "loss": 0.0002, "num_tokens": 951478467.0, "reward": 0.425, "reward_std": 0.053973087668418886, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.902616274356842, "step": 13985 }, { "completion_length": 445.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.6, "completions/max_terminated_length": 364.2, "completions/mean_length": 87.97578125, "completions/mean_terminated_length": 87.4430923461914, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012660931388430859, "frac_reward_zero_std": 0.95, "grad_norm": 5.414128303527832, "kl": 0.2594410634716041, "learning_rate": 4.4453174603174606e-07, "loss": 0.0003, "num_tokens": 951789516.0, "reward": 0.234375, "reward_std": 0.04013920240104198, "rewards/verify_chess_move/mean": 0.234375, "rewards/verify_chess_move/std": 0.9461039900779724, "step": 13990 }, { "completion_length": 429.6, "completions/clipped_ratio": 0.0, "completions/max_length": 429.6, "completions/max_terminated_length": 429.6, "completions/mean_length": 93.4375, "completions/mean_terminated_length": 93.4375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012665456381779119, "frac_reward_zero_std": 0.95, "grad_norm": 4.973396301269531, "kl": 0.5642476099601481, "learning_rate": 4.4449206349206346e-07, "loss": 0.0006, "num_tokens": 952106852.0, "reward": 0.46875, "reward_std": 0.041034359112381937, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8803266763687134, "step": 13995 }, { "completion_length": 381.4, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/max_terminated_length": 381.4, "completions/mean_length": 88.328125, "completions/mean_terminated_length": 88.328125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012669981375127379, "frac_reward_zero_std": 0.95625, "grad_norm": 27.085941314697266, "kl": 0.5054098594933748, "learning_rate": 4.444523809523809e-07, "loss": 0.0005, "num_tokens": 952419184.0, "reward": 0.30625, "reward_std": 0.034352826327085494, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.949279534816742, "step": 14000 }, { "completion_length": 461.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 461.6, "completions/max_terminated_length": 430.2, "completions/mean_length": 92.53515625, "completions/mean_terminated_length": 92.0110855102539, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01267450636847564, "frac_reward_zero_std": 0.95, "grad_norm": 11.012228012084961, "kl": 0.3298710813978687, "learning_rate": 4.444126984126984e-07, "loss": 0.0003, "num_tokens": 952735781.0, "reward": 0.4, "reward_std": 0.0408231720328331, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9061472773551941, "step": 14005 }, { "completion_length": 535.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 535.8, "completions/max_terminated_length": 457.0, "completions/mean_length": 91.3578125, "completions/mean_terminated_length": 90.29554138183593, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012679031361823898, "frac_reward_zero_std": 0.96875, "grad_norm": 0.1520875096321106, "kl": 0.2132695549284108, "learning_rate": 4.443730158730158e-07, "loss": 0.0002, "num_tokens": 953050095.0, "reward": 0.3296875, "reward_std": 0.025726158916950227, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9329352021217346, "step": 14010 }, { "completion_length": 316.6, "completions/clipped_ratio": 0.0, "completions/max_length": 316.6, "completions/max_terminated_length": 316.6, "completions/mean_length": 86.234375, "completions/mean_terminated_length": 86.234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012683556355172158, "frac_reward_zero_std": 0.95, "grad_norm": 0.4931328296661377, "kl": 1.4199155554408207, "learning_rate": 4.4433333333333333e-07, "loss": 0.0014, "num_tokens": 953358963.0, "reward": 0.375, "reward_std": 0.04308430477976799, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9044018864631653, "step": 14015 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 90.53046875, "completions/mean_terminated_length": 90.53046875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012688081348520418, "frac_reward_zero_std": 0.96875, "grad_norm": 3.663573741912842, "kl": 0.2099335164297372, "learning_rate": 4.442936507936508e-07, "loss": 0.0002, "num_tokens": 953672306.0, "reward": 0.3234375, "reward_std": 0.02414703369140625, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9431273698806762, "step": 14020 }, { "completion_length": 289.2, "completions/clipped_ratio": 0.0, "completions/max_length": 289.2, "completions/max_terminated_length": 289.2, "completions/mean_length": 85.425, "completions/mean_terminated_length": 85.425, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012692606341868678, "frac_reward_zero_std": 0.9375, "grad_norm": 0.8351932764053345, "kl": 0.5274456440936774, "learning_rate": 4.4425396825396824e-07, "loss": 0.0005, "num_tokens": 953979642.0, "reward": 0.303125, "reward_std": 0.05102798528969288, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9496872067451477, "step": 14025 }, { "completion_length": 317.6, "completions/clipped_ratio": 0.0, "completions/max_length": 317.6, "completions/max_terminated_length": 317.6, "completions/mean_length": 91.615625, "completions/mean_terminated_length": 91.615625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012697131335216938, "frac_reward_zero_std": 0.9625, "grad_norm": 7.2017741203308105, "kl": 1.9631012857425958, "learning_rate": 4.442142857142857e-07, "loss": 0.002, "num_tokens": 954298382.0, "reward": 0.196875, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.196875, "rewards/verify_chess_move/std": 0.9716455221176148, "step": 14030 }, { "completion_length": 431.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 431.0, "completions/max_terminated_length": 406.8, "completions/mean_length": 94.546875, "completions/mean_terminated_length": 94.02981414794922, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012701656328565196, "frac_reward_zero_std": 0.94375, "grad_norm": 6.046407699584961, "kl": 0.8894813836552202, "learning_rate": 4.4417460317460314e-07, "loss": 0.0009, "num_tokens": 954616282.0, "reward": 0.36875, "reward_std": 0.04660954922437668, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9168949842453002, "step": 14035 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0, "completions/max_length": 389.6, "completions/max_terminated_length": 389.6, "completions/mean_length": 87.69140625, "completions/mean_terminated_length": 87.69140625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012706181321913456, "frac_reward_zero_std": 0.9625, "grad_norm": 5.722429275512695, "kl": 1.3662498391931877, "learning_rate": 4.4413492063492065e-07, "loss": 0.0014, "num_tokens": 954924847.0, "reward": 0.3828125, "reward_std": 0.03356248140335083, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9132529020309448, "step": 14040 }, { "completion_length": 494.2, "completions/clipped_ratio": 0.0, "completions/max_length": 494.2, "completions/max_terminated_length": 494.2, "completions/mean_length": 94.384375, "completions/mean_terminated_length": 94.384375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012710706315261716, "frac_reward_zero_std": 0.9375, "grad_norm": 28.05841636657715, "kl": 1.4075105204014107, "learning_rate": 4.4409523809523805e-07, "loss": 0.0014, "num_tokens": 955245187.0, "reward": 0.3140625, "reward_std": 0.054185253009200095, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.934941577911377, "step": 14045 }, { "completion_length": 430.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 430.6, "completions/max_terminated_length": 333.0, "completions/mean_length": 86.97421875, "completions/mean_terminated_length": 86.445849609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012715231308609977, "frac_reward_zero_std": 0.93125, "grad_norm": 13.06766414642334, "kl": 0.6476433984935284, "learning_rate": 4.440555555555555e-07, "loss": 0.0006, "num_tokens": 955553442.0, "reward": 0.3734375, "reward_std": 0.05702652558684349, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9277796983718872, "step": 14050 }, { "completion_length": 499.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 499.4, "completions/max_terminated_length": 372.2, "completions/mean_length": 92.6984375, "completions/mean_terminated_length": 91.65424194335938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012719756301958237, "frac_reward_zero_std": 0.9375, "grad_norm": 1.690643548965454, "kl": 0.5722888383432292, "learning_rate": 4.44015873015873e-07, "loss": 0.0006, "num_tokens": 955871152.0, "reward": 0.39375, "reward_std": 0.05350226499140263, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.8811159491539001, "step": 14055 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.2, "completions/max_terminated_length": 409.2, "completions/mean_length": 84.7625, "completions/mean_terminated_length": 84.22480163574218, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012724281295306497, "frac_reward_zero_std": 0.94375, "grad_norm": 18.947940826416016, "kl": 1.9245208099950104, "learning_rate": 4.4397619047619047e-07, "loss": 0.0019, "num_tokens": 956174960.0, "reward": 0.421875, "reward_std": 0.05249975323677063, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8940592288970948, "step": 14060 }, { "completion_length": 491.2, "completions/clipped_ratio": 0.0, "completions/max_length": 491.2, "completions/max_terminated_length": 491.2, "completions/mean_length": 88.2890625, "completions/mean_terminated_length": 88.2890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012728806288654755, "frac_reward_zero_std": 0.93125, "grad_norm": 0.008263207972049713, "kl": 1.1233963397564366, "learning_rate": 4.439365079365079e-07, "loss": 0.0011, "num_tokens": 956484762.0, "reward": 0.4234375, "reward_std": 0.059076473116874695, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8956797122955322, "step": 14065 }, { "completion_length": 576.8, "completions/clipped_ratio": 0.0, "completions/max_length": 576.8, "completions/max_terminated_length": 576.8, "completions/mean_length": 102.87265625, "completions/mean_terminated_length": 102.87265625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012733331282003015, "frac_reward_zero_std": 0.95625, "grad_norm": 0.19940216839313507, "kl": 1.4951803074451164, "learning_rate": 4.438968253968254e-07, "loss": 0.0015, "num_tokens": 956816775.0, "reward": 0.4578125, "reward_std": 0.04071483463048935, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.8875946998596191, "step": 14070 }, { "completion_length": 413.2, "completions/clipped_ratio": 0.0, "completions/max_length": 413.2, "completions/max_terminated_length": 413.2, "completions/mean_length": 86.1453125, "completions/mean_terminated_length": 86.1453125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012737856275351275, "frac_reward_zero_std": 0.94375, "grad_norm": 25.132150650024414, "kl": 5.029857169999741, "learning_rate": 4.4385714285714283e-07, "loss": 0.005, "num_tokens": 957123937.0, "reward": 0.4125, "reward_std": 0.04613676369190216, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9006668448448181, "step": 14075 }, { "completion_length": 361.6, "completions/clipped_ratio": 0.0, "completions/max_length": 361.6, "completions/max_terminated_length": 361.6, "completions/mean_length": 89.6890625, "completions/mean_terminated_length": 89.6890625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012742381268699535, "frac_reward_zero_std": 0.93125, "grad_norm": 8.20674991607666, "kl": 2.2689075608737768, "learning_rate": 4.4381746031746034e-07, "loss": 0.0023, "num_tokens": 957435627.0, "reward": 0.428125, "reward_std": 0.05408044271171093, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8975615859031677, "step": 14080 }, { "completion_length": 374.2, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/max_terminated_length": 374.2, "completions/mean_length": 94.36640625, "completions/mean_terminated_length": 94.36640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012746906262047795, "frac_reward_zero_std": 0.95625, "grad_norm": 0.13502717018127441, "kl": 2.219591690832749, "learning_rate": 4.4377777777777774e-07, "loss": 0.0022, "num_tokens": 957757480.0, "reward": 0.446875, "reward_std": 0.03503679595887661, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.8899943351745605, "step": 14085 }, { "completion_length": 476.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 476.0, "completions/max_terminated_length": 406.4, "completions/mean_length": 99.33828125, "completions/mean_terminated_length": 98.81290588378906, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012751431255396054, "frac_reward_zero_std": 0.94375, "grad_norm": 0.0036786324344575405, "kl": 2.063175674818922, "learning_rate": 4.4373809523809524e-07, "loss": 0.0021, "num_tokens": 958086921.0, "reward": 0.346875, "reward_std": 0.04592557735741139, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9324875712394715, "step": 14090 }, { "completion_length": 492.2, "completions/clipped_ratio": 0.003125, "completions/max_length": 492.2, "completions/max_terminated_length": 302.4, "completions/mean_length": 99.24140625, "completions/mean_terminated_length": 97.12530517578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012755956248744314, "frac_reward_zero_std": 0.95, "grad_norm": 5.0346245765686035, "kl": 0.4979581384337507, "learning_rate": 4.436984126984127e-07, "loss": 0.0005, "num_tokens": 958416614.0, "reward": 0.2765625, "reward_std": 0.04240131676197052, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9556626439094543, "step": 14095 }, { "completion_length": 275.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 89.55, "completions/mean_terminated_length": 89.55, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012760481242092574, "frac_reward_zero_std": 0.98125, "grad_norm": 4.815569877624512, "kl": 0.6443027205765247, "learning_rate": 4.436587301587301e-07, "loss": 0.0006, "num_tokens": 958729614.0, "reward": 0.4671875, "reward_std": 0.01893727108836174, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8633296966552735, "step": 14100 }, { "completion_length": 496.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 496.2, "completions/max_terminated_length": 391.0, "completions/mean_length": 90.1578125, "completions/mean_terminated_length": 89.07859802246094, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012765006235440834, "frac_reward_zero_std": 0.925, "grad_norm": 1.3087588548660278, "kl": 0.3095090846996754, "learning_rate": 4.436190476190476e-07, "loss": 0.0003, "num_tokens": 959040640.0, "reward": 0.2859375, "reward_std": 0.06439202874898911, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9403385758399964, "step": 14105 }, { "completion_length": 323.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 96.0328125, "completions/mean_terminated_length": 96.0328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.012769531228789094, "frac_reward_zero_std": 0.94375, "grad_norm": 1.782264232635498, "kl": 1.562021675845608, "learning_rate": 4.4357936507936506e-07, "loss": 0.0016, "num_tokens": 959366938.0, "reward": 0.3015625, "reward_std": 0.051132794842123984, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9446106791496277, "step": 14110 }, { "completion_length": 458.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.0, "completions/max_terminated_length": 453.2, "completions/mean_length": 93.325, "completions/mean_terminated_length": 92.81406097412109, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012774056222137354, "frac_reward_zero_std": 0.96875, "grad_norm": 8.363750457763672, "kl": 0.29757255347212774, "learning_rate": 4.4353968253968257e-07, "loss": 0.0003, "num_tokens": 959684666.0, "reward": 0.4109375, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.8801244139671326, "step": 14115 }, { "completion_length": 413.2, "completions/clipped_ratio": 0.0, "completions/max_length": 413.2, "completions/max_terminated_length": 413.2, "completions/mean_length": 89.81953125, "completions/mean_terminated_length": 89.81953125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012778581215485613, "frac_reward_zero_std": 0.95625, "grad_norm": 10.176056861877441, "kl": 0.859496869915165, "learning_rate": 4.4349999999999997e-07, "loss": 0.0009, "num_tokens": 959996459.0, "reward": 0.4, "reward_std": 0.03640277422964573, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.8919529795646668, "step": 14120 }, { "completion_length": 508.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 508.2, "completions/max_terminated_length": 459.4, "completions/mean_length": 96.015625, "completions/mean_terminated_length": 95.4969253540039, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012783106208833873, "frac_reward_zero_std": 0.95625, "grad_norm": 10.03741455078125, "kl": 0.2854888183064759, "learning_rate": 4.434603174603174e-07, "loss": 0.0003, "num_tokens": 960317751.0, "reward": 0.365625, "reward_std": 0.0350367970764637, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9100672960281372, "step": 14125 }, { "completion_length": 439.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.6, "completions/max_terminated_length": 395.2, "completions/mean_length": 101.128125, "completions/mean_terminated_length": 100.61298217773438, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012787631202182133, "frac_reward_zero_std": 0.96875, "grad_norm": 7.656935214996338, "kl": 0.4696543917991221, "learning_rate": 4.4342063492063493e-07, "loss": 0.0005, "num_tokens": 960648643.0, "reward": 0.3609375, "reward_std": 0.029826052486896515, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9180354475975037, "step": 14130 }, { "completion_length": 436.8, "completions/clipped_ratio": 0.0, "completions/max_length": 436.8, "completions/max_terminated_length": 436.8, "completions/mean_length": 95.08828125, "completions/mean_terminated_length": 95.08828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012792156195530393, "frac_reward_zero_std": 0.94375, "grad_norm": 2.470374345779419, "kl": 1.3952991229481995, "learning_rate": 4.4338095238095233e-07, "loss": 0.0014, "num_tokens": 960972332.0, "reward": 0.3109375, "reward_std": 0.05245228260755539, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9364976763725281, "step": 14135 }, { "completion_length": 418.4, "completions/clipped_ratio": 0.0, "completions/max_length": 418.4, "completions/max_terminated_length": 418.4, "completions/mean_length": 84.54453125, "completions/mean_terminated_length": 84.54453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012796681188878653, "frac_reward_zero_std": 0.96875, "grad_norm": 0.007827223278582096, "kl": 0.4909196695894934, "learning_rate": 4.433412698412698e-07, "loss": 0.0005, "num_tokens": 961275853.0, "reward": 0.4625, "reward_std": 0.029613886773586274, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.868663239479065, "step": 14140 }, { "completion_length": 270.8, "completions/clipped_ratio": 0.0, "completions/max_length": 270.8, "completions/max_terminated_length": 270.8, "completions/mean_length": 89.69765625, "completions/mean_terminated_length": 89.69765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012801206182226911, "frac_reward_zero_std": 0.9625, "grad_norm": 13.653964042663574, "kl": 0.32694672578945755, "learning_rate": 4.433015873015873e-07, "loss": 0.0003, "num_tokens": 961590506.0, "reward": 0.3265625, "reward_std": 0.0299334105104208, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.939684271812439, "step": 14145 }, { "completion_length": 352.8, "completions/clipped_ratio": 0.0, "completions/max_length": 352.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 88.95703125, "completions/mean_terminated_length": 88.95703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012805731175575171, "frac_reward_zero_std": 0.98125, "grad_norm": 2.3667068481445312, "kl": 3.0555565083166583, "learning_rate": 4.4326190476190474e-07, "loss": 0.0031, "num_tokens": 961902291.0, "reward": 0.4546875, "reward_std": 0.01530819907784462, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8869262576103211, "step": 14150 }, { "completion_length": 424.6, "completions/clipped_ratio": 0.0, "completions/max_length": 424.6, "completions/max_terminated_length": 424.6, "completions/mean_length": 94.12890625, "completions/mean_terminated_length": 94.12890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012810256168923432, "frac_reward_zero_std": 0.99375, "grad_norm": 0.021235017105937004, "kl": 2.4099910867982546, "learning_rate": 4.432222222222222e-07, "loss": 0.0024, "num_tokens": 962225232.0, "reward": 0.3109375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9385220646858216, "step": 14155 }, { "completion_length": 439.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 439.2, "completions/max_terminated_length": 424.2, "completions/mean_length": 89.9671875, "completions/mean_terminated_length": 88.91551361083984, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012814781162271692, "frac_reward_zero_std": 0.95625, "grad_norm": 0.036677174270153046, "kl": 0.5095990991801955, "learning_rate": 4.4318253968253965e-07, "loss": 0.0005, "num_tokens": 962538974.0, "reward": 0.428125, "reward_std": 0.03798190020024776, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8840435385704041, "step": 14160 }, { "completion_length": 307.8, "completions/clipped_ratio": 0.0, "completions/max_length": 307.8, "completions/max_terminated_length": 307.8, "completions/mean_length": 85.303125, "completions/mean_terminated_length": 85.303125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012819306155619952, "frac_reward_zero_std": 0.93125, "grad_norm": 0.759621798992157, "kl": 1.3512871796265244, "learning_rate": 4.431428571428571e-07, "loss": 0.0014, "num_tokens": 962845346.0, "reward": 0.3359375, "reward_std": 0.05997162833809853, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9253792405128479, "step": 14165 }, { "completion_length": 348.6, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 85.07734375, "completions/mean_terminated_length": 85.07734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012823831148968212, "frac_reward_zero_std": 0.95625, "grad_norm": 19.715543746948242, "kl": 0.21842155437916516, "learning_rate": 4.431031746031746e-07, "loss": 0.0002, "num_tokens": 963150933.0, "reward": 0.521875, "reward_std": 0.03913669064640999, "rewards/verify_chess_move/mean": 0.521875, "rewards/verify_chess_move/std": 0.8505025744438172, "step": 14170 }, { "completion_length": 440.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 440.8, "completions/max_terminated_length": 344.2, "completions/mean_length": 91.76171875, "completions/mean_terminated_length": 91.24235076904297, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01282835614231647, "frac_reward_zero_std": 0.96875, "grad_norm": 0.9907578825950623, "kl": 0.1902669514180161, "learning_rate": 4.43063492063492e-07, "loss": 0.0002, "num_tokens": 963468684.0, "reward": 0.49375, "reward_std": 0.02845909371972084, "rewards/verify_chess_move/mean": 0.49375, "rewards/verify_chess_move/std": 0.8623724818229676, "step": 14175 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 374.6, "completions/max_terminated_length": 287.8, "completions/mean_length": 91.2015625, "completions/mean_terminated_length": 90.67585601806641, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01283288113566473, "frac_reward_zero_std": 0.96875, "grad_norm": 1.4401428699493408, "kl": 0.20202018067939206, "learning_rate": 4.430238095238095e-07, "loss": 0.0002, "num_tokens": 963785398.0, "reward": 0.340625, "reward_std": 0.02709311693906784, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9376920104026795, "step": 14180 }, { "completion_length": 395.8, "completions/clipped_ratio": 0.0, "completions/max_length": 395.8, "completions/max_terminated_length": 395.8, "completions/mean_length": 82.653125, "completions/mean_terminated_length": 82.653125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01283740612901299, "frac_reward_zero_std": 0.975, "grad_norm": 0.011507590301334858, "kl": 0.20859317143913358, "learning_rate": 4.42984126984127e-07, "loss": 0.0002, "num_tokens": 964087466.0, "reward": 0.3359375, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9274922132492065, "step": 14185 }, { "completion_length": 295.2, "completions/clipped_ratio": 0.0, "completions/max_length": 295.2, "completions/max_terminated_length": 295.2, "completions/mean_length": 88.39609375, "completions/mean_terminated_length": 88.39609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01284193112236125, "frac_reward_zero_std": 0.96875, "grad_norm": 1.0452988147735596, "kl": 0.22135160146281124, "learning_rate": 4.429444444444444e-07, "loss": 0.0002, "num_tokens": 964397053.0, "reward": 0.4890625, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8674521803855896, "step": 14190 }, { "completion_length": 546.2, "completions/clipped_ratio": 0.0, "completions/max_length": 546.2, "completions/max_terminated_length": 546.2, "completions/mean_length": 98.7359375, "completions/mean_terminated_length": 98.7359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01284645611570951, "frac_reward_zero_std": 0.91875, "grad_norm": 10.701545715332031, "kl": 0.6795646287268028, "learning_rate": 4.429047619047619e-07, "loss": 0.0007, "num_tokens": 964723867.0, "reward": 0.3140625, "reward_std": 0.0711200475692749, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9436717867851258, "step": 14195 }, { "completion_length": 342.4, "completions/clipped_ratio": 0.0, "completions/max_length": 342.4, "completions/max_terminated_length": 342.4, "completions/mean_length": 86.56484375, "completions/mean_terminated_length": 86.56484375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012850981109057769, "frac_reward_zero_std": 0.95, "grad_norm": 0.011777793057262897, "kl": 1.979893451393582, "learning_rate": 4.4286507936507934e-07, "loss": 0.002, "num_tokens": 965030966.0, "reward": 0.45625, "reward_std": 0.03945523202419281, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8693315863609314, "step": 14200 }, { "completion_length": 319.6, "completions/clipped_ratio": 0.0, "completions/max_length": 319.6, "completions/max_terminated_length": 319.6, "completions/mean_length": 83.95859375, "completions/mean_terminated_length": 83.95859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012855506102406029, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06916859745979309, "kl": 0.3892395328963175, "learning_rate": 4.4282539682539684e-07, "loss": 0.0004, "num_tokens": 965333665.0, "reward": 0.3984375, "reward_std": 0.02414703369140625, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9135731101036072, "step": 14205 }, { "completion_length": 288.2, "completions/clipped_ratio": 0.0, "completions/max_length": 288.2, "completions/max_terminated_length": 288.2, "completions/mean_length": 86.39375, "completions/mean_terminated_length": 86.39375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012860031095754289, "frac_reward_zero_std": 0.98125, "grad_norm": 8.247723579406738, "kl": 0.30911340974271295, "learning_rate": 4.4278571428571425e-07, "loss": 0.0003, "num_tokens": 965640921.0, "reward": 0.390625, "reward_std": 0.01552036516368389, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9134121894836426, "step": 14210 }, { "completion_length": 448.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 448.0, "completions/max_terminated_length": 402.8, "completions/mean_length": 95.24609375, "completions/mean_terminated_length": 94.21663970947266, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01286455608910255, "frac_reward_zero_std": 0.96875, "grad_norm": 30.474496841430664, "kl": 0.49997180548962206, "learning_rate": 4.427460317460317e-07, "loss": 0.0005, "num_tokens": 965960900.0, "reward": 0.4578125, "reward_std": 0.02824692875146866, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.8550233125686646, "step": 14215 }, { "completion_length": 427.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 427.8, "completions/max_terminated_length": 347.6, "completions/mean_length": 89.83671875, "completions/mean_terminated_length": 88.77404022216797, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01286908108245081, "frac_reward_zero_std": 0.95625, "grad_norm": 6.385611534118652, "kl": 1.9358516362030058, "learning_rate": 4.427063492063492e-07, "loss": 0.0019, "num_tokens": 966273843.0, "reward": 0.25, "reward_std": 0.04208179414272308, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9577676773071289, "step": 14220 }, { "completion_length": 329.4, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/max_terminated_length": 329.4, "completions/mean_length": 96.51015625, "completions/mean_terminated_length": 96.51015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01287360607579907, "frac_reward_zero_std": 0.95, "grad_norm": 2.643730878829956, "kl": 1.0540630569215863, "learning_rate": 4.426666666666666e-07, "loss": 0.0011, "num_tokens": 966598008.0, "reward": 0.4421875, "reward_std": 0.04219013154506683, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.885304069519043, "step": 14225 }, { "completion_length": 334.8, "completions/clipped_ratio": 0.0, "completions/max_length": 334.8, "completions/max_terminated_length": 334.8, "completions/mean_length": 92.14296875, "completions/mean_terminated_length": 92.14296875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.012878131069147328, "frac_reward_zero_std": 0.9625, "grad_norm": 0.27877500653266907, "kl": 1.248435107129626, "learning_rate": 4.426269841269841e-07, "loss": 0.0012, "num_tokens": 966915431.0, "reward": 0.3390625, "reward_std": 0.03356248140335083, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9372966766357422, "step": 14230 }, { "completion_length": 605.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 605.4, "completions/max_terminated_length": 574.6, "completions/mean_length": 93.69609375, "completions/mean_terminated_length": 92.6611572265625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012882656062495588, "frac_reward_zero_std": 0.94375, "grad_norm": 0.4675285220146179, "kl": 0.8017355680698529, "learning_rate": 4.4258730158730157e-07, "loss": 0.0008, "num_tokens": 967234906.0, "reward": 0.253125, "reward_std": 0.04729155525565147, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9439074158668518, "step": 14235 }, { "completion_length": 332.6, "completions/clipped_ratio": 0.0, "completions/max_length": 332.6, "completions/max_terminated_length": 332.6, "completions/mean_length": 85.41640625, "completions/mean_terminated_length": 85.41640625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012887181055843848, "frac_reward_zero_std": 0.96875, "grad_norm": 17.958547592163086, "kl": 0.2783458915306255, "learning_rate": 4.42547619047619e-07, "loss": 0.0003, "num_tokens": 967540303.0, "reward": 0.4234375, "reward_std": 0.025726158171892166, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8964382410049438, "step": 14240 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 90.06015625, "completions/mean_terminated_length": 90.06015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012891706049192108, "frac_reward_zero_std": 0.9625, "grad_norm": 12.04513931274414, "kl": 0.9574502068106086, "learning_rate": 4.425079365079365e-07, "loss": 0.001, "num_tokens": 967854276.0, "reward": 0.28125, "reward_std": 0.028566450998187064, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9544920563697815, "step": 14245 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 358.4, "completions/max_terminated_length": 268.8, "completions/mean_length": 89.94609375, "completions/mean_terminated_length": 89.43555908203125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012896231042540368, "frac_reward_zero_std": 0.93125, "grad_norm": 9.096311569213867, "kl": 5.659578974568285, "learning_rate": 4.4246825396825393e-07, "loss": 0.0057, "num_tokens": 968167935.0, "reward": 0.3875, "reward_std": 0.054764412343502045, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9200615167617798, "step": 14250 }, { "completion_length": 501.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 501.4, "completions/max_terminated_length": 419.2, "completions/mean_length": 96.18046875, "completions/mean_terminated_length": 95.65844421386718, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012900756035888626, "frac_reward_zero_std": 0.9625, "grad_norm": 10.24559497833252, "kl": 0.7020671345642768, "learning_rate": 4.4242857142857144e-07, "loss": 0.0007, "num_tokens": 968492046.0, "reward": 0.353125, "reward_std": 0.03335031494498253, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9182881593704224, "step": 14255 }, { "completion_length": 432.4, "completions/clipped_ratio": 0.0, "completions/max_length": 432.4, "completions/max_terminated_length": 432.4, "completions/mean_length": 99.77421875, "completions/mean_terminated_length": 99.77421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012905281029236887, "frac_reward_zero_std": 0.95, "grad_norm": 5.335239410400391, "kl": 0.2332557180663571, "learning_rate": 4.423888888888889e-07, "loss": 0.0002, "num_tokens": 968821061.0, "reward": 0.3390625, "reward_std": 0.04150616116821766, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9416876435279846, "step": 14260 }, { "completion_length": 484.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 484.0, "completions/max_terminated_length": 327.2, "completions/mean_length": 89.4265625, "completions/mean_terminated_length": 88.35822143554688, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012909806022585147, "frac_reward_zero_std": 0.95625, "grad_norm": 10.48742389678955, "kl": 0.5617394400760531, "learning_rate": 4.423492063492063e-07, "loss": 0.0006, "num_tokens": 969133223.0, "reward": 0.4140625, "reward_std": 0.03776973336935043, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9090097069740295, "step": 14265 }, { "completion_length": 385.2, "completions/clipped_ratio": 0.0, "completions/max_length": 385.2, "completions/max_terminated_length": 385.2, "completions/mean_length": 91.0875, "completions/mean_terminated_length": 91.0875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012914331015933407, "frac_reward_zero_std": 0.96875, "grad_norm": 4.060704231262207, "kl": 0.36503767567919565, "learning_rate": 4.423095238095238e-07, "loss": 0.0004, "num_tokens": 969446847.0, "reward": 0.4203125, "reward_std": 0.02777610532939434, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8859761714935303, "step": 14270 }, { "completion_length": 354.2, "completions/clipped_ratio": 0.0, "completions/max_length": 354.2, "completions/max_terminated_length": 354.2, "completions/mean_length": 95.61953125, "completions/mean_terminated_length": 95.61953125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012918856009281667, "frac_reward_zero_std": 0.9625, "grad_norm": 1.8098593950271606, "kl": 0.4744215635932051, "learning_rate": 4.4226984126984125e-07, "loss": 0.0005, "num_tokens": 969770032.0, "reward": 0.4140625, "reward_std": 0.03766237571835518, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8874218344688416, "step": 14275 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 90.16953125, "completions/mean_terminated_length": 90.16953125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012923381002629927, "frac_reward_zero_std": 0.9625, "grad_norm": 0.050015922635793686, "kl": 0.2627982993144542, "learning_rate": 4.422301587301587e-07, "loss": 0.0003, "num_tokens": 970084057.0, "reward": 0.3671875, "reward_std": 0.03287851139903068, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.929820442199707, "step": 14280 }, { "completion_length": 493.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 493.4, "completions/max_terminated_length": 466.2, "completions/mean_length": 96.628125, "completions/mean_terminated_length": 96.11270294189453, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012927905995978185, "frac_reward_zero_std": 0.95, "grad_norm": 3.3746087551116943, "kl": 0.18342345741111785, "learning_rate": 4.4219047619047616e-07, "loss": 0.0002, "num_tokens": 970405709.0, "reward": 0.3484375, "reward_std": 0.040351370349526405, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9296130657196044, "step": 14285 }, { "completion_length": 448.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 99.6984375, "completions/mean_terminated_length": 99.6984375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012932430989326445, "frac_reward_zero_std": 0.95, "grad_norm": 1.9937843084335327, "kl": 0.2691073330584913, "learning_rate": 4.421507936507936e-07, "loss": 0.0003, "num_tokens": 970734883.0, "reward": 0.3609375, "reward_std": 0.04897549077868461, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.919677197933197, "step": 14290 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 90.35625, "completions/mean_terminated_length": 90.35625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012936955982674705, "frac_reward_zero_std": 0.975, "grad_norm": 0.001428319257684052, "kl": 0.12513173657935112, "learning_rate": 4.421111111111111e-07, "loss": 0.0001, "num_tokens": 971048283.0, "reward": 0.421875, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8975714683532715, "step": 14295 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 90.5234375, "completions/mean_terminated_length": 90.5234375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012941480976022966, "frac_reward_zero_std": 0.95, "grad_norm": 2.624758243560791, "kl": 0.4335777543252334, "learning_rate": 4.420714285714285e-07, "loss": 0.0004, "num_tokens": 971362321.0, "reward": 0.3328125, "reward_std": 0.04534641802310944, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9364734888076782, "step": 14300 }, { "completion_length": 393.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 393.4, "completions/max_terminated_length": 350.6, "completions/mean_length": 89.99921875, "completions/mean_terminated_length": 89.47250671386719, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.012946005969371226, "frac_reward_zero_std": 0.9375, "grad_norm": 2.958620548248291, "kl": 0.278748054546304, "learning_rate": 4.4203174603174603e-07, "loss": 0.0003, "num_tokens": 971674760.0, "reward": 0.3484375, "reward_std": 0.05239494405686855, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9119350194931031, "step": 14305 }, { "completion_length": 531.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 531.6, "completions/max_terminated_length": 414.0, "completions/mean_length": 101.76328125, "completions/mean_terminated_length": 100.70484924316406, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012950530962719484, "frac_reward_zero_std": 0.95, "grad_norm": 1.8366745710372925, "kl": 0.2539684817194939, "learning_rate": 4.419920634920635e-07, "loss": 0.0003, "num_tokens": 972006225.0, "reward": 0.3609375, "reward_std": 0.046501210704445836, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9297589778900146, "step": 14310 }, { "completion_length": 385.6, "completions/clipped_ratio": 0.0, "completions/max_length": 385.6, "completions/max_terminated_length": 385.6, "completions/mean_length": 97.309375, "completions/mean_terminated_length": 97.309375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.012955055956067744, "frac_reward_zero_std": 0.925, "grad_norm": 2.872243881225586, "kl": 0.5312512211152353, "learning_rate": 4.419523809523809e-07, "loss": 0.0005, "num_tokens": 972330981.0, "reward": 0.446875, "reward_std": 0.06486383192241192, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.8819049000740051, "step": 14315 }, { "completion_length": 450.6, "completions/clipped_ratio": 0.0, "completions/max_length": 450.6, "completions/max_terminated_length": 450.6, "completions/mean_length": 88.2703125, "completions/mean_terminated_length": 88.2703125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.012959580949416004, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0010511665605008602, "kl": 0.2225172363454476, "learning_rate": 4.419126984126984e-07, "loss": 0.0002, "num_tokens": 972639815.0, "reward": 0.3953125, "reward_std": 0.033669838309288026, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9128103852272034, "step": 14320 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 90.92734375, "completions/mean_terminated_length": 90.92734375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.012964105942764264, "frac_reward_zero_std": 0.95625, "grad_norm": 3.0545408725738525, "kl": 0.42792005809023975, "learning_rate": 4.4187301587301585e-07, "loss": 0.0004, "num_tokens": 972955650.0, "reward": 0.2109375, "reward_std": 0.03298586755990982, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.9643134236335754, "step": 14325 }, { "completion_length": 398.6, "completions/clipped_ratio": 0.0, "completions/max_length": 398.6, "completions/max_terminated_length": 398.6, "completions/mean_length": 93.21171875, "completions/mean_terminated_length": 93.21171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.012968630936112524, "frac_reward_zero_std": 0.9625, "grad_norm": 9.632599830627441, "kl": 1.4937981761759147, "learning_rate": 4.4183333333333335e-07, "loss": 0.0015, "num_tokens": 973273697.0, "reward": 0.3265625, "reward_std": 0.03240768909454346, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9443271160125732, "step": 14330 }, { "completion_length": 272.6, "completions/clipped_ratio": 0.0, "completions/max_length": 272.6, "completions/max_terminated_length": 272.6, "completions/mean_length": 86.325, "completions/mean_terminated_length": 86.325, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012973155929460784, "frac_reward_zero_std": 0.9625, "grad_norm": 4.1911821365356445, "kl": 0.24094159710220991, "learning_rate": 4.4179365079365075e-07, "loss": 0.0002, "num_tokens": 973581265.0, "reward": 0.41875, "reward_std": 0.0354002621024847, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8554489493370057, "step": 14335 }, { "completion_length": 390.6, "completions/clipped_ratio": 0.0, "completions/max_length": 390.6, "completions/max_terminated_length": 390.6, "completions/mean_length": 89.3203125, "completions/mean_terminated_length": 89.3203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.012977680922809043, "frac_reward_zero_std": 0.95625, "grad_norm": 8.756795883178711, "kl": 0.8270891987951472, "learning_rate": 4.417539682539682e-07, "loss": 0.0008, "num_tokens": 973891931.0, "reward": 0.3078125, "reward_std": 0.03956004194915295, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9441657781600952, "step": 14340 }, { "completion_length": 326.6, "completions/clipped_ratio": 0.0, "completions/max_length": 326.6, "completions/max_terminated_length": 326.6, "completions/mean_length": 90.95390625, "completions/mean_terminated_length": 90.95390625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.012982205916157303, "frac_reward_zero_std": 0.96875, "grad_norm": 2.204237461090088, "kl": 0.5103823390323668, "learning_rate": 4.417142857142857e-07, "loss": 0.0005, "num_tokens": 974206344.0, "reward": 0.5203125, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.5203125, "rewards/verify_chess_move/std": 0.846467936038971, "step": 14345 }, { "completion_length": 325.2, "completions/clipped_ratio": 0.0, "completions/max_length": 325.2, "completions/max_terminated_length": 325.2, "completions/mean_length": 96.79609375, "completions/mean_terminated_length": 96.79609375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012986730909505563, "frac_reward_zero_std": 0.94375, "grad_norm": 0.029174620285630226, "kl": 1.9506590977543965, "learning_rate": 4.4167460317460317e-07, "loss": 0.002, "num_tokens": 974530827.0, "reward": 0.428125, "reward_std": 0.04592557847499847, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.88228679895401, "step": 14350 }, { "completion_length": 414.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 414.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 78.65625, "completions/mean_terminated_length": 78.11251068115234, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.012991255902853823, "frac_reward_zero_std": 0.96875, "grad_norm": 7.2443671226501465, "kl": 1.0248537935782225, "learning_rate": 4.416349206349206e-07, "loss": 0.001, "num_tokens": 974825035.0, "reward": 0.421875, "reward_std": 0.02346404492855072, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.9039905786514282, "step": 14355 }, { "completion_length": 362.6, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/max_terminated_length": 362.6, "completions/mean_length": 91.13203125, "completions/mean_terminated_length": 91.13203125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.012995780896202083, "frac_reward_zero_std": 0.95, "grad_norm": 11.841338157653809, "kl": 1.6835681485710665, "learning_rate": 4.415952380952381e-07, "loss": 0.0017, "num_tokens": 975139588.0, "reward": 0.4015625, "reward_std": 0.04650121033191681, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9061951160430908, "step": 14360 }, { "completion_length": 344.2, "completions/clipped_ratio": 0.0, "completions/max_length": 344.2, "completions/max_terminated_length": 344.2, "completions/mean_length": 95.36015625, "completions/mean_terminated_length": 95.36015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013000305889550342, "frac_reward_zero_std": 0.95625, "grad_norm": 8.964737892150879, "kl": 1.5595265212468803, "learning_rate": 4.4155555555555553e-07, "loss": 0.0016, "num_tokens": 975464417.0, "reward": 0.2546875, "reward_std": 0.04092798158526421, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9629568219184875, "step": 14365 }, { "completion_length": 308.6, "completions/clipped_ratio": 0.0, "completions/max_length": 308.6, "completions/max_terminated_length": 308.6, "completions/mean_length": 93.6765625, "completions/mean_terminated_length": 93.6765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013004830882898602, "frac_reward_zero_std": 0.94375, "grad_norm": 1.150577187538147, "kl": 1.3805571742588654, "learning_rate": 4.41515873015873e-07, "loss": 0.0014, "num_tokens": 975783675.0, "reward": 0.378125, "reward_std": 0.04682073444128036, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9094022870063782, "step": 14370 }, { "completion_length": 439.6, "completions/clipped_ratio": 0.0, "completions/max_length": 439.6, "completions/max_terminated_length": 439.6, "completions/mean_length": 90.02890625, "completions/mean_terminated_length": 90.02890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013009355876246862, "frac_reward_zero_std": 0.94375, "grad_norm": 5.727823257446289, "kl": 0.9974395132157952, "learning_rate": 4.4147619047619044e-07, "loss": 0.001, "num_tokens": 976096120.0, "reward": 0.4546875, "reward_std": 0.044558620825409886, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8418025135993957, "step": 14375 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 87.79296875, "completions/mean_terminated_length": 87.79296875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013013880869595122, "frac_reward_zero_std": 0.94375, "grad_norm": 14.431135177612305, "kl": 1.3439418382244184, "learning_rate": 4.4143650793650795e-07, "loss": 0.0013, "num_tokens": 976406367.0, "reward": 0.35625, "reward_std": 0.052028930932283404, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9328030467033386, "step": 14380 }, { "completion_length": 355.8, "completions/clipped_ratio": 0.0, "completions/max_length": 355.8, "completions/max_terminated_length": 355.8, "completions/mean_length": 93.67734375, "completions/mean_terminated_length": 93.67734375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013018405862943382, "frac_reward_zero_std": 0.94375, "grad_norm": 1.421414852142334, "kl": 1.109842214267701, "learning_rate": 4.413968253968254e-07, "loss": 0.0011, "num_tokens": 976726194.0, "reward": 0.353125, "reward_std": 0.04797552637755871, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9229938507080078, "step": 14385 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 89.88984375, "completions/mean_terminated_length": 89.88984375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013022930856291642, "frac_reward_zero_std": 0.975, "grad_norm": 11.328083038330078, "kl": 1.2776763125322759, "learning_rate": 4.413571428571428e-07, "loss": 0.0013, "num_tokens": 977041029.0, "reward": 0.3828125, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9098278522491455, "step": 14390 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 84.15703125, "completions/mean_terminated_length": 84.15703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0130274558496399, "frac_reward_zero_std": 0.9875, "grad_norm": 0.008578184060752392, "kl": 0.6790635693818331, "learning_rate": 4.413174603174603e-07, "loss": 0.0007, "num_tokens": 977344566.0, "reward": 0.5078125, "reward_std": 0.010205793380737304, "rewards/verify_chess_move/mean": 0.5078125, "rewards/verify_chess_move/std": 0.852743947505951, "step": 14395 }, { "completion_length": 302.8, "completions/clipped_ratio": 0.0, "completions/max_length": 302.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 90.86796875, "completions/mean_terminated_length": 90.86796875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01303198084298816, "frac_reward_zero_std": 0.95, "grad_norm": 8.739147186279297, "kl": 3.389805128984153, "learning_rate": 4.4127777777777776e-07, "loss": 0.0034, "num_tokens": 977659141.0, "reward": 0.4671875, "reward_std": 0.04629002511501312, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.875481927394867, "step": 14400 }, { "completion_length": 426.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 426.2, "completions/max_terminated_length": 373.0, "completions/mean_length": 92.41328125, "completions/mean_terminated_length": 91.35643463134765, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01303650583633642, "frac_reward_zero_std": 0.93125, "grad_norm": 1.4163442850112915, "kl": 1.351469250372611, "learning_rate": 4.4123809523809527e-07, "loss": 0.0014, "num_tokens": 977976366.0, "reward": 0.3734375, "reward_std": 0.0631763681769371, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9223950028419494, "step": 14405 }, { "completion_length": 570.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 570.0, "completions/max_terminated_length": 445.6, "completions/mean_length": 95.48125, "completions/mean_terminated_length": 94.43314666748047, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01304103082968468, "frac_reward_zero_std": 0.925, "grad_norm": 19.68670654296875, "kl": 0.7803699384443462, "learning_rate": 4.4119841269841267e-07, "loss": 0.0008, "num_tokens": 978297046.0, "reward": 0.3796875, "reward_std": 0.061233778670430186, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9186360239982605, "step": 14410 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 90.296875, "completions/mean_terminated_length": 90.296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01304555582303294, "frac_reward_zero_std": 0.975, "grad_norm": 1.1840754747390747, "kl": 0.19033793068956584, "learning_rate": 4.411587301587301e-07, "loss": 0.0002, "num_tokens": 978611658.0, "reward": 0.396875, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9097729325294495, "step": 14415 }, { "completion_length": 475.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 475.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 87.9703125, "completions/mean_terminated_length": 87.43222503662109, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013050080816381199, "frac_reward_zero_std": 0.975, "grad_norm": 3.3537869453430176, "kl": 1.162437863671221, "learning_rate": 4.4111904761904763e-07, "loss": 0.0012, "num_tokens": 978920492.0, "reward": 0.4703125, "reward_std": 0.021778544783592223, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8665383458137512, "step": 14420 }, { "completion_length": 554.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 554.8, "completions/max_terminated_length": 544.6, "completions/mean_length": 93.315625, "completions/mean_terminated_length": 92.28434448242187, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01305460580972946, "frac_reward_zero_std": 0.94375, "grad_norm": 4.269708156585693, "kl": 3.6090308290091344, "learning_rate": 4.4107936507936503e-07, "loss": 0.0036, "num_tokens": 979237912.0, "reward": 0.421875, "reward_std": 0.0466095469892025, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.9015698432922363, "step": 14425 }, { "completion_length": 410.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 410.6, "completions/max_terminated_length": 354.6, "completions/mean_length": 91.5671875, "completions/mean_terminated_length": 89.96417388916015, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01305913080307772, "frac_reward_zero_std": 0.9375, "grad_norm": 23.61195182800293, "kl": 1.5545842894585804, "learning_rate": 4.4103968253968254e-07, "loss": 0.0016, "num_tokens": 979552702.0, "reward": 0.3234375, "reward_std": 0.05828514620661736, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9295454263687134, "step": 14430 }, { "completion_length": 308.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 89.18359375, "completions/mean_terminated_length": 89.18359375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01306365579642598, "frac_reward_zero_std": 0.94375, "grad_norm": 1.8095146417617798, "kl": 0.6941388857550919, "learning_rate": 4.41e-07, "loss": 0.0007, "num_tokens": 979864641.0, "reward": 0.378125, "reward_std": 0.0486594945192337, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9034443736076355, "step": 14435 }, { "completion_length": 283.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 84.890625, "completions/mean_terminated_length": 84.890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01306818078977424, "frac_reward_zero_std": 0.9625, "grad_norm": 1.3423279523849487, "kl": 1.0046479006530717, "learning_rate": 4.4096031746031745e-07, "loss": 0.001, "num_tokens": 980170341.0, "reward": 0.3453125, "reward_std": 0.032878512144088747, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.93737952709198, "step": 14440 }, { "completion_length": 305.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 93.23984375, "completions/mean_terminated_length": 93.23984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0130727057831225, "frac_reward_zero_std": 0.9875, "grad_norm": 0.5818870663642883, "kl": 2.0956901764962823, "learning_rate": 4.409206349206349e-07, "loss": 0.0021, "num_tokens": 980489256.0, "reward": 0.40625, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.8931894421577453, "step": 14445 }, { "completion_length": 420.6, "completions/clipped_ratio": 0.0, "completions/max_length": 420.6, "completions/max_terminated_length": 420.6, "completions/mean_length": 97.73203125, "completions/mean_terminated_length": 97.73203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013077230776470758, "frac_reward_zero_std": 0.98125, "grad_norm": 0.9706653952598572, "kl": 0.7032940269331448, "learning_rate": 4.4088095238095235e-07, "loss": 0.0007, "num_tokens": 980816969.0, "reward": 0.25625, "reward_std": 0.01462521068751812, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9547247648239136, "step": 14450 }, { "completion_length": 343.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 93.3734375, "completions/mean_terminated_length": 93.3734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013081755769819018, "frac_reward_zero_std": 0.95, "grad_norm": 0.002117094350978732, "kl": 1.4619767839321867, "learning_rate": 4.4084126984126986e-07, "loss": 0.0015, "num_tokens": 981138623.0, "reward": 0.24375, "reward_std": 0.04671337604522705, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9652850151062011, "step": 14455 }, { "completion_length": 273.4, "completions/clipped_ratio": 0.0, "completions/max_length": 273.4, "completions/max_terminated_length": 273.4, "completions/mean_length": 87.21796875, "completions/mean_terminated_length": 87.21796875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.013086280763167278, "frac_reward_zero_std": 0.95625, "grad_norm": 4.1672821044921875, "kl": 0.7017618798883631, "learning_rate": 4.4080158730158726e-07, "loss": 0.0007, "num_tokens": 981446870.0, "reward": 0.459375, "reward_std": 0.036402773857116696, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8834821343421936, "step": 14460 }, { "completion_length": 576.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 576.0, "completions/max_terminated_length": 395.4, "completions/mean_length": 99.90234375, "completions/mean_terminated_length": 98.34164428710938, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013090805756515538, "frac_reward_zero_std": 0.90625, "grad_norm": 22.763465881347656, "kl": 2.7951127002481373, "learning_rate": 4.407619047619047e-07, "loss": 0.0028, "num_tokens": 981775001.0, "reward": 0.2296875, "reward_std": 0.0829039853066206, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9722997188568115, "step": 14465 }, { "completion_length": 381.2, "completions/clipped_ratio": 0.0, "completions/max_length": 381.2, "completions/max_terminated_length": 381.2, "completions/mean_length": 83.9609375, "completions/mean_terminated_length": 83.9609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013095330749863798, "frac_reward_zero_std": 0.9875, "grad_norm": 0.011001748964190483, "kl": 1.1478959497995675, "learning_rate": 4.407222222222222e-07, "loss": 0.0011, "num_tokens": 982079911.0, "reward": 0.33125, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9320532798767089, "step": 14470 }, { "completion_length": 515.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 515.2, "completions/max_terminated_length": 460.8, "completions/mean_length": 86.58828125, "completions/mean_terminated_length": 86.04820556640625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013099855743212057, "frac_reward_zero_std": 0.96875, "grad_norm": 0.7590919137001038, "kl": 2.2904542325413786, "learning_rate": 4.406825396825397e-07, "loss": 0.0023, "num_tokens": 982387056.0, "reward": 0.334375, "reward_std": 0.03208816573023796, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9053478837013245, "step": 14475 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.0, "completions/max_terminated_length": 339.2, "completions/mean_length": 91.50078125, "completions/mean_terminated_length": 90.96781616210937, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013104380736560317, "frac_reward_zero_std": 0.94375, "grad_norm": 2.188019037246704, "kl": 1.803728573070839, "learning_rate": 4.4064285714285713e-07, "loss": 0.0018, "num_tokens": 982703265.0, "reward": 0.396875, "reward_std": 0.04771588891744614, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9004221558570862, "step": 14480 }, { "completion_length": 427.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 427.2, "completions/max_terminated_length": 314.8, "completions/mean_length": 89.28515625, "completions/mean_terminated_length": 88.73729248046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013108905729908577, "frac_reward_zero_std": 0.9375, "grad_norm": 23.86819076538086, "kl": 3.8869890366098843, "learning_rate": 4.406031746031746e-07, "loss": 0.0039, "num_tokens": 983015710.0, "reward": 0.3734375, "reward_std": 0.05444488972425461, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9172836542129517, "step": 14485 }, { "completion_length": 288.6, "completions/clipped_ratio": 0.0, "completions/max_length": 288.6, "completions/max_terminated_length": 288.6, "completions/mean_length": 86.97578125, "completions/mean_terminated_length": 86.97578125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.013113430723256837, "frac_reward_zero_std": 0.96875, "grad_norm": 23.533634185791016, "kl": 0.9145472564734518, "learning_rate": 4.4056349206349204e-07, "loss": 0.0009, "num_tokens": 983324751.0, "reward": 0.3328125, "reward_std": 0.02867126017808914, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9354920506477356, "step": 14490 }, { "completion_length": 407.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 91.39453125, "completions/mean_terminated_length": 91.39453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013117955716605097, "frac_reward_zero_std": 0.975, "grad_norm": 0.8512874841690063, "kl": 1.6986693075625225, "learning_rate": 4.4052380952380955e-07, "loss": 0.0017, "num_tokens": 983640760.0, "reward": 0.2875, "reward_std": 0.0245114803314209, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9581959128379822, "step": 14495 }, { "completion_length": 417.4, "completions/clipped_ratio": 0.0, "completions/max_length": 417.4, "completions/max_terminated_length": 417.4, "completions/mean_length": 92.57734375, "completions/mean_terminated_length": 92.57734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013122480709953357, "frac_reward_zero_std": 0.96875, "grad_norm": 4.131455898284912, "kl": 0.1454925615922548, "learning_rate": 4.4048412698412695e-07, "loss": 0.0001, "num_tokens": 983958611.0, "reward": 0.4, "reward_std": 0.030297856032848357, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.8816133737564087, "step": 14500 }, { "completion_length": 355.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 91.1421875, "completions/mean_terminated_length": 91.1421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013127005703301615, "frac_reward_zero_std": 0.95, "grad_norm": 0.005864683538675308, "kl": 0.2517187770921737, "learning_rate": 4.4044444444444445e-07, "loss": 0.0003, "num_tokens": 984275921.0, "reward": 0.3359375, "reward_std": 0.04629002511501312, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9220069050788879, "step": 14505 }, { "completion_length": 291.6, "completions/clipped_ratio": 0.0, "completions/max_length": 291.6, "completions/max_terminated_length": 291.6, "completions/mean_length": 80.99453125, "completions/mean_terminated_length": 80.99453125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.013131530696649876, "frac_reward_zero_std": 0.96875, "grad_norm": 3.0319323539733887, "kl": 0.19358279404696077, "learning_rate": 4.404047619047619e-07, "loss": 0.0002, "num_tokens": 984575002.0, "reward": 0.3609375, "reward_std": 0.02414703443646431, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9220932722091675, "step": 14510 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 93.67734375, "completions/mean_terminated_length": 93.67734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013136055689998136, "frac_reward_zero_std": 0.9625, "grad_norm": 0.00146579893771559, "kl": 0.44208491188474, "learning_rate": 4.403650793650793e-07, "loss": 0.0004, "num_tokens": 984894909.0, "reward": 0.3609375, "reward_std": 0.032667326182127, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9282713651657104, "step": 14515 }, { "completion_length": 289.4, "completions/clipped_ratio": 0.0, "completions/max_length": 289.4, "completions/max_terminated_length": 289.4, "completions/mean_length": 91.90234375, "completions/mean_terminated_length": 91.90234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013140580683346396, "frac_reward_zero_std": 0.96875, "grad_norm": 0.012449285015463829, "kl": 0.2152169046457857, "learning_rate": 4.403253968253968e-07, "loss": 0.0002, "num_tokens": 985212504.0, "reward": 0.275, "reward_std": 0.028247909247875215, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9583871364593506, "step": 14520 }, { "completion_length": 408.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 408.8, "completions/max_terminated_length": 311.6, "completions/mean_length": 91.8421875, "completions/mean_terminated_length": 91.3147689819336, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013145105676694656, "frac_reward_zero_std": 0.95625, "grad_norm": 0.00862171035259962, "kl": 0.9183482587104663, "learning_rate": 4.4028571428571427e-07, "loss": 0.0009, "num_tokens": 985530382.0, "reward": 0.28125, "reward_std": 0.04045617952942848, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9353413939476013, "step": 14525 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 440.6, "completions/max_terminated_length": 421.8, "completions/mean_length": 93.434375, "completions/mean_terminated_length": 92.38265228271484, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013149630670042914, "frac_reward_zero_std": 0.9375, "grad_norm": 10.714143753051758, "kl": 1.646577888331376, "learning_rate": 4.402460317460317e-07, "loss": 0.0016, "num_tokens": 985848890.0, "reward": 0.3, "reward_std": 0.05670700445771217, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9276710391044617, "step": 14530 }, { "completion_length": 386.2, "completions/clipped_ratio": 0.0, "completions/max_length": 386.2, "completions/max_terminated_length": 386.2, "completions/mean_length": 90.7765625, "completions/mean_terminated_length": 90.7765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013154155663391174, "frac_reward_zero_std": 0.9625, "grad_norm": 1.5105870962142944, "kl": 0.7499898385023698, "learning_rate": 4.402063492063492e-07, "loss": 0.0007, "num_tokens": 986164156.0, "reward": 0.321875, "reward_std": 0.031300367787480354, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9361206650733948, "step": 14535 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.2, "completions/max_terminated_length": 416.2, "completions/mean_length": 94.74921875, "completions/mean_terminated_length": 94.22825775146484, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013158680656739434, "frac_reward_zero_std": 0.9375, "grad_norm": 15.69190788269043, "kl": 0.5732310632592998, "learning_rate": 4.4016666666666663e-07, "loss": 0.0006, "num_tokens": 986483395.0, "reward": 0.3703125, "reward_std": 0.053290098905563354, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9284684896469116, "step": 14540 }, { "completion_length": 325.4, "completions/clipped_ratio": 0.0, "completions/max_length": 325.4, "completions/max_terminated_length": 325.4, "completions/mean_length": 87.38828125, "completions/mean_terminated_length": 87.38828125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013163205650087694, "frac_reward_zero_std": 0.95625, "grad_norm": 4.317090034484863, "kl": 0.23283716778969393, "learning_rate": 4.4012698412698414e-07, "loss": 0.0002, "num_tokens": 986791220.0, "reward": 0.4078125, "reward_std": 0.040244012326002124, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9097463607788085, "step": 14545 }, { "completion_length": 436.4, "completions/clipped_ratio": 0.0, "completions/max_length": 436.4, "completions/max_terminated_length": 436.4, "completions/mean_length": 93.01015625, "completions/mean_terminated_length": 93.01015625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013167730643435955, "frac_reward_zero_std": 0.9375, "grad_norm": 0.9683947563171387, "kl": 0.23604918718338014, "learning_rate": 4.4008730158730154e-07, "loss": 0.0002, "num_tokens": 987108433.0, "reward": 0.3453125, "reward_std": 0.05465607568621635, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9280910015106201, "step": 14550 }, { "completion_length": 391.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 391.0, "completions/max_terminated_length": 295.2, "completions/mean_length": 87.18359375, "completions/mean_terminated_length": 86.64522094726563, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013172255636784215, "frac_reward_zero_std": 0.95, "grad_norm": 2.0978119373321533, "kl": 0.16133390681352466, "learning_rate": 4.40047619047619e-07, "loss": 0.0002, "num_tokens": 987416292.0, "reward": 0.484375, "reward_std": 0.03898441046476364, "rewards/verify_chess_move/mean": 0.484375, "rewards/verify_chess_move/std": 0.8234276831150055, "step": 14555 }, { "completion_length": 282.4, "completions/clipped_ratio": 0.0, "completions/max_length": 282.4, "completions/max_terminated_length": 282.4, "completions/mean_length": 89.02890625, "completions/mean_terminated_length": 89.02890625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013176780630132473, "frac_reward_zero_std": 0.975, "grad_norm": 0.0026981213595718145, "kl": 0.28723984365351496, "learning_rate": 4.400079365079365e-07, "loss": 0.0003, "num_tokens": 987729001.0, "reward": 0.38125, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9242313146591187, "step": 14560 }, { "completion_length": 297.2, "completions/clipped_ratio": 0.0, "completions/max_length": 297.2, "completions/max_terminated_length": 297.2, "completions/mean_length": 91.8171875, "completions/mean_terminated_length": 91.8171875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.013181305623480733, "frac_reward_zero_std": 0.95, "grad_norm": 15.499650955200195, "kl": 1.3318768305005506, "learning_rate": 4.3996825396825395e-07, "loss": 0.0013, "num_tokens": 988045711.0, "reward": 0.3578125, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9175934791564941, "step": 14565 }, { "completion_length": 304.8, "completions/clipped_ratio": 0.0, "completions/max_length": 304.8, "completions/max_terminated_length": 304.8, "completions/mean_length": 89.22734375, "completions/mean_terminated_length": 89.22734375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013185830616828993, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016255427617579699, "kl": 0.22548690666444599, "learning_rate": 4.399285714285714e-07, "loss": 0.0002, "num_tokens": 988357066.0, "reward": 0.325, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9140689849853516, "step": 14570 }, { "completion_length": 541.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 541.4, "completions/max_terminated_length": 490.2, "completions/mean_length": 101.0515625, "completions/mean_terminated_length": 100.02999572753906, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.013190355610177253, "frac_reward_zero_std": 0.94375, "grad_norm": 9.261900901794434, "kl": 0.9875709657440893, "learning_rate": 4.3988888888888886e-07, "loss": 0.001, "num_tokens": 988688308.0, "reward": 0.3328125, "reward_std": 0.04792805500328541, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9305615305900574, "step": 14575 }, { "completion_length": 493.4, "completions/clipped_ratio": 0.003125, "completions/max_length": 493.4, "completions/max_terminated_length": 404.6, "completions/mean_length": 101.6828125, "completions/mean_terminated_length": 99.62018585205078, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013194880603525513, "frac_reward_zero_std": 0.925, "grad_norm": 25.798105239868164, "kl": 1.6076299390872009, "learning_rate": 4.398492063492063e-07, "loss": 0.0016, "num_tokens": 989020038.0, "reward": 0.271875, "reward_std": 0.0648618683218956, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.955473792552948, "step": 14580 }, { "completion_length": 261.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 91.63671875, "completions/mean_terminated_length": 91.63671875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013199405596873772, "frac_reward_zero_std": 0.9375, "grad_norm": 8.61567211151123, "kl": 0.656476643262431, "learning_rate": 4.398095238095238e-07, "loss": 0.0007, "num_tokens": 989336781.0, "reward": 0.3140625, "reward_std": 0.05623519979417324, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9332590937614441, "step": 14585 }, { "completion_length": 308.4, "completions/clipped_ratio": 0.0, "completions/max_length": 308.4, "completions/max_terminated_length": 308.4, "completions/mean_length": 89.328125, "completions/mean_terminated_length": 89.328125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013203930590222032, "frac_reward_zero_std": 0.95625, "grad_norm": 5.900920867919922, "kl": 1.6746580956038088, "learning_rate": 4.397698412698412e-07, "loss": 0.0017, "num_tokens": 989648113.0, "reward": 0.33125, "reward_std": 0.03913669139146805, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.944172739982605, "step": 14590 }, { "completion_length": 304.4, "completions/clipped_ratio": 0.0, "completions/max_length": 304.4, "completions/max_terminated_length": 304.4, "completions/mean_length": 91.40859375, "completions/mean_terminated_length": 91.40859375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013208455583570292, "frac_reward_zero_std": 0.95, "grad_norm": 15.021952629089355, "kl": 1.1628352328436449, "learning_rate": 4.3973015873015873e-07, "loss": 0.0012, "num_tokens": 989963228.0, "reward": 0.521875, "reward_std": 0.044663429260253906, "rewards/verify_chess_move/mean": 0.521875, "rewards/verify_chess_move/std": 0.827098274230957, "step": 14595 }, { "completion_length": 405.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.6, "completions/max_terminated_length": 331.8, "completions/mean_length": 88.12578125, "completions/mean_terminated_length": 87.60491790771485, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013212980576918552, "frac_reward_zero_std": 0.95, "grad_norm": 0.28689903020858765, "kl": 1.6492794542689808, "learning_rate": 4.396904761904762e-07, "loss": 0.0016, "num_tokens": 990273221.0, "reward": 0.2828125, "reward_std": 0.03672229796648026, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9509523987770081, "step": 14600 }, { "completion_length": 305.2, "completions/clipped_ratio": 0.0, "completions/max_length": 305.2, "completions/max_terminated_length": 305.2, "completions/mean_length": 90.4484375, "completions/mean_terminated_length": 90.4484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013217505570266812, "frac_reward_zero_std": 0.9625, "grad_norm": 21.496950149536133, "kl": 0.9104306817054748, "learning_rate": 4.396507936507936e-07, "loss": 0.0009, "num_tokens": 990587539.0, "reward": 0.34375, "reward_std": 0.03335031531751156, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9237117886543273, "step": 14605 }, { "completion_length": 328.2, "completions/clipped_ratio": 0.0, "completions/max_length": 328.2, "completions/max_terminated_length": 328.2, "completions/mean_length": 96.35703125, "completions/mean_terminated_length": 96.35703125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.013222030563615072, "frac_reward_zero_std": 0.9375, "grad_norm": 13.62981128692627, "kl": 0.4005522113060579, "learning_rate": 4.396111111111111e-07, "loss": 0.0004, "num_tokens": 990911316.0, "reward": 0.44375, "reward_std": 0.047823245823383334, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8899821281433106, "step": 14610 }, { "completion_length": 340.8, "completions/clipped_ratio": 0.0, "completions/max_length": 340.8, "completions/max_terminated_length": 340.8, "completions/mean_length": 90.078125, "completions/mean_terminated_length": 90.078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01322655555696333, "frac_reward_zero_std": 0.925, "grad_norm": 5.1688008308410645, "kl": 1.1242231484036893, "learning_rate": 4.3957142857142855e-07, "loss": 0.0011, "num_tokens": 991224208.0, "reward": 0.3625, "reward_std": 0.06896176487207413, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9280871272087097, "step": 14615 }, { "completion_length": 499.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 499.2, "completions/max_terminated_length": 420.4, "completions/mean_length": 94.6765625, "completions/mean_terminated_length": 94.14385223388672, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01323108055031159, "frac_reward_zero_std": 0.94375, "grad_norm": 0.026787741109728813, "kl": 1.2133405563305133, "learning_rate": 4.3953174603174605e-07, "loss": 0.0012, "num_tokens": 991543530.0, "reward": 0.4015625, "reward_std": 0.049553670734167096, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9103242754936218, "step": 14620 }, { "completion_length": 357.2, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 94.00546875, "completions/mean_terminated_length": 94.00546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01323560554365985, "frac_reward_zero_std": 0.94375, "grad_norm": 12.285889625549316, "kl": 1.0627576350001617, "learning_rate": 4.3949206349206345e-07, "loss": 0.0011, "num_tokens": 991862409.0, "reward": 0.2453125, "reward_std": 0.04955367110669613, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9674586653709412, "step": 14625 }, { "completion_length": 283.6, "completions/clipped_ratio": 0.0, "completions/max_length": 283.6, "completions/max_terminated_length": 283.6, "completions/mean_length": 93.6203125, "completions/mean_terminated_length": 93.6203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01324013053700811, "frac_reward_zero_std": 0.95, "grad_norm": 8.800822257995605, "kl": 0.6258118040394038, "learning_rate": 4.394523809523809e-07, "loss": 0.0006, "num_tokens": 992180419.0, "reward": 0.3359375, "reward_std": 0.04287213869392872, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9246177911758423, "step": 14630 }, { "completion_length": 476.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 476.4, "completions/max_terminated_length": 335.2, "completions/mean_length": 98.1203125, "completions/mean_terminated_length": 96.55000305175781, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013244655530356371, "frac_reward_zero_std": 0.93125, "grad_norm": 39.90863037109375, "kl": 1.8072088692802937, "learning_rate": 4.394126984126984e-07, "loss": 0.0018, "num_tokens": 992506637.0, "reward": 0.365625, "reward_std": 0.06296419948339463, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9070710897445678, "step": 14635 }, { "completion_length": 390.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 93.99921875, "completions/mean_terminated_length": 93.99921875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01324918052370463, "frac_reward_zero_std": 0.9625, "grad_norm": 21.25297737121582, "kl": 1.853051367891021, "learning_rate": 4.393730158730158e-07, "loss": 0.0019, "num_tokens": 992828100.0, "reward": 0.353125, "reward_std": 0.031300367787480354, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9312444567680359, "step": 14640 }, { "completion_length": 452.2, "completions/clipped_ratio": 0.0, "completions/max_length": 452.2, "completions/max_terminated_length": 452.2, "completions/mean_length": 96.28359375, "completions/mean_terminated_length": 96.28359375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.01325370551705289, "frac_reward_zero_std": 0.93125, "grad_norm": 2.1328845024108887, "kl": 4.267087531764991, "learning_rate": 4.393333333333333e-07, "loss": 0.0043, "num_tokens": 993150895.0, "reward": 0.38125, "reward_std": 0.05681436099112034, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9110489606857299, "step": 14645 }, { "completion_length": 395.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 395.8, "completions/max_terminated_length": 355.8, "completions/mean_length": 97.5875, "completions/mean_terminated_length": 97.07184600830078, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01325823051040115, "frac_reward_zero_std": 0.9625, "grad_norm": 9.825688362121582, "kl": 9.45708537721075, "learning_rate": 4.392936507936508e-07, "loss": 0.0095, "num_tokens": 993475887.0, "reward": 0.3921875, "reward_std": 0.03356248214840889, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.8636661767959595, "step": 14650 }, { "completion_length": 337.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 91.33046875, "completions/mean_terminated_length": 91.33046875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01326275550374941, "frac_reward_zero_std": 0.975, "grad_norm": 2.2206833362579346, "kl": 1.1325407927157358, "learning_rate": 4.3925396825396823e-07, "loss": 0.0011, "num_tokens": 993790086.0, "reward": 0.453125, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8483083724975586, "step": 14655 }, { "completion_length": 486.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 486.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 96.40546875, "completions/mean_terminated_length": 94.82468109130859, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01326728049709767, "frac_reward_zero_std": 0.93125, "grad_norm": 18.187244415283203, "kl": 1.2520648338482716, "learning_rate": 4.392142857142857e-07, "loss": 0.0013, "num_tokens": 994114325.0, "reward": 0.453125, "reward_std": 0.05886430740356445, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8781209945678711, "step": 14660 }, { "completion_length": 446.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 446.8, "completions/max_terminated_length": 442.8, "completions/mean_length": 89.14453125, "completions/mean_terminated_length": 88.61763610839844, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01327180549044593, "frac_reward_zero_std": 0.95, "grad_norm": 12.564739227294922, "kl": 1.2936096174409613, "learning_rate": 4.3917460317460314e-07, "loss": 0.0013, "num_tokens": 994424142.0, "reward": 0.43125, "reward_std": 0.037405284494161604, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.896221661567688, "step": 14665 }, { "completion_length": 436.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.4, "completions/max_terminated_length": 417.2, "completions/mean_length": 92.26484375, "completions/mean_terminated_length": 91.73747406005859, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013276330483794188, "frac_reward_zero_std": 0.95625, "grad_norm": 0.829077959060669, "kl": 0.5872547626262531, "learning_rate": 4.3913492063492065e-07, "loss": 0.0006, "num_tokens": 994740545.0, "reward": 0.296875, "reward_std": 0.03729793019592762, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9519661903381348, "step": 14670 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 90.10703125, "completions/mean_terminated_length": 90.10703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013280855477142448, "frac_reward_zero_std": 0.96875, "grad_norm": 0.011794797144830227, "kl": 0.32704115002416073, "learning_rate": 4.390952380952381e-07, "loss": 0.0003, "num_tokens": 995053874.0, "reward": 0.3875, "reward_std": 0.027563939988613128, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9043850660324096, "step": 14675 }, { "completion_length": 384.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 384.2, "completions/max_terminated_length": 287.2, "completions/mean_length": 95.93359375, "completions/mean_terminated_length": 95.39720001220704, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.013285380470490708, "frac_reward_zero_std": 0.95, "grad_norm": 0.013765871524810791, "kl": 1.0149735984043218, "learning_rate": 4.390555555555555e-07, "loss": 0.001, "num_tokens": 995378589.0, "reward": 0.33125, "reward_std": 0.04013920240104198, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9404839277267456, "step": 14680 }, { "completion_length": 450.2, "completions/clipped_ratio": 0.0, "completions/max_length": 450.2, "completions/max_terminated_length": 450.2, "completions/mean_length": 99.24765625, "completions/mean_terminated_length": 99.24765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013289905463838968, "frac_reward_zero_std": 0.93125, "grad_norm": 2.1545608043670654, "kl": 0.6446047536097467, "learning_rate": 4.39015873015873e-07, "loss": 0.0006, "num_tokens": 995708626.0, "reward": 0.3015625, "reward_std": 0.06023126617074013, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9396051883697509, "step": 14685 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 85.4421875, "completions/mean_terminated_length": 85.4421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013294430457187228, "frac_reward_zero_std": 0.975, "grad_norm": 0.004250905476510525, "kl": 0.17556245173327625, "learning_rate": 4.3897619047619046e-07, "loss": 0.0002, "num_tokens": 996012520.0, "reward": 0.43125, "reward_std": 0.022461533173918725, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.9026264548301697, "step": 14690 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 93.2875, "completions/mean_terminated_length": 93.2875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013298955450535487, "frac_reward_zero_std": 0.975, "grad_norm": 1.2418899536132812, "kl": 0.3222371890442446, "learning_rate": 4.389365079365079e-07, "loss": 0.0003, "num_tokens": 996332016.0, "reward": 0.3203125, "reward_std": 0.019939782842993737, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9336365103721619, "step": 14695 }, { "completion_length": 328.8, "completions/clipped_ratio": 0.0, "completions/max_length": 328.8, "completions/max_terminated_length": 328.8, "completions/mean_length": 95.453125, "completions/mean_terminated_length": 95.453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013303480443883747, "frac_reward_zero_std": 0.95625, "grad_norm": 11.571839332580566, "kl": 0.5027659797342494, "learning_rate": 4.3889682539682537e-07, "loss": 0.0005, "num_tokens": 996654052.0, "reward": 0.2984375, "reward_std": 0.03571978472173214, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9481733322143555, "step": 14700 }, { "completion_length": 333.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 92.74921875, "completions/mean_terminated_length": 92.74921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013308005437232007, "frac_reward_zero_std": 0.9625, "grad_norm": 3.196136236190796, "kl": 1.038936357642524, "learning_rate": 4.388571428571428e-07, "loss": 0.001, "num_tokens": 996971915.0, "reward": 0.2953125, "reward_std": 0.03240768946707249, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9361641049385071, "step": 14705 }, { "completion_length": 410.4, "completions/clipped_ratio": 0.0, "completions/max_length": 410.4, "completions/max_terminated_length": 410.4, "completions/mean_length": 94.796875, "completions/mean_terminated_length": 94.796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013312530430580267, "frac_reward_zero_std": 0.95, "grad_norm": 13.56872272491455, "kl": 0.972387383505702, "learning_rate": 4.3881746031746033e-07, "loss": 0.001, "num_tokens": 997293815.0, "reward": 0.275, "reward_std": 0.04833899140357971, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9515522003173829, "step": 14710 }, { "completion_length": 416.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 416.8, "completions/max_terminated_length": 330.0, "completions/mean_length": 90.56953125, "completions/mean_terminated_length": 90.03464813232422, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013317055423928527, "frac_reward_zero_std": 0.9625, "grad_norm": 0.8028386235237122, "kl": 2.9003309698658994, "learning_rate": 4.3877777777777773e-07, "loss": 0.0029, "num_tokens": 997608520.0, "reward": 0.340625, "reward_std": 0.031984337419271466, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9347674369812011, "step": 14715 }, { "completion_length": 308.6, "completions/clipped_ratio": 0.0, "completions/max_length": 308.6, "completions/max_terminated_length": 308.6, "completions/mean_length": 95.86953125, "completions/mean_terminated_length": 95.86953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013321580417276787, "frac_reward_zero_std": 0.96875, "grad_norm": 0.09574650973081589, "kl": 0.8852377810748294, "learning_rate": 4.3873809523809524e-07, "loss": 0.0009, "num_tokens": 997933081.0, "reward": 0.3171875, "reward_std": 0.028246928378939627, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9446279048919678, "step": 14720 }, { "completion_length": 410.4, "completions/clipped_ratio": 0.0, "completions/max_length": 410.4, "completions/max_terminated_length": 410.4, "completions/mean_length": 96.51875, "completions/mean_terminated_length": 96.51875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013326105410625046, "frac_reward_zero_std": 0.96875, "grad_norm": 0.003912066109478474, "kl": 1.86598213436082, "learning_rate": 4.386984126984127e-07, "loss": 0.0019, "num_tokens": 998258345.0, "reward": 0.4234375, "reward_std": 0.029826052486896515, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8913766026496888, "step": 14725 }, { "completion_length": 328.2, "completions/clipped_ratio": 0.0, "completions/max_length": 328.2, "completions/max_terminated_length": 328.2, "completions/mean_length": 92.4203125, "completions/mean_terminated_length": 92.4203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013330630403973306, "frac_reward_zero_std": 0.9625, "grad_norm": 37.9493293762207, "kl": 0.5052173983072862, "learning_rate": 4.386587301587301e-07, "loss": 0.0005, "num_tokens": 998574651.0, "reward": 0.3734375, "reward_std": 0.0344576358795166, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9227065682411194, "step": 14730 }, { "completion_length": 399.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 399.2, "completions/max_terminated_length": 298.0, "completions/mean_length": 90.55703125, "completions/mean_terminated_length": 90.02671966552734, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013335155397321566, "frac_reward_zero_std": 0.925, "grad_norm": 23.428152084350586, "kl": 1.2482829907443374, "learning_rate": 4.386190476190476e-07, "loss": 0.0012, "num_tokens": 998890940.0, "reward": 0.30625, "reward_std": 0.06759578585624695, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9351041078567505, "step": 14735 }, { "completion_length": 337.6, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/max_terminated_length": 337.6, "completions/mean_length": 88.40078125, "completions/mean_terminated_length": 88.40078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013339680390669826, "frac_reward_zero_std": 0.9375, "grad_norm": 1.5543763637542725, "kl": 2.2931476834695785, "learning_rate": 4.3857936507936505e-07, "loss": 0.0023, "num_tokens": 999202333.0, "reward": 0.3921875, "reward_std": 0.0539740689098835, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9188912510871887, "step": 14740 }, { "completion_length": 338.8, "completions/clipped_ratio": 0.0, "completions/max_length": 338.8, "completions/max_terminated_length": 338.8, "completions/mean_length": 86.9078125, "completions/mean_terminated_length": 86.9078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013344205384018086, "frac_reward_zero_std": 0.975, "grad_norm": 0.03583265841007233, "kl": 4.319262890308164, "learning_rate": 4.3853968253968256e-07, "loss": 0.0043, "num_tokens": 999509847.0, "reward": 0.4359375, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.9000658392906189, "step": 14745 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.0, "completions/max_length": 457.4, "completions/max_terminated_length": 457.4, "completions/mean_length": 92.853125, "completions/mean_terminated_length": 92.853125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013348730377366346, "frac_reward_zero_std": 0.975, "grad_norm": 3.799739360809326, "kl": 1.292414071620442, "learning_rate": 4.3849999999999996e-07, "loss": 0.0013, "num_tokens": 999826883.0, "reward": 0.5046875, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.5046875, "rewards/verify_chess_move/std": 0.8494638442993164, "step": 14750 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 93.265625, "completions/mean_terminated_length": 93.265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013353255370714604, "frac_reward_zero_std": 0.9625, "grad_norm": 6.390530586242676, "kl": 0.7669282756978646, "learning_rate": 4.384603174603174e-07, "loss": 0.0008, "num_tokens": 1000145623.0, "reward": 0.4421875, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8948202848434448, "step": 14755 }, { "completion_length": 341.4, "completions/clipped_ratio": 0.0, "completions/max_length": 341.4, "completions/max_terminated_length": 341.4, "completions/mean_length": 85.45546875, "completions/mean_terminated_length": 85.45546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013357780364062865, "frac_reward_zero_std": 0.925, "grad_norm": 10.856996536254883, "kl": 0.9431827245047316, "learning_rate": 4.384206349206349e-07, "loss": 0.0009, "num_tokens": 1000451358.0, "reward": 0.3796875, "reward_std": 0.0669127967208624, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9235788464546204, "step": 14760 }, { "completion_length": 430.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 430.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 96.31484375, "completions/mean_terminated_length": 95.7857177734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013362305357411125, "frac_reward_zero_std": 0.95625, "grad_norm": 11.148587226867676, "kl": 2.807739374239463, "learning_rate": 4.383809523809524e-07, "loss": 0.0028, "num_tokens": 1000776729.0, "reward": 0.3703125, "reward_std": 0.03908922150731087, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.920844841003418, "step": 14765 }, { "completion_length": 368.2, "completions/clipped_ratio": 0.0, "completions/max_length": 368.2, "completions/max_terminated_length": 368.2, "completions/mean_length": 85.75546875, "completions/mean_terminated_length": 85.75546875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013366830350759385, "frac_reward_zero_std": 0.975, "grad_norm": 0.5722194314002991, "kl": 0.5293825183762237, "learning_rate": 4.3834126984126983e-07, "loss": 0.0005, "num_tokens": 1001082192.0, "reward": 0.390625, "reward_std": 0.019727616757154464, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9166219234466553, "step": 14770 }, { "completion_length": 332.6, "completions/clipped_ratio": 0.0, "completions/max_length": 332.6, "completions/max_terminated_length": 332.6, "completions/mean_length": 90.0890625, "completions/mean_terminated_length": 90.0890625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013371355344107645, "frac_reward_zero_std": 0.96875, "grad_norm": 4.957676410675049, "kl": 0.8111614233930595, "learning_rate": 4.383015873015873e-07, "loss": 0.0008, "num_tokens": 1001396762.0, "reward": 0.3109375, "reward_std": 0.024831004068255426, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9513146162033081, "step": 14775 }, { "completion_length": 321.6, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/max_terminated_length": 321.6, "completions/mean_length": 89.86875, "completions/mean_terminated_length": 89.86875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013375880337455903, "frac_reward_zero_std": 0.975, "grad_norm": 0.13254468142986298, "kl": 0.2530745025025681, "learning_rate": 4.3826190476190474e-07, "loss": 0.0003, "num_tokens": 1001711378.0, "reward": 0.4234375, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.9058368563652038, "step": 14780 }, { "completion_length": 451.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.4, "completions/max_terminated_length": 369.6, "completions/mean_length": 88.8171875, "completions/mean_terminated_length": 88.29928131103516, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013380405330804163, "frac_reward_zero_std": 0.9375, "grad_norm": 7.2608208656311035, "kl": 0.4667541048140265, "learning_rate": 4.382222222222222e-07, "loss": 0.0005, "num_tokens": 1002021624.0, "reward": 0.3125, "reward_std": 0.04987319335341454, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.933510673046112, "step": 14785 }, { "completion_length": 308.6, "completions/clipped_ratio": 0.0, "completions/max_length": 308.6, "completions/max_terminated_length": 308.6, "completions/mean_length": 84.6109375, "completions/mean_terminated_length": 84.6109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013384930324152423, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0020355398301035166, "kl": 0.8969576860312373, "learning_rate": 4.3818253968253965e-07, "loss": 0.0009, "num_tokens": 1002327278.0, "reward": 0.4609375, "reward_std": 0.04160999022424221, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8658651113510132, "step": 14790 }, { "completion_length": 299.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 93.07265625, "completions/mean_terminated_length": 93.07265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013389455317500683, "frac_reward_zero_std": 0.96875, "grad_norm": 0.5554459691047668, "kl": 0.4218937170226127, "learning_rate": 4.3814285714285715e-07, "loss": 0.0004, "num_tokens": 1002649947.0, "reward": 0.3828125, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.917918598651886, "step": 14795 }, { "completion_length": 421.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.2, "completions/max_terminated_length": 325.4, "completions/mean_length": 83.98046875, "completions/mean_terminated_length": 83.45374145507813, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013393980310848944, "frac_reward_zero_std": 0.9875, "grad_norm": 4.300911903381348, "kl": 0.9454974644817412, "learning_rate": 4.381031746031746e-07, "loss": 0.0009, "num_tokens": 1002953090.0, "reward": 0.428125, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8915583729743958, "step": 14800 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 83.8765625, "completions/mean_terminated_length": 83.8765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013398505304197204, "frac_reward_zero_std": 0.9625, "grad_norm": 0.018460026010870934, "kl": 0.6636011595604941, "learning_rate": 4.38063492063492e-07, "loss": 0.0007, "num_tokens": 1003254764.0, "reward": 0.384375, "reward_std": 0.034929439425468445, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9114007353782654, "step": 14805 }, { "completion_length": 572.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 572.8, "completions/max_terminated_length": 483.6, "completions/mean_length": 102.57265625, "completions/mean_terminated_length": 102.05454711914062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013403030297545462, "frac_reward_zero_std": 0.95, "grad_norm": 8.216849327087402, "kl": 1.2387624700553714, "learning_rate": 4.380238095238095e-07, "loss": 0.0012, "num_tokens": 1003590001.0, "reward": 0.284375, "reward_std": 0.04308430440723896, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9578689932823181, "step": 14810 }, { "completion_length": 348.4, "completions/clipped_ratio": 0.0, "completions/max_length": 348.4, "completions/max_terminated_length": 348.4, "completions/mean_length": 91.07421875, "completions/mean_terminated_length": 91.07421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013407555290893722, "frac_reward_zero_std": 0.95, "grad_norm": 0.017519937828183174, "kl": 0.9842917426954955, "learning_rate": 4.3798412698412697e-07, "loss": 0.001, "num_tokens": 1003906160.0, "reward": 0.4328125, "reward_std": 0.04492208622395992, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.889263117313385, "step": 14815 }, { "completion_length": 532.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 532.2, "completions/max_terminated_length": 482.8, "completions/mean_length": 101.64140625, "completions/mean_terminated_length": 100.59532928466797, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013412080284241982, "frac_reward_zero_std": 0.9625, "grad_norm": 7.953854084014893, "kl": 0.604294177016709, "learning_rate": 4.379444444444444e-07, "loss": 0.0006, "num_tokens": 1004238117.0, "reward": 0.2515625, "reward_std": 0.03287851139903068, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9374019265174866, "step": 14820 }, { "completion_length": 329.6, "completions/clipped_ratio": 0.0, "completions/max_length": 329.6, "completions/max_terminated_length": 329.6, "completions/mean_length": 93.48125, "completions/mean_terminated_length": 93.48125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013416605277590242, "frac_reward_zero_std": 0.975, "grad_norm": 0.16808964312076569, "kl": 0.310765050817281, "learning_rate": 4.379047619047619e-07, "loss": 0.0003, "num_tokens": 1004555949.0, "reward": 0.4234375, "reward_std": 0.022673700004816055, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8952791094779968, "step": 14825 }, { "completion_length": 299.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 90.06015625, "completions/mean_terminated_length": 90.06015625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013421130270938502, "frac_reward_zero_std": 0.95, "grad_norm": 6.890025615692139, "kl": 1.7810909907100723, "learning_rate": 4.3786507936507933e-07, "loss": 0.0018, "num_tokens": 1004870506.0, "reward": 0.2984375, "reward_std": 0.03945621512830257, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9483781695365906, "step": 14830 }, { "completion_length": 397.4, "completions/clipped_ratio": 0.0, "completions/max_length": 397.4, "completions/max_terminated_length": 397.4, "completions/mean_length": 91.44375, "completions/mean_terminated_length": 91.44375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01342565526428676, "frac_reward_zero_std": 0.96875, "grad_norm": 11.716842651367188, "kl": 0.4327281669480726, "learning_rate": 4.3782539682539684e-07, "loss": 0.0004, "num_tokens": 1005185122.0, "reward": 0.4671875, "reward_std": 0.028930897638201714, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.883440899848938, "step": 14835 }, { "completion_length": 396.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 96.15546875, "completions/mean_terminated_length": 96.15546875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01343018025763502, "frac_reward_zero_std": 0.9375, "grad_norm": 18.954204559326172, "kl": 1.3524228390771895, "learning_rate": 4.3778571428571424e-07, "loss": 0.0014, "num_tokens": 1005507073.0, "reward": 0.471875, "reward_std": 0.05581184923648834, "rewards/verify_chess_move/mean": 0.471875, "rewards/verify_chess_move/std": 0.8781631231307984, "step": 14840 }, { "completion_length": 296.6, "completions/clipped_ratio": 0.0, "completions/max_length": 296.6, "completions/max_terminated_length": 296.6, "completions/mean_length": 92.55703125, "completions/mean_terminated_length": 92.55703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013434705250983281, "frac_reward_zero_std": 0.9375, "grad_norm": 12.131851196289062, "kl": 1.4427613539621233, "learning_rate": 4.3774603174603175e-07, "loss": 0.0014, "num_tokens": 1005825866.0, "reward": 0.3140625, "reward_std": 0.04966102614998817, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9497156023979187, "step": 14845 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 92.1734375, "completions/mean_terminated_length": 92.1734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013439230244331541, "frac_reward_zero_std": 0.96875, "grad_norm": 0.40131819248199463, "kl": 7.740032546082512, "learning_rate": 4.377063492063492e-07, "loss": 0.0077, "num_tokens": 1006144136.0, "reward": 0.303125, "reward_std": 0.02346404567360878, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9436566233634949, "step": 14850 }, { "completion_length": 366.4, "completions/clipped_ratio": 0.0, "completions/max_length": 366.4, "completions/max_terminated_length": 366.4, "completions/mean_length": 88.4625, "completions/mean_terminated_length": 88.4625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013443755237679801, "frac_reward_zero_std": 0.975, "grad_norm": 10.185698509216309, "kl": 0.7815671907039359, "learning_rate": 4.376666666666666e-07, "loss": 0.0008, "num_tokens": 1006453984.0, "reward": 0.5125, "reward_std": 0.023356688767671586, "rewards/verify_chess_move/mean": 0.5125, "rewards/verify_chess_move/std": 0.8451654434204101, "step": 14855 }, { "completion_length": 417.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 417.4, "completions/max_terminated_length": 416.8, "completions/mean_length": 88.3609375, "completions/mean_terminated_length": 87.29384765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013448280231028061, "frac_reward_zero_std": 0.94375, "grad_norm": 2.6318438053131104, "kl": 6.0079337219125595, "learning_rate": 4.376269841269841e-07, "loss": 0.006, "num_tokens": 1006762814.0, "reward": 0.46875, "reward_std": 0.04887068048119545, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8654278159141541, "step": 14860 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 86.44375, "completions/mean_terminated_length": 86.44375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01345280522437632, "frac_reward_zero_std": 0.99375, "grad_norm": 0.3744383454322815, "kl": 7.256154853734188, "learning_rate": 4.3758730158730156e-07, "loss": 0.0073, "num_tokens": 1007070190.0, "reward": 0.2734375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9556743860244751, "step": 14865 }, { "completion_length": 382.8, "completions/clipped_ratio": 0.0, "completions/max_length": 382.8, "completions/max_terminated_length": 382.8, "completions/mean_length": 91.0328125, "completions/mean_terminated_length": 91.0328125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01345733021772458, "frac_reward_zero_std": 0.925, "grad_norm": 25.58297348022461, "kl": 4.266804535640404, "learning_rate": 4.3754761904761907e-07, "loss": 0.0043, "num_tokens": 1007384840.0, "reward": 0.4609375, "reward_std": 0.06870310753583908, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8751269102096557, "step": 14870 }, { "completion_length": 599.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 599.0, "completions/max_terminated_length": 498.8, "completions/mean_length": 90.69453125, "completions/mean_terminated_length": 88.06837158203125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01346185521107284, "frac_reward_zero_std": 0.9625, "grad_norm": 9.052474975585938, "kl": 0.7629543295595795, "learning_rate": 4.3750793650793647e-07, "loss": 0.0008, "num_tokens": 1007698297.0, "reward": 0.421875, "reward_std": 0.03745020925998688, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8942368865013123, "step": 14875 }, { "completion_length": 312.4, "completions/clipped_ratio": 0.0, "completions/max_length": 312.4, "completions/max_terminated_length": 312.4, "completions/mean_length": 92.18359375, "completions/mean_terminated_length": 92.18359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0134663802044211, "frac_reward_zero_std": 0.975, "grad_norm": 0.0024107571225613356, "kl": 0.34370225835591556, "learning_rate": 4.374682539682539e-07, "loss": 0.0003, "num_tokens": 1008015628.0, "reward": 0.475, "reward_std": 0.02041158638894558, "rewards/verify_chess_move/mean": 0.475, "rewards/verify_chess_move/std": 0.861469280719757, "step": 14880 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 84.91796875, "completions/mean_terminated_length": 84.91796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01347090519776936, "frac_reward_zero_std": 0.95625, "grad_norm": 4.677147388458252, "kl": 0.5045423929346725, "learning_rate": 4.3742857142857143e-07, "loss": 0.0005, "num_tokens": 1008321379.0, "reward": 0.49375, "reward_std": 0.041610969603061675, "rewards/verify_chess_move/mean": 0.49375, "rewards/verify_chess_move/std": 0.8673789381980896, "step": 14885 }, { "completion_length": 312.2, "completions/clipped_ratio": 0.0, "completions/max_length": 312.2, "completions/max_terminated_length": 312.2, "completions/mean_length": 94.8375, "completions/mean_terminated_length": 94.8375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.013475430191117618, "frac_reward_zero_std": 0.95625, "grad_norm": 0.009268849156796932, "kl": 2.072720892401412, "learning_rate": 4.373888888888889e-07, "loss": 0.0021, "num_tokens": 1008643147.0, "reward": 0.365625, "reward_std": 0.035247981920838355, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9247413992881774, "step": 14890 }, { "completion_length": 398.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.4, "completions/max_terminated_length": 397.6, "completions/mean_length": 100.15078125, "completions/mean_terminated_length": 99.63030090332032, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013479955184465878, "frac_reward_zero_std": 0.95, "grad_norm": 0.01074539590626955, "kl": 0.8126546634128318, "learning_rate": 4.3734920634920634e-07, "loss": 0.0008, "num_tokens": 1008973236.0, "reward": 0.3421875, "reward_std": 0.04035136960446835, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9281939029693603, "step": 14895 }, { "completion_length": 468.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 468.4, "completions/max_terminated_length": 462.4, "completions/mean_length": 85.9046875, "completions/mean_terminated_length": 85.38071746826172, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013484480177814138, "frac_reward_zero_std": 0.98125, "grad_norm": 3.0416531562805176, "kl": 1.9359319523908198, "learning_rate": 4.373095238095238e-07, "loss": 0.0019, "num_tokens": 1009281266.0, "reward": 0.25, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9633062839508056, "step": 14900 }, { "completion_length": 407.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 407.6, "completions/max_terminated_length": 375.2, "completions/mean_length": 100.1796875, "completions/mean_terminated_length": 99.65582885742188, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013489005171162399, "frac_reward_zero_std": 0.95, "grad_norm": 22.809778213500977, "kl": 0.5512907987111249, "learning_rate": 4.3726984126984125e-07, "loss": 0.0006, "num_tokens": 1009610752.0, "reward": 0.3171875, "reward_std": 0.04287213832139969, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9127422094345092, "step": 14905 }, { "completion_length": 296.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 87.12890625, "completions/mean_terminated_length": 87.12890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013493530164510659, "frac_reward_zero_std": 0.96875, "grad_norm": 6.823322772979736, "kl": 0.908563616941683, "learning_rate": 4.372301587301587e-07, "loss": 0.0009, "num_tokens": 1009919173.0, "reward": 0.2984375, "reward_std": 0.027776104956865312, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9506171226501465, "step": 14910 }, { "completion_length": 409.8, "completions/clipped_ratio": 0.0, "completions/max_length": 409.8, "completions/max_terminated_length": 409.8, "completions/mean_length": 91.78671875, "completions/mean_terminated_length": 91.78671875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013498055157858919, "frac_reward_zero_std": 0.94375, "grad_norm": 0.1224861592054367, "kl": 2.7429630754049867, "learning_rate": 4.3719047619047616e-07, "loss": 0.0027, "num_tokens": 1010234652.0, "reward": 0.471875, "reward_std": 0.05297057554125786, "rewards/verify_chess_move/mean": 0.471875, "rewards/verify_chess_move/std": 0.8751039624214172, "step": 14915 }, { "completion_length": 306.4, "completions/clipped_ratio": 0.0, "completions/max_length": 306.4, "completions/max_terminated_length": 306.4, "completions/mean_length": 84.12265625, "completions/mean_terminated_length": 84.12265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013502580151207177, "frac_reward_zero_std": 0.96875, "grad_norm": 1.123600959777832, "kl": 0.665639515616931, "learning_rate": 4.371507936507936e-07, "loss": 0.0007, "num_tokens": 1010539105.0, "reward": 0.3671875, "reward_std": 0.025726158171892166, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9219825744628907, "step": 14920 }, { "completion_length": 420.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 420.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 90.88125, "completions/mean_terminated_length": 90.35788116455078, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013507105144555437, "frac_reward_zero_std": 0.95625, "grad_norm": 4.3979010581970215, "kl": 1.1908809190033935, "learning_rate": 4.371111111111111e-07, "loss": 0.0012, "num_tokens": 1010853321.0, "reward": 0.2765625, "reward_std": 0.037769732996821404, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9484102249145507, "step": 14925 }, { "completion_length": 277.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 89.00703125, "completions/mean_terminated_length": 89.00703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013511630137903697, "frac_reward_zero_std": 0.95625, "grad_norm": 2.9292819499969482, "kl": 1.239912196388468, "learning_rate": 4.370714285714285e-07, "loss": 0.0012, "num_tokens": 1011166234.0, "reward": 0.459375, "reward_std": 0.037086743861436844, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8709755182266236, "step": 14930 }, { "completion_length": 534.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 534.2, "completions/max_terminated_length": 347.2, "completions/mean_length": 92.29921875, "completions/mean_terminated_length": 91.23370361328125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013516155131251957, "frac_reward_zero_std": 0.925, "grad_norm": 29.625804901123047, "kl": 1.989761516766157, "learning_rate": 4.37031746031746e-07, "loss": 0.002, "num_tokens": 1011484785.0, "reward": 0.2125, "reward_std": 0.06896176300942898, "rewards/verify_chess_move/mean": 0.2125, "rewards/verify_chess_move/std": 0.9733428835868836, "step": 14935 }, { "completion_length": 387.2, "completions/clipped_ratio": 0.0, "completions/max_length": 387.2, "completions/max_terminated_length": 387.2, "completions/mean_length": 84.45078125, "completions/mean_terminated_length": 84.45078125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013520680124600217, "frac_reward_zero_std": 0.98125, "grad_norm": 17.3248291015625, "kl": 1.2328233361127787, "learning_rate": 4.369920634920635e-07, "loss": 0.0012, "num_tokens": 1011788682.0, "reward": 0.3328125, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9181019306182862, "step": 14940 }, { "completion_length": 353.8, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/max_terminated_length": 353.8, "completions/mean_length": 91.86796875, "completions/mean_terminated_length": 91.86796875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013525205117948476, "frac_reward_zero_std": 0.96875, "grad_norm": 13.685101509094238, "kl": 0.27511859270744027, "learning_rate": 4.369523809523809e-07, "loss": 0.0003, "num_tokens": 1012107057.0, "reward": 0.34375, "reward_std": 0.027563939988613128, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9227148175239563, "step": 14945 }, { "completion_length": 422.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 422.2, "completions/max_terminated_length": 363.8, "completions/mean_length": 92.00625, "completions/mean_terminated_length": 90.93536529541015, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013529730111296736, "frac_reward_zero_std": 0.95625, "grad_norm": 3.5220654010772705, "kl": 3.2921493934001775, "learning_rate": 4.369126984126984e-07, "loss": 0.0033, "num_tokens": 1012426641.0, "reward": 0.134375, "reward_std": 0.035247981920838355, "rewards/verify_chess_move/mean": 0.134375, "rewards/verify_chess_move/std": 0.9814298152923584, "step": 14950 }, { "completion_length": 269.2, "completions/clipped_ratio": 0.0, "completions/max_length": 269.2, "completions/max_terminated_length": 269.2, "completions/mean_length": 78.88828125, "completions/mean_terminated_length": 78.88828125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013534255104644996, "frac_reward_zero_std": 0.975, "grad_norm": 9.294381141662598, "kl": 1.2187961819116027, "learning_rate": 4.3687301587301584e-07, "loss": 0.0012, "num_tokens": 1012722666.0, "reward": 0.4625, "reward_std": 0.019727616757154464, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8037056982517242, "step": 14955 }, { "completion_length": 286.8, "completions/clipped_ratio": 0.0, "completions/max_length": 286.8, "completions/max_terminated_length": 286.8, "completions/mean_length": 90.06953125, "completions/mean_terminated_length": 90.06953125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013538780097993256, "frac_reward_zero_std": 0.96875, "grad_norm": 0.19984596967697144, "kl": 0.7680414567235857, "learning_rate": 4.3683333333333335e-07, "loss": 0.0008, "num_tokens": 1013038627.0, "reward": 0.2453125, "reward_std": 0.024831003695726394, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9681201934814453, "step": 14960 }, { "completion_length": 304.6, "completions/clipped_ratio": 0.0, "completions/max_length": 304.6, "completions/max_terminated_length": 304.6, "completions/mean_length": 88.54140625, "completions/mean_terminated_length": 88.54140625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013543305091341516, "frac_reward_zero_std": 0.95625, "grad_norm": 15.43554401397705, "kl": 0.9109444645699114, "learning_rate": 4.3679365079365075e-07, "loss": 0.0009, "num_tokens": 1013351088.0, "reward": 0.4140625, "reward_std": 0.038664887100458144, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.894789743423462, "step": 14965 }, { "completion_length": 271.4, "completions/clipped_ratio": 0.0, "completions/max_length": 271.4, "completions/max_terminated_length": 271.4, "completions/mean_length": 95.13125, "completions/mean_terminated_length": 95.13125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013547830084689776, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002927103778347373, "kl": 2.971206295560114, "learning_rate": 4.367539682539682e-07, "loss": 0.003, "num_tokens": 1013675920.0, "reward": 0.221875, "reward_std": 0.02709311656653881, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.9571886181831359, "step": 14970 }, { "completion_length": 280.6, "completions/clipped_ratio": 0.0, "completions/max_length": 280.6, "completions/max_terminated_length": 280.6, "completions/mean_length": 87.075, "completions/mean_terminated_length": 87.075, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013552355078038035, "frac_reward_zero_std": 0.9625, "grad_norm": 9.257515907287598, "kl": 4.754195170034654, "learning_rate": 4.367142857142857e-07, "loss": 0.0048, "num_tokens": 1013986096.0, "reward": 0.45, "reward_std": 0.03424547016620636, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8817134022712707, "step": 14975 }, { "completion_length": 377.2, "completions/clipped_ratio": 0.0, "completions/max_length": 377.2, "completions/max_terminated_length": 377.2, "completions/mean_length": 85.22578125, "completions/mean_terminated_length": 85.22578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013556880071386295, "frac_reward_zero_std": 0.95, "grad_norm": 21.291542053222656, "kl": 3.6908357434906067, "learning_rate": 4.3667460317460316e-07, "loss": 0.0037, "num_tokens": 1014292913.0, "reward": 0.3765625, "reward_std": 0.04171734638512135, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9217031836509705, "step": 14980 }, { "completion_length": 395.4, "completions/clipped_ratio": 0.0, "completions/max_length": 395.4, "completions/max_terminated_length": 395.4, "completions/mean_length": 91.40546875, "completions/mean_terminated_length": 91.40546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013561405064734555, "frac_reward_zero_std": 0.95625, "grad_norm": 1.8076000213623047, "kl": 8.435619041300379, "learning_rate": 4.366349206349206e-07, "loss": 0.0084, "num_tokens": 1014610416.0, "reward": 0.384375, "reward_std": 0.04161097034811974, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9146311998367309, "step": 14985 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 87.03046875, "completions/mean_terminated_length": 87.03046875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013565930058082815, "frac_reward_zero_std": 0.9375, "grad_norm": 13.966277122497559, "kl": 4.1408995111705735, "learning_rate": 4.3659523809523807e-07, "loss": 0.0041, "num_tokens": 1014920319.0, "reward": 0.2703125, "reward_std": 0.05124015100300312, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9386923313140869, "step": 14990 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 88.08984375, "completions/mean_terminated_length": 88.08984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013570455051431075, "frac_reward_zero_std": 0.9625, "grad_norm": 26.267406463623047, "kl": 1.7531853915425017, "learning_rate": 4.365555555555555e-07, "loss": 0.0018, "num_tokens": 1015231130.0, "reward": 0.378125, "reward_std": 0.03014557622373104, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9231788516044617, "step": 14995 }, { "completion_length": 506.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 506.8, "completions/max_terminated_length": 475.0, "completions/mean_length": 94.63984375, "completions/mean_terminated_length": 94.11874847412109, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013574980044779333, "frac_reward_zero_std": 0.93125, "grad_norm": 12.083730697631836, "kl": 1.2527525599114597, "learning_rate": 4.36515873015873e-07, "loss": 0.0013, "num_tokens": 1015551581.0, "reward": 0.453125, "reward_std": 0.05408044382929802, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8850670576095581, "step": 15000 }, { "completion_length": 505.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 505.2, "completions/max_terminated_length": 428.0, "completions/mean_length": 95.5875, "completions/mean_terminated_length": 94.54169006347657, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013579505038127593, "frac_reward_zero_std": 0.975, "grad_norm": 0.17025701701641083, "kl": 0.4081786280730739, "learning_rate": 4.3647619047619043e-07, "loss": 0.0004, "num_tokens": 1015874981.0, "reward": 0.35625, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9320450782775879, "step": 15005 }, { "completion_length": 344.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 90.1203125, "completions/mean_terminated_length": 90.1203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013584030031475854, "frac_reward_zero_std": 0.975, "grad_norm": 6.5499420166015625, "kl": 0.4726567645790055, "learning_rate": 4.3643650793650794e-07, "loss": 0.0005, "num_tokens": 1016188767.0, "reward": 0.428125, "reward_std": 0.02177756391465664, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8567200899124146, "step": 15010 }, { "completion_length": 488.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 488.2, "completions/max_terminated_length": 332.8, "completions/mean_length": 89.4328125, "completions/mean_terminated_length": 88.37394104003906, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.013588555024824114, "frac_reward_zero_std": 0.9625, "grad_norm": 7.364249229431152, "kl": 0.22737963137915357, "learning_rate": 4.363968253968254e-07, "loss": 0.0002, "num_tokens": 1016500337.0, "reward": 0.253125, "reward_std": 0.02925041988492012, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9419628024101258, "step": 15015 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 90.62109375, "completions/mean_terminated_length": 90.62109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013593080018172374, "frac_reward_zero_std": 0.9375, "grad_norm": 29.323057174682617, "kl": 1.4536695244722069, "learning_rate": 4.363571428571428e-07, "loss": 0.0015, "num_tokens": 1016814436.0, "reward": 0.428125, "reward_std": 0.05944092087447643, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8715027570724487, "step": 15020 }, { "completion_length": 369.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 369.8, "completions/max_terminated_length": 352.8, "completions/mean_length": 96.75703125, "completions/mean_terminated_length": 96.22270050048829, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.013597605011520634, "frac_reward_zero_std": 0.95, "grad_norm": 5.961207389831543, "kl": 1.1388322644284927, "learning_rate": 4.363174603174603e-07, "loss": 0.0011, "num_tokens": 1017140509.0, "reward": 0.2859375, "reward_std": 0.04513523355126381, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9592764139175415, "step": 15025 }, { "completion_length": 340.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 85.575, "completions/mean_terminated_length": 85.575, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013602130004868892, "frac_reward_zero_std": 0.9625, "grad_norm": 0.8387723565101624, "kl": 0.26348596124444157, "learning_rate": 4.3627777777777776e-07, "loss": 0.0003, "num_tokens": 1017446301.0, "reward": 0.371875, "reward_std": 0.032195523381233215, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9209322094917297, "step": 15030 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 95.91484375, "completions/mean_terminated_length": 95.91484375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013606654998217152, "frac_reward_zero_std": 0.90625, "grad_norm": 10.59262466430664, "kl": 0.46221360107883813, "learning_rate": 4.3623809523809526e-07, "loss": 0.0005, "num_tokens": 1017770840.0, "reward": 0.2453125, "reward_std": 0.07470419779419898, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.9597150325775147, "step": 15035 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 98.71484375, "completions/mean_terminated_length": 98.71484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013611179991565412, "frac_reward_zero_std": 0.96875, "grad_norm": 2.539234161376953, "kl": 0.834452651720494, "learning_rate": 4.3619841269841266e-07, "loss": 0.0008, "num_tokens": 1018097643.0, "reward": 0.240625, "reward_std": 0.02845909409224987, "rewards/verify_chess_move/mean": 0.240625, "rewards/verify_chess_move/std": 0.9667160272598266, "step": 15040 }, { "completion_length": 315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 92.06484375, "completions/mean_terminated_length": 92.06484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013615704984913672, "frac_reward_zero_std": 0.95, "grad_norm": 14.11117172241211, "kl": 0.22323361807502806, "learning_rate": 4.361587301587301e-07, "loss": 0.0002, "num_tokens": 1018413918.0, "reward": 0.271875, "reward_std": 0.04671337604522705, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9581132531166077, "step": 15045 }, { "completion_length": 376.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 376.2, "completions/max_terminated_length": 343.8, "completions/mean_length": 93.26015625, "completions/mean_terminated_length": 92.73349304199219, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013620229978261933, "frac_reward_zero_std": 0.95, "grad_norm": 10.932992935180664, "kl": 0.6906936765182763, "learning_rate": 4.361190476190476e-07, "loss": 0.0007, "num_tokens": 1018732131.0, "reward": 0.359375, "reward_std": 0.04287312030792236, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9127314448356628, "step": 15050 }, { "completion_length": 391.2, "completions/clipped_ratio": 0.0, "completions/max_length": 391.2, "completions/max_terminated_length": 391.2, "completions/mean_length": 88.64765625, "completions/mean_terminated_length": 88.64765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013624754971610191, "frac_reward_zero_std": 0.96875, "grad_norm": 0.13779661059379578, "kl": 0.6761464044218883, "learning_rate": 4.36079365079365e-07, "loss": 0.0007, "num_tokens": 1019044128.0, "reward": 0.3765625, "reward_std": 0.028930897638201714, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9263232827186585, "step": 15055 }, { "completion_length": 260.8, "completions/clipped_ratio": 0.0, "completions/max_length": 260.8, "completions/max_terminated_length": 260.8, "completions/mean_length": 86.7234375, "completions/mean_terminated_length": 86.7234375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013629279964958451, "frac_reward_zero_std": 0.95625, "grad_norm": 0.002404646249487996, "kl": 0.5957735973410309, "learning_rate": 4.3603968253968253e-07, "loss": 0.0006, "num_tokens": 1019352966.0, "reward": 0.2703125, "reward_std": 0.04229395985603333, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9583493232727051, "step": 15060 }, { "completion_length": 346.6, "completions/clipped_ratio": 0.0, "completions/max_length": 346.6, "completions/max_terminated_length": 346.6, "completions/mean_length": 91.93671875, "completions/mean_terminated_length": 91.93671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013633804958306711, "frac_reward_zero_std": 0.975, "grad_norm": 2.4376375675201416, "kl": 0.4315874878084287, "learning_rate": 4.36e-07, "loss": 0.0004, "num_tokens": 1019670477.0, "reward": 0.19375, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.19375, "rewards/verify_chess_move/std": 0.9710312008857727, "step": 15065 }, { "completion_length": 539.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 539.0, "completions/max_terminated_length": 452.4, "completions/mean_length": 99.8015625, "completions/mean_terminated_length": 98.75779266357422, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013638329951654971, "frac_reward_zero_std": 0.94375, "grad_norm": 0.6873733401298523, "kl": 1.4668669854174368, "learning_rate": 4.3596031746031744e-07, "loss": 0.0015, "num_tokens": 1019997903.0, "reward": 0.3703125, "reward_std": 0.043403827399015424, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.8881883502006531, "step": 15070 }, { "completion_length": 340.6, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/max_terminated_length": 340.6, "completions/mean_length": 91.01953125, "completions/mean_terminated_length": 91.01953125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013642854945003231, "frac_reward_zero_std": 0.975, "grad_norm": 2.8819470405578613, "kl": 1.3194331439444795, "learning_rate": 4.359206349206349e-07, "loss": 0.0013, "num_tokens": 1020313808.0, "reward": 0.3, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9267109870910645, "step": 15075 }, { "completion_length": 361.8, "completions/clipped_ratio": 0.0, "completions/max_length": 361.8, "completions/max_terminated_length": 361.8, "completions/mean_length": 91.36796875, "completions/mean_terminated_length": 91.36796875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.013647379938351491, "frac_reward_zero_std": 0.95625, "grad_norm": 1.57822847366333, "kl": 0.19971099575050175, "learning_rate": 4.3588095238095235e-07, "loss": 0.0002, "num_tokens": 1020628975.0, "reward": 0.4375, "reward_std": 0.03956102356314659, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8898382902145385, "step": 15080 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 93.40859375, "completions/mean_terminated_length": 93.40859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01365190493169975, "frac_reward_zero_std": 0.9875, "grad_norm": 1.0608266592025757, "kl": 0.5058252423885279, "learning_rate": 4.3584126984126986e-07, "loss": 0.0005, "num_tokens": 1020948898.0, "reward": 0.2765625, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9505837202072144, "step": 15085 }, { "completion_length": 261.4, "completions/clipped_ratio": 0.0, "completions/max_length": 261.4, "completions/max_terminated_length": 261.4, "completions/mean_length": 83.1734375, "completions/mean_terminated_length": 83.1734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01365642992504801, "frac_reward_zero_std": 0.975, "grad_norm": 8.790922164916992, "kl": 0.5682322420645505, "learning_rate": 4.3580158730158726e-07, "loss": 0.0006, "num_tokens": 1021251640.0, "reward": 0.41875, "reward_std": 0.024935813248157503, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8938482403755188, "step": 15090 }, { "completion_length": 331.8, "completions/clipped_ratio": 0.0, "completions/max_length": 331.8, "completions/max_terminated_length": 331.8, "completions/mean_length": 93.46875, "completions/mean_terminated_length": 93.46875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01366095491839627, "frac_reward_zero_std": 1.0, "grad_norm": 0.030931593850255013, "kl": 0.15668350358027966, "learning_rate": 4.357619047619047e-07, "loss": 0.0002, "num_tokens": 1021571648.0, "reward": 0.4125, "reward_std": 0.0, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.8968540787696838, "step": 15095 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 84.81796875, "completions/mean_terminated_length": 84.81796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01366547991174453, "frac_reward_zero_std": 0.975, "grad_norm": 0.008990702219307423, "kl": 0.883507170365192, "learning_rate": 4.357222222222222e-07, "loss": 0.0009, "num_tokens": 1021876991.0, "reward": 0.390625, "reward_std": 0.023356687650084494, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9046484947204589, "step": 15100 }, { "completion_length": 443.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 87.62265625, "completions/mean_terminated_length": 87.09837799072265, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01367000490509279, "frac_reward_zero_std": 0.95, "grad_norm": 11.233281135559082, "kl": 0.5644495715387166, "learning_rate": 4.3568253968253967e-07, "loss": 0.0006, "num_tokens": 1022184660.0, "reward": 0.346875, "reward_std": 0.04855213835835457, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9138374090194702, "step": 15105 }, { "completion_length": 271.8, "completions/clipped_ratio": 0.0, "completions/max_length": 271.8, "completions/max_terminated_length": 271.8, "completions/mean_length": 84.3453125, "completions/mean_terminated_length": 84.3453125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013674529898441048, "frac_reward_zero_std": 0.96875, "grad_norm": 10.201594352722168, "kl": 0.2771319935563952, "learning_rate": 4.356428571428571e-07, "loss": 0.0003, "num_tokens": 1022489406.0, "reward": 0.4171875, "reward_std": 0.025726158544421195, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.892254376411438, "step": 15110 }, { "completion_length": 375.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 375.4, "completions/max_terminated_length": 277.2, "completions/mean_length": 84.24140625, "completions/mean_terminated_length": 83.71253814697266, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013679054891789309, "frac_reward_zero_std": 0.9875, "grad_norm": 0.02851611189544201, "kl": 0.1735558870364912, "learning_rate": 4.356031746031746e-07, "loss": 0.0002, "num_tokens": 1022792787.0, "reward": 0.3, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9495175957679749, "step": 15115 }, { "completion_length": 384.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 384.2, "completions/max_terminated_length": 354.2, "completions/mean_length": 88.01875, "completions/mean_terminated_length": 86.95004272460938, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013683579885137569, "frac_reward_zero_std": 0.9625, "grad_norm": 2.761319637298584, "kl": 3.907875012687873, "learning_rate": 4.3556349206349203e-07, "loss": 0.0039, "num_tokens": 1023103315.0, "reward": 0.465625, "reward_std": 0.03335031531751156, "rewards/verify_chess_move/mean": 0.465625, "rewards/verify_chess_move/std": 0.8536284804344177, "step": 15120 }, { "completion_length": 358.8, "completions/clipped_ratio": 0.0, "completions/max_length": 358.8, "completions/max_terminated_length": 358.8, "completions/mean_length": 89.709375, "completions/mean_terminated_length": 89.709375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013688104878485829, "frac_reward_zero_std": 0.96875, "grad_norm": 4.3463568687438965, "kl": 1.2735764778451995, "learning_rate": 4.3552380952380954e-07, "loss": 0.0013, "num_tokens": 1023416519.0, "reward": 0.4375, "reward_std": 0.027988271787762642, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8905329823493957, "step": 15125 }, { "completion_length": 291.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 86.65078125, "completions/mean_terminated_length": 86.65078125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013692629871834089, "frac_reward_zero_std": 0.975, "grad_norm": 1.7561557292938232, "kl": 0.5074997765943408, "learning_rate": 4.3548412698412694e-07, "loss": 0.0005, "num_tokens": 1023723576.0, "reward": 0.475, "reward_std": 0.02130674123764038, "rewards/verify_chess_move/mean": 0.475, "rewards/verify_chess_move/std": 0.8773319721221924, "step": 15130 }, { "completion_length": 517.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 517.4, "completions/max_terminated_length": 510.0, "completions/mean_length": 92.57734375, "completions/mean_terminated_length": 91.51991271972656, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013697154865182349, "frac_reward_zero_std": 0.95, "grad_norm": 21.600364685058594, "kl": 1.07440121085383, "learning_rate": 4.3544444444444445e-07, "loss": 0.0011, "num_tokens": 1024039875.0, "reward": 0.325, "reward_std": 0.046029407531023026, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9392740845680236, "step": 15135 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 388.6, "completions/max_terminated_length": 359.8, "completions/mean_length": 89.078125, "completions/mean_terminated_length": 88.01613006591796, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013701679858530607, "frac_reward_zero_std": 0.95, "grad_norm": 0.0032707371283322573, "kl": 3.4202259448124095, "learning_rate": 4.354047619047619e-07, "loss": 0.0034, "num_tokens": 1024352103.0, "reward": 0.3921875, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9112790942192077, "step": 15140 }, { "completion_length": 407.6, "completions/clipped_ratio": 0.0, "completions/max_length": 407.6, "completions/max_terminated_length": 407.6, "completions/mean_length": 89.62265625, "completions/mean_terminated_length": 89.62265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013706204851878867, "frac_reward_zero_std": 0.96875, "grad_norm": 18.337949752807617, "kl": 1.3394020217936486, "learning_rate": 4.353650793650793e-07, "loss": 0.0013, "num_tokens": 1024664892.0, "reward": 0.3546875, "reward_std": 0.02414703480899334, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9117201447486878, "step": 15145 }, { "completion_length": 268.6, "completions/clipped_ratio": 0.0, "completions/max_length": 268.6, "completions/max_terminated_length": 268.6, "completions/mean_length": 88.2890625, "completions/mean_terminated_length": 88.2890625, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.013710729845227127, "frac_reward_zero_std": 0.98125, "grad_norm": 1.041237711906433, "kl": 0.32661966448649765, "learning_rate": 4.353253968253968e-07, "loss": 0.0003, "num_tokens": 1024977774.0, "reward": 0.33125, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9396863341331482, "step": 15150 }, { "completion_length": 293.6, "completions/clipped_ratio": 0.0, "completions/max_length": 293.6, "completions/max_terminated_length": 293.6, "completions/mean_length": 89.284375, "completions/mean_terminated_length": 89.284375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013715254838575388, "frac_reward_zero_std": 0.95625, "grad_norm": 20.088163375854492, "kl": 0.9702720056287945, "learning_rate": 4.3528571428571426e-07, "loss": 0.001, "num_tokens": 1025290394.0, "reward": 0.38125, "reward_std": 0.04139782302081585, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9095259547233582, "step": 15155 }, { "completion_length": 302.4, "completions/clipped_ratio": 0.0, "completions/max_length": 302.4, "completions/max_terminated_length": 302.4, "completions/mean_length": 86.00546875, "completions/mean_terminated_length": 86.00546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013719779831923648, "frac_reward_zero_std": 0.96875, "grad_norm": 2.6510074138641357, "kl": 0.4436098205624148, "learning_rate": 4.3524603174603177e-07, "loss": 0.0004, "num_tokens": 1025598041.0, "reward": 0.3796875, "reward_std": 0.024831003323197366, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.921716320514679, "step": 15160 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 387.6, "completions/max_terminated_length": 357.0, "completions/mean_length": 85.47421875, "completions/mean_terminated_length": 84.94673767089844, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013724304825271906, "frac_reward_zero_std": 0.94375, "grad_norm": 12.934524536132812, "kl": 1.0243366305017845, "learning_rate": 4.3520634920634917e-07, "loss": 0.001, "num_tokens": 1025903784.0, "reward": 0.3515625, "reward_std": 0.050237640365958215, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9301422357559204, "step": 15165 }, { "completion_length": 385.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 385.8, "completions/max_terminated_length": 325.8, "completions/mean_length": 94.0046875, "completions/mean_terminated_length": 93.49313201904297, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013728829818620166, "frac_reward_zero_std": 0.94375, "grad_norm": 21.520248413085938, "kl": 0.2235546022420749, "learning_rate": 4.351666666666666e-07, "loss": 0.0002, "num_tokens": 1026223550.0, "reward": 0.4375, "reward_std": 0.0511337760835886, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8839164018630982, "step": 15170 }, { "completion_length": 296.8, "completions/clipped_ratio": 0.0, "completions/max_length": 296.8, "completions/max_terminated_length": 296.8, "completions/mean_length": 95.3265625, "completions/mean_terminated_length": 95.3265625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013733354811968426, "frac_reward_zero_std": 0.94375, "grad_norm": 20.953380584716797, "kl": 0.7442441942403093, "learning_rate": 4.3512698412698413e-07, "loss": 0.0007, "num_tokens": 1026545376.0, "reward": 0.415625, "reward_std": 0.04955465048551559, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.8848519921302795, "step": 15175 }, { "completion_length": 264.8, "completions/clipped_ratio": 0.0, "completions/max_length": 264.8, "completions/max_terminated_length": 264.8, "completions/mean_length": 83.8171875, "completions/mean_terminated_length": 83.8171875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013737879805316686, "frac_reward_zero_std": 0.98125, "grad_norm": 25.840282440185547, "kl": 0.373698185198009, "learning_rate": 4.3508730158730153e-07, "loss": 0.0004, "num_tokens": 1026848470.0, "reward": 0.3734375, "reward_std": 0.016887323558330537, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9051685214042664, "step": 15180 }, { "completion_length": 592.8, "completions/clipped_ratio": 0.0, "completions/max_length": 592.8, "completions/max_terminated_length": 592.8, "completions/mean_length": 95.15546875, "completions/mean_terminated_length": 95.15546875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013742404798664946, "frac_reward_zero_std": 0.99375, "grad_norm": 0.4985013008117676, "kl": 0.3445463331183419, "learning_rate": 4.3504761904761904e-07, "loss": 0.0003, "num_tokens": 1027170557.0, "reward": 0.48125, "reward_std": 0.006681530922651291, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8635448217391968, "step": 15185 }, { "completion_length": 472.8, "completions/clipped_ratio": 0.0, "completions/max_length": 472.8, "completions/max_terminated_length": 472.8, "completions/mean_length": 94.578125, "completions/mean_terminated_length": 94.578125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013746929792013207, "frac_reward_zero_std": 0.95, "grad_norm": 1.0889639854431152, "kl": 2.1959662978420966, "learning_rate": 4.350079365079365e-07, "loss": 0.0022, "num_tokens": 1027491025.0, "reward": 0.4015625, "reward_std": 0.04240131564438343, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8996417760848999, "step": 15190 }, { "completion_length": 386.8, "completions/clipped_ratio": 0.0, "completions/max_length": 386.8, "completions/max_terminated_length": 386.8, "completions/mean_length": 92.6265625, "completions/mean_terminated_length": 92.6265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013751454785361465, "frac_reward_zero_std": 0.94375, "grad_norm": 22.702762603759766, "kl": 2.262822174280882, "learning_rate": 4.3496825396825395e-07, "loss": 0.0023, "num_tokens": 1027810323.0, "reward": 0.4296875, "reward_std": 0.05155712589621544, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8905691027641296, "step": 15195 }, { "completion_length": 372.6, "completions/clipped_ratio": 0.0, "completions/max_length": 372.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 92.10703125, "completions/mean_terminated_length": 92.10703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013755979778709725, "frac_reward_zero_std": 0.95, "grad_norm": 1.3726798295974731, "kl": 1.3551219480577856, "learning_rate": 4.349285714285714e-07, "loss": 0.0014, "num_tokens": 1028128876.0, "reward": 0.2640625, "reward_std": 0.04445126354694366, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9496280312538147, "step": 15200 }, { "completion_length": 366.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 366.8, "completions/max_terminated_length": 281.8, "completions/mean_length": 89.628125, "completions/mean_terminated_length": 89.09873809814454, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013760504772057985, "frac_reward_zero_std": 0.98125, "grad_norm": 4.917295455932617, "kl": 0.6129832518519833, "learning_rate": 4.3488888888888886e-07, "loss": 0.0006, "num_tokens": 1028441896.0, "reward": 0.371875, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9084725379943848, "step": 15205 }, { "completion_length": 326.4, "completions/clipped_ratio": 0.0, "completions/max_length": 326.4, "completions/max_terminated_length": 326.4, "completions/mean_length": 84.9078125, "completions/mean_terminated_length": 84.9078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013765029765406245, "frac_reward_zero_std": 0.95, "grad_norm": 13.104535102844238, "kl": 0.4308207815280184, "learning_rate": 4.3484920634920636e-07, "loss": 0.0004, "num_tokens": 1028747458.0, "reward": 0.35625, "reward_std": 0.045818221569061277, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9319738268852233, "step": 15210 }, { "completion_length": 325.4, "completions/clipped_ratio": 0.0, "completions/max_length": 325.4, "completions/max_terminated_length": 325.4, "completions/mean_length": 82.9234375, "completions/mean_terminated_length": 82.9234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013769554758754505, "frac_reward_zero_std": 0.9875, "grad_norm": 1.4479643106460571, "kl": 0.5913738740608097, "learning_rate": 4.348095238095238e-07, "loss": 0.0006, "num_tokens": 1029049272.0, "reward": 0.51875, "reward_std": 0.011572751402854919, "rewards/verify_chess_move/mean": 0.51875, "rewards/verify_chess_move/std": 0.8330656886100769, "step": 15215 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 84.8453125, "completions/mean_terminated_length": 84.8453125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013774079752102764, "frac_reward_zero_std": 0.94375, "grad_norm": 16.02690315246582, "kl": 0.6121322860941291, "learning_rate": 4.347698412698412e-07, "loss": 0.0006, "num_tokens": 1029353018.0, "reward": 0.4125, "reward_std": 0.048186710476875304, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.89752037525177, "step": 15220 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 88.91953125, "completions/mean_terminated_length": 88.91953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013778604745451024, "frac_reward_zero_std": 0.94375, "grad_norm": 2.8785006999969482, "kl": 0.7863008380169049, "learning_rate": 4.347301587301587e-07, "loss": 0.0008, "num_tokens": 1029666019.0, "reward": 0.41875, "reward_std": 0.04660954885184765, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.9010817170143127, "step": 15225 }, { "completion_length": 508.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 508.6, "completions/max_terminated_length": 472.2, "completions/mean_length": 96.61640625, "completions/mean_terminated_length": 95.04566040039063, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013783129738799284, "frac_reward_zero_std": 0.96875, "grad_norm": 0.2327592372894287, "kl": 0.6144313689204864, "learning_rate": 4.346904761904762e-07, "loss": 0.0006, "num_tokens": 1029989040.0, "reward": 0.45, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8882807970046998, "step": 15230 }, { "completion_length": 408.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 89.63203125, "completions/mean_terminated_length": 89.63203125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013787654732147544, "frac_reward_zero_std": 0.95625, "grad_norm": 3.8477818965911865, "kl": 0.4890334472293034, "learning_rate": 4.3465079365079363e-07, "loss": 0.0005, "num_tokens": 1030300873.0, "reward": 0.4390625, "reward_std": 0.03571978583931923, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8939005851745605, "step": 15235 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 84.80078125, "completions/mean_terminated_length": 84.80078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013792179725495804, "frac_reward_zero_std": 0.96875, "grad_norm": 2.0938668251037598, "kl": 0.3442415744997561, "learning_rate": 4.346111111111111e-07, "loss": 0.0003, "num_tokens": 1030606946.0, "reward": 0.334375, "reward_std": 0.025513991713523865, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9436192512512207, "step": 15240 }, { "completion_length": 329.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 89.28984375, "completions/mean_terminated_length": 89.28984375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.013796704718844064, "frac_reward_zero_std": 0.95625, "grad_norm": 14.321568489074707, "kl": 0.8355101065477356, "learning_rate": 4.3457142857142854e-07, "loss": 0.0008, "num_tokens": 1030919901.0, "reward": 0.3375, "reward_std": 0.038877053558826445, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9249803781509399, "step": 15245 }, { "completion_length": 600.4, "completions/clipped_ratio": 0.00546875, "completions/max_length": 600.4, "completions/max_terminated_length": 390.2, "completions/mean_length": 95.1890625, "completions/mean_terminated_length": 91.48468627929688, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013801229712192322, "frac_reward_zero_std": 0.98125, "grad_norm": 14.715144157409668, "kl": 1.0842222783830948, "learning_rate": 4.3453174603174605e-07, "loss": 0.0011, "num_tokens": 1031241831.0, "reward": 0.2875, "reward_std": 0.017570313066244125, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9584309101104737, "step": 15250 }, { "completion_length": 411.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 411.4, "completions/max_terminated_length": 316.6, "completions/mean_length": 94.125, "completions/mean_terminated_length": 93.59955749511718, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013805754705540582, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0027517713606357574, "kl": 0.3443013982381672, "learning_rate": 4.3449206349206345e-07, "loss": 0.0003, "num_tokens": 1031563735.0, "reward": 0.3640625, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9187743067741394, "step": 15255 }, { "completion_length": 518.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 518.6, "completions/max_terminated_length": 434.8, "completions/mean_length": 91.7921875, "completions/mean_terminated_length": 90.72667388916015, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013810279698888843, "frac_reward_zero_std": 0.94375, "grad_norm": 3.2699337005615234, "kl": 0.8860165822901763, "learning_rate": 4.3445238095238096e-07, "loss": 0.0009, "num_tokens": 1031880309.0, "reward": 0.40625, "reward_std": 0.04887068085372448, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9034591436386108, "step": 15260 }, { "completion_length": 450.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 450.2, "completions/max_terminated_length": 347.8, "completions/mean_length": 88.13515625, "completions/mean_terminated_length": 87.59808959960938, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013814804692237103, "frac_reward_zero_std": 0.95625, "grad_norm": 16.6415958404541, "kl": 0.2845638718572445, "learning_rate": 4.344126984126984e-07, "loss": 0.0003, "num_tokens": 1032191274.0, "reward": 0.3640625, "reward_std": 0.0413988035172224, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9278870582580566, "step": 15265 }, { "completion_length": 387.4, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/max_terminated_length": 387.4, "completions/mean_length": 83.67109375, "completions/mean_terminated_length": 83.67109375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.013819329685585363, "frac_reward_zero_std": 0.9875, "grad_norm": 0.001375438179820776, "kl": 0.10343176894821227, "learning_rate": 4.343730158730158e-07, "loss": 0.0001, "num_tokens": 1032494349.0, "reward": 0.4328125, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.8698860049247742, "step": 15270 }, { "completion_length": 468.6, "completions/clipped_ratio": 0.0, "completions/max_length": 468.6, "completions/max_terminated_length": 468.6, "completions/mean_length": 101.4796875, "completions/mean_terminated_length": 101.4796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013823854678933621, "frac_reward_zero_std": 0.95, "grad_norm": 8.698070526123047, "kl": 0.5629813551204279, "learning_rate": 4.343333333333333e-07, "loss": 0.0006, "num_tokens": 1032827659.0, "reward": 0.3515625, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.928546154499054, "step": 15275 }, { "completion_length": 403.2, "completions/clipped_ratio": 0.0, "completions/max_length": 403.2, "completions/max_terminated_length": 403.2, "completions/mean_length": 84.58125, "completions/mean_terminated_length": 84.58125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013828379672281881, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017263501649722457, "kl": 0.11083843312226235, "learning_rate": 4.3429365079365077e-07, "loss": 0.0001, "num_tokens": 1033131363.0, "reward": 0.3796875, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9214497566223144, "step": 15280 }, { "completion_length": 324.8, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/max_terminated_length": 324.8, "completions/mean_length": 86.91640625, "completions/mean_terminated_length": 86.91640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013832904665630141, "frac_reward_zero_std": 0.9625, "grad_norm": 1.4708222150802612, "kl": 0.13281428788322955, "learning_rate": 4.342539682539683e-07, "loss": 0.0001, "num_tokens": 1033441496.0, "reward": 0.3078125, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9438002943992615, "step": 15285 }, { "completion_length": 355.4, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/max_terminated_length": 355.4, "completions/mean_length": 86.00078125, "completions/mean_terminated_length": 86.00078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013837429658978401, "frac_reward_zero_std": 0.95625, "grad_norm": 1.6265195608139038, "kl": 0.20693365519400686, "learning_rate": 4.342142857142857e-07, "loss": 0.0002, "num_tokens": 1033749961.0, "reward": 0.3296875, "reward_std": 0.037298910319805145, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9322799205780029, "step": 15290 }, { "completion_length": 442.4, "completions/clipped_ratio": 0.0, "completions/max_length": 442.4, "completions/max_terminated_length": 442.4, "completions/mean_length": 92.3109375, "completions/mean_terminated_length": 92.3109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013841954652326662, "frac_reward_zero_std": 0.98125, "grad_norm": 0.002159307012334466, "kl": 0.19862516467692332, "learning_rate": 4.3417460317460313e-07, "loss": 0.0002, "num_tokens": 1034069623.0, "reward": 0.2359375, "reward_std": 0.017358146235346796, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9525859236717225, "step": 15295 }, { "completion_length": 426.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 426.2, "completions/max_terminated_length": 370.2, "completions/mean_length": 96.50703125, "completions/mean_terminated_length": 95.48095397949218, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013846479645674922, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0014667916111648083, "kl": 0.15200439256150275, "learning_rate": 4.3413492063492064e-07, "loss": 0.0002, "num_tokens": 1034393536.0, "reward": 0.3640625, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.93108149766922, "step": 15300 }, { "completion_length": 293.6, "completions/clipped_ratio": 0.0, "completions/max_length": 293.6, "completions/max_terminated_length": 293.6, "completions/mean_length": 86.64140625, "completions/mean_terminated_length": 86.64140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01385100463902318, "frac_reward_zero_std": 0.99375, "grad_norm": 0.7664117813110352, "kl": 0.11641886043362319, "learning_rate": 4.340952380952381e-07, "loss": 0.0001, "num_tokens": 1034701477.0, "reward": 0.390625, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9149729609489441, "step": 15305 }, { "completion_length": 457.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.6, "completions/max_terminated_length": 401.0, "completions/mean_length": 91.2859375, "completions/mean_terminated_length": 90.76156921386719, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01385552963237144, "frac_reward_zero_std": 0.98125, "grad_norm": 0.1832635998725891, "kl": 0.2530815253499895, "learning_rate": 4.340555555555555e-07, "loss": 0.0003, "num_tokens": 1035015723.0, "reward": 0.3796875, "reward_std": 0.01530819907784462, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9203927159309387, "step": 15310 }, { "completion_length": 407.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 407.4, "completions/max_terminated_length": 321.0, "completions/mean_length": 87.25234375, "completions/mean_terminated_length": 86.72072143554688, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.0138600546257197, "frac_reward_zero_std": 0.95625, "grad_norm": 19.31708526611328, "kl": 0.7787794779753312, "learning_rate": 4.34015873015873e-07, "loss": 0.0008, "num_tokens": 1035323942.0, "reward": 0.3125, "reward_std": 0.04071581587195396, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9394720554351806, "step": 15315 }, { "completion_length": 324.6, "completions/clipped_ratio": 0.0, "completions/max_length": 324.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 87.7796875, "completions/mean_terminated_length": 87.7796875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01386457961906796, "frac_reward_zero_std": 0.975, "grad_norm": 0.05494372174143791, "kl": 0.791600727243349, "learning_rate": 4.3397619047619046e-07, "loss": 0.0008, "num_tokens": 1035634556.0, "reward": 0.2875, "reward_std": 0.020411586761474608, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9540603518486023, "step": 15320 }, { "completion_length": 396.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 396.8, "completions/max_terminated_length": 323.2, "completions/mean_length": 87.70078125, "completions/mean_terminated_length": 87.16634979248047, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01386910461241622, "frac_reward_zero_std": 0.9625, "grad_norm": 1.9808329343795776, "kl": 0.4602869640570134, "learning_rate": 4.339365079365079e-07, "loss": 0.0005, "num_tokens": 1035944749.0, "reward": 0.2453125, "reward_std": 0.0347172737121582, "rewards/verify_chess_move/mean": 0.2453125, "rewards/verify_chess_move/std": 0.928373146057129, "step": 15325 }, { "completion_length": 434.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 434.0, "completions/max_terminated_length": 359.6, "completions/mean_length": 89.653125, "completions/mean_terminated_length": 89.1163833618164, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.013873629605764479, "frac_reward_zero_std": 0.9625, "grad_norm": 3.49541974067688, "kl": 0.41792041377630085, "learning_rate": 4.3389682539682536e-07, "loss": 0.0004, "num_tokens": 1036256169.0, "reward": 0.415625, "reward_std": 0.03335031494498253, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9048586845397949, "step": 15330 }, { "completion_length": 360.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 360.2, "completions/max_terminated_length": 282.8, "completions/mean_length": 86.2859375, "completions/mean_terminated_length": 85.75977783203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013878154599112739, "frac_reward_zero_std": 0.96875, "grad_norm": 31.244218826293945, "kl": 0.21780964814824982, "learning_rate": 4.338571428571428e-07, "loss": 0.0002, "num_tokens": 1036563927.0, "reward": 0.4171875, "reward_std": 0.02867126017808914, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9089403271675109, "step": 15335 }, { "completion_length": 391.6, "completions/clipped_ratio": 0.0, "completions/max_length": 391.6, "completions/max_terminated_length": 391.6, "completions/mean_length": 85.0390625, "completions/mean_terminated_length": 85.0390625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013882679592460999, "frac_reward_zero_std": 0.975, "grad_norm": 0.035455476492643356, "kl": 1.1136523516615853, "learning_rate": 4.338174603174603e-07, "loss": 0.0011, "num_tokens": 1036869889.0, "reward": 0.35, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9335846900939941, "step": 15340 }, { "completion_length": 376.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 376.2, "completions/max_terminated_length": 283.4, "completions/mean_length": 88.41484375, "completions/mean_terminated_length": 87.88408508300782, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013887204585809259, "frac_reward_zero_std": 0.96875, "grad_norm": 3.0409741401672363, "kl": 0.7475323094753549, "learning_rate": 4.3377777777777773e-07, "loss": 0.0007, "num_tokens": 1037181604.0, "reward": 0.278125, "reward_std": 0.02798827216029167, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9544988512992859, "step": 15345 }, { "completion_length": 324.2, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/max_terminated_length": 324.2, "completions/mean_length": 88.1609375, "completions/mean_terminated_length": 88.1609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013891729579157519, "frac_reward_zero_std": 0.9625, "grad_norm": 5.449366092681885, "kl": 0.3778908017789945, "learning_rate": 4.3373809523809523e-07, "loss": 0.0004, "num_tokens": 1037492050.0, "reward": 0.378125, "reward_std": 0.0349294401705265, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9147553324699402, "step": 15350 }, { "completion_length": 411.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 94.25625, "completions/mean_terminated_length": 94.25625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01389625457250578, "frac_reward_zero_std": 0.9625, "grad_norm": 1.3874311447143555, "kl": 3.0405792455887424, "learning_rate": 4.336984126984127e-07, "loss": 0.003, "num_tokens": 1037812010.0, "reward": 0.4359375, "reward_std": 0.02993340976536274, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8910308957099915, "step": 15355 }, { "completion_length": 459.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 459.8, "completions/max_terminated_length": 392.2, "completions/mean_length": 90.78203125, "completions/mean_terminated_length": 90.25764465332031, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013900779565854037, "frac_reward_zero_std": 0.975, "grad_norm": 1.5912836790084839, "kl": 0.4530131601030007, "learning_rate": 4.336587301587301e-07, "loss": 0.0005, "num_tokens": 1038125099.0, "reward": 0.4296875, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8874347448348999, "step": 15360 }, { "completion_length": 375.6, "completions/clipped_ratio": 0.0, "completions/max_length": 375.6, "completions/max_terminated_length": 375.6, "completions/mean_length": 84.76796875, "completions/mean_terminated_length": 84.76796875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.013905304559202298, "frac_reward_zero_std": 0.9375, "grad_norm": 13.938426971435547, "kl": 1.6416533469920978, "learning_rate": 4.336190476190476e-07, "loss": 0.0016, "num_tokens": 1038429498.0, "reward": 0.3171875, "reward_std": 0.051924120634794235, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9390872359275818, "step": 15365 }, { "completion_length": 287.2, "completions/clipped_ratio": 0.0, "completions/max_length": 287.2, "completions/max_terminated_length": 287.2, "completions/mean_length": 90.2796875, "completions/mean_terminated_length": 90.2796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013909829552550558, "frac_reward_zero_std": 0.95625, "grad_norm": 0.012360516004264355, "kl": 2.200180216971785, "learning_rate": 4.3357936507936505e-07, "loss": 0.0022, "num_tokens": 1038743152.0, "reward": 0.31875, "reward_std": 0.04003184586763382, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9364825010299682, "step": 15370 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 440.6, "completions/max_terminated_length": 351.0, "completions/mean_length": 90.83125, "completions/mean_terminated_length": 89.77062683105468, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013914354545898818, "frac_reward_zero_std": 0.95625, "grad_norm": 14.592997550964355, "kl": 1.3818179502966814, "learning_rate": 4.3353968253968256e-07, "loss": 0.0014, "num_tokens": 1039058808.0, "reward": 0.2828125, "reward_std": 0.03661493957042694, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9517320394515991, "step": 15375 }, { "completion_length": 400.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.6, "completions/max_terminated_length": 298.6, "completions/mean_length": 86.40390625, "completions/mean_terminated_length": 85.87293243408203, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013918879539247078, "frac_reward_zero_std": 0.95, "grad_norm": 19.951128005981445, "kl": 2.273821039975155, "learning_rate": 4.3349999999999996e-07, "loss": 0.0023, "num_tokens": 1039366797.0, "reward": 0.34375, "reward_std": 0.04171832650899887, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9316792964935303, "step": 15380 }, { "completion_length": 480.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 480.4, "completions/max_terminated_length": 337.0, "completions/mean_length": 98.4390625, "completions/mean_terminated_length": 97.40273742675781, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.013923404532595336, "frac_reward_zero_std": 0.9625, "grad_norm": 30.7304744720459, "kl": 2.6040640769526364, "learning_rate": 4.334603174603174e-07, "loss": 0.0026, "num_tokens": 1039695103.0, "reward": 0.2546875, "reward_std": 0.02993340939283371, "rewards/verify_chess_move/mean": 0.2546875, "rewards/verify_chess_move/std": 0.9638108968734741, "step": 15385 }, { "completion_length": 467.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 467.6, "completions/max_terminated_length": 464.4, "completions/mean_length": 92.89609375, "completions/mean_terminated_length": 92.37302856445312, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013927929525943596, "frac_reward_zero_std": 0.94375, "grad_norm": 9.268698692321777, "kl": 1.7316338271251879, "learning_rate": 4.334206349206349e-07, "loss": 0.0017, "num_tokens": 1040012058.0, "reward": 0.3734375, "reward_std": 0.050921608507633206, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9178491711616517, "step": 15390 }, { "completion_length": 409.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 409.6, "completions/max_terminated_length": 355.2, "completions/mean_length": 88.0578125, "completions/mean_terminated_length": 87.53028564453125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013932454519291856, "frac_reward_zero_std": 0.9625, "grad_norm": 0.27897700667381287, "kl": 1.1127347343717702, "learning_rate": 4.3338095238095237e-07, "loss": 0.0011, "num_tokens": 1040323284.0, "reward": 0.425, "reward_std": 0.033350315690040586, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8967999815940857, "step": 15395 }, { "completion_length": 341.8, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/max_terminated_length": 341.8, "completions/mean_length": 85.43203125, "completions/mean_terminated_length": 85.43203125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.013936979512640117, "frac_reward_zero_std": 0.9625, "grad_norm": 16.667423248291016, "kl": 1.8898937692982145, "learning_rate": 4.333412698412698e-07, "loss": 0.0019, "num_tokens": 1040629477.0, "reward": 0.4640625, "reward_std": 0.03082856573164463, "rewards/verify_chess_move/mean": 0.4640625, "rewards/verify_chess_move/std": 0.8807757496833801, "step": 15400 }, { "completion_length": 482.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 482.2, "completions/max_terminated_length": 442.0, "completions/mean_length": 98.06328125, "completions/mean_terminated_length": 96.47976837158203, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013941504505988377, "frac_reward_zero_std": 0.96875, "grad_norm": 0.07800247520208359, "kl": 7.769618812471163, "learning_rate": 4.333015873015873e-07, "loss": 0.0078, "num_tokens": 1040956430.0, "reward": 0.375, "reward_std": 0.029143064096570016, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.8736035823822021, "step": 15405 }, { "completion_length": 450.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 450.8, "completions/max_terminated_length": 350.8, "completions/mean_length": 92.0390625, "completions/mean_terminated_length": 91.50665588378907, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013946029499336637, "frac_reward_zero_std": 0.95, "grad_norm": 24.01871109008789, "kl": 4.5833294342039155, "learning_rate": 4.3326190476190473e-07, "loss": 0.0046, "num_tokens": 1041274176.0, "reward": 0.24375, "reward_std": 0.046713375672698024, "rewards/verify_chess_move/mean": 0.24375, "rewards/verify_chess_move/std": 0.9586309432983399, "step": 15410 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 366.0, "completions/max_terminated_length": 271.2, "completions/mean_length": 91.13984375, "completions/mean_terminated_length": 90.62390747070313, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.013950554492684895, "frac_reward_zero_std": 0.975, "grad_norm": 0.7564879655838013, "kl": 0.4429065579548478, "learning_rate": 4.332222222222222e-07, "loss": 0.0004, "num_tokens": 1041592043.0, "reward": 0.35, "reward_std": 0.02130674086511135, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9311512351036072, "step": 15415 }, { "completion_length": 303.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 90.7234375, "completions/mean_terminated_length": 90.7234375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.013955079486033155, "frac_reward_zero_std": 0.975, "grad_norm": 0.015651075169444084, "kl": 0.275544425426051, "learning_rate": 4.3318253968253964e-07, "loss": 0.0003, "num_tokens": 1041908273.0, "reward": 0.3140625, "reward_std": 0.022673700004816055, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9431218385696412, "step": 15420 }, { "completion_length": 425.4, "completions/clipped_ratio": 0.0, "completions/max_length": 425.4, "completions/max_terminated_length": 425.4, "completions/mean_length": 91.15234375, "completions/mean_terminated_length": 91.15234375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.013959604479381415, "frac_reward_zero_std": 0.95625, "grad_norm": 0.4576057195663452, "kl": 0.9564602441387251, "learning_rate": 4.3314285714285715e-07, "loss": 0.001, "num_tokens": 1042222972.0, "reward": 0.434375, "reward_std": 0.034352827444672586, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.879340648651123, "step": 15425 }, { "completion_length": 368.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 368.6, "completions/max_terminated_length": 268.6, "completions/mean_length": 84.090625, "completions/mean_terminated_length": 83.55271911621094, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013964129472729675, "frac_reward_zero_std": 0.95, "grad_norm": 17.518169403076172, "kl": 0.18870644813869147, "learning_rate": 4.331031746031746e-07, "loss": 0.0002, "num_tokens": 1042525968.0, "reward": 0.3671875, "reward_std": 0.04376729428768158, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9227258205413819, "step": 15430 }, { "completion_length": 393.2, "completions/clipped_ratio": 0.0, "completions/max_length": 393.2, "completions/max_terminated_length": 393.2, "completions/mean_length": 88.85078125, "completions/mean_terminated_length": 88.85078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013968654466077935, "frac_reward_zero_std": 0.975, "grad_norm": 18.124547958374023, "kl": 0.28818120297510175, "learning_rate": 4.33063492063492e-07, "loss": 0.0003, "num_tokens": 1042839265.0, "reward": 0.3, "reward_std": 0.024935813248157503, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9440756678581238, "step": 15435 }, { "completion_length": 485.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.2, "completions/max_terminated_length": 443.4, "completions/mean_length": 89.2609375, "completions/mean_terminated_length": 88.72855224609376, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.013973179459426196, "frac_reward_zero_std": 0.975, "grad_norm": 3.548523426055908, "kl": 0.21651765797287226, "learning_rate": 4.330238095238095e-07, "loss": 0.0002, "num_tokens": 1043151247.0, "reward": 0.3671875, "reward_std": 0.024039677157998086, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9260929584503174, "step": 15440 }, { "completion_length": 491.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 491.2, "completions/max_terminated_length": 385.2, "completions/mean_length": 94.79921875, "completions/mean_terminated_length": 94.26859741210937, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013977704452774454, "frac_reward_zero_std": 0.9625, "grad_norm": 7.429944038391113, "kl": 0.5488534841104411, "learning_rate": 4.3298412698412697e-07, "loss": 0.0005, "num_tokens": 1043473246.0, "reward": 0.3671875, "reward_std": 0.02993340939283371, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9109992265701294, "step": 15445 }, { "completion_length": 403.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 403.4, "completions/max_terminated_length": 310.6, "completions/mean_length": 89.296875, "completions/mean_terminated_length": 88.26087646484375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013982229446122714, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0376124382019043, "kl": 1.0810294933384283, "learning_rate": 4.329444444444444e-07, "loss": 0.0011, "num_tokens": 1043785858.0, "reward": 0.38125, "reward_std": 0.03913669139146805, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9096181035041809, "step": 15450 }, { "completion_length": 371.4, "completions/clipped_ratio": 0.0, "completions/max_length": 371.4, "completions/max_terminated_length": 371.4, "completions/mean_length": 90.2796875, "completions/mean_terminated_length": 90.2796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013986754439470974, "frac_reward_zero_std": 0.975, "grad_norm": 3.635343551635742, "kl": 0.6676547271781601, "learning_rate": 4.3290476190476187e-07, "loss": 0.0007, "num_tokens": 1044100480.0, "reward": 0.4078125, "reward_std": 0.023144522309303285, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9122000217437745, "step": 15455 }, { "completion_length": 479.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 479.2, "completions/max_terminated_length": 460.2, "completions/mean_length": 91.98984375, "completions/mean_terminated_length": 90.931982421875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.013991279432819234, "frac_reward_zero_std": 0.94375, "grad_norm": 1.9834954738616943, "kl": 0.9850107328034937, "learning_rate": 4.3286507936507933e-07, "loss": 0.001, "num_tokens": 1044417011.0, "reward": 0.3375, "reward_std": 0.05228856801986694, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9344272971153259, "step": 15460 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 88.77109375, "completions/mean_terminated_length": 88.77109375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.013995804426167494, "frac_reward_zero_std": 0.94375, "grad_norm": 7.909889221191406, "kl": 0.7500311427284032, "learning_rate": 4.3282539682539683e-07, "loss": 0.0008, "num_tokens": 1044727534.0, "reward": 0.359375, "reward_std": 0.0488706823438406, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9212192893028259, "step": 15465 }, { "completion_length": 341.4, "completions/clipped_ratio": 0.0, "completions/max_length": 341.4, "completions/max_terminated_length": 341.4, "completions/mean_length": 93.49296875, "completions/mean_terminated_length": 93.49296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014000329419515753, "frac_reward_zero_std": 0.95, "grad_norm": 12.784421920776367, "kl": 0.8035046593286097, "learning_rate": 4.3278571428571424e-07, "loss": 0.0008, "num_tokens": 1045045909.0, "reward": 0.3984375, "reward_std": 0.04376729428768158, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.8874533176422119, "step": 15470 }, { "completion_length": 471.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.8, "completions/max_terminated_length": 409.0, "completions/mean_length": 90.67890625, "completions/mean_terminated_length": 90.14773101806641, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014004854412864013, "frac_reward_zero_std": 0.94375, "grad_norm": 0.8602224588394165, "kl": 0.8965435397229158, "learning_rate": 4.3274603174603174e-07, "loss": 0.0009, "num_tokens": 1045360778.0, "reward": 0.3421875, "reward_std": 0.046137744560837746, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.925578773021698, "step": 15475 }, { "completion_length": 289.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 86.4515625, "completions/mean_terminated_length": 86.4515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014009379406212273, "frac_reward_zero_std": 0.95625, "grad_norm": 0.003367713186889887, "kl": 1.8412019740091636, "learning_rate": 4.327063492063492e-07, "loss": 0.0018, "num_tokens": 1045670788.0, "reward": 0.4234375, "reward_std": 0.03456499353051186, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.9059947848320007, "step": 15480 }, { "completion_length": 486.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 486.2, "completions/max_terminated_length": 456.6, "completions/mean_length": 87.9984375, "completions/mean_terminated_length": 86.95861206054687, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014013904399560533, "frac_reward_zero_std": 0.95625, "grad_norm": 27.153841018676758, "kl": 0.703803117456846, "learning_rate": 4.3266666666666665e-07, "loss": 0.0007, "num_tokens": 1045981178.0, "reward": 0.221875, "reward_std": 0.04003184586763382, "rewards/verify_chess_move/mean": 0.221875, "rewards/verify_chess_move/std": 0.971134877204895, "step": 15485 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.0, "completions/max_length": 380.6, "completions/max_terminated_length": 380.6, "completions/mean_length": 91.2046875, "completions/mean_terminated_length": 91.2046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014018429392908793, "frac_reward_zero_std": 0.9875, "grad_norm": 0.23187468945980072, "kl": 0.7810264134081081, "learning_rate": 4.326269841269841e-07, "loss": 0.0008, "num_tokens": 1046296920.0, "reward": 0.5015625, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.8598742127418518, "step": 15490 }, { "completion_length": 266.2, "completions/clipped_ratio": 0.0, "completions/max_length": 266.2, "completions/max_terminated_length": 266.2, "completions/mean_length": 88.74453125, "completions/mean_terminated_length": 88.74453125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014022954386257053, "frac_reward_zero_std": 0.94375, "grad_norm": 14.684865951538086, "kl": 0.6625192995648831, "learning_rate": 4.3258730158730156e-07, "loss": 0.0007, "num_tokens": 1046609417.0, "reward": 0.3546875, "reward_std": 0.04750372357666492, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.932313883304596, "step": 15495 }, { "completion_length": 390.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 390.4, "completions/max_terminated_length": 316.2, "completions/mean_length": 94.0734375, "completions/mean_terminated_length": 93.55184478759766, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014027479379605311, "frac_reward_zero_std": 0.9625, "grad_norm": 1.817082405090332, "kl": 1.0604907585307957, "learning_rate": 4.3254761904761906e-07, "loss": 0.0011, "num_tokens": 1046927959.0, "reward": 0.4609375, "reward_std": 0.02993340976536274, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8822807192802429, "step": 15500 }, { "completion_length": 296.6, "completions/clipped_ratio": 0.0, "completions/max_length": 296.6, "completions/max_terminated_length": 296.6, "completions/mean_length": 96.08125, "completions/mean_terminated_length": 96.08125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014032004372953572, "frac_reward_zero_std": 0.96875, "grad_norm": 5.499467372894287, "kl": 0.23920632349327206, "learning_rate": 4.3250793650793647e-07, "loss": 0.0002, "num_tokens": 1047251047.0, "reward": 0.30625, "reward_std": 0.02619796171784401, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9417397499084472, "step": 15505 }, { "completion_length": 370.2, "completions/clipped_ratio": 0.0, "completions/max_length": 370.2, "completions/max_terminated_length": 370.2, "completions/mean_length": 78.99375, "completions/mean_terminated_length": 78.99375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014036529366301832, "frac_reward_zero_std": 0.975, "grad_norm": 0.002038294915109873, "kl": 0.22950936827110127, "learning_rate": 4.324682539682539e-07, "loss": 0.0002, "num_tokens": 1047545599.0, "reward": 0.4703125, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8722904682159424, "step": 15510 }, { "completion_length": 272.8, "completions/clipped_ratio": 0.0, "completions/max_length": 272.8, "completions/max_terminated_length": 272.8, "completions/mean_length": 86.009375, "completions/mean_terminated_length": 86.009375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014041054359650092, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0065179551020264626, "kl": 0.2952511213719845, "learning_rate": 4.3242857142857143e-07, "loss": 0.0003, "num_tokens": 1047851595.0, "reward": 0.415625, "reward_std": 0.028247909247875215, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9058630704879761, "step": 15515 }, { "completion_length": 524.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 524.0, "completions/max_terminated_length": 466.2, "completions/mean_length": 90.01171875, "completions/mean_terminated_length": 89.48943786621093, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014045579352998352, "frac_reward_zero_std": 0.975, "grad_norm": 0.01593988575041294, "kl": 0.7584148851805367, "learning_rate": 4.323888888888889e-07, "loss": 0.0008, "num_tokens": 1048164570.0, "reward": 0.3984375, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9166329145431519, "step": 15520 }, { "completion_length": 270.6, "completions/clipped_ratio": 0.0, "completions/max_length": 270.6, "completions/max_terminated_length": 270.6, "completions/mean_length": 80.04765625, "completions/mean_terminated_length": 80.04765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01405010434634661, "frac_reward_zero_std": 0.98125, "grad_norm": 9.651371002197266, "kl": 0.3290292561985552, "learning_rate": 4.3234920634920633e-07, "loss": 0.0003, "num_tokens": 1048462055.0, "reward": 0.4375, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8820338726043702, "step": 15525 }, { "completion_length": 441.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 441.4, "completions/max_terminated_length": 363.4, "completions/mean_length": 92.79453125, "completions/mean_terminated_length": 92.27358856201172, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01405462933969487, "frac_reward_zero_std": 0.94375, "grad_norm": 9.87375545501709, "kl": 2.1615773468394766, "learning_rate": 4.323095238095238e-07, "loss": 0.0022, "num_tokens": 1048782616.0, "reward": 0.325, "reward_std": 0.04750470407307148, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9291407823562622, "step": 15530 }, { "completion_length": 471.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.2, "completions/max_terminated_length": 448.8, "completions/mean_length": 88.66796875, "completions/mean_terminated_length": 88.12896270751953, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01405915433304313, "frac_reward_zero_std": 0.9625, "grad_norm": 8.442049026489258, "kl": 0.5185308534419164, "learning_rate": 4.3226984126984124e-07, "loss": 0.0005, "num_tokens": 1049092431.0, "reward": 0.471875, "reward_std": 0.03335031494498253, "rewards/verify_chess_move/mean": 0.471875, "rewards/verify_chess_move/std": 0.8769481897354126, "step": 15535 }, { "completion_length": 379.2, "completions/clipped_ratio": 0.0, "completions/max_length": 379.2, "completions/max_terminated_length": 379.2, "completions/mean_length": 94.28984375, "completions/mean_terminated_length": 94.28984375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01406367932639139, "frac_reward_zero_std": 0.95625, "grad_norm": 0.036404892802238464, "kl": 0.9497038602596148, "learning_rate": 4.322301587301587e-07, "loss": 0.0009, "num_tokens": 1049414842.0, "reward": 0.4484375, "reward_std": 0.03571978472173214, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8716011881828308, "step": 15540 }, { "completion_length": 291.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 87.490625, "completions/mean_terminated_length": 87.490625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01406820431973965, "frac_reward_zero_std": 0.96875, "grad_norm": 0.4892078638076782, "kl": 2.8121545903617515, "learning_rate": 4.3219047619047615e-07, "loss": 0.0028, "num_tokens": 1049725862.0, "reward": 0.403125, "reward_std": 0.025513991713523865, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9004034280776978, "step": 15545 }, { "completion_length": 548.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 548.4, "completions/max_terminated_length": 452.4, "completions/mean_length": 93.58359375, "completions/mean_terminated_length": 92.01148071289063, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01407272931308791, "frac_reward_zero_std": 0.90625, "grad_norm": 25.85696792602539, "kl": 1.602851727080997, "learning_rate": 4.3215079365079366e-07, "loss": 0.0016, "num_tokens": 1050042905.0, "reward": 0.30625, "reward_std": 0.0817501712590456, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9322871088981628, "step": 15550 }, { "completion_length": 284.6, "completions/clipped_ratio": 0.0, "completions/max_length": 284.6, "completions/max_terminated_length": 284.6, "completions/mean_length": 82.23046875, "completions/mean_terminated_length": 82.23046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014077254306436169, "frac_reward_zero_std": 0.95625, "grad_norm": 26.18150520324707, "kl": 1.6660449056886137, "learning_rate": 4.321111111111111e-07, "loss": 0.0017, "num_tokens": 1050343160.0, "reward": 0.4140625, "reward_std": 0.03981967978179455, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9050396919250489, "step": 15555 }, { "completion_length": 397.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 90.7171875, "completions/mean_terminated_length": 90.7171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014081779299784429, "frac_reward_zero_std": 0.95, "grad_norm": 2.5362346172332764, "kl": 1.597093031986151, "learning_rate": 4.320714285714285e-07, "loss": 0.0016, "num_tokens": 1050656430.0, "reward": 0.49375, "reward_std": 0.04718419872224331, "rewards/verify_chess_move/mean": 0.49375, "rewards/verify_chess_move/std": 0.8633765578269958, "step": 15560 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 389.6, "completions/max_terminated_length": 296.2, "completions/mean_length": 90.303125, "completions/mean_terminated_length": 89.78233642578125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01408630429313269, "frac_reward_zero_std": 0.95625, "grad_norm": 6.355713844299316, "kl": 0.38070323592983185, "learning_rate": 4.32031746031746e-07, "loss": 0.0004, "num_tokens": 1050970266.0, "reward": 0.4609375, "reward_std": 0.033669837191700934, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8782249450683594, "step": 15565 }, { "completion_length": 337.2, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/max_terminated_length": 337.2, "completions/mean_length": 91.20234375, "completions/mean_terminated_length": 91.20234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01409082928648095, "frac_reward_zero_std": 0.975, "grad_norm": 2.665940046310425, "kl": 2.7844643997959793, "learning_rate": 4.3199206349206347e-07, "loss": 0.0028, "num_tokens": 1051285709.0, "reward": 0.4359375, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.893068265914917, "step": 15570 }, { "completion_length": 359.8, "completions/clipped_ratio": 0.0, "completions/max_length": 359.8, "completions/max_terminated_length": 359.8, "completions/mean_length": 87.93984375, "completions/mean_terminated_length": 87.93984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01409535427982921, "frac_reward_zero_std": 0.95, "grad_norm": 10.588834762573242, "kl": 0.6722279714071192, "learning_rate": 4.31952380952381e-07, "loss": 0.0007, "num_tokens": 1051595456.0, "reward": 0.4671875, "reward_std": 0.04398044124245644, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8757899284362793, "step": 15575 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 94.54375, "completions/mean_terminated_length": 94.54375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014099879273177468, "frac_reward_zero_std": 0.9875, "grad_norm": 0.16773855686187744, "kl": 2.561920250556432, "learning_rate": 4.319126984126984e-07, "loss": 0.0026, "num_tokens": 1051916248.0, "reward": 0.4359375, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8889544010162354, "step": 15580 }, { "completion_length": 461.2, "completions/clipped_ratio": 0.0, "completions/max_length": 461.2, "completions/max_terminated_length": 461.2, "completions/mean_length": 88.309375, "completions/mean_terminated_length": 88.309375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014104404266525728, "frac_reward_zero_std": 0.98125, "grad_norm": 0.6330658197402954, "kl": 1.333426034031436, "learning_rate": 4.3187301587301584e-07, "loss": 0.0013, "num_tokens": 1052228292.0, "reward": 0.271875, "reward_std": 0.016675157845020293, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.95395587682724, "step": 15585 }, { "completion_length": 441.4, "completions/clipped_ratio": 0.0, "completions/max_length": 441.4, "completions/max_terminated_length": 441.4, "completions/mean_length": 89.4421875, "completions/mean_terminated_length": 89.4421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014108929259873988, "frac_reward_zero_std": 0.95, "grad_norm": 2.829421043395996, "kl": 11.260063844756223, "learning_rate": 4.3183333333333334e-07, "loss": 0.0113, "num_tokens": 1052539938.0, "reward": 0.3390625, "reward_std": 0.04355610907077789, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9180680394172669, "step": 15590 }, { "completion_length": 410.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 410.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 86.50703125, "completions/mean_terminated_length": 85.97692260742187, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014113454253222248, "frac_reward_zero_std": 0.9625, "grad_norm": 3.8124759197235107, "kl": 2.886759452789556, "learning_rate": 4.3179365079365074e-07, "loss": 0.0029, "num_tokens": 1052846667.0, "reward": 0.4046875, "reward_std": 0.03356248140335083, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9001060605049134, "step": 15595 }, { "completion_length": 396.4, "completions/clipped_ratio": 0.0, "completions/max_length": 396.4, "completions/max_terminated_length": 396.4, "completions/mean_length": 91.43046875, "completions/mean_terminated_length": 91.43046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014117979246570508, "frac_reward_zero_std": 0.9875, "grad_norm": 20.3248233795166, "kl": 1.3154432166717016, "learning_rate": 4.3175396825396825e-07, "loss": 0.0013, "num_tokens": 1053162058.0, "reward": 0.440625, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8905897617340088, "step": 15600 }, { "completion_length": 296.2, "completions/clipped_ratio": 0.0, "completions/max_length": 296.2, "completions/max_terminated_length": 296.2, "completions/mean_length": 83.95703125, "completions/mean_terminated_length": 83.95703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014122504239918768, "frac_reward_zero_std": 0.96875, "grad_norm": 8.04029655456543, "kl": 2.4004617393016816, "learning_rate": 4.317142857142857e-07, "loss": 0.0024, "num_tokens": 1053466803.0, "reward": 0.30625, "reward_std": 0.02709311693906784, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9417837262153625, "step": 15605 }, { "completion_length": 508.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 508.2, "completions/max_terminated_length": 498.8, "completions/mean_length": 89.02890625, "completions/mean_terminated_length": 88.488232421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014127029233267027, "frac_reward_zero_std": 0.9625, "grad_norm": 0.7306330800056458, "kl": 3.067796354566235, "learning_rate": 4.3167460317460316e-07, "loss": 0.0031, "num_tokens": 1053776656.0, "reward": 0.3703125, "reward_std": 0.03403330445289612, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.917079496383667, "step": 15610 }, { "completion_length": 408.2, "completions/clipped_ratio": 0.0, "completions/max_length": 408.2, "completions/max_terminated_length": 408.2, "completions/mean_length": 95.7234375, "completions/mean_terminated_length": 95.7234375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014131554226615287, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00124232133384794, "kl": 0.39902334795333444, "learning_rate": 4.316349206349206e-07, "loss": 0.0004, "num_tokens": 1054098518.0, "reward": 0.3421875, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9355259895324707, "step": 15615 }, { "completion_length": 279.6, "completions/clipped_ratio": 0.0, "completions/max_length": 279.6, "completions/max_terminated_length": 279.6, "completions/mean_length": 94.39921875, "completions/mean_terminated_length": 94.39921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014136079219963547, "frac_reward_zero_std": 0.975, "grad_norm": 1.3283190727233887, "kl": 0.161041080346331, "learning_rate": 4.3159523809523807e-07, "loss": 0.0002, "num_tokens": 1054420469.0, "reward": 0.4203125, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8945372343063355, "step": 15620 }, { "completion_length": 311.8, "completions/clipped_ratio": 0.0, "completions/max_length": 311.8, "completions/max_terminated_length": 311.8, "completions/mean_length": 82.28828125, "completions/mean_terminated_length": 82.28828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014140604213311807, "frac_reward_zero_std": 0.975, "grad_norm": 0.001580402022227645, "kl": 0.14720884033013135, "learning_rate": 4.3155555555555557e-07, "loss": 0.0001, "num_tokens": 1054722094.0, "reward": 0.2484375, "reward_std": 0.021778544783592223, "rewards/verify_chess_move/mean": 0.2484375, "rewards/verify_chess_move/std": 0.9698426485061645, "step": 15625 }, { "completion_length": 378.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 378.2, "completions/max_terminated_length": 319.2, "completions/mean_length": 95.18046875, "completions/mean_terminated_length": 93.59695587158203, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014145129206660067, "frac_reward_zero_std": 0.95625, "grad_norm": 1.4163538217544556, "kl": 0.695913570036646, "learning_rate": 4.31515873015873e-07, "loss": 0.0007, "num_tokens": 1055044533.0, "reward": 0.453125, "reward_std": 0.04003184549510479, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8855796575546264, "step": 15630 }, { "completion_length": 368.2, "completions/clipped_ratio": 0.0, "completions/max_length": 368.2, "completions/max_terminated_length": 368.2, "completions/mean_length": 94.63359375, "completions/mean_terminated_length": 94.63359375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.014149654200008325, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0018813916249200702, "kl": 0.16619092444889247, "learning_rate": 4.3147619047619043e-07, "loss": 0.0002, "num_tokens": 1055367648.0, "reward": 0.30625, "reward_std": 0.01552036553621292, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9439481139183045, "step": 15635 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.4, "completions/max_terminated_length": 361.2, "completions/mean_length": 92.84296875, "completions/mean_terminated_length": 92.32051849365234, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014154179193356585, "frac_reward_zero_std": 0.975, "grad_norm": 0.3504483997821808, "kl": 0.3899774947552942, "learning_rate": 4.3143650793650793e-07, "loss": 0.0004, "num_tokens": 1055684895.0, "reward": 0.396875, "reward_std": 0.019727617129683496, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9111876010894775, "step": 15640 }, { "completion_length": 413.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 413.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 94.77890625, "completions/mean_terminated_length": 94.24704132080078, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.014158704186704845, "frac_reward_zero_std": 0.95625, "grad_norm": 7.499078750610352, "kl": 0.341248337039724, "learning_rate": 4.313968253968254e-07, "loss": 0.0003, "num_tokens": 1056005620.0, "reward": 0.4640625, "reward_std": 0.03708576261997223, "rewards/verify_chess_move/mean": 0.4640625, "rewards/verify_chess_move/std": 0.8729402661323548, "step": 15645 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.6, "completions/max_terminated_length": 430.4, "completions/mean_length": 93.565625, "completions/mean_terminated_length": 93.03172912597657, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014163229180053106, "frac_reward_zero_std": 0.95, "grad_norm": 1.919466495513916, "kl": 1.4610459673800507, "learning_rate": 4.3135714285714284e-07, "loss": 0.0015, "num_tokens": 1056325376.0, "reward": 0.39375, "reward_std": 0.047137710824608804, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9101765036582947, "step": 15650 }, { "completion_length": 283.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 82.0984375, "completions/mean_terminated_length": 82.0984375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014167754173401366, "frac_reward_zero_std": 0.9625, "grad_norm": 2.7398509979248047, "kl": 0.4971031391527504, "learning_rate": 4.313174603174603e-07, "loss": 0.0005, "num_tokens": 1056625918.0, "reward": 0.4375, "reward_std": 0.03424546979367733, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8984203100204468, "step": 15655 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 95.07734375, "completions/mean_terminated_length": 95.07734375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014172279166749626, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0023684301413595676, "kl": 1.3161840501008555, "learning_rate": 4.3127777777777775e-07, "loss": 0.0013, "num_tokens": 1056948825.0, "reward": 0.21875, "reward_std": 0.04092700108885765, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9689412832260131, "step": 15660 }, { "completion_length": 322.4, "completions/clipped_ratio": 0.0, "completions/max_length": 322.4, "completions/max_terminated_length": 322.4, "completions/mean_length": 91.22734375, "completions/mean_terminated_length": 91.22734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014176804160097884, "frac_reward_zero_std": 0.9625, "grad_norm": 0.08940713852643967, "kl": 0.7068982411641628, "learning_rate": 4.3123809523809526e-07, "loss": 0.0007, "num_tokens": 1057265188.0, "reward": 0.371875, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9186908364295959, "step": 15665 }, { "completion_length": 296.6, "completions/clipped_ratio": 0.0, "completions/max_length": 296.6, "completions/max_terminated_length": 296.6, "completions/mean_length": 84.70546875, "completions/mean_terminated_length": 84.70546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014181329153446144, "frac_reward_zero_std": 0.9375, "grad_norm": 6.772106647491455, "kl": 1.6025332348654047, "learning_rate": 4.3119841269841266e-07, "loss": 0.0016, "num_tokens": 1057567987.0, "reward": 0.48125, "reward_std": 0.05713133662939072, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8646466493606567, "step": 15670 }, { "completion_length": 315.4, "completions/clipped_ratio": 0.0, "completions/max_length": 315.4, "completions/max_terminated_length": 315.4, "completions/mean_length": 89.3984375, "completions/mean_terminated_length": 89.3984375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.014185854146794404, "frac_reward_zero_std": 0.9625, "grad_norm": 15.291133880615234, "kl": 0.9775617849081755, "learning_rate": 4.3115873015873017e-07, "loss": 0.001, "num_tokens": 1057880793.0, "reward": 0.50625, "reward_std": 0.031040730699896813, "rewards/verify_chess_move/mean": 0.50625, "rewards/verify_chess_move/std": 0.8540774345397949, "step": 15675 }, { "completion_length": 284.6, "completions/clipped_ratio": 0.0, "completions/max_length": 284.6, "completions/max_terminated_length": 284.6, "completions/mean_length": 89.70625, "completions/mean_terminated_length": 89.70625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014190379140142664, "frac_reward_zero_std": 0.96875, "grad_norm": 19.727588653564453, "kl": 0.6624969126656651, "learning_rate": 4.311190476190476e-07, "loss": 0.0007, "num_tokens": 1058194753.0, "reward": 0.3171875, "reward_std": 0.028460075333714484, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9310022950172424, "step": 15680 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 88.56171875, "completions/mean_terminated_length": 88.56171875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.014194904133490924, "frac_reward_zero_std": 0.95, "grad_norm": 0.33903974294662476, "kl": 1.9063893545418977, "learning_rate": 4.31079365079365e-07, "loss": 0.0019, "num_tokens": 1058506504.0, "reward": 0.3109375, "reward_std": 0.04739636480808258, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9331860303878784, "step": 15685 }, { "completion_length": 380.6, "completions/clipped_ratio": 0.0, "completions/max_length": 380.6, "completions/max_terminated_length": 380.6, "completions/mean_length": 89.11171875, "completions/mean_terminated_length": 89.11171875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014199429126839183, "frac_reward_zero_std": 0.94375, "grad_norm": 0.6804895997047424, "kl": 0.49981783209368585, "learning_rate": 4.3103968253968253e-07, "loss": 0.0005, "num_tokens": 1058819223.0, "reward": 0.46875, "reward_std": 0.0524997528642416, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8809513807296753, "step": 15690 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/max_terminated_length": 389.2, "completions/mean_length": 90.46875, "completions/mean_terminated_length": 90.46875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014203954120187443, "frac_reward_zero_std": 0.9875, "grad_norm": 2.9256792068481445, "kl": 0.3815333720529452, "learning_rate": 4.31e-07, "loss": 0.0004, "num_tokens": 1059133463.0, "reward": 0.39375, "reward_std": 0.011572751402854919, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9145597815513611, "step": 15695 }, { "completion_length": 596.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 596.2, "completions/max_terminated_length": 474.6, "completions/mean_length": 93.7078125, "completions/mean_terminated_length": 92.65940856933594, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014208479113535703, "frac_reward_zero_std": 0.9125, "grad_norm": 22.0346622467041, "kl": 2.2877936171833424, "learning_rate": 4.309603174603175e-07, "loss": 0.0023, "num_tokens": 1059451225.0, "reward": 0.3859375, "reward_std": 0.07280653230845928, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.8908375382423401, "step": 15700 }, { "completion_length": 518.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 518.6, "completions/max_terminated_length": 389.2, "completions/mean_length": 89.96484375, "completions/mean_terminated_length": 88.89993438720703, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014213004106883963, "frac_reward_zero_std": 0.975, "grad_norm": 37.37646484375, "kl": 0.8159817409818061, "learning_rate": 4.309206349206349e-07, "loss": 0.0008, "num_tokens": 1059762988.0, "reward": 0.484375, "reward_std": 0.02177756428718567, "rewards/verify_chess_move/mean": 0.484375, "rewards/verify_chess_move/std": 0.8660805225372314, "step": 15705 }, { "completion_length": 268.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 86.33359375, "completions/mean_terminated_length": 86.33359375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014217529100232223, "frac_reward_zero_std": 0.975, "grad_norm": 9.598307609558105, "kl": 1.0033816652838141, "learning_rate": 4.3088095238095234e-07, "loss": 0.001, "num_tokens": 1060069063.0, "reward": 0.5515625, "reward_std": 0.025194469094276428, "rewards/verify_chess_move/mean": 0.5515625, "rewards/verify_chess_move/std": 0.8312040567398071, "step": 15710 }, { "completion_length": 414.8, "completions/clipped_ratio": 0.0, "completions/max_length": 414.8, "completions/max_terminated_length": 414.8, "completions/mean_length": 91.7125, "completions/mean_terminated_length": 91.7125, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.014222054093580483, "frac_reward_zero_std": 0.9625, "grad_norm": 1.5534987449645996, "kl": 1.167280691396445, "learning_rate": 4.3084126984126985e-07, "loss": 0.0012, "num_tokens": 1060387415.0, "reward": 0.290625, "reward_std": 0.032195523381233215, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9117600083351135, "step": 15715 }, { "completion_length": 560.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 560.4, "completions/max_terminated_length": 290.6, "completions/mean_length": 85.3484375, "completions/mean_terminated_length": 83.74534606933594, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014226579086928742, "frac_reward_zero_std": 0.95625, "grad_norm": 15.971055030822754, "kl": 1.1398669095360674, "learning_rate": 4.3080158730158725e-07, "loss": 0.0011, "num_tokens": 1060689981.0, "reward": 0.3859375, "reward_std": 0.038664887100458144, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9206425428390503, "step": 15720 }, { "completion_length": 422.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 422.8, "completions/max_terminated_length": 340.4, "completions/mean_length": 87.40859375, "completions/mean_terminated_length": 86.8650619506836, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014231104080277002, "frac_reward_zero_std": 0.95625, "grad_norm": 0.25075021386146545, "kl": 1.1050082986708731, "learning_rate": 4.307619047619047e-07, "loss": 0.0011, "num_tokens": 1061000104.0, "reward": 0.396875, "reward_std": 0.03934787660837173, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.8992718577384948, "step": 15725 }, { "completion_length": 376.2, "completions/clipped_ratio": 0.0, "completions/max_length": 376.2, "completions/max_terminated_length": 376.2, "completions/mean_length": 88.7171875, "completions/mean_terminated_length": 88.7171875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014235629073625262, "frac_reward_zero_std": 0.975, "grad_norm": 8.913947105407715, "kl": 0.5618843860225752, "learning_rate": 4.307222222222222e-07, "loss": 0.0006, "num_tokens": 1061311950.0, "reward": 0.415625, "reward_std": 0.01767766885459423, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.8931599497795105, "step": 15730 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 89.03828125, "completions/mean_terminated_length": 89.03828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014240154066973522, "frac_reward_zero_std": 0.975, "grad_norm": 0.5940341949462891, "kl": 0.6950178276747465, "learning_rate": 4.3068253968253967e-07, "loss": 0.0007, "num_tokens": 1061623623.0, "reward": 0.5203125, "reward_std": 0.023144521936774253, "rewards/verify_chess_move/mean": 0.5203125, "rewards/verify_chess_move/std": 0.8350409626960754, "step": 15735 }, { "completion_length": 319.2, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/max_terminated_length": 319.2, "completions/mean_length": 91.278125, "completions/mean_terminated_length": 91.278125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014244679060321782, "frac_reward_zero_std": 0.975, "grad_norm": 2.7592580318450928, "kl": 0.8735025165835395, "learning_rate": 4.306428571428571e-07, "loss": 0.0009, "num_tokens": 1061940491.0, "reward": 0.1984375, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.1984375, "rewards/verify_chess_move/std": 0.9654842972755432, "step": 15740 }, { "completion_length": 304.6, "completions/clipped_ratio": 0.0, "completions/max_length": 304.6, "completions/max_terminated_length": 304.6, "completions/mean_length": 89.40078125, "completions/mean_terminated_length": 89.40078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01424920405367004, "frac_reward_zero_std": 0.9625, "grad_norm": 8.6509428024292, "kl": 0.14617209455464036, "learning_rate": 4.306031746031746e-07, "loss": 0.0001, "num_tokens": 1062252812.0, "reward": 0.4125, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9023956894874573, "step": 15745 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/max_terminated_length": 381.6, "completions/mean_length": 86.16640625, "completions/mean_terminated_length": 86.16640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0142537290470183, "frac_reward_zero_std": 0.9625, "grad_norm": 2.4715285301208496, "kl": 0.61840513816569, "learning_rate": 4.3056349206349203e-07, "loss": 0.0006, "num_tokens": 1062558417.0, "reward": 0.425, "reward_std": 0.03130036890506745, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9009027242660522, "step": 15750 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 93.0046875, "completions/mean_terminated_length": 93.0046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01425825404036656, "frac_reward_zero_std": 0.975, "grad_norm": 0.014467164874076843, "kl": 0.4661012392025441, "learning_rate": 4.3052380952380954e-07, "loss": 0.0005, "num_tokens": 1062876655.0, "reward": 0.3765625, "reward_std": 0.02198973037302494, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9195046067237854, "step": 15755 }, { "completion_length": 401.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 401.4, "completions/max_terminated_length": 372.4, "completions/mean_length": 89.153125, "completions/mean_terminated_length": 88.09342346191406, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01426277903371482, "frac_reward_zero_std": 0.9375, "grad_norm": 13.37839412689209, "kl": 2.900183375319466, "learning_rate": 4.3048412698412694e-07, "loss": 0.0029, "num_tokens": 1063187123.0, "reward": 0.4578125, "reward_std": 0.050344996899366376, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.8854338884353637, "step": 15760 }, { "completion_length": 502.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 502.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 97.52421875, "completions/mean_terminated_length": 97.00587463378906, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01426730402706308, "frac_reward_zero_std": 0.94375, "grad_norm": 0.020108750090003014, "kl": 1.5386063262121752, "learning_rate": 4.3044444444444444e-07, "loss": 0.0015, "num_tokens": 1063512090.0, "reward": 0.2765625, "reward_std": 0.04934248328208923, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9549507379531861, "step": 15765 }, { "completion_length": 401.8, "completions/clipped_ratio": 0.0, "completions/max_length": 401.8, "completions/max_terminated_length": 401.8, "completions/mean_length": 92.05703125, "completions/mean_terminated_length": 92.05703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01427182902041134, "frac_reward_zero_std": 0.9625, "grad_norm": 0.005515174008905888, "kl": 0.7036334455595352, "learning_rate": 4.304047619047619e-07, "loss": 0.0007, "num_tokens": 1063828755.0, "reward": 0.4140625, "reward_std": 0.032667326554656026, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8919719696044922, "step": 15770 }, { "completion_length": 507.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 507.6, "completions/max_terminated_length": 468.0, "completions/mean_length": 91.8671875, "completions/mean_terminated_length": 91.33490905761718, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0142763540137596, "frac_reward_zero_std": 0.95625, "grad_norm": 9.875418663024902, "kl": 3.9314157637185416, "learning_rate": 4.303650793650793e-07, "loss": 0.0039, "num_tokens": 1064144057.0, "reward": 0.4109375, "reward_std": 0.03866488784551621, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.9077800035476684, "step": 15775 }, { "completion_length": 321.6, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/max_terminated_length": 321.6, "completions/mean_length": 89.4921875, "completions/mean_terminated_length": 89.4921875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01428087900710786, "frac_reward_zero_std": 0.9625, "grad_norm": 2.024200916290283, "kl": 0.3699845735216513, "learning_rate": 4.303253968253968e-07, "loss": 0.0004, "num_tokens": 1064456975.0, "reward": 0.3078125, "reward_std": 0.030617379397153855, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9464658260345459, "step": 15780 }, { "completion_length": 501.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 501.4, "completions/max_terminated_length": 345.2, "completions/mean_length": 93.16796875, "completions/mean_terminated_length": 92.12214050292968, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01428540400045612, "frac_reward_zero_std": 0.975, "grad_norm": 0.008022662252187729, "kl": 0.8187814506702125, "learning_rate": 4.3028571428571426e-07, "loss": 0.0008, "num_tokens": 1064774310.0, "reward": 0.3203125, "reward_std": 0.024039677530527114, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9457858920097351, "step": 15785 }, { "completion_length": 286.6, "completions/clipped_ratio": 0.0, "completions/max_length": 286.6, "completions/max_terminated_length": 286.6, "completions/mean_length": 89.58828125, "completions/mean_terminated_length": 89.58828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01428992899380438, "frac_reward_zero_std": 0.95625, "grad_norm": 9.002176284790039, "kl": 0.37285892642103136, "learning_rate": 4.3024603174603177e-07, "loss": 0.0004, "num_tokens": 1065087687.0, "reward": 0.3796875, "reward_std": 0.03640375509858131, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.8789059281349182, "step": 15790 }, { "completion_length": 361.2, "completions/clipped_ratio": 0.0, "completions/max_length": 361.2, "completions/max_terminated_length": 361.2, "completions/mean_length": 78.63125, "completions/mean_terminated_length": 78.63125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01429445398715264, "frac_reward_zero_std": 0.975, "grad_norm": 0.7448951601982117, "kl": 1.2034881191095337, "learning_rate": 4.3020634920634917e-07, "loss": 0.0012, "num_tokens": 1065382711.0, "reward": 0.425, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9020922303199768, "step": 15795 }, { "completion_length": 342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 91.58359375, "completions/mean_terminated_length": 91.58359375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014298978980500898, "frac_reward_zero_std": 0.975, "grad_norm": 13.473316192626953, "kl": 0.26037042811512945, "learning_rate": 4.301666666666666e-07, "loss": 0.0003, "num_tokens": 1065699346.0, "reward": 0.3546875, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9268287062644959, "step": 15800 }, { "completion_length": 443.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 443.6, "completions/max_terminated_length": 349.8, "completions/mean_length": 97.9328125, "completions/mean_terminated_length": 96.88785095214844, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014303503973849158, "frac_reward_zero_std": 0.94375, "grad_norm": 5.403481960296631, "kl": 0.17745043649338185, "learning_rate": 4.3012698412698413e-07, "loss": 0.0002, "num_tokens": 1066026276.0, "reward": 0.3046875, "reward_std": 0.04818769246339798, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.948190200328827, "step": 15805 }, { "completion_length": 452.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 452.4, "completions/max_terminated_length": 435.4, "completions/mean_length": 97.28515625, "completions/mean_terminated_length": 96.76051330566406, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014308028967197418, "frac_reward_zero_std": 0.925, "grad_norm": 0.002177139976993203, "kl": 1.6228131126845255, "learning_rate": 4.3008730158730153e-07, "loss": 0.0016, "num_tokens": 1066352553.0, "reward": 0.378125, "reward_std": 0.06417986005544662, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9249892354011535, "step": 15810 }, { "completion_length": 376.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 376.4, "completions/max_terminated_length": 282.6, "completions/mean_length": 93.34375, "completions/mean_terminated_length": 91.78968658447266, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014312553960545678, "frac_reward_zero_std": 0.9875, "grad_norm": 0.4264196753501892, "kl": 0.315237008780241, "learning_rate": 4.3004761904761904e-07, "loss": 0.0003, "num_tokens": 1066671849.0, "reward": 0.3671875, "reward_std": 0.011100948229432106, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9184264898300171, "step": 15815 }, { "completion_length": 403.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.2, "completions/max_terminated_length": 325.6, "completions/mean_length": 97.2640625, "completions/mean_terminated_length": 96.7399398803711, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.014317078953893938, "frac_reward_zero_std": 0.98125, "grad_norm": 0.001695395214483142, "kl": 0.2394581962376833, "learning_rate": 4.300079365079365e-07, "loss": 0.0002, "num_tokens": 1066996003.0, "reward": 0.4625, "reward_std": 0.017570312693715097, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8817407965660096, "step": 15820 }, { "completion_length": 280.8, "completions/clipped_ratio": 0.0, "completions/max_length": 280.8, "completions/max_terminated_length": 280.8, "completions/mean_length": 93.54609375, "completions/mean_terminated_length": 93.54609375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014321603947242198, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0017917610239237547, "kl": 0.27307498920708895, "learning_rate": 4.2996825396825394e-07, "loss": 0.0003, "num_tokens": 1067316782.0, "reward": 0.3421875, "reward_std": 0.037769732996821404, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9386862277984619, "step": 15825 }, { "completion_length": 491.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 491.2, "completions/max_terminated_length": 415.8, "completions/mean_length": 86.8734375, "completions/mean_terminated_length": 86.33953857421875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014326128940590457, "frac_reward_zero_std": 0.95, "grad_norm": 51.41184616088867, "kl": 1.4692886090837418, "learning_rate": 4.299285714285714e-07, "loss": 0.0015, "num_tokens": 1067623980.0, "reward": 0.3640625, "reward_std": 0.0444512639194727, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9263355255126953, "step": 15830 }, { "completion_length": 262.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 82.834375, "completions/mean_terminated_length": 82.834375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014330653933938717, "frac_reward_zero_std": 0.96875, "grad_norm": 17.033733367919922, "kl": 0.17024752735160292, "learning_rate": 4.2988888888888885e-07, "loss": 0.0002, "num_tokens": 1067926784.0, "reward": 0.328125, "reward_std": 0.028883427381515503, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.90389643907547, "step": 15835 }, { "completion_length": 270.4, "completions/clipped_ratio": 0.0, "completions/max_length": 270.4, "completions/max_terminated_length": 270.4, "completions/mean_length": 84.46171875, "completions/mean_terminated_length": 84.46171875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014335178927286977, "frac_reward_zero_std": 0.9875, "grad_norm": 0.9308593273162842, "kl": 0.12829747716896236, "learning_rate": 4.2984920634920636e-07, "loss": 0.0001, "num_tokens": 1068230855.0, "reward": 0.4515625, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8721375226974487, "step": 15840 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 99.125, "completions/mean_terminated_length": 99.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014339703920635237, "frac_reward_zero_std": 0.94375, "grad_norm": 24.65160369873047, "kl": 1.3100105601362884, "learning_rate": 4.298095238095238e-07, "loss": 0.0013, "num_tokens": 1068557847.0, "reward": 0.44375, "reward_std": 0.046820733696222305, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8872060894966125, "step": 15845 }, { "completion_length": 286.4, "completions/clipped_ratio": 0.0, "completions/max_length": 286.4, "completions/max_terminated_length": 286.4, "completions/mean_length": 91.68671875, "completions/mean_terminated_length": 91.68671875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014344228913983497, "frac_reward_zero_std": 0.96875, "grad_norm": 5.220074653625488, "kl": 2.951460825977847, "learning_rate": 4.297698412698412e-07, "loss": 0.003, "num_tokens": 1068877318.0, "reward": 0.2984375, "reward_std": 0.0277761060744524, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9551705241203308, "step": 15850 }, { "completion_length": 451.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.8, "completions/max_terminated_length": 353.4, "completions/mean_length": 84.52578125, "completions/mean_terminated_length": 83.99701080322265, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014348753907331755, "frac_reward_zero_std": 0.9875, "grad_norm": 4.384960174560547, "kl": 2.6976311480626465, "learning_rate": 4.297301587301587e-07, "loss": 0.0027, "num_tokens": 1069181215.0, "reward": 0.38125, "reward_std": 0.011572751402854919, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9179788827896118, "step": 15855 }, { "completion_length": 335.4, "completions/clipped_ratio": 0.0, "completions/max_length": 335.4, "completions/max_terminated_length": 335.4, "completions/mean_length": 92.42890625, "completions/mean_terminated_length": 92.42890625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014353278900680016, "frac_reward_zero_std": 0.95625, "grad_norm": 17.835655212402344, "kl": 0.9439863269682973, "learning_rate": 4.296904761904762e-07, "loss": 0.0009, "num_tokens": 1069497948.0, "reward": 0.3625, "reward_std": 0.03982065878808498, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9278267383575439, "step": 15860 }, { "completion_length": 453.6, "completions/clipped_ratio": 0.0, "completions/max_length": 453.6, "completions/max_terminated_length": 453.6, "completions/mean_length": 94.30234375, "completions/mean_terminated_length": 94.30234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014357803894028276, "frac_reward_zero_std": 0.99375, "grad_norm": 0.6346418261528015, "kl": 0.46569810605142264, "learning_rate": 4.2965079365079363e-07, "loss": 0.0005, "num_tokens": 1069819791.0, "reward": 0.309375, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9518654584884644, "step": 15865 }, { "completion_length": 405.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 405.2, "completions/max_terminated_length": 323.6, "completions/mean_length": 94.1671875, "completions/mean_terminated_length": 93.13202514648438, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014362328887376536, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021337801590561867, "kl": 1.1081467172363773, "learning_rate": 4.296111111111111e-07, "loss": 0.0011, "num_tokens": 1070137701.0, "reward": 0.3734375, "reward_std": 0.024831003323197366, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9018653988838196, "step": 15870 }, { "completion_length": 274.4, "completions/clipped_ratio": 0.0, "completions/max_length": 274.4, "completions/max_terminated_length": 274.4, "completions/mean_length": 86.371875, "completions/mean_terminated_length": 86.371875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014366853880724796, "frac_reward_zero_std": 0.9875, "grad_norm": 0.007574026472866535, "kl": 0.7371487976051867, "learning_rate": 4.2957142857142854e-07, "loss": 0.0007, "num_tokens": 1070445953.0, "reward": 0.4, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9120412588119506, "step": 15875 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 444.8, "completions/max_terminated_length": 351.6, "completions/mean_length": 91.91015625, "completions/mean_terminated_length": 90.86636352539062, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014371378874073056, "frac_reward_zero_std": 0.99375, "grad_norm": 7.049808025360107, "kl": 1.581681486289017, "learning_rate": 4.2953174603174604e-07, "loss": 0.0016, "num_tokens": 1070763526.0, "reward": 0.434375, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8991016626358033, "step": 15880 }, { "completion_length": 378.4, "completions/clipped_ratio": 0.0, "completions/max_length": 378.4, "completions/max_terminated_length": 378.4, "completions/mean_length": 103.01171875, "completions/mean_terminated_length": 103.01171875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014375903867421314, "frac_reward_zero_std": 0.95625, "grad_norm": 23.94818687438965, "kl": 4.703831462841481, "learning_rate": 4.2949206349206344e-07, "loss": 0.0047, "num_tokens": 1071100797.0, "reward": 0.25625, "reward_std": 0.04003184512257576, "rewards/verify_chess_move/mean": 0.25625, "rewards/verify_chess_move/std": 0.9579355001449585, "step": 15885 }, { "completion_length": 413.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 413.8, "completions/max_terminated_length": 321.6, "completions/mean_length": 86.096875, "completions/mean_terminated_length": 85.56091766357422, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014380428860769574, "frac_reward_zero_std": 0.95, "grad_norm": 26.261383056640625, "kl": 1.631474501930643, "learning_rate": 4.2945238095238095e-07, "loss": 0.0016, "num_tokens": 1071408593.0, "reward": 0.328125, "reward_std": 0.04218915067613125, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9443780064582825, "step": 15890 }, { "completion_length": 443.6, "completions/clipped_ratio": 0.0, "completions/max_length": 443.6, "completions/max_terminated_length": 443.6, "completions/mean_length": 101.2328125, "completions/mean_terminated_length": 101.2328125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014384953854117834, "frac_reward_zero_std": 0.95, "grad_norm": 21.770854949951172, "kl": 1.005197437480092, "learning_rate": 4.294126984126984e-07, "loss": 0.001, "num_tokens": 1071741083.0, "reward": 0.1734375, "reward_std": 0.04629002511501312, "rewards/verify_chess_move/mean": 0.1734375, "rewards/verify_chess_move/std": 0.9804054141044617, "step": 15895 }, { "completion_length": 414.4, "completions/clipped_ratio": 0.0, "completions/max_length": 414.4, "completions/max_terminated_length": 414.4, "completions/mean_length": 100.85625, "completions/mean_terminated_length": 100.85625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014389478847466095, "frac_reward_zero_std": 0.93125, "grad_norm": 34.673301696777344, "kl": 3.3001168500166385, "learning_rate": 4.293730158730158e-07, "loss": 0.0033, "num_tokens": 1072073363.0, "reward": 0.2140625, "reward_std": 0.06244590915739536, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9686714768409729, "step": 15900 }, { "completion_length": 289.8, "completions/clipped_ratio": 0.0, "completions/max_length": 289.8, "completions/max_terminated_length": 289.8, "completions/mean_length": 89.728125, "completions/mean_terminated_length": 89.728125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014394003840814355, "frac_reward_zero_std": 0.9625, "grad_norm": 1.6835628747940063, "kl": 4.492825621366501, "learning_rate": 4.293333333333333e-07, "loss": 0.0045, "num_tokens": 1072387023.0, "reward": 0.290625, "reward_std": 0.031300367787480354, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9546215772628784, "step": 15905 }, { "completion_length": 261.4, "completions/clipped_ratio": 0.0, "completions/max_length": 261.4, "completions/max_terminated_length": 261.4, "completions/mean_length": 80.40078125, "completions/mean_terminated_length": 80.40078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014398528834162613, "frac_reward_zero_std": 0.96875, "grad_norm": 7.421642303466797, "kl": 2.0339046090608464, "learning_rate": 4.2929365079365077e-07, "loss": 0.002, "num_tokens": 1072685096.0, "reward": 0.3578125, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9322473049163819, "step": 15910 }, { "completion_length": 485.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.4, "completions/max_terminated_length": 442.0, "completions/mean_length": 95.4671875, "completions/mean_terminated_length": 94.94210357666016, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.014403053827510873, "frac_reward_zero_std": 0.95625, "grad_norm": 5.057626724243164, "kl": 2.631249211495742, "learning_rate": 4.292539682539683e-07, "loss": 0.0026, "num_tokens": 1073005894.0, "reward": 0.4296875, "reward_std": 0.03661493994295597, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8738789439201355, "step": 15915 }, { "completion_length": 391.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 391.8, "completions/max_terminated_length": 296.4, "completions/mean_length": 84.6828125, "completions/mean_terminated_length": 84.14962768554688, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014407578820859133, "frac_reward_zero_std": 0.95625, "grad_norm": 22.162670135498047, "kl": 0.36672391279134897, "learning_rate": 4.292142857142857e-07, "loss": 0.0004, "num_tokens": 1073309144.0, "reward": 0.459375, "reward_std": 0.03982066065073013, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8776255369186401, "step": 15920 }, { "completion_length": 403.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.2, "completions/max_terminated_length": 322.8, "completions/mean_length": 93.140625, "completions/mean_terminated_length": 92.62129669189453, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014412103814207393, "frac_reward_zero_std": 0.95, "grad_norm": 0.11335225403308868, "kl": 1.1583837512880564, "learning_rate": 4.2917460317460313e-07, "loss": 0.0012, "num_tokens": 1073628260.0, "reward": 0.2625, "reward_std": 0.043768275156617166, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9529136419296265, "step": 15925 }, { "completion_length": 436.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 436.6, "completions/max_terminated_length": 343.4, "completions/mean_length": 91.828125, "completions/mean_terminated_length": 91.29679870605469, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014416628807555653, "frac_reward_zero_std": 0.9625, "grad_norm": 0.34801527857780457, "kl": 0.7027591051533818, "learning_rate": 4.2913492063492064e-07, "loss": 0.0007, "num_tokens": 1073944736.0, "reward": 0.3625, "reward_std": 0.03219552449882031, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9308522939682007, "step": 15930 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 97.24140625, "completions/mean_terminated_length": 97.24140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014421153800903913, "frac_reward_zero_std": 0.975, "grad_norm": 64.40113067626953, "kl": 4.885919507336803, "learning_rate": 4.290952380952381e-07, "loss": 0.0049, "num_tokens": 1074270941.0, "reward": 0.3, "reward_std": 0.02177756391465664, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9457934617996215, "step": 15935 }, { "completion_length": 433.8, "completions/clipped_ratio": 0.0, "completions/max_length": 433.8, "completions/max_terminated_length": 433.8, "completions/mean_length": 86.8640625, "completions/mean_terminated_length": 86.8640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014425678794252172, "frac_reward_zero_std": 0.96875, "grad_norm": 12.495222091674805, "kl": 1.1272369380807503, "learning_rate": 4.2905555555555554e-07, "loss": 0.0011, "num_tokens": 1074579911.0, "reward": 0.309375, "reward_std": 0.026409147679805754, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9254719734191894, "step": 15940 }, { "completion_length": 302.2, "completions/clipped_ratio": 0.0, "completions/max_length": 302.2, "completions/max_terminated_length": 302.2, "completions/mean_length": 82.7515625, "completions/mean_terminated_length": 82.7515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014430203787600432, "frac_reward_zero_std": 0.9625, "grad_norm": 0.002084751147776842, "kl": 0.22864994276314973, "learning_rate": 4.29015873015873e-07, "loss": 0.0002, "num_tokens": 1074880929.0, "reward": 0.4171875, "reward_std": 0.03330284468829632, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.9063334584236145, "step": 15945 }, { "completion_length": 308.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 91.94375, "completions/mean_terminated_length": 91.94375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014434728780948692, "frac_reward_zero_std": 0.93125, "grad_norm": 22.851179122924805, "kl": 0.4975398362847045, "learning_rate": 4.2897619047619045e-07, "loss": 0.0005, "num_tokens": 1075197825.0, "reward": 0.371875, "reward_std": 0.05886430740356445, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9280598044395447, "step": 15950 }, { "completion_length": 408.8, "completions/clipped_ratio": 0.0, "completions/max_length": 408.8, "completions/max_terminated_length": 408.8, "completions/mean_length": 88.290625, "completions/mean_terminated_length": 88.290625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014439253774296952, "frac_reward_zero_std": 0.975, "grad_norm": 9.736886024475098, "kl": 0.27627953633200375, "learning_rate": 4.289365079365079e-07, "loss": 0.0003, "num_tokens": 1075507805.0, "reward": 0.325, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9420396327972412, "step": 15955 }, { "completion_length": 443.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.6, "completions/max_terminated_length": 442.6, "completions/mean_length": 95.21171875, "completions/mean_terminated_length": 94.6989501953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014443778767645212, "frac_reward_zero_std": 0.96875, "grad_norm": 0.021708332002162933, "kl": 1.1381418598932214, "learning_rate": 4.2889682539682536e-07, "loss": 0.0011, "num_tokens": 1075828884.0, "reward": 0.328125, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.939511501789093, "step": 15960 }, { "completion_length": 269.2, "completions/clipped_ratio": 0.0, "completions/max_length": 269.2, "completions/max_terminated_length": 269.2, "completions/mean_length": 81.8078125, "completions/mean_terminated_length": 81.8078125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01444830376099347, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0021868532057851553, "kl": 0.7338306274497881, "learning_rate": 4.2885714285714287e-07, "loss": 0.0007, "num_tokens": 1076130294.0, "reward": 0.3203125, "reward_std": 0.032667326554656026, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.93100026845932, "step": 15965 }, { "completion_length": 428.4, "completions/clipped_ratio": 0.0, "completions/max_length": 428.4, "completions/max_terminated_length": 428.4, "completions/mean_length": 88.3953125, "completions/mean_terminated_length": 88.3953125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01445282875434173, "frac_reward_zero_std": 0.98125, "grad_norm": 3.919867992401123, "kl": 0.4400255350978114, "learning_rate": 4.288174603174603e-07, "loss": 0.0004, "num_tokens": 1076439384.0, "reward": 0.384375, "reward_std": 0.01872510462999344, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9202402353286743, "step": 15970 }, { "completion_length": 365.6, "completions/clipped_ratio": 0.0, "completions/max_length": 365.6, "completions/max_terminated_length": 365.6, "completions/mean_length": 101.36171875, "completions/mean_terminated_length": 101.36171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01445735374768999, "frac_reward_zero_std": 0.94375, "grad_norm": 2.705688953399658, "kl": 0.9254183614859357, "learning_rate": 4.287777777777777e-07, "loss": 0.0009, "num_tokens": 1076772655.0, "reward": 0.2578125, "reward_std": 0.0518167644739151, "rewards/verify_chess_move/mean": 0.2578125, "rewards/verify_chess_move/std": 0.9655938386917114, "step": 15975 }, { "completion_length": 433.4, "completions/clipped_ratio": 0.0, "completions/max_length": 433.4, "completions/max_terminated_length": 433.4, "completions/mean_length": 90.365625, "completions/mean_terminated_length": 90.365625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01446187874103825, "frac_reward_zero_std": 0.95625, "grad_norm": 8.353869438171387, "kl": 0.2010274731554091, "learning_rate": 4.2873809523809523e-07, "loss": 0.0002, "num_tokens": 1077087683.0, "reward": 0.4359375, "reward_std": 0.03661494068801403, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8955950260162353, "step": 15980 }, { "completion_length": 416.8, "completions/clipped_ratio": 0.0, "completions/max_length": 416.8, "completions/max_terminated_length": 416.8, "completions/mean_length": 92.72265625, "completions/mean_terminated_length": 92.72265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014466403734386511, "frac_reward_zero_std": 0.94375, "grad_norm": 0.9876240491867065, "kl": 1.0432199944159948, "learning_rate": 4.286984126984127e-07, "loss": 0.001, "num_tokens": 1077404488.0, "reward": 0.475, "reward_std": 0.052028929442167283, "rewards/verify_chess_move/mean": 0.475, "rewards/verify_chess_move/std": 0.8692773222923279, "step": 15985 }, { "completion_length": 345.8, "completions/clipped_ratio": 0.0, "completions/max_length": 345.8, "completions/max_terminated_length": 345.8, "completions/mean_length": 87.01796875, "completions/mean_terminated_length": 87.01796875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014470928727734771, "frac_reward_zero_std": 0.94375, "grad_norm": 15.696674346923828, "kl": 1.127606478636153, "learning_rate": 4.286587301587302e-07, "loss": 0.0011, "num_tokens": 1077711479.0, "reward": 0.3625, "reward_std": 0.04682073295116425, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9152785062789917, "step": 15990 }, { "completion_length": 389.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 389.2, "completions/max_terminated_length": 348.0, "completions/mean_length": 89.89609375, "completions/mean_terminated_length": 89.37223815917969, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01447545372108303, "frac_reward_zero_std": 0.94375, "grad_norm": 6.591354846954346, "kl": 1.0981618643272668, "learning_rate": 4.286190476190476e-07, "loss": 0.0011, "num_tokens": 1078024186.0, "reward": 0.38125, "reward_std": 0.04524160884320736, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9116376638412476, "step": 15995 }, { "completion_length": 406.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 406.6, "completions/max_terminated_length": 322.4, "completions/mean_length": 97.665625, "completions/mean_terminated_length": 97.15547637939453, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01447997871443129, "frac_reward_zero_std": 0.9625, "grad_norm": 0.8468102216720581, "kl": 0.7780950974789448, "learning_rate": 4.2857936507936504e-07, "loss": 0.0008, "num_tokens": 1078351582.0, "reward": 0.2625, "reward_std": 0.03582459464669228, "rewards/verify_chess_move/mean": 0.2625, "rewards/verify_chess_move/std": 0.9570918798446655, "step": 16000 }, { "completion_length": 336.2, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/max_terminated_length": 336.2, "completions/mean_length": 91.0796875, "completions/mean_terminated_length": 91.0796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01448450370777955, "frac_reward_zero_std": 0.9625, "grad_norm": 1.6714764833450317, "kl": 0.5952342502539978, "learning_rate": 4.2853968253968255e-07, "loss": 0.0006, "num_tokens": 1078666468.0, "reward": 0.3515625, "reward_std": 0.03650758340954781, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.933574628829956, "step": 16005 }, { "completion_length": 384.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 384.8, "completions/max_terminated_length": 291.2, "completions/mean_length": 90.70625, "completions/mean_terminated_length": 90.18377532958985, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01448902870112781, "frac_reward_zero_std": 0.94375, "grad_norm": 17.781187057495117, "kl": 0.3717656533932313, "learning_rate": 4.2849999999999995e-07, "loss": 0.0004, "num_tokens": 1078979204.0, "reward": 0.38125, "reward_std": 0.04929501377046108, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9083955883979797, "step": 16010 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/max_terminated_length": 369.2, "completions/mean_length": 85.6296875, "completions/mean_terminated_length": 85.6296875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01449355369447607, "frac_reward_zero_std": 0.9625, "grad_norm": 8.109200477600098, "kl": 1.2620231795823202, "learning_rate": 4.2846031746031746e-07, "loss": 0.0013, "num_tokens": 1079284218.0, "reward": 0.4421875, "reward_std": 0.03266732692718506, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8930928587913514, "step": 16015 }, { "completion_length": 429.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 429.4, "completions/max_terminated_length": 317.8, "completions/mean_length": 89.1, "completions/mean_terminated_length": 88.56217346191406, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.014498078687824328, "frac_reward_zero_std": 0.96875, "grad_norm": 0.07599248737096786, "kl": 0.273477334540803, "learning_rate": 4.284206349206349e-07, "loss": 0.0003, "num_tokens": 1079594586.0, "reward": 0.4515625, "reward_std": 0.028930897638201714, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8725861430168151, "step": 16020 }, { "completion_length": 310.4, "completions/clipped_ratio": 0.0, "completions/max_length": 310.4, "completions/max_terminated_length": 310.4, "completions/mean_length": 88.7484375, "completions/mean_terminated_length": 88.7484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014502603681172588, "frac_reward_zero_std": 0.96875, "grad_norm": 2.489797830581665, "kl": 0.21267242636531591, "learning_rate": 4.2838095238095237e-07, "loss": 0.0002, "num_tokens": 1079906728.0, "reward": 0.334375, "reward_std": 0.02346404530107975, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9420133590698242, "step": 16025 }, { "completion_length": 323.6, "completions/clipped_ratio": 0.0, "completions/max_length": 323.6, "completions/max_terminated_length": 323.6, "completions/mean_length": 90.3859375, "completions/mean_terminated_length": 90.3859375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014507128674520848, "frac_reward_zero_std": 0.9375, "grad_norm": 18.49922752380371, "kl": 0.8966167463688179, "learning_rate": 4.283412698412698e-07, "loss": 0.0009, "num_tokens": 1080220910.0, "reward": 0.4046875, "reward_std": 0.052394944429397586, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.8981793045997619, "step": 16030 }, { "completion_length": 360.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 89.24765625, "completions/mean_terminated_length": 89.24765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014511653667869108, "frac_reward_zero_std": 0.95, "grad_norm": 0.9261398315429688, "kl": 1.96147160155233, "learning_rate": 4.283015873015873e-07, "loss": 0.002, "num_tokens": 1080534811.0, "reward": 0.325, "reward_std": 0.04287312030792236, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9395694613456727, "step": 16035 }, { "completion_length": 376.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 87.21640625, "completions/mean_terminated_length": 87.21640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014516178661217368, "frac_reward_zero_std": 0.9625, "grad_norm": 1.0695891380310059, "kl": 0.27538575334474447, "learning_rate": 4.282619047619048e-07, "loss": 0.0003, "num_tokens": 1080841984.0, "reward": 0.4546875, "reward_std": 0.03629639893770218, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8834324359893799, "step": 16040 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 96.853125, "completions/mean_terminated_length": 96.853125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014520703654565629, "frac_reward_zero_std": 0.95, "grad_norm": 12.805479049682617, "kl": 0.9874060781556182, "learning_rate": 4.282222222222222e-07, "loss": 0.001, "num_tokens": 1081167884.0, "reward": 0.315625, "reward_std": 0.04513425305485726, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9249073386192321, "step": 16045 }, { "completion_length": 342.8, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 83.55, "completions/mean_terminated_length": 83.55, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014525228647913887, "frac_reward_zero_std": 0.975, "grad_norm": 7.24401330947876, "kl": 0.32041129090357573, "learning_rate": 4.2818253968253964e-07, "loss": 0.0003, "num_tokens": 1081469844.0, "reward": 0.3796875, "reward_std": 0.019939782470464705, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9119413733482361, "step": 16050 }, { "completion_length": 411.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 84.03828125, "completions/mean_terminated_length": 84.03828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014529753641262147, "frac_reward_zero_std": 0.9625, "grad_norm": 7.001246929168701, "kl": 1.6966543391579763, "learning_rate": 4.2814285714285714e-07, "loss": 0.0017, "num_tokens": 1081774637.0, "reward": 0.415625, "reward_std": 0.031300367787480354, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.90918869972229, "step": 16055 }, { "completion_length": 316.4, "completions/clipped_ratio": 0.0, "completions/max_length": 316.4, "completions/max_terminated_length": 316.4, "completions/mean_length": 86.346875, "completions/mean_terminated_length": 86.346875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014534278634610407, "frac_reward_zero_std": 0.96875, "grad_norm": 15.431589126586914, "kl": 0.5893721795175224, "learning_rate": 4.281031746031746e-07, "loss": 0.0006, "num_tokens": 1082083201.0, "reward": 0.4421875, "reward_std": 0.029826052859425543, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.895898985862732, "step": 16060 }, { "completion_length": 434.4, "completions/clipped_ratio": 0.0, "completions/max_length": 434.4, "completions/max_terminated_length": 434.4, "completions/mean_length": 84.74453125, "completions/mean_terminated_length": 84.74453125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014538803627958667, "frac_reward_zero_std": 0.9625, "grad_norm": 9.255831718444824, "kl": 0.7332677419995889, "learning_rate": 4.2806349206349205e-07, "loss": 0.0007, "num_tokens": 1082386930.0, "reward": 0.4421875, "reward_std": 0.030617379769682884, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8923977613449097, "step": 16065 }, { "completion_length": 328.2, "completions/clipped_ratio": 0.0, "completions/max_length": 328.2, "completions/max_terminated_length": 328.2, "completions/mean_length": 95.50625, "completions/mean_terminated_length": 95.50625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014543328621306927, "frac_reward_zero_std": 0.95, "grad_norm": 14.349655151367188, "kl": 4.269787261122838, "learning_rate": 4.280238095238095e-07, "loss": 0.0043, "num_tokens": 1082711322.0, "reward": 0.3796875, "reward_std": 0.044451264664530754, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9149580836296082, "step": 16070 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 90.5828125, "completions/mean_terminated_length": 90.5828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014547853614655186, "frac_reward_zero_std": 0.98125, "grad_norm": 0.013107843697071075, "kl": 0.8983581788372248, "learning_rate": 4.2798412698412696e-07, "loss": 0.0009, "num_tokens": 1083027092.0, "reward": 0.4109375, "reward_std": 0.01893727108836174, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.9002862095832824, "step": 16075 }, { "completion_length": 341.6, "completions/clipped_ratio": 0.0, "completions/max_length": 341.6, "completions/max_terminated_length": 341.6, "completions/mean_length": 89.090625, "completions/mean_terminated_length": 89.090625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014552378608003446, "frac_reward_zero_std": 0.95625, "grad_norm": 9.648019790649414, "kl": 3.5801389243453743, "learning_rate": 4.2794444444444447e-07, "loss": 0.0036, "num_tokens": 1083339568.0, "reward": 0.2640625, "reward_std": 0.03661494143307209, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9619562506675721, "step": 16080 }, { "completion_length": 314.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 94.409375, "completions/mean_terminated_length": 94.409375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014556903601351706, "frac_reward_zero_std": 0.94375, "grad_norm": 11.819607734680176, "kl": 3.7827525817556307, "learning_rate": 4.2790476190476187e-07, "loss": 0.0038, "num_tokens": 1083660012.0, "reward": 0.35, "reward_std": 0.045241609960794446, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9205087184906006, "step": 16085 }, { "completion_length": 472.4, "completions/clipped_ratio": 0.003125, "completions/max_length": 472.4, "completions/max_terminated_length": 347.8, "completions/mean_length": 87.80390625, "completions/mean_terminated_length": 85.667041015625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014561428594699966, "frac_reward_zero_std": 0.9375, "grad_norm": 15.220780372619629, "kl": 12.363510284409859, "learning_rate": 4.278650793650794e-07, "loss": 0.0124, "num_tokens": 1083969033.0, "reward": 0.3453125, "reward_std": 0.054444891214370725, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9164073228836059, "step": 16090 }, { "completion_length": 410.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 410.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 90.19921875, "completions/mean_terminated_length": 89.67183074951171, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014565953588048226, "frac_reward_zero_std": 0.95, "grad_norm": 18.842283248901367, "kl": 4.364011000399477, "learning_rate": 4.2782539682539683e-07, "loss": 0.0044, "num_tokens": 1084282360.0, "reward": 0.309375, "reward_std": 0.04445224441587925, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.939832067489624, "step": 16095 }, { "completion_length": 429.4, "completions/clipped_ratio": 0.0, "completions/max_length": 429.4, "completions/max_terminated_length": 429.4, "completions/mean_length": 94.8625, "completions/mean_terminated_length": 94.8625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014570478581396486, "frac_reward_zero_std": 0.95625, "grad_norm": 17.97134017944336, "kl": 5.246978793223389, "learning_rate": 4.2778571428571423e-07, "loss": 0.0052, "num_tokens": 1084603392.0, "reward": 0.4171875, "reward_std": 0.037085762992501256, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8864774107933044, "step": 16100 }, { "completion_length": 302.2, "completions/clipped_ratio": 0.0, "completions/max_length": 302.2, "completions/max_terminated_length": 302.2, "completions/mean_length": 86.82421875, "completions/mean_terminated_length": 86.82421875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014575003574744744, "frac_reward_zero_std": 0.95625, "grad_norm": 0.001985939685255289, "kl": 1.3174367419444024, "learning_rate": 4.2774603174603174e-07, "loss": 0.0013, "num_tokens": 1084912431.0, "reward": 0.3171875, "reward_std": 0.037769732996821404, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9300567030906677, "step": 16105 }, { "completion_length": 534.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 534.4, "completions/max_terminated_length": 506.8, "completions/mean_length": 88.26328125, "completions/mean_terminated_length": 87.74174194335937, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014579528568093005, "frac_reward_zero_std": 0.9625, "grad_norm": 5.6443986892700195, "kl": 0.24856023533502594, "learning_rate": 4.277063492063492e-07, "loss": 0.0002, "num_tokens": 1085222736.0, "reward": 0.365625, "reward_std": 0.03014557547867298, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9008556842803955, "step": 16110 }, { "completion_length": 392.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 392.8, "completions/max_terminated_length": 297.2, "completions/mean_length": 87.5796875, "completions/mean_terminated_length": 87.04750061035156, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014584053561441265, "frac_reward_zero_std": 0.975, "grad_norm": 0.6160318851470947, "kl": 1.1827134321210906, "learning_rate": 4.2766666666666664e-07, "loss": 0.0012, "num_tokens": 1085533558.0, "reward": 0.31875, "reward_std": 0.023356688395142555, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9150004863739014, "step": 16115 }, { "completion_length": 429.4, "completions/clipped_ratio": 0.0, "completions/max_length": 429.4, "completions/max_terminated_length": 429.4, "completions/mean_length": 90.02109375, "completions/mean_terminated_length": 90.02109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014588578554789525, "frac_reward_zero_std": 0.95625, "grad_norm": 6.044546127319336, "kl": 1.0055213416926563, "learning_rate": 4.276269841269841e-07, "loss": 0.001, "num_tokens": 1085847289.0, "reward": 0.3015625, "reward_std": 0.0400328267365694, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9504785060882568, "step": 16120 }, { "completion_length": 449.2, "completions/clipped_ratio": 0.0, "completions/max_length": 449.2, "completions/max_terminated_length": 449.2, "completions/mean_length": 89.38984375, "completions/mean_terminated_length": 89.38984375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.014593103548137785, "frac_reward_zero_std": 0.9625, "grad_norm": 25.677947998046875, "kl": 0.36923299164045603, "learning_rate": 4.2758730158730155e-07, "loss": 0.0004, "num_tokens": 1086158132.0, "reward": 0.4734375, "reward_std": 0.035612428188323976, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8786509513854981, "step": 16125 }, { "completion_length": 497.4, "completions/clipped_ratio": 0.0, "completions/max_length": 497.4, "completions/max_terminated_length": 497.4, "completions/mean_length": 93.54453125, "completions/mean_terminated_length": 93.54453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014597628541486043, "frac_reward_zero_std": 0.9375, "grad_norm": 6.7060322761535645, "kl": 1.537552338419482, "learning_rate": 4.2754761904761906e-07, "loss": 0.0015, "num_tokens": 1086477357.0, "reward": 0.2671875, "reward_std": 0.05281927585601807, "rewards/verify_chess_move/mean": 0.2671875, "rewards/verify_chess_move/std": 0.9521496534347534, "step": 16130 }, { "completion_length": 426.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 426.8, "completions/max_terminated_length": 326.2, "completions/mean_length": 90.553125, "completions/mean_terminated_length": 90.02391662597657, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014602153534834303, "frac_reward_zero_std": 0.95, "grad_norm": 24.21625328063965, "kl": 0.30002844538539647, "learning_rate": 4.2750793650793646e-07, "loss": 0.0003, "num_tokens": 1086792529.0, "reward": 0.2765625, "reward_std": 0.04124652519822121, "rewards/verify_chess_move/mean": 0.2765625, "rewards/verify_chess_move/std": 0.9605720400810241, "step": 16135 }, { "completion_length": 334.4, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/max_terminated_length": 334.4, "completions/mean_length": 91.10703125, "completions/mean_terminated_length": 91.10703125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014606678528182563, "frac_reward_zero_std": 0.9625, "grad_norm": 0.002573170233517885, "kl": 0.5215984507929534, "learning_rate": 4.274682539682539e-07, "loss": 0.0005, "num_tokens": 1087108474.0, "reward": 0.3953125, "reward_std": 0.03198335766792297, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.914298152923584, "step": 16140 }, { "completion_length": 384.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 91.82265625, "completions/mean_terminated_length": 91.82265625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014611203521530823, "frac_reward_zero_std": 0.96875, "grad_norm": 5.0820770263671875, "kl": 0.15826049505267292, "learning_rate": 4.274285714285714e-07, "loss": 0.0002, "num_tokens": 1087424255.0, "reward": 0.396875, "reward_std": 0.026409146934747697, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9134395003318787, "step": 16145 }, { "completion_length": 376.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 376.0, "completions/max_terminated_length": 304.8, "completions/mean_length": 97.7328125, "completions/mean_terminated_length": 97.21900634765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014615728514879084, "frac_reward_zero_std": 0.95625, "grad_norm": 7.174679756164551, "kl": 0.18377889012917875, "learning_rate": 4.273888888888889e-07, "loss": 0.0002, "num_tokens": 1087748721.0, "reward": 0.4375, "reward_std": 0.03913669139146805, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8992234468460083, "step": 16150 }, { "completion_length": 368.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 368.2, "completions/max_terminated_length": 364.2, "completions/mean_length": 83.51171875, "completions/mean_terminated_length": 82.44958801269532, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014620253508227344, "frac_reward_zero_std": 0.98125, "grad_norm": 0.038626592606306076, "kl": 0.19586197310127318, "learning_rate": 4.2734920634920633e-07, "loss": 0.0002, "num_tokens": 1088051336.0, "reward": 0.3171875, "reward_std": 0.015992168709635733, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9438372135162354, "step": 16155 }, { "completion_length": 449.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 449.6, "completions/max_terminated_length": 372.0, "completions/mean_length": 93.00859375, "completions/mean_terminated_length": 92.4790267944336, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014624778501575602, "frac_reward_zero_std": 0.95, "grad_norm": 4.404189109802246, "kl": 0.7262747468892485, "learning_rate": 4.273095238095238e-07, "loss": 0.0007, "num_tokens": 1088369995.0, "reward": 0.3765625, "reward_std": 0.04082219153642654, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9163041472434997, "step": 16160 }, { "completion_length": 570.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 570.0, "completions/max_terminated_length": 518.6, "completions/mean_length": 90.528125, "completions/mean_terminated_length": 90.00813598632813, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014629303494923862, "frac_reward_zero_std": 0.9625, "grad_norm": 9.418479919433594, "kl": 1.0817360549001023, "learning_rate": 4.2726984126984124e-07, "loss": 0.0011, "num_tokens": 1088681815.0, "reward": 0.3796875, "reward_std": 0.03287851177155972, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9223140001296997, "step": 16165 }, { "completion_length": 584.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 584.6, "completions/max_terminated_length": 580.2, "completions/mean_length": 97.86796875, "completions/mean_terminated_length": 97.35667114257812, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014633828488272122, "frac_reward_zero_std": 0.925, "grad_norm": 25.97823715209961, "kl": 3.311878288211301, "learning_rate": 4.2723015873015874e-07, "loss": 0.0033, "num_tokens": 1089007094.0, "reward": 0.45625, "reward_std": 0.06780893132090568, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8626707077026368, "step": 16170 }, { "completion_length": 368.6, "completions/clipped_ratio": 0.0, "completions/max_length": 368.6, "completions/max_terminated_length": 368.6, "completions/mean_length": 90.19375, "completions/mean_terminated_length": 90.19375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014638353481620382, "frac_reward_zero_std": 0.9625, "grad_norm": 0.01322547160089016, "kl": 4.300937909353524, "learning_rate": 4.2719047619047615e-07, "loss": 0.0043, "num_tokens": 1089320814.0, "reward": 0.3125, "reward_std": 0.032195523381233215, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9202508449554443, "step": 16175 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 91.0453125, "completions/mean_terminated_length": 91.0453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014642878474968642, "frac_reward_zero_std": 0.96875, "grad_norm": 0.5520508885383606, "kl": 0.5372143701184541, "learning_rate": 4.2715079365079365e-07, "loss": 0.0005, "num_tokens": 1089637336.0, "reward": 0.39375, "reward_std": 0.02709311693906784, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9054550290107727, "step": 16180 }, { "completion_length": 322.4, "completions/clipped_ratio": 0.0, "completions/max_length": 322.4, "completions/max_terminated_length": 322.4, "completions/mean_length": 94.54765625, "completions/mean_terminated_length": 94.54765625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.014647403468316902, "frac_reward_zero_std": 0.99375, "grad_norm": 0.41067951917648315, "kl": 0.4023169622058049, "learning_rate": 4.271111111111111e-07, "loss": 0.0004, "num_tokens": 1089959053.0, "reward": 0.428125, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.899273943901062, "step": 16185 }, { "completion_length": 383.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 88.88515625, "completions/mean_terminated_length": 88.88515625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01465192846166516, "frac_reward_zero_std": 0.9625, "grad_norm": 1.3501609563827515, "kl": 0.6981725603109226, "learning_rate": 4.270714285714285e-07, "loss": 0.0007, "num_tokens": 1090270938.0, "reward": 0.459375, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.879926347732544, "step": 16190 }, { "completion_length": 326.8, "completions/clipped_ratio": 0.0, "completions/max_length": 326.8, "completions/max_terminated_length": 326.8, "completions/mean_length": 88.26796875, "completions/mean_terminated_length": 88.26796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014656453455013421, "frac_reward_zero_std": 0.95, "grad_norm": 1.6313128471374512, "kl": 0.4722281150985509, "learning_rate": 4.27031746031746e-07, "loss": 0.0005, "num_tokens": 1090580953.0, "reward": 0.390625, "reward_std": 0.04218915067613125, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.914792263507843, "step": 16195 }, { "completion_length": 370.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 370.6, "completions/max_terminated_length": 355.6, "completions/mean_length": 91.3359375, "completions/mean_terminated_length": 90.81062927246094, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.014660978448361681, "frac_reward_zero_std": 0.9875, "grad_norm": 0.10492869466543198, "kl": 0.4849117882666178, "learning_rate": 4.2699206349206347e-07, "loss": 0.0005, "num_tokens": 1090897119.0, "reward": 0.325, "reward_std": 0.01293872892856598, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.940523910522461, "step": 16200 }, { "completion_length": 342.6, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/max_terminated_length": 342.6, "completions/mean_length": 98.63671875, "completions/mean_terminated_length": 98.63671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014665503441709941, "frac_reward_zero_std": 0.94375, "grad_norm": 1.7064766883850098, "kl": 0.44564696692395955, "learning_rate": 4.26952380952381e-07, "loss": 0.0004, "num_tokens": 1091223270.0, "reward": 0.3859375, "reward_std": 0.04455862008035183, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9193918943405152, "step": 16205 }, { "completion_length": 349.2, "completions/clipped_ratio": 0.0, "completions/max_length": 349.2, "completions/max_terminated_length": 349.2, "completions/mean_length": 91.08359375, "completions/mean_terminated_length": 91.08359375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014670028435058201, "frac_reward_zero_std": 0.96875, "grad_norm": 16.212507247924805, "kl": 0.6979252829682082, "learning_rate": 4.269126984126984e-07, "loss": 0.0007, "num_tokens": 1091539385.0, "reward": 0.4296875, "reward_std": 0.02688095048069954, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8935702443122864, "step": 16210 }, { "completion_length": 419.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 419.6, "completions/max_terminated_length": 348.0, "completions/mean_length": 86.990625, "completions/mean_terminated_length": 86.46195526123047, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.01467455342840646, "frac_reward_zero_std": 0.95625, "grad_norm": 1.3737637996673584, "kl": 0.4430746323429048, "learning_rate": 4.2687301587301583e-07, "loss": 0.0004, "num_tokens": 1091849125.0, "reward": 0.25, "reward_std": 0.03708674423396587, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9367452263832092, "step": 16215 }, { "completion_length": 384.4, "completions/clipped_ratio": 0.0, "completions/max_length": 384.4, "completions/max_terminated_length": 384.4, "completions/mean_length": 89.16953125, "completions/mean_terminated_length": 89.16953125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01467907842175472, "frac_reward_zero_std": 0.95625, "grad_norm": 4.929984092712402, "kl": 1.9496728412341326, "learning_rate": 4.2683333333333334e-07, "loss": 0.0019, "num_tokens": 1092160550.0, "reward": 0.4296875, "reward_std": 0.037769732624292375, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8736903071403503, "step": 16220 }, { "completion_length": 431.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 431.8, "completions/max_terminated_length": 406.0, "completions/mean_length": 97.29765625, "completions/mean_terminated_length": 96.78013305664062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01468360341510298, "frac_reward_zero_std": 0.95625, "grad_norm": 0.5096997022628784, "kl": 1.0736141381552442, "learning_rate": 4.2679365079365074e-07, "loss": 0.0011, "num_tokens": 1092485427.0, "reward": 0.5046875, "reward_std": 0.033669838309288026, "rewards/verify_chess_move/mean": 0.5046875, "rewards/verify_chess_move/std": 0.8615496993064881, "step": 16225 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.0, "completions/max_terminated_length": 332.6, "completions/mean_length": 88.51640625, "completions/mean_terminated_length": 87.98545684814454, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01468812840845124, "frac_reward_zero_std": 0.9625, "grad_norm": 12.01846981048584, "kl": 0.49602406448684633, "learning_rate": 4.2675396825396825e-07, "loss": 0.0005, "num_tokens": 1092796504.0, "reward": 0.4125, "reward_std": 0.033350315690040586, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9115035891532898, "step": 16230 }, { "completion_length": 302.8, "completions/clipped_ratio": 0.0, "completions/max_length": 302.8, "completions/max_terminated_length": 302.8, "completions/mean_length": 90.70703125, "completions/mean_terminated_length": 90.70703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0146926534017995, "frac_reward_zero_std": 0.98125, "grad_norm": 0.06541117280721664, "kl": 1.0032110310625284, "learning_rate": 4.267142857142857e-07, "loss": 0.001, "num_tokens": 1093111345.0, "reward": 0.328125, "reward_std": 0.017570312321186065, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9346646070480347, "step": 16235 }, { "completion_length": 303.2, "completions/clipped_ratio": 0.0, "completions/max_length": 303.2, "completions/max_terminated_length": 303.2, "completions/mean_length": 89.5140625, "completions/mean_terminated_length": 89.5140625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01469717839514776, "frac_reward_zero_std": 0.975, "grad_norm": 12.72575855255127, "kl": 0.7829838742036372, "learning_rate": 4.2667460317460315e-07, "loss": 0.0008, "num_tokens": 1093422219.0, "reward": 0.475, "reward_std": 0.02130674123764038, "rewards/verify_chess_move/mean": 0.475, "rewards/verify_chess_move/std": 0.8537684202194213, "step": 16240 }, { "completion_length": 373.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 373.6, "completions/max_terminated_length": 349.4, "completions/mean_length": 89.8578125, "completions/mean_terminated_length": 89.34446868896484, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014701703388496018, "frac_reward_zero_std": 0.94375, "grad_norm": 5.362157344818115, "kl": 4.522302593663335, "learning_rate": 4.266349206349206e-07, "loss": 0.0045, "num_tokens": 1093735197.0, "reward": 0.2640625, "reward_std": 0.05023763999342919, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9532466053962707, "step": 16245 }, { "completion_length": 284.2, "completions/clipped_ratio": 0.0, "completions/max_length": 284.2, "completions/max_terminated_length": 284.2, "completions/mean_length": 86.2125, "completions/mean_terminated_length": 86.2125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014706228381844278, "frac_reward_zero_std": 0.94375, "grad_norm": 14.157174110412598, "kl": 3.5076717451680453, "learning_rate": 4.2659523809523806e-07, "loss": 0.0035, "num_tokens": 1094042141.0, "reward": 0.38125, "reward_std": 0.051344961300492285, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9217184782028198, "step": 16250 }, { "completion_length": 497.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 497.4, "completions/max_terminated_length": 469.2, "completions/mean_length": 95.72109375, "completions/mean_terminated_length": 94.663232421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014710753375192539, "frac_reward_zero_std": 0.9625, "grad_norm": 20.740299224853516, "kl": 1.119574000686407, "learning_rate": 4.2655555555555557e-07, "loss": 0.0011, "num_tokens": 1094364520.0, "reward": 0.3546875, "reward_std": 0.030617379397153855, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9264343619346619, "step": 16255 }, { "completion_length": 415.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 415.6, "completions/max_terminated_length": 391.2, "completions/mean_length": 94.615625, "completions/mean_terminated_length": 93.56309051513672, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.014715278368540799, "frac_reward_zero_std": 0.95625, "grad_norm": 1.3079564571380615, "kl": 2.0153918097377757, "learning_rate": 4.26515873015873e-07, "loss": 0.002, "num_tokens": 1094684860.0, "reward": 0.3875, "reward_std": 0.03524798229336738, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9207677841186523, "step": 16260 }, { "completion_length": 293.2, "completions/clipped_ratio": 0.0, "completions/max_length": 293.2, "completions/max_terminated_length": 293.2, "completions/mean_length": 90.7578125, "completions/mean_terminated_length": 90.7578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014719803361889059, "frac_reward_zero_std": 0.95625, "grad_norm": 1.0859936475753784, "kl": 1.329010714427568, "learning_rate": 4.264761904761904e-07, "loss": 0.0013, "num_tokens": 1094999702.0, "reward": 0.4453125, "reward_std": 0.03456499315798282, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.886792254447937, "step": 16265 }, { "completion_length": 418.2, "completions/clipped_ratio": 0.0, "completions/max_length": 418.2, "completions/max_terminated_length": 418.2, "completions/mean_length": 90.3953125, "completions/mean_terminated_length": 90.3953125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014724328355237317, "frac_reward_zero_std": 0.95625, "grad_norm": 1.9458109140396118, "kl": 1.6498599750222638, "learning_rate": 4.2643650793650793e-07, "loss": 0.0016, "num_tokens": 1095312968.0, "reward": 0.3453125, "reward_std": 0.03934885822236538, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9389959812164307, "step": 16270 }, { "completion_length": 334.8, "completions/clipped_ratio": 0.0, "completions/max_length": 334.8, "completions/max_terminated_length": 334.8, "completions/mean_length": 91.06875, "completions/mean_terminated_length": 91.06875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014728853348585577, "frac_reward_zero_std": 0.99375, "grad_norm": 1.8344788551330566, "kl": 0.8237109483452514, "learning_rate": 4.263968253968254e-07, "loss": 0.0008, "num_tokens": 1095627936.0, "reward": 0.3859375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.8957189679145813, "step": 16275 }, { "completion_length": 412.2, "completions/clipped_ratio": 0.0, "completions/max_length": 412.2, "completions/max_terminated_length": 412.2, "completions/mean_length": 89.934375, "completions/mean_terminated_length": 89.934375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014733378341933837, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0018022347940132022, "kl": 0.42199460363481195, "learning_rate": 4.2635714285714284e-07, "loss": 0.0004, "num_tokens": 1095939708.0, "reward": 0.3703125, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9130850553512573, "step": 16280 }, { "completion_length": 599.6, "completions/clipped_ratio": 0.003125, "completions/max_length": 599.6, "completions/max_terminated_length": 506.0, "completions/mean_length": 92.10078125, "completions/mean_terminated_length": 89.99839324951172, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014737903335282097, "frac_reward_zero_std": 0.975, "grad_norm": 5.701510906219482, "kl": 1.1436220634030179, "learning_rate": 4.263174603174603e-07, "loss": 0.0011, "num_tokens": 1096253037.0, "reward": 0.40625, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9084602236747742, "step": 16285 }, { "completion_length": 404.8, "completions/clipped_ratio": 0.0, "completions/max_length": 404.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 89.26328125, "completions/mean_terminated_length": 89.26328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014742428328630357, "frac_reward_zero_std": 0.96875, "grad_norm": 0.6721853017807007, "kl": 0.1379148870939389, "learning_rate": 4.2627777777777775e-07, "loss": 0.0001, "num_tokens": 1096564694.0, "reward": 0.328125, "reward_std": 0.027563939243555068, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9118321537971497, "step": 16290 }, { "completion_length": 381.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 381.2, "completions/max_terminated_length": 317.6, "completions/mean_length": 88.63046875, "completions/mean_terminated_length": 88.09031677246094, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014746953321978618, "frac_reward_zero_std": 0.9625, "grad_norm": 2.4805080890655518, "kl": 0.47111634676111863, "learning_rate": 4.2623809523809525e-07, "loss": 0.0005, "num_tokens": 1096876773.0, "reward": 0.3328125, "reward_std": 0.029933410137891768, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9368561863899231, "step": 16295 }, { "completion_length": 296.6, "completions/clipped_ratio": 0.0, "completions/max_length": 296.6, "completions/max_terminated_length": 296.6, "completions/mean_length": 87.8, "completions/mean_terminated_length": 87.8, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014751478315326876, "frac_reward_zero_std": 0.9625, "grad_norm": 6.282450199127197, "kl": 0.4704572624061257, "learning_rate": 4.2619841269841265e-07, "loss": 0.0005, "num_tokens": 1097185653.0, "reward": 0.415625, "reward_std": 0.03377464860677719, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.891085410118103, "step": 16300 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 423.0, "completions/max_terminated_length": 388.8, "completions/mean_length": 93.1078125, "completions/mean_terminated_length": 92.5797103881836, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014756003308675136, "frac_reward_zero_std": 0.975, "grad_norm": 0.5199241042137146, "kl": 0.18493179861688985, "learning_rate": 4.2615873015873016e-07, "loss": 0.0002, "num_tokens": 1097504399.0, "reward": 0.4515625, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8629519701004028, "step": 16305 }, { "completion_length": 323.6, "completions/clipped_ratio": 0.0, "completions/max_length": 323.6, "completions/max_terminated_length": 323.6, "completions/mean_length": 88.05234375, "completions/mean_terminated_length": 88.05234375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014760528302023396, "frac_reward_zero_std": 0.99375, "grad_norm": 0.043160587549209595, "kl": 0.19265499289613217, "learning_rate": 4.261190476190476e-07, "loss": 0.0002, "num_tokens": 1097814578.0, "reward": 0.3703125, "reward_std": 0.00646936446428299, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9097554326057434, "step": 16310 }, { "completion_length": 420.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 420.8, "completions/max_terminated_length": 346.2, "completions/mean_length": 96.30625, "completions/mean_terminated_length": 95.78519439697266, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014765053295371656, "frac_reward_zero_std": 0.98125, "grad_norm": 0.11460530012845993, "kl": 0.21238363523734732, "learning_rate": 4.26079365079365e-07, "loss": 0.0002, "num_tokens": 1098138106.0, "reward": 0.278125, "reward_std": 0.017570312693715097, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9548294305801391, "step": 16315 }, { "completion_length": 570.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 570.6, "completions/max_terminated_length": 415.8, "completions/mean_length": 91.79375, "completions/mean_terminated_length": 90.20180206298828, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.014769578288719916, "frac_reward_zero_std": 0.94375, "grad_norm": 10.416481971740723, "kl": 0.4430376647505909, "learning_rate": 4.260396825396825e-07, "loss": 0.0004, "num_tokens": 1098454930.0, "reward": 0.328125, "reward_std": 0.050920628011226654, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9315691709518432, "step": 16320 }, { "completion_length": 299.8, "completions/clipped_ratio": 0.0, "completions/max_length": 299.8, "completions/max_terminated_length": 299.8, "completions/mean_length": 96.096875, "completions/mean_terminated_length": 96.096875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014774103282068175, "frac_reward_zero_std": 0.96875, "grad_norm": 9.65577220916748, "kl": 0.9605100037064404, "learning_rate": 4.26e-07, "loss": 0.001, "num_tokens": 1098778630.0, "reward": 0.375, "reward_std": 0.026409146934747697, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9152550816535949, "step": 16325 }, { "completion_length": 386.2, "completions/clipped_ratio": 0.0, "completions/max_length": 386.2, "completions/max_terminated_length": 386.2, "completions/mean_length": 95.61796875, "completions/mean_terminated_length": 95.61796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014778628275416435, "frac_reward_zero_std": 0.95625, "grad_norm": 3.0290894508361816, "kl": 1.3419794665882363, "learning_rate": 4.259603174603175e-07, "loss": 0.0013, "num_tokens": 1099100773.0, "reward": 0.346875, "reward_std": 0.038452721387147906, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9336893439292908, "step": 16330 }, { "completion_length": 528.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 528.0, "completions/max_terminated_length": 470.4, "completions/mean_length": 93.0453125, "completions/mean_terminated_length": 91.99484252929688, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014783153268764695, "frac_reward_zero_std": 0.9375, "grad_norm": 3.5651135444641113, "kl": 0.3495593072846532, "learning_rate": 4.259206349206349e-07, "loss": 0.0003, "num_tokens": 1099415679.0, "reward": 0.5453125, "reward_std": 0.050344996899366376, "rewards/verify_chess_move/mean": 0.5453125, "rewards/verify_chess_move/std": 0.8126548528671265, "step": 16335 }, { "completion_length": 392.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 392.4, "completions/max_terminated_length": 333.6, "completions/mean_length": 86.6359375, "completions/mean_terminated_length": 86.10236968994141, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014787678262112955, "frac_reward_zero_std": 0.95625, "grad_norm": 10.861044883728027, "kl": 1.814911506487988, "learning_rate": 4.2588095238095234e-07, "loss": 0.0018, "num_tokens": 1099723293.0, "reward": 0.34375, "reward_std": 0.041870607435703276, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9261577010154725, "step": 16340 }, { "completion_length": 355.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 355.0, "completions/max_terminated_length": 314.2, "completions/mean_length": 78.471875, "completions/mean_terminated_length": 77.93849639892578, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014792203255461215, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0018383768619969487, "kl": 1.845253039582167, "learning_rate": 4.2584126984126985e-07, "loss": 0.0018, "num_tokens": 1100018073.0, "reward": 0.4203125, "reward_std": 0.02993340976536274, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8959639191627502, "step": 16345 }, { "completion_length": 476.6, "completions/clipped_ratio": 0.0, "completions/max_length": 476.6, "completions/max_terminated_length": 476.6, "completions/mean_length": 91.071875, "completions/mean_terminated_length": 91.071875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014796728248809475, "frac_reward_zero_std": 0.975, "grad_norm": 0.056105099618434906, "kl": 1.884051431773696, "learning_rate": 4.2580158730158725e-07, "loss": 0.0019, "num_tokens": 1100335541.0, "reward": 0.309375, "reward_std": 0.023356688022613526, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9371774435043335, "step": 16350 }, { "completion_length": 455.8, "completions/clipped_ratio": 0.003125, "completions/max_length": 455.8, "completions/max_terminated_length": 441.4, "completions/mean_length": 92.35625, "completions/mean_terminated_length": 90.24650421142579, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014801253242157733, "frac_reward_zero_std": 0.96875, "grad_norm": 0.043606072664260864, "kl": 0.34890169820282607, "learning_rate": 4.2576190476190475e-07, "loss": 0.0003, "num_tokens": 1100652629.0, "reward": 0.403125, "reward_std": 0.0284590944647789, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.8998024821281433, "step": 16355 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 395.2, "completions/max_terminated_length": 307.2, "completions/mean_length": 88.3296875, "completions/mean_terminated_length": 87.80132751464843, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014805778235505994, "frac_reward_zero_std": 0.95625, "grad_norm": 11.916292190551758, "kl": 0.8976569347432815, "learning_rate": 4.257222222222222e-07, "loss": 0.0009, "num_tokens": 1100962835.0, "reward": 0.5265625, "reward_std": 0.03640375584363938, "rewards/verify_chess_move/mean": 0.5265625, "rewards/verify_chess_move/std": 0.8479023337364197, "step": 16360 }, { "completion_length": 446.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 446.2, "completions/max_terminated_length": 393.4, "completions/mean_length": 89.71171875, "completions/mean_terminated_length": 88.64938507080078, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014810303228854254, "frac_reward_zero_std": 0.95625, "grad_norm": 5.563848495483398, "kl": 3.962623954098672, "learning_rate": 4.2568253968253966e-07, "loss": 0.004, "num_tokens": 1101275066.0, "reward": 0.5078125, "reward_std": 0.03546014800667763, "rewards/verify_chess_move/mean": 0.5078125, "rewards/verify_chess_move/std": 0.8605921149253846, "step": 16365 }, { "completion_length": 461.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 461.6, "completions/max_terminated_length": 372.6, "completions/mean_length": 86.2265625, "completions/mean_terminated_length": 85.68512268066407, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014814828222202514, "frac_reward_zero_std": 0.975, "grad_norm": 0.6576524376869202, "kl": 0.7298268190701492, "learning_rate": 4.256428571428571e-07, "loss": 0.0007, "num_tokens": 1101582292.0, "reward": 0.35, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9346193552017212, "step": 16370 }, { "completion_length": 380.8, "completions/clipped_ratio": 0.0, "completions/max_length": 380.8, "completions/max_terminated_length": 380.8, "completions/mean_length": 85.90546875, "completions/mean_terminated_length": 85.90546875, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.014819353215550774, "frac_reward_zero_std": 0.9625, "grad_norm": 13.921040534973145, "kl": 2.1339247495867313, "learning_rate": 4.2560317460317457e-07, "loss": 0.0021, "num_tokens": 1101888267.0, "reward": 0.4671875, "reward_std": 0.03198335766792297, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8719544768333435, "step": 16375 }, { "completion_length": 447.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.6, "completions/max_terminated_length": 405.0, "completions/mean_length": 93.975, "completions/mean_terminated_length": 93.44805297851562, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014823878208899032, "frac_reward_zero_std": 0.9625, "grad_norm": 0.976107656955719, "kl": 3.119312308798544, "learning_rate": 4.255634920634921e-07, "loss": 0.0031, "num_tokens": 1102208883.0, "reward": 0.3109375, "reward_std": 0.03471727333962917, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.934138011932373, "step": 16380 }, { "completion_length": 290.2, "completions/clipped_ratio": 0.0, "completions/max_length": 290.2, "completions/max_terminated_length": 290.2, "completions/mean_length": 84.43125, "completions/mean_terminated_length": 84.43125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014828403202247292, "frac_reward_zero_std": 0.95625, "grad_norm": 3.653608560562134, "kl": 3.261969312187284, "learning_rate": 4.2552380952380953e-07, "loss": 0.0033, "num_tokens": 1102512747.0, "reward": 0.35625, "reward_std": 0.038452721387147906, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9290890216827392, "step": 16385 }, { "completion_length": 628.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 628.0, "completions/max_terminated_length": 620.6, "completions/mean_length": 93.2234375, "completions/mean_terminated_length": 92.68663940429687, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014832928195595552, "frac_reward_zero_std": 0.94375, "grad_norm": 2.312955379486084, "kl": 1.6848330603796058, "learning_rate": 4.2548412698412693e-07, "loss": 0.0017, "num_tokens": 1102830897.0, "reward": 0.3296875, "reward_std": 0.053653563186526296, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9346559524536133, "step": 16390 }, { "completion_length": 306.2, "completions/clipped_ratio": 0.0, "completions/max_length": 306.2, "completions/max_terminated_length": 306.2, "completions/mean_length": 91.66796875, "completions/mean_terminated_length": 91.66796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014837453188943812, "frac_reward_zero_std": 0.98125, "grad_norm": 4.8148603439331055, "kl": 0.24960989921819418, "learning_rate": 4.2544444444444444e-07, "loss": 0.0002, "num_tokens": 1103147024.0, "reward": 0.428125, "reward_std": 0.01462521068751812, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8766329169273377, "step": 16395 }, { "completion_length": 496.4, "completions/clipped_ratio": 0.0, "completions/max_length": 496.4, "completions/max_terminated_length": 496.4, "completions/mean_length": 96.7328125, "completions/mean_terminated_length": 96.7328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014841978182292073, "frac_reward_zero_std": 0.95625, "grad_norm": 11.309985160827637, "kl": 0.5345034091966226, "learning_rate": 4.254047619047619e-07, "loss": 0.0005, "num_tokens": 1103473346.0, "reward": 0.2875, "reward_std": 0.04003184624016285, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9548773288726806, "step": 16400 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 381.0, "completions/max_terminated_length": 302.8, "completions/mean_length": 87.1796875, "completions/mean_terminated_length": 86.65919494628906, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014846503175640333, "frac_reward_zero_std": 0.94375, "grad_norm": 3.2341322898864746, "kl": 0.9977056697942317, "learning_rate": 4.2536507936507935e-07, "loss": 0.001, "num_tokens": 1103783344.0, "reward": 0.3484375, "reward_std": 0.04340382777154446, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9114702343940735, "step": 16405 }, { "completion_length": 301.6, "completions/clipped_ratio": 0.0, "completions/max_length": 301.6, "completions/max_terminated_length": 301.6, "completions/mean_length": 93.36875, "completions/mean_terminated_length": 93.36875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014851028168988591, "frac_reward_zero_std": 0.96875, "grad_norm": 14.010419845581055, "kl": 1.509229054208845, "learning_rate": 4.253253968253968e-07, "loss": 0.0015, "num_tokens": 1104103424.0, "reward": 0.315625, "reward_std": 0.0275639396160841, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9484797954559326, "step": 16410 }, { "completion_length": 379.2, "completions/clipped_ratio": 0.0, "completions/max_length": 379.2, "completions/max_terminated_length": 379.2, "completions/mean_length": 84.54921875, "completions/mean_terminated_length": 84.54921875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014855553162336851, "frac_reward_zero_std": 0.9625, "grad_norm": 3.7822744846343994, "kl": 0.6582862574607133, "learning_rate": 4.2528571428571425e-07, "loss": 0.0007, "num_tokens": 1104409023.0, "reward": 0.45, "reward_std": 0.030145575851202012, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8919515252113343, "step": 16415 }, { "completion_length": 467.8, "completions/clipped_ratio": 0.0, "completions/max_length": 467.8, "completions/max_terminated_length": 467.8, "completions/mean_length": 86.3859375, "completions/mean_terminated_length": 86.3859375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014860078155685111, "frac_reward_zero_std": 0.93125, "grad_norm": 23.18781089782715, "kl": 1.692007945640944, "learning_rate": 4.2524603174603176e-07, "loss": 0.0017, "num_tokens": 1104715653.0, "reward": 0.4671875, "reward_std": 0.057921680808067325, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8603696584701538, "step": 16420 }, { "completion_length": 435.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 435.2, "completions/max_terminated_length": 414.4, "completions/mean_length": 93.903125, "completions/mean_terminated_length": 93.39429931640625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014864603149033371, "frac_reward_zero_std": 0.9375, "grad_norm": 12.381122589111328, "kl": 1.1549850849434733, "learning_rate": 4.2520634920634916e-07, "loss": 0.0012, "num_tokens": 1105034553.0, "reward": 0.3859375, "reward_std": 0.05991076231002808, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.912692379951477, "step": 16425 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 88.34453125, "completions/mean_terminated_length": 88.34453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014869128142381631, "frac_reward_zero_std": 0.96875, "grad_norm": 1.5282915830612183, "kl": 0.2360487164231017, "learning_rate": 4.2516666666666667e-07, "loss": 0.0002, "num_tokens": 1105345530.0, "reward": 0.384375, "reward_std": 0.027563939988613128, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9092799305915833, "step": 16430 }, { "completion_length": 392.8, "completions/clipped_ratio": 0.0, "completions/max_length": 392.8, "completions/max_terminated_length": 392.8, "completions/mean_length": 85.8609375, "completions/mean_terminated_length": 85.8609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01487365313572989, "frac_reward_zero_std": 0.95, "grad_norm": 0.002479361603036523, "kl": 2.682068053481635, "learning_rate": 4.251269841269841e-07, "loss": 0.0027, "num_tokens": 1105653256.0, "reward": 0.446875, "reward_std": 0.044663430377841, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.891146433353424, "step": 16435 }, { "completion_length": 353.4, "completions/clipped_ratio": 0.0, "completions/max_length": 353.4, "completions/max_terminated_length": 353.4, "completions/mean_length": 92.19296875, "completions/mean_terminated_length": 92.19296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01487817812907815, "frac_reward_zero_std": 0.9375, "grad_norm": 25.061494827270508, "kl": 0.4229226717259735, "learning_rate": 4.250873015873015e-07, "loss": 0.0004, "num_tokens": 1105969647.0, "reward": 0.38125, "reward_std": 0.053502263873815535, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9158148646354676, "step": 16440 }, { "completion_length": 445.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.2, "completions/max_terminated_length": 373.8, "completions/mean_length": 93.58203125, "completions/mean_terminated_length": 93.07220153808593, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01488270312242641, "frac_reward_zero_std": 0.95625, "grad_norm": 3.4335246086120605, "kl": 0.17364207457285374, "learning_rate": 4.2504761904761903e-07, "loss": 0.0002, "num_tokens": 1106289120.0, "reward": 0.321875, "reward_std": 0.03798189871013165, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.930206561088562, "step": 16445 }, { "completion_length": 316.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 86.75078125, "completions/mean_terminated_length": 86.75078125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01488722811577467, "frac_reward_zero_std": 0.975, "grad_norm": 13.7905855178833, "kl": 2.1159979575313628, "learning_rate": 4.250079365079365e-07, "loss": 0.0021, "num_tokens": 1106596513.0, "reward": 0.484375, "reward_std": 0.02041158638894558, "rewards/verify_chess_move/mean": 0.484375, "rewards/verify_chess_move/std": 0.868684446811676, "step": 16450 }, { "completion_length": 413.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 83.14296875, "completions/mean_terminated_length": 83.14296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01489175310912293, "frac_reward_zero_std": 0.94375, "grad_norm": 0.0024079589638859034, "kl": 1.188149804796558, "learning_rate": 4.24968253968254e-07, "loss": 0.0012, "num_tokens": 1106896912.0, "reward": 0.434375, "reward_std": 0.0452416080981493, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8954945921897888, "step": 16455 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 444.6, "completions/max_terminated_length": 387.0, "completions/mean_length": 90.10390625, "completions/mean_terminated_length": 89.04918823242187, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01489627810247119, "frac_reward_zero_std": 0.9625, "grad_norm": 0.055517613887786865, "kl": 1.017412179405801, "learning_rate": 4.249285714285714e-07, "loss": 0.001, "num_tokens": 1107210597.0, "reward": 0.353125, "reward_std": 0.034034284949302676, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9265239834785461, "step": 16460 }, { "completion_length": 366.8, "completions/clipped_ratio": 0.0, "completions/max_length": 366.8, "completions/max_terminated_length": 366.8, "completions/mean_length": 94.35703125, "completions/mean_terminated_length": 94.35703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014900803095819449, "frac_reward_zero_std": 0.9625, "grad_norm": 4.477126121520996, "kl": 0.19477154477499425, "learning_rate": 4.2488888888888885e-07, "loss": 0.0002, "num_tokens": 1107532958.0, "reward": 0.48125, "reward_std": 0.028566450253129004, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8410582542419434, "step": 16465 }, { "completion_length": 374.8, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 92.07734375, "completions/mean_terminated_length": 92.07734375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.014905328089167709, "frac_reward_zero_std": 0.975, "grad_norm": 0.01826825737953186, "kl": 0.669625430717133, "learning_rate": 4.2484920634920635e-07, "loss": 0.0007, "num_tokens": 1107851457.0, "reward": 0.325, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9289332628250122, "step": 16470 }, { "completion_length": 343.8, "completions/clipped_ratio": 0.0, "completions/max_length": 343.8, "completions/max_terminated_length": 343.8, "completions/mean_length": 86.6921875, "completions/mean_terminated_length": 86.6921875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014909853082515969, "frac_reward_zero_std": 0.95, "grad_norm": 2.1997601985931396, "kl": 0.40561113322619347, "learning_rate": 4.248095238095238e-07, "loss": 0.0004, "num_tokens": 1108158735.0, "reward": 0.3171875, "reward_std": 0.04329647086560726, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9357131361961365, "step": 16475 }, { "completion_length": 497.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 497.0, "completions/max_terminated_length": 445.4, "completions/mean_length": 99.1234375, "completions/mean_terminated_length": 98.60030517578124, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.014914378075864229, "frac_reward_zero_std": 0.96875, "grad_norm": 10.670717239379883, "kl": 0.906081607285887, "learning_rate": 4.2476984126984126e-07, "loss": 0.0009, "num_tokens": 1108486309.0, "reward": 0.5015625, "reward_std": 0.025726158171892166, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.8468278527259827, "step": 16480 }, { "completion_length": 352.4, "completions/clipped_ratio": 0.0, "completions/max_length": 352.4, "completions/max_terminated_length": 352.4, "completions/mean_length": 92.31015625, "completions/mean_terminated_length": 92.31015625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.014918903069212489, "frac_reward_zero_std": 0.98125, "grad_norm": 3.9059648513793945, "kl": 0.16483265531715005, "learning_rate": 4.247301587301587e-07, "loss": 0.0002, "num_tokens": 1108804402.0, "reward": 0.3875, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9149532318115234, "step": 16485 }, { "completion_length": 608.6, "completions/clipped_ratio": 0.00390625, "completions/max_length": 608.6, "completions/max_terminated_length": 530.8, "completions/mean_length": 94.2875, "completions/mean_terminated_length": 91.641845703125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.014923428062560747, "frac_reward_zero_std": 0.925, "grad_norm": 27.807857513427734, "kl": 3.3151219985564238, "learning_rate": 4.2469047619047617e-07, "loss": 0.0033, "num_tokens": 1109123802.0, "reward": 0.353125, "reward_std": 0.06875057816505432, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9108528256416321, "step": 16490 }, { "completion_length": 351.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 86.16171875, "completions/mean_terminated_length": 86.16171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014927953055909007, "frac_reward_zero_std": 0.9875, "grad_norm": 0.1250009834766388, "kl": 0.4386288185138255, "learning_rate": 4.246507936507936e-07, "loss": 0.0004, "num_tokens": 1109429889.0, "reward": 0.4734375, "reward_std": 0.01315089464187622, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8499438166618347, "step": 16495 }, { "completion_length": 346.4, "completions/clipped_ratio": 0.0, "completions/max_length": 346.4, "completions/max_terminated_length": 346.4, "completions/mean_length": 83.59453125, "completions/mean_terminated_length": 83.59453125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014932478049257267, "frac_reward_zero_std": 0.95625, "grad_norm": 0.25807005167007446, "kl": 0.4173617573687807, "learning_rate": 4.246111111111111e-07, "loss": 0.0004, "num_tokens": 1109733402.0, "reward": 0.340625, "reward_std": 0.03640277497470379, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9326104044914245, "step": 16500 }, { "completion_length": 485.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.8, "completions/max_terminated_length": 479.8, "completions/mean_length": 90.82421875, "completions/mean_terminated_length": 90.31151123046875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014937003042605528, "frac_reward_zero_std": 0.96875, "grad_norm": 24.335424423217773, "kl": 0.6860353823401966, "learning_rate": 4.2457142857142853e-07, "loss": 0.0007, "num_tokens": 1110048465.0, "reward": 0.26875, "reward_std": 0.025513992086052893, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9598509907722473, "step": 16505 }, { "completion_length": 298.8, "completions/clipped_ratio": 0.0, "completions/max_length": 298.8, "completions/max_terminated_length": 298.8, "completions/mean_length": 94.34296875, "completions/mean_terminated_length": 94.34296875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014941528035953788, "frac_reward_zero_std": 0.9625, "grad_norm": 19.978116989135742, "kl": 1.0322220523376018, "learning_rate": 4.2453174603174604e-07, "loss": 0.001, "num_tokens": 1110369912.0, "reward": 0.1875, "reward_std": 0.03130036853253841, "rewards/verify_chess_move/mean": 0.1875, "rewards/verify_chess_move/std": 0.9621013879776001, "step": 16510 }, { "completion_length": 464.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 86.4046875, "completions/mean_terminated_length": 86.4046875, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.014946053029302048, "frac_reward_zero_std": 0.9625, "grad_norm": 1.0744335651397705, "kl": 4.172363125812263, "learning_rate": 4.2449206349206344e-07, "loss": 0.0042, "num_tokens": 1110677030.0, "reward": 0.246875, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9662070989608764, "step": 16515 }, { "completion_length": 477.6, "completions/clipped_ratio": 0.0, "completions/max_length": 477.6, "completions/max_terminated_length": 477.6, "completions/mean_length": 92.6390625, "completions/mean_terminated_length": 92.6390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.014950578022650306, "frac_reward_zero_std": 0.96875, "grad_norm": 0.7172502279281616, "kl": 1.4931613980792462, "learning_rate": 4.2445238095238095e-07, "loss": 0.0015, "num_tokens": 1110994680.0, "reward": 0.3703125, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9108813643455506, "step": 16520 }, { "completion_length": 426.6, "completions/clipped_ratio": 0.0, "completions/max_length": 426.6, "completions/max_terminated_length": 426.6, "completions/mean_length": 84.2796875, "completions/mean_terminated_length": 84.2796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014955103015998566, "frac_reward_zero_std": 0.9625, "grad_norm": 0.4860730469226837, "kl": 0.5625272067962215, "learning_rate": 4.244126984126984e-07, "loss": 0.0006, "num_tokens": 1111299310.0, "reward": 0.4234375, "reward_std": 0.03492845892906189, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.9030228376388549, "step": 16525 }, { "completion_length": 459.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 459.2, "completions/max_terminated_length": 394.6, "completions/mean_length": 88.68203125, "completions/mean_terminated_length": 88.15113220214843, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.014959628009346826, "frac_reward_zero_std": 0.9375, "grad_norm": 4.648522853851318, "kl": 2.7353267948026767, "learning_rate": 4.243730158730158e-07, "loss": 0.0027, "num_tokens": 1111610935.0, "reward": 0.421875, "reward_std": 0.053077931702136996, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.9053661823272705, "step": 16530 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.0, "completions/max_length": 395.2, "completions/max_terminated_length": 395.2, "completions/mean_length": 87.1140625, "completions/mean_terminated_length": 87.1140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014964153002695086, "frac_reward_zero_std": 0.975, "grad_norm": 0.23588992655277252, "kl": 0.637561872904189, "learning_rate": 4.243333333333333e-07, "loss": 0.0006, "num_tokens": 1111918033.0, "reward": 0.478125, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8562669515609741, "step": 16535 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 370.8, "completions/max_terminated_length": 307.0, "completions/mean_length": 94.47734375, "completions/mean_terminated_length": 93.96533203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.014968677996043346, "frac_reward_zero_std": 0.975, "grad_norm": 0.8493472933769226, "kl": 0.8408355967141687, "learning_rate": 4.2429365079365076e-07, "loss": 0.0008, "num_tokens": 1112240620.0, "reward": 0.3140625, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9357981443405151, "step": 16540 }, { "completion_length": 418.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 418.4, "completions/max_terminated_length": 340.8, "completions/mean_length": 83.5671875, "completions/mean_terminated_length": 83.03856048583984, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.014973202989391605, "frac_reward_zero_std": 0.95, "grad_norm": 11.998475074768066, "kl": 0.33127893971977757, "learning_rate": 4.2425396825396827e-07, "loss": 0.0003, "num_tokens": 1112543314.0, "reward": 0.4046875, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9071272253990174, "step": 16545 }, { "completion_length": 495.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 495.0, "completions/max_terminated_length": 448.8, "completions/mean_length": 94.275, "completions/mean_terminated_length": 93.24432220458985, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.014977727982739865, "frac_reward_zero_std": 0.94375, "grad_norm": 19.037677764892578, "kl": 0.9477424633922056, "learning_rate": 4.2421428571428567e-07, "loss": 0.0009, "num_tokens": 1112862690.0, "reward": 0.3578125, "reward_std": 0.046348930522799495, "rewards/verify_chess_move/mean": 0.3578125, "rewards/verify_chess_move/std": 0.9221076726913452, "step": 16550 }, { "completion_length": 332.6, "completions/clipped_ratio": 0.0, "completions/max_length": 332.6, "completions/max_terminated_length": 332.6, "completions/mean_length": 90.8984375, "completions/mean_terminated_length": 90.8984375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.014982252976088125, "frac_reward_zero_std": 0.96875, "grad_norm": 3.1758902072906494, "kl": 0.6736696868669242, "learning_rate": 4.241746031746031e-07, "loss": 0.0007, "num_tokens": 1113176528.0, "reward": 0.4125, "reward_std": 0.027563939243555068, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9025603175163269, "step": 16555 }, { "completion_length": 351.2, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/max_terminated_length": 351.2, "completions/mean_length": 87.79453125, "completions/mean_terminated_length": 87.79453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.014986777969436385, "frac_reward_zero_std": 0.94375, "grad_norm": 1.7049182653427124, "kl": 0.14910863370168953, "learning_rate": 4.2413492063492063e-07, "loss": 0.0001, "num_tokens": 1113487225.0, "reward": 0.425, "reward_std": 0.04524160847067833, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8910587668418884, "step": 16560 }, { "completion_length": 562.4, "completions/clipped_ratio": 0.00234375, "completions/max_length": 562.4, "completions/max_terminated_length": 508.2, "completions/mean_length": 96.1859375, "completions/mean_terminated_length": 94.62582550048828, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.014991302962784645, "frac_reward_zero_std": 0.94375, "grad_norm": 21.512392044067383, "kl": 0.5184140824712813, "learning_rate": 4.240952380952381e-07, "loss": 0.0005, "num_tokens": 1113809567.0, "reward": 0.36875, "reward_std": 0.04887068048119545, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9268843054771423, "step": 16565 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 85.5703125, "completions/mean_terminated_length": 85.5703125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.014995827956132905, "frac_reward_zero_std": 0.9875, "grad_norm": 7.242623329162598, "kl": 0.34705140369478615, "learning_rate": 4.2405555555555554e-07, "loss": 0.0003, "num_tokens": 1114114793.0, "reward": 0.45, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8553874611854553, "step": 16570 }, { "completion_length": 559.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 559.2, "completions/max_terminated_length": 521.0, "completions/mean_length": 90.9125, "completions/mean_terminated_length": 89.87712249755859, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015000352949481164, "frac_reward_zero_std": 0.96875, "grad_norm": 7.6350226402282715, "kl": 0.3468116622650996, "learning_rate": 4.24015873015873e-07, "loss": 0.0003, "num_tokens": 1114429193.0, "reward": 0.4078125, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.8893776416778565, "step": 16575 }, { "completion_length": 347.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 93.65078125, "completions/mean_terminated_length": 93.65078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015004877942829424, "frac_reward_zero_std": 0.94375, "grad_norm": 1.1063624620437622, "kl": 0.5644950983580201, "learning_rate": 4.2397619047619045e-07, "loss": 0.0006, "num_tokens": 1114748354.0, "reward": 0.3125, "reward_std": 0.047715889662504195, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9478920459747314, "step": 16580 }, { "completion_length": 346.2, "completions/clipped_ratio": 0.0, "completions/max_length": 346.2, "completions/max_terminated_length": 346.2, "completions/mean_length": 89.69609375, "completions/mean_terminated_length": 89.69609375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015009402936177684, "frac_reward_zero_std": 0.9625, "grad_norm": 0.016114849597215652, "kl": 0.4041665272321552, "learning_rate": 4.239365079365079e-07, "loss": 0.0004, "num_tokens": 1115061621.0, "reward": 0.353125, "reward_std": 0.03014557547867298, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9313838362693787, "step": 16585 }, { "completion_length": 350.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 92.275, "completions/mean_terminated_length": 92.275, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015013927929525944, "frac_reward_zero_std": 0.9375, "grad_norm": 4.0511016845703125, "kl": 0.41412878714036194, "learning_rate": 4.2389682539682535e-07, "loss": 0.0004, "num_tokens": 1115376981.0, "reward": 0.421875, "reward_std": 0.05307793281972408, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8975699782371521, "step": 16590 }, { "completion_length": 508.6, "completions/clipped_ratio": 0.0, "completions/max_length": 508.6, "completions/max_terminated_length": 508.6, "completions/mean_length": 92.2703125, "completions/mean_terminated_length": 92.2703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015018452922874204, "frac_reward_zero_std": 0.9375, "grad_norm": 0.22760747373104095, "kl": 0.2633078265353106, "learning_rate": 4.2385714285714286e-07, "loss": 0.0003, "num_tokens": 1115693687.0, "reward": 0.4171875, "reward_std": 0.05418525189161301, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8928666830062866, "step": 16595 }, { "completion_length": 328.6, "completions/clipped_ratio": 0.0, "completions/max_length": 328.6, "completions/max_terminated_length": 328.6, "completions/mean_length": 86.44765625, "completions/mean_terminated_length": 86.44765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015022977916222462, "frac_reward_zero_std": 0.9625, "grad_norm": 7.332956314086914, "kl": 0.6327760744839906, "learning_rate": 4.238174603174603e-07, "loss": 0.0006, "num_tokens": 1116001356.0, "reward": 0.321875, "reward_std": 0.03471629172563553, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9416512370109558, "step": 16600 }, { "completion_length": 417.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 417.8, "completions/max_terminated_length": 402.4, "completions/mean_length": 89.81640625, "completions/mean_terminated_length": 88.76149139404296, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.015027502909570722, "frac_reward_zero_std": 0.95625, "grad_norm": 0.00411604717373848, "kl": 0.4601716702221893, "learning_rate": 4.237777777777777e-07, "loss": 0.0005, "num_tokens": 1116312097.0, "reward": 0.4609375, "reward_std": 0.038664887100458144, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8610430359840393, "step": 16605 }, { "completion_length": 408.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 408.8, "completions/max_terminated_length": 358.2, "completions/mean_length": 89.4578125, "completions/mean_terminated_length": 88.93605499267578, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015032027902918983, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0023043861147016287, "kl": 0.40525465140817685, "learning_rate": 4.237380952380952e-07, "loss": 0.0004, "num_tokens": 1116624315.0, "reward": 0.3359375, "reward_std": 0.02993340902030468, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9317179918289185, "step": 16610 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 90.665625, "completions/mean_terminated_length": 90.665625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015036552896267243, "frac_reward_zero_std": 0.95, "grad_norm": 0.008305029012262821, "kl": 1.3356774553656579, "learning_rate": 4.236984126984127e-07, "loss": 0.0013, "num_tokens": 1116938095.0, "reward": 0.5046875, "reward_std": 0.041246522963047025, "rewards/verify_chess_move/mean": 0.5046875, "rewards/verify_chess_move/std": 0.8585571885108948, "step": 16615 }, { "completion_length": 420.4, "completions/clipped_ratio": 0.0, "completions/max_length": 420.4, "completions/max_terminated_length": 420.4, "completions/mean_length": 92.54765625, "completions/mean_terminated_length": 92.54765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015041077889615503, "frac_reward_zero_std": 0.975, "grad_norm": 1.1864533424377441, "kl": 0.17737399709876628, "learning_rate": 4.236587301587302e-07, "loss": 0.0002, "num_tokens": 1117254508.0, "reward": 0.43125, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8877305269241333, "step": 16620 }, { "completion_length": 269.8, "completions/clipped_ratio": 0.0, "completions/max_length": 269.8, "completions/max_terminated_length": 269.8, "completions/mean_length": 96.1359375, "completions/mean_terminated_length": 96.1359375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015045602882963763, "frac_reward_zero_std": 0.98125, "grad_norm": 1.501010537147522, "kl": 0.23685410926118494, "learning_rate": 4.236190476190476e-07, "loss": 0.0002, "num_tokens": 1117579162.0, "reward": 0.4125, "reward_std": 0.01462521068751812, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9083035349845886, "step": 16625 }, { "completion_length": 447.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 447.4, "completions/max_terminated_length": 432.2, "completions/mean_length": 97.3875, "completions/mean_terminated_length": 96.33707580566406, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015050127876312021, "frac_reward_zero_std": 0.95625, "grad_norm": 1.4014089107513428, "kl": 0.6823242718703113, "learning_rate": 4.2357936507936504e-07, "loss": 0.0007, "num_tokens": 1117904802.0, "reward": 0.4421875, "reward_std": 0.03776973150670528, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8597548604011536, "step": 16630 }, { "completion_length": 367.2, "completions/clipped_ratio": 0.0, "completions/max_length": 367.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 87.9234375, "completions/mean_terminated_length": 87.9234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015054652869660281, "frac_reward_zero_std": 0.98125, "grad_norm": 15.725296974182129, "kl": 0.19705999144352973, "learning_rate": 4.2353968253968255e-07, "loss": 0.0002, "num_tokens": 1118214800.0, "reward": 0.3109375, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9370764374732972, "step": 16635 }, { "completion_length": 394.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 394.0, "completions/max_terminated_length": 359.2, "completions/mean_length": 94.2, "completions/mean_terminated_length": 93.68444061279297, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015059177863008541, "frac_reward_zero_std": 0.95, "grad_norm": 6.1511688232421875, "kl": 0.37773480350151656, "learning_rate": 4.2349999999999995e-07, "loss": 0.0004, "num_tokens": 1118535136.0, "reward": 0.2890625, "reward_std": 0.04739636518061161, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9554783225059509, "step": 16640 }, { "completion_length": 285.4, "completions/clipped_ratio": 0.0, "completions/max_length": 285.4, "completions/max_terminated_length": 285.4, "completions/mean_length": 95.64375, "completions/mean_terminated_length": 95.64375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015063702856356801, "frac_reward_zero_std": 0.95625, "grad_norm": 0.08952431380748749, "kl": 0.4607054274063557, "learning_rate": 4.2346031746031745e-07, "loss": 0.0005, "num_tokens": 1118859960.0, "reward": 0.2421875, "reward_std": 0.037769732624292375, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9594817399978638, "step": 16645 }, { "completion_length": 391.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 391.8, "completions/max_terminated_length": 289.8, "completions/mean_length": 94.56328125, "completions/mean_terminated_length": 94.03353424072266, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015068227849705062, "frac_reward_zero_std": 0.96875, "grad_norm": 0.003185572801157832, "kl": 0.2762691881507635, "learning_rate": 4.234206349206349e-07, "loss": 0.0003, "num_tokens": 1119181257.0, "reward": 0.4015625, "reward_std": 0.025726159289479255, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9119141340255738, "step": 16650 }, { "completion_length": 376.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 376.6, "completions/max_terminated_length": 319.6, "completions/mean_length": 85.96875, "completions/mean_terminated_length": 84.90006103515626, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01507275284305332, "frac_reward_zero_std": 0.95, "grad_norm": 0.001636339700780809, "kl": 0.4924662109231576, "learning_rate": 4.2338095238095236e-07, "loss": 0.0005, "num_tokens": 1119487905.0, "reward": 0.4203125, "reward_std": 0.04398044161498547, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8769579410552979, "step": 16655 }, { "completion_length": 271.2, "completions/clipped_ratio": 0.0, "completions/max_length": 271.2, "completions/max_terminated_length": 271.2, "completions/mean_length": 86.1078125, "completions/mean_terminated_length": 86.1078125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01507727783640158, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024268196430057287, "kl": 0.20986733734607696, "learning_rate": 4.233412698412698e-07, "loss": 0.0002, "num_tokens": 1119793763.0, "reward": 0.4765625, "reward_std": 0.024831002578139305, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8757208943367004, "step": 16660 }, { "completion_length": 327.4, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/max_terminated_length": 327.4, "completions/mean_length": 95.03046875, "completions/mean_terminated_length": 95.03046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01508180282974984, "frac_reward_zero_std": 0.9375, "grad_norm": 10.858845710754395, "kl": 0.7602505017770455, "learning_rate": 4.2330158730158727e-07, "loss": 0.0008, "num_tokens": 1120114802.0, "reward": 0.3734375, "reward_std": 0.05303046144545078, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.899452006816864, "step": 16665 }, { "completion_length": 401.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 88.36796875, "completions/mean_terminated_length": 88.36796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0150863278230981, "frac_reward_zero_std": 0.95625, "grad_norm": 25.092777252197266, "kl": 0.5080095324898138, "learning_rate": 4.232619047619048e-07, "loss": 0.0005, "num_tokens": 1120425345.0, "reward": 0.3109375, "reward_std": 0.035035816580057146, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9354473114013672, "step": 16670 }, { "completion_length": 353.6, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/max_terminated_length": 353.6, "completions/mean_length": 85.57578125, "completions/mean_terminated_length": 85.57578125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01509085281644636, "frac_reward_zero_std": 0.95, "grad_norm": 24.47315216064453, "kl": 2.389761696266942, "learning_rate": 4.232222222222222e-07, "loss": 0.0024, "num_tokens": 1120731474.0, "reward": 0.4078125, "reward_std": 0.04308528676629066, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.906105923652649, "step": 16675 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.0, "completions/max_terminated_length": 306.8, "completions/mean_length": 86.01171875, "completions/mean_terminated_length": 85.47910003662109, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01509537780979462, "frac_reward_zero_std": 0.9625, "grad_norm": 6.592018127441406, "kl": 2.2825548238120974, "learning_rate": 4.2318253968253963e-07, "loss": 0.0023, "num_tokens": 1121039825.0, "reward": 0.378125, "reward_std": 0.03219552300870419, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9161825180053711, "step": 16680 }, { "completion_length": 422.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 90.24609375, "completions/mean_terminated_length": 90.24609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015099902803142879, "frac_reward_zero_std": 0.9625, "grad_norm": 0.012308907695114613, "kl": 1.8943145404104142, "learning_rate": 4.2314285714285714e-07, "loss": 0.0019, "num_tokens": 1121353172.0, "reward": 0.246875, "reward_std": 0.0337746474891901, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.9608209252357482, "step": 16685 }, { "completion_length": 366.8, "completions/clipped_ratio": 0.0, "completions/max_length": 366.8, "completions/max_terminated_length": 366.8, "completions/mean_length": 93.24453125, "completions/mean_terminated_length": 93.24453125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015104427796491139, "frac_reward_zero_std": 0.9625, "grad_norm": 0.03568021580576897, "kl": 1.7736717281164602, "learning_rate": 4.231031746031746e-07, "loss": 0.0018, "num_tokens": 1121673669.0, "reward": 0.290625, "reward_std": 0.02925042025744915, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9303469181060791, "step": 16690 }, { "completion_length": 266.6, "completions/clipped_ratio": 0.0, "completions/max_length": 266.6, "completions/max_terminated_length": 266.6, "completions/mean_length": 83.084375, "completions/mean_terminated_length": 83.084375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.015108952789839399, "frac_reward_zero_std": 0.96875, "grad_norm": 19.834741592407227, "kl": 2.425463260570541, "learning_rate": 4.2306349206349205e-07, "loss": 0.0024, "num_tokens": 1121975697.0, "reward": 0.409375, "reward_std": 0.027563939243555068, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9067108631134033, "step": 16695 }, { "completion_length": 296.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 84.66015625, "completions/mean_terminated_length": 84.66015625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015113477783187659, "frac_reward_zero_std": 0.9625, "grad_norm": 9.039095878601074, "kl": 2.478190026152879, "learning_rate": 4.230238095238095e-07, "loss": 0.0025, "num_tokens": 1122280086.0, "reward": 0.34375, "reward_std": 0.032195523381233215, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9330493927001953, "step": 16700 }, { "completion_length": 346.2, "completions/clipped_ratio": 0.0, "completions/max_length": 346.2, "completions/max_terminated_length": 346.2, "completions/mean_length": 86.51484375, "completions/mean_terminated_length": 86.51484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015118002776535919, "frac_reward_zero_std": 0.96875, "grad_norm": 8.906179428100586, "kl": 7.229153965855948, "learning_rate": 4.2298412698412696e-07, "loss": 0.0072, "num_tokens": 1122589401.0, "reward": 0.3921875, "reward_std": 0.0277761060744524, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9095942378044128, "step": 16705 }, { "completion_length": 395.8, "completions/clipped_ratio": 0.0, "completions/max_length": 395.8, "completions/max_terminated_length": 395.8, "completions/mean_length": 96.4765625, "completions/mean_terminated_length": 96.4765625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015122527769884177, "frac_reward_zero_std": 0.95, "grad_norm": 0.20674441754817963, "kl": 3.0059180593816563, "learning_rate": 4.2294444444444446e-07, "loss": 0.003, "num_tokens": 1122914291.0, "reward": 0.378125, "reward_std": 0.04376827478408814, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.921132504940033, "step": 16710 }, { "completion_length": 331.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 89.990625, "completions/mean_terminated_length": 89.990625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015127052763232438, "frac_reward_zero_std": 0.975, "grad_norm": 0.18491362035274506, "kl": 1.7584845699369906, "learning_rate": 4.2290476190476186e-07, "loss": 0.0018, "num_tokens": 1123226815.0, "reward": 0.309375, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9515680313110352, "step": 16715 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 425.6, "completions/max_terminated_length": 341.4, "completions/mean_length": 93.0078125, "completions/mean_terminated_length": 92.48488159179688, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015131577756580698, "frac_reward_zero_std": 0.95625, "grad_norm": 17.498577117919922, "kl": 2.052232451119926, "learning_rate": 4.2286507936507937e-07, "loss": 0.0021, "num_tokens": 1123545633.0, "reward": 0.340625, "reward_std": 0.040031847357749936, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9366654634475708, "step": 16720 }, { "completion_length": 361.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 361.6, "completions/max_terminated_length": 349.6, "completions/mean_length": 84.55078125, "completions/mean_terminated_length": 84.02174377441406, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015136102749928958, "frac_reward_zero_std": 0.975, "grad_norm": 33.42262268066406, "kl": 1.2200788793037645, "learning_rate": 4.228253968253968e-07, "loss": 0.0012, "num_tokens": 1123849370.0, "reward": 0.4625, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8511798739433288, "step": 16725 }, { "completion_length": 418.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 418.6, "completions/max_terminated_length": 314.0, "completions/mean_length": 90.61796875, "completions/mean_terminated_length": 90.08679656982422, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015140627743277218, "frac_reward_zero_std": 0.95, "grad_norm": 0.43716439604759216, "kl": 2.244163321494125, "learning_rate": 4.227857142857142e-07, "loss": 0.0022, "num_tokens": 1124164169.0, "reward": 0.4515625, "reward_std": 0.04534641914069652, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8746610045433044, "step": 16730 }, { "completion_length": 394.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 394.6, "completions/max_terminated_length": 362.4, "completions/mean_length": 87.8859375, "completions/mean_terminated_length": 86.82009735107422, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015145152736625478, "frac_reward_zero_std": 0.94375, "grad_norm": 0.48980817198753357, "kl": 0.7856515628867783, "learning_rate": 4.2274603174603173e-07, "loss": 0.0008, "num_tokens": 1124474751.0, "reward": 0.3765625, "reward_std": 0.04340382814407349, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9214630961418152, "step": 16735 }, { "completion_length": 487.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 487.8, "completions/max_terminated_length": 358.8, "completions/mean_length": 93.4515625, "completions/mean_terminated_length": 92.39508819580078, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015149677729973736, "frac_reward_zero_std": 0.95, "grad_norm": 2.1319901943206787, "kl": 1.2090012583415954, "learning_rate": 4.227063492063492e-07, "loss": 0.0012, "num_tokens": 1124793369.0, "reward": 0.3609375, "reward_std": 0.04240131601691246, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9144522547721863, "step": 16740 }, { "completion_length": 378.6, "completions/clipped_ratio": 0.0, "completions/max_length": 378.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 86.55078125, "completions/mean_terminated_length": 86.55078125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.015154202723321996, "frac_reward_zero_std": 0.9875, "grad_norm": 0.17181052267551422, "kl": 0.2881992734386586, "learning_rate": 4.226666666666667e-07, "loss": 0.0003, "num_tokens": 1125101434.0, "reward": 0.4671875, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.8742770433425904, "step": 16745 }, { "completion_length": 370.2, "completions/clipped_ratio": 0.0, "completions/max_length": 370.2, "completions/max_terminated_length": 370.2, "completions/mean_length": 91.7359375, "completions/mean_terminated_length": 91.7359375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015158727716670256, "frac_reward_zero_std": 0.95625, "grad_norm": 11.356369972229004, "kl": 0.7048772299545817, "learning_rate": 4.226269841269841e-07, "loss": 0.0007, "num_tokens": 1125418720.0, "reward": 0.5015625, "reward_std": 0.03571978583931923, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.8526713609695434, "step": 16750 }, { "completion_length": 445.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 445.6, "completions/max_terminated_length": 431.2, "completions/mean_length": 97.38828125, "completions/mean_terminated_length": 95.82271270751953, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015163252710018517, "frac_reward_zero_std": 0.9625, "grad_norm": 12.645208358764648, "kl": 1.1674074617912993, "learning_rate": 4.2258730158730155e-07, "loss": 0.0012, "num_tokens": 1125744049.0, "reward": 0.3484375, "reward_std": 0.031983356550335885, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9347173810005188, "step": 16755 }, { "completion_length": 514.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 514.6, "completions/max_terminated_length": 408.6, "completions/mean_length": 104.653125, "completions/mean_terminated_length": 103.07581481933593, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015167777703366777, "frac_reward_zero_std": 0.95625, "grad_norm": 2.345832109451294, "kl": 0.9275366997462697, "learning_rate": 4.2254761904761905e-07, "loss": 0.0009, "num_tokens": 1126080901.0, "reward": 0.375, "reward_std": 0.03708674274384975, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9242016315460205, "step": 16760 }, { "completion_length": 495.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 495.2, "completions/max_terminated_length": 400.0, "completions/mean_length": 93.77578125, "completions/mean_terminated_length": 93.25157012939454, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015172302696715035, "frac_reward_zero_std": 0.95, "grad_norm": 16.51352882385254, "kl": 0.45992591861868276, "learning_rate": 4.2250793650793646e-07, "loss": 0.0005, "num_tokens": 1126400998.0, "reward": 0.3171875, "reward_std": 0.04082219228148461, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9357090473175049, "step": 16765 }, { "completion_length": 355.2, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/max_terminated_length": 355.2, "completions/mean_length": 89.053125, "completions/mean_terminated_length": 89.053125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015176827690063295, "frac_reward_zero_std": 0.96875, "grad_norm": 25.388988494873047, "kl": 0.42951636780053376, "learning_rate": 4.2246825396825396e-07, "loss": 0.0004, "num_tokens": 1126713738.0, "reward": 0.478125, "reward_std": 0.025513992086052893, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8763361096382141, "step": 16770 }, { "completion_length": 466.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 466.8, "completions/max_terminated_length": 390.6, "completions/mean_length": 88.96796875, "completions/mean_terminated_length": 88.43420715332032, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015181352683411555, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017058057710528374, "kl": 0.7824026592075825, "learning_rate": 4.224285714285714e-07, "loss": 0.0008, "num_tokens": 1127024073.0, "reward": 0.371875, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9206655144691467, "step": 16775 }, { "completion_length": 285.4, "completions/clipped_ratio": 0.0, "completions/max_length": 285.4, "completions/max_terminated_length": 285.4, "completions/mean_length": 91.74140625, "completions/mean_terminated_length": 91.74140625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015185877676759815, "frac_reward_zero_std": 0.95, "grad_norm": 13.786118507385254, "kl": 0.4054068499710411, "learning_rate": 4.2238888888888887e-07, "loss": 0.0004, "num_tokens": 1127340638.0, "reward": 0.18125, "reward_std": 0.04308430477976799, "rewards/verify_chess_move/mean": 0.18125, "rewards/verify_chess_move/std": 0.9605384469032288, "step": 16780 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 92.01484375, "completions/mean_terminated_length": 92.01484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015190402670108075, "frac_reward_zero_std": 0.9625, "grad_norm": 0.7518346309661865, "kl": 1.2609585269121453, "learning_rate": 4.223492063492063e-07, "loss": 0.0013, "num_tokens": 1127658185.0, "reward": 0.3125, "reward_std": 0.031300367414951326, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9492805600166321, "step": 16785 }, { "completion_length": 306.4, "completions/clipped_ratio": 0.0, "completions/max_length": 306.4, "completions/max_terminated_length": 306.4, "completions/mean_length": 91.42421875, "completions/mean_terminated_length": 91.42421875, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.015194927663456335, "frac_reward_zero_std": 0.98125, "grad_norm": 8.646574974060059, "kl": 0.8548181462101638, "learning_rate": 4.223095238095238e-07, "loss": 0.0009, "num_tokens": 1127973608.0, "reward": 0.3953125, "reward_std": 0.018042115867137908, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9095044016838074, "step": 16790 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 83.32734375, "completions/mean_terminated_length": 83.32734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015199452656804594, "frac_reward_zero_std": 0.9625, "grad_norm": 42.92293930053711, "kl": 0.24379630342591554, "learning_rate": 4.222698412698413e-07, "loss": 0.0002, "num_tokens": 1128274707.0, "reward": 0.3484375, "reward_std": 0.03445763550698757, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9254837393760681, "step": 16795 }, { "completion_length": 320.8, "completions/clipped_ratio": 0.0, "completions/max_length": 320.8, "completions/max_terminated_length": 320.8, "completions/mean_length": 92.22109375, "completions/mean_terminated_length": 92.22109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015203977650152854, "frac_reward_zero_std": 0.95, "grad_norm": 1.0798184871673584, "kl": 1.294735369249247, "learning_rate": 4.2223015873015874e-07, "loss": 0.0013, "num_tokens": 1128590798.0, "reward": 0.31875, "reward_std": 0.04923414662480354, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9413592576980591, "step": 16800 }, { "completion_length": 442.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 442.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 91.83203125, "completions/mean_terminated_length": 91.30185089111328, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015208502643501114, "frac_reward_zero_std": 0.95625, "grad_norm": 10.291162490844727, "kl": 0.46543789902934807, "learning_rate": 4.2219047619047614e-07, "loss": 0.0005, "num_tokens": 1128909335.0, "reward": 0.2296875, "reward_std": 0.033669838681817055, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.958220613002777, "step": 16805 }, { "completion_length": 452.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 452.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 91.3859375, "completions/mean_terminated_length": 89.82978057861328, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015213027636849374, "frac_reward_zero_std": 0.96875, "grad_norm": 27.296035766601562, "kl": 1.0528896033763886, "learning_rate": 4.2215079365079365e-07, "loss": 0.0011, "num_tokens": 1129221813.0, "reward": 0.4640625, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.4640625, "rewards/verify_chess_move/std": 0.8807074189186096, "step": 16810 }, { "completion_length": 374.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 374.6, "completions/max_terminated_length": 358.6, "completions/mean_length": 85.96953125, "completions/mean_terminated_length": 85.4426483154297, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015217552630197634, "frac_reward_zero_std": 0.9625, "grad_norm": 21.956865310668945, "kl": 1.6087922119768336, "learning_rate": 4.221111111111111e-07, "loss": 0.0016, "num_tokens": 1129528814.0, "reward": 0.36875, "reward_std": 0.0326663464307785, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9006666660308837, "step": 16815 }, { "completion_length": 474.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 474.6, "completions/max_terminated_length": 318.4, "completions/mean_length": 95.90859375, "completions/mean_terminated_length": 94.87289123535156, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015222077623545893, "frac_reward_zero_std": 0.96875, "grad_norm": 5.6875152587890625, "kl": 1.3372050857055, "learning_rate": 4.2207142857142856e-07, "loss": 0.0013, "num_tokens": 1129851209.0, "reward": 0.36875, "reward_std": 0.02709311693906784, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.8883615374565125, "step": 16820 }, { "completion_length": 432.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 432.4, "completions/max_terminated_length": 394.6, "completions/mean_length": 91.68984375, "completions/mean_terminated_length": 91.16504974365235, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015226602616894153, "frac_reward_zero_std": 0.95, "grad_norm": 0.7497193813323975, "kl": 1.86636590535054, "learning_rate": 4.22031746031746e-07, "loss": 0.0019, "num_tokens": 1130167900.0, "reward": 0.296875, "reward_std": 0.041034357994794844, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9488668322563172, "step": 16825 }, { "completion_length": 391.6, "completions/clipped_ratio": 0.0, "completions/max_length": 391.6, "completions/max_terminated_length": 391.6, "completions/mean_length": 92.5328125, "completions/mean_terminated_length": 92.5328125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015231127610242413, "frac_reward_zero_std": 0.9625, "grad_norm": 5.092483043670654, "kl": 1.0735948594054208, "learning_rate": 4.2199206349206346e-07, "loss": 0.0011, "num_tokens": 1130485486.0, "reward": 0.4859375, "reward_std": 0.030828564241528512, "rewards/verify_chess_move/mean": 0.4859375, "rewards/verify_chess_move/std": 0.8481248617172241, "step": 16830 }, { "completion_length": 411.4, "completions/clipped_ratio": 0.0, "completions/max_length": 411.4, "completions/max_terminated_length": 411.4, "completions/mean_length": 86.5078125, "completions/mean_terminated_length": 86.5078125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015235652603590673, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021767267026007175, "kl": 2.0049825438763946, "learning_rate": 4.2195238095238097e-07, "loss": 0.002, "num_tokens": 1130791632.0, "reward": 0.375, "reward_std": 0.029613886773586274, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9268303513526917, "step": 16835 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 92.97734375, "completions/mean_terminated_length": 92.44835510253907, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015240177596938933, "frac_reward_zero_std": 0.95625, "grad_norm": 35.1787109375, "kl": 3.0381849035271444, "learning_rate": 4.2191269841269837e-07, "loss": 0.003, "num_tokens": 1131109603.0, "reward": 0.4421875, "reward_std": 0.042764782160520556, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8829870343208313, "step": 16840 }, { "completion_length": 442.2, "completions/clipped_ratio": 0.0, "completions/max_length": 442.2, "completions/max_terminated_length": 442.2, "completions/mean_length": 88.68984375, "completions/mean_terminated_length": 88.68984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015244702590287193, "frac_reward_zero_std": 0.94375, "grad_norm": 23.952247619628906, "kl": 2.923862616927363, "learning_rate": 4.218730158730159e-07, "loss": 0.0029, "num_tokens": 1131420390.0, "reward": 0.36875, "reward_std": 0.050238621234893796, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9212243437767029, "step": 16845 }, { "completion_length": 273.2, "completions/clipped_ratio": 0.0, "completions/max_length": 273.2, "completions/max_terminated_length": 273.2, "completions/mean_length": 89.85390625, "completions/mean_terminated_length": 89.85390625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015249227583635451, "frac_reward_zero_std": 0.9625, "grad_norm": 8.185240745544434, "kl": 1.8395998136606067, "learning_rate": 4.2183333333333333e-07, "loss": 0.0018, "num_tokens": 1131734371.0, "reward": 0.278125, "reward_std": 0.028566450998187064, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9417358279228211, "step": 16850 }, { "completion_length": 326.2, "completions/clipped_ratio": 0.0, "completions/max_length": 326.2, "completions/max_terminated_length": 326.2, "completions/mean_length": 92.959375, "completions/mean_terminated_length": 92.959375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015253752576983711, "frac_reward_zero_std": 0.94375, "grad_norm": 11.08674144744873, "kl": 2.696150969224982, "learning_rate": 4.2179365079365073e-07, "loss": 0.0027, "num_tokens": 1132053847.0, "reward": 0.421875, "reward_std": 0.05139144919812679, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.9030793190002442, "step": 16855 }, { "completion_length": 378.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 378.0, "completions/max_terminated_length": 307.8, "completions/mean_length": 90.6640625, "completions/mean_terminated_length": 90.13121185302734, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015258277570331972, "frac_reward_zero_std": 0.96875, "grad_norm": 14.976373672485352, "kl": 0.38369078884134067, "learning_rate": 4.2175396825396824e-07, "loss": 0.0004, "num_tokens": 1132369513.0, "reward": 0.375, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.8906014204025269, "step": 16860 }, { "completion_length": 400.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 400.6, "completions/max_terminated_length": 307.0, "completions/mean_length": 95.0234375, "completions/mean_terminated_length": 94.4996551513672, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015262802563680232, "frac_reward_zero_std": 0.9875, "grad_norm": 0.3369416892528534, "kl": 0.7945653256261721, "learning_rate": 4.217142857142857e-07, "loss": 0.0008, "num_tokens": 1132692359.0, "reward": 0.4109375, "reward_std": 0.01315089538693428, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.9110048651695252, "step": 16865 }, { "completion_length": 300.4, "completions/clipped_ratio": 0.0, "completions/max_length": 300.4, "completions/max_terminated_length": 300.4, "completions/mean_length": 81.8640625, "completions/mean_terminated_length": 81.8640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015267327557028492, "frac_reward_zero_std": 0.96875, "grad_norm": 0.5893779397010803, "kl": 2.617398229963146, "learning_rate": 4.216746031746032e-07, "loss": 0.0026, "num_tokens": 1132990729.0, "reward": 0.4375, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8928272008895874, "step": 16870 }, { "completion_length": 329.8, "completions/clipped_ratio": 0.0, "completions/max_length": 329.8, "completions/max_terminated_length": 329.8, "completions/mean_length": 93.475, "completions/mean_terminated_length": 93.475, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015271852550376752, "frac_reward_zero_std": 0.9875, "grad_norm": 0.08948374539613724, "kl": 0.41272651236504315, "learning_rate": 4.216349206349206e-07, "loss": 0.0004, "num_tokens": 1133310881.0, "reward": 0.3671875, "reward_std": 0.011100948229432106, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.8937939763069153, "step": 16875 }, { "completion_length": 382.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 382.2, "completions/max_terminated_length": 300.8, "completions/mean_length": 88.3015625, "completions/mean_terminated_length": 87.78348236083984, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01527637754372501, "frac_reward_zero_std": 0.96875, "grad_norm": 17.428817749023438, "kl": 3.3368696273886598, "learning_rate": 4.2159523809523806e-07, "loss": 0.0033, "num_tokens": 1133622635.0, "reward": 0.3859375, "reward_std": 0.029355230927467345, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9074082732200622, "step": 16880 }, { "completion_length": 484.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 484.8, "completions/max_terminated_length": 407.6, "completions/mean_length": 86.20078125, "completions/mean_terminated_length": 85.66978302001954, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01528090253707327, "frac_reward_zero_std": 0.98125, "grad_norm": 4.724565505981445, "kl": 1.6256099455058575, "learning_rate": 4.2155555555555556e-07, "loss": 0.0016, "num_tokens": 1133930052.0, "reward": 0.390625, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9194617986679077, "step": 16885 }, { "completion_length": 341.4, "completions/clipped_ratio": 0.0, "completions/max_length": 341.4, "completions/max_terminated_length": 341.4, "completions/mean_length": 96.50546875, "completions/mean_terminated_length": 96.50546875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01528542753042153, "frac_reward_zero_std": 0.9875, "grad_norm": 4.799892425537109, "kl": 0.22488589244894683, "learning_rate": 4.21515873015873e-07, "loss": 0.0002, "num_tokens": 1134254403.0, "reward": 0.5015625, "reward_std": 0.010205793380737304, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.8586928367614746, "step": 16890 }, { "completion_length": 470.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 470.4, "completions/max_terminated_length": 330.2, "completions/mean_length": 92.16015625, "completions/mean_terminated_length": 91.105517578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01528995252376979, "frac_reward_zero_std": 0.9625, "grad_norm": 103.07241821289062, "kl": 5.768917050072924, "learning_rate": 4.214761904761904e-07, "loss": 0.0058, "num_tokens": 1134572760.0, "reward": 0.33125, "reward_std": 0.03130036853253841, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9355065226554871, "step": 16895 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 88.90546875, "completions/mean_terminated_length": 88.90546875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01529447751711805, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0020842922385782003, "kl": 2.354435419139918, "learning_rate": 4.214365079365079e-07, "loss": 0.0024, "num_tokens": 1134885727.0, "reward": 0.3765625, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9216644167900085, "step": 16900 }, { "completion_length": 263.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 83.196875, "completions/mean_terminated_length": 83.196875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015299002510466309, "frac_reward_zero_std": 0.9625, "grad_norm": 29.218013763427734, "kl": 0.24549918482080102, "learning_rate": 4.213968253968254e-07, "loss": 0.0002, "num_tokens": 1135189475.0, "reward": 0.4859375, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.4859375, "rewards/verify_chess_move/std": 0.8330986142158509, "step": 16905 }, { "completion_length": 305.4, "completions/clipped_ratio": 0.0, "completions/max_length": 305.4, "completions/max_terminated_length": 305.4, "completions/mean_length": 92.1078125, "completions/mean_terminated_length": 92.1078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015303527503814569, "frac_reward_zero_std": 0.9875, "grad_norm": 0.7342997789382935, "kl": 0.36089640052523464, "learning_rate": 4.2135714285714283e-07, "loss": 0.0004, "num_tokens": 1135508413.0, "reward": 0.4484375, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8841413140296936, "step": 16910 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 91.525, "completions/mean_terminated_length": 91.525, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015308052497162829, "frac_reward_zero_std": 0.98125, "grad_norm": 4.218549728393555, "kl": 5.300854088971391, "learning_rate": 4.213174603174603e-07, "loss": 0.0053, "num_tokens": 1135826621.0, "reward": 0.3046875, "reward_std": 0.015992169082164765, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9264910221099854, "step": 16915 }, { "completion_length": 285.2, "completions/clipped_ratio": 0.0, "completions/max_length": 285.2, "completions/max_terminated_length": 285.2, "completions/mean_length": 86.86328125, "completions/mean_terminated_length": 86.86328125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01531257749051109, "frac_reward_zero_std": 0.96875, "grad_norm": 14.238639831542969, "kl": 1.3794081307249144, "learning_rate": 4.2127777777777774e-07, "loss": 0.0014, "num_tokens": 1136135678.0, "reward": 0.346875, "reward_std": 0.0284590944647789, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.909203040599823, "step": 16920 }, { "completion_length": 492.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 492.4, "completions/max_terminated_length": 429.2, "completions/mean_length": 91.134375, "completions/mean_terminated_length": 90.10648345947266, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01531710248385935, "frac_reward_zero_std": 0.95, "grad_norm": 9.521799087524414, "kl": 1.066979073442053, "learning_rate": 4.2123809523809525e-07, "loss": 0.0011, "num_tokens": 1136450898.0, "reward": 0.35, "reward_std": 0.04287311993539333, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9298113942146301, "step": 16925 }, { "completion_length": 303.8, "completions/clipped_ratio": 0.0, "completions/max_length": 303.8, "completions/max_terminated_length": 303.8, "completions/mean_length": 86.425, "completions/mean_terminated_length": 86.425, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01532162747720761, "frac_reward_zero_std": 0.9625, "grad_norm": 37.49019241333008, "kl": 2.493688868987374, "learning_rate": 4.2119841269841265e-07, "loss": 0.0025, "num_tokens": 1136757834.0, "reward": 0.553125, "reward_std": 0.02651650346815586, "rewards/verify_chess_move/mean": 0.553125, "rewards/verify_chess_move/std": 0.8270524382591248, "step": 16930 }, { "completion_length": 295.4, "completions/clipped_ratio": 0.0, "completions/max_length": 295.4, "completions/max_terminated_length": 295.4, "completions/mean_length": 90.90546875, "completions/mean_terminated_length": 90.90546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015326152470555868, "frac_reward_zero_std": 0.975, "grad_norm": 0.0041428073309361935, "kl": 2.9844177229562776, "learning_rate": 4.2115873015873016e-07, "loss": 0.003, "num_tokens": 1137073865.0, "reward": 0.2140625, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.2140625, "rewards/verify_chess_move/std": 0.9693052887916564, "step": 16935 }, { "completion_length": 326.4, "completions/clipped_ratio": 0.0, "completions/max_length": 326.4, "completions/max_terminated_length": 326.4, "completions/mean_length": 90.16328125, "completions/mean_terminated_length": 90.16328125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015330677463904128, "frac_reward_zero_std": 0.9625, "grad_norm": 0.36145371198654175, "kl": 0.6616609043907374, "learning_rate": 4.211190476190476e-07, "loss": 0.0007, "num_tokens": 1137387538.0, "reward": 0.3875, "reward_std": 0.028566450998187064, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9199163675308227, "step": 16940 }, { "completion_length": 311.2, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/max_terminated_length": 311.2, "completions/mean_length": 89.8421875, "completions/mean_terminated_length": 89.8421875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015335202457252388, "frac_reward_zero_std": 0.975, "grad_norm": 35.40050506591797, "kl": 1.0595166189363225, "learning_rate": 4.21079365079365e-07, "loss": 0.0011, "num_tokens": 1137703528.0, "reward": 0.41875, "reward_std": 0.022461533173918725, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.9012622117996216, "step": 16945 }, { "completion_length": 381.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 381.6, "completions/max_terminated_length": 286.8, "completions/mean_length": 87.1765625, "completions/mean_terminated_length": 86.10521087646484, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015339727450600648, "frac_reward_zero_std": 0.98125, "grad_norm": 0.00255245971493423, "kl": 1.523557967983652, "learning_rate": 4.210396825396825e-07, "loss": 0.0015, "num_tokens": 1138010394.0, "reward": 0.371875, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9241015911102295, "step": 16950 }, { "completion_length": 393.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 393.8, "completions/max_terminated_length": 381.8, "completions/mean_length": 97.44609375, "completions/mean_terminated_length": 96.40062713623047, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015344252443948908, "frac_reward_zero_std": 0.9625, "grad_norm": 8.45707893371582, "kl": 3.0048564836150033, "learning_rate": 4.2099999999999997e-07, "loss": 0.003, "num_tokens": 1138334397.0, "reward": 0.4171875, "reward_std": 0.02993340976536274, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8863927841186523, "step": 16955 }, { "completion_length": 457.2, "completions/clipped_ratio": 0.0, "completions/max_length": 457.2, "completions/max_terminated_length": 457.2, "completions/mean_length": 93.83828125, "completions/mean_terminated_length": 93.83828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015348777437297166, "frac_reward_zero_std": 0.9625, "grad_norm": 21.564348220825195, "kl": 1.5043332126573659, "learning_rate": 4.209603174603175e-07, "loss": 0.0015, "num_tokens": 1138651574.0, "reward": 0.3328125, "reward_std": 0.031512534245848656, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9348775744438171, "step": 16960 }, { "completion_length": 274.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 93.1296875, "completions/mean_terminated_length": 93.1296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015353302430645427, "frac_reward_zero_std": 0.96875, "grad_norm": 3.21842885017395, "kl": 2.3759960164316, "learning_rate": 4.209206349206349e-07, "loss": 0.0024, "num_tokens": 1138971284.0, "reward": 0.428125, "reward_std": 0.025513992086052893, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8911587595939636, "step": 16965 }, { "completion_length": 358.2, "completions/clipped_ratio": 0.0, "completions/max_length": 358.2, "completions/max_terminated_length": 358.2, "completions/mean_length": 90.0109375, "completions/mean_terminated_length": 90.0109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015357827423993687, "frac_reward_zero_std": 0.9625, "grad_norm": 0.8585144281387329, "kl": 2.7045310137793424, "learning_rate": 4.2088095238095233e-07, "loss": 0.0027, "num_tokens": 1139282650.0, "reward": 0.3765625, "reward_std": 0.03198335766792297, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9159708142280578, "step": 16970 }, { "completion_length": 266.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 88.13671875, "completions/mean_terminated_length": 88.13671875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015362352417341947, "frac_reward_zero_std": 0.975, "grad_norm": 24.798255920410156, "kl": 0.5891756351105869, "learning_rate": 4.2084126984126984e-07, "loss": 0.0006, "num_tokens": 1139593913.0, "reward": 0.3515625, "reward_std": 0.02198973037302494, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9335421562194824, "step": 16975 }, { "completion_length": 469.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 469.2, "completions/max_terminated_length": 383.0, "completions/mean_length": 95.1203125, "completions/mean_terminated_length": 94.5816162109375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015366877410690207, "frac_reward_zero_std": 0.9625, "grad_norm": 3.7431530952453613, "kl": 0.5077321137767286, "learning_rate": 4.208015873015873e-07, "loss": 0.0005, "num_tokens": 1139915547.0, "reward": 0.41875, "reward_std": 0.028566450625658036, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.9003271102905274, "step": 16980 }, { "completion_length": 354.6, "completions/clipped_ratio": 0.0, "completions/max_length": 354.6, "completions/max_terminated_length": 354.6, "completions/mean_length": 90.66796875, "completions/mean_terminated_length": 90.66796875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.015371402404038467, "frac_reward_zero_std": 0.975, "grad_norm": 0.00783849973231554, "kl": 0.8322408218169585, "learning_rate": 4.2076190476190475e-07, "loss": 0.0008, "num_tokens": 1140228930.0, "reward": 0.490625, "reward_std": 0.023827511072158813, "rewards/verify_chess_move/mean": 0.490625, "rewards/verify_chess_move/std": 0.8616740226745605, "step": 16985 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 85.39921875, "completions/mean_terminated_length": 85.39921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015375927397386725, "frac_reward_zero_std": 0.9625, "grad_norm": 6.201296329498291, "kl": 1.1260709187365137, "learning_rate": 4.207222222222222e-07, "loss": 0.0011, "num_tokens": 1140534937.0, "reward": 0.390625, "reward_std": 0.02651650384068489, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9099587559700012, "step": 16990 }, { "completion_length": 346.8, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/max_terminated_length": 346.8, "completions/mean_length": 90.29921875, "completions/mean_terminated_length": 90.29921875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.015380452390734985, "frac_reward_zero_std": 0.95, "grad_norm": 3.5655250549316406, "kl": 1.838660328858532, "learning_rate": 4.2068253968253966e-07, "loss": 0.0018, "num_tokens": 1140848016.0, "reward": 0.2828125, "reward_std": 0.04739636592566967, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9391804575920105, "step": 16995 }, { "completion_length": 461.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 461.0, "completions/max_terminated_length": 368.2, "completions/mean_length": 88.075, "completions/mean_terminated_length": 87.55411529541016, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015384977384083245, "frac_reward_zero_std": 0.9125, "grad_norm": 26.240331649780273, "kl": 1.9683537322096527, "learning_rate": 4.206428571428571e-07, "loss": 0.002, "num_tokens": 1141157992.0, "reward": 0.453125, "reward_std": 0.07758941426873207, "rewards/verify_chess_move/mean": 0.453125, "rewards/verify_chess_move/std": 0.8839996933937073, "step": 17000 }, { "completion_length": 344.6, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/max_terminated_length": 344.6, "completions/mean_length": 93.91875, "completions/mean_terminated_length": 93.91875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015389502377431506, "frac_reward_zero_std": 0.9875, "grad_norm": 0.10540508478879929, "kl": 0.9264767767628654, "learning_rate": 4.2060317460317456e-07, "loss": 0.0009, "num_tokens": 1141476368.0, "reward": 0.478125, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8632181525230408, "step": 17005 }, { "completion_length": 298.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 87.58125, "completions/mean_terminated_length": 87.58125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015394027370779766, "frac_reward_zero_std": 0.96875, "grad_norm": 27.3303165435791, "kl": 0.2528275791555643, "learning_rate": 4.2056349206349207e-07, "loss": 0.0003, "num_tokens": 1141786360.0, "reward": 0.3125, "reward_std": 0.026409146934747697, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9332822799682617, "step": 17010 }, { "completion_length": 410.2, "completions/clipped_ratio": 0.0, "completions/max_length": 410.2, "completions/max_terminated_length": 410.2, "completions/mean_length": 89.8390625, "completions/mean_terminated_length": 89.8390625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015398552364128024, "frac_reward_zero_std": 0.95625, "grad_norm": 0.002874838188290596, "kl": 1.54809242982883, "learning_rate": 4.205238095238095e-07, "loss": 0.0015, "num_tokens": 1142098946.0, "reward": 0.3296875, "reward_std": 0.03571978583931923, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9228779435157776, "step": 17015 }, { "completion_length": 331.6, "completions/clipped_ratio": 0.0, "completions/max_length": 331.6, "completions/max_terminated_length": 331.6, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015403077357476284, "frac_reward_zero_std": 0.96875, "grad_norm": 2.9387600421905518, "kl": 1.408084056340158, "learning_rate": 4.204841269841269e-07, "loss": 0.0014, "num_tokens": 1142412314.0, "reward": 0.44375, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8912846446037292, "step": 17020 }, { "completion_length": 495.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 495.6, "completions/max_terminated_length": 483.2, "completions/mean_length": 95.44296875, "completions/mean_terminated_length": 93.8651351928711, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015407602350824544, "frac_reward_zero_std": 0.96875, "grad_norm": 10.964625358581543, "kl": 2.1018339893547817, "learning_rate": 4.2044444444444443e-07, "loss": 0.0021, "num_tokens": 1142734145.0, "reward": 0.3546875, "reward_std": 0.024831004068255426, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9147978186607361, "step": 17025 }, { "completion_length": 258.2, "completions/clipped_ratio": 0.0, "completions/max_length": 258.2, "completions/max_terminated_length": 258.2, "completions/mean_length": 85.53359375, "completions/mean_terminated_length": 85.53359375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.015412127344172804, "frac_reward_zero_std": 0.98125, "grad_norm": 21.6756591796875, "kl": 0.93596361563541, "learning_rate": 4.204047619047619e-07, "loss": 0.0009, "num_tokens": 1143041028.0, "reward": 0.28125, "reward_std": 0.016675157845020293, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9553849101066589, "step": 17030 }, { "completion_length": 294.2, "completions/clipped_ratio": 0.0, "completions/max_length": 294.2, "completions/max_terminated_length": 294.2, "completions/mean_length": 98.02265625, "completions/mean_terminated_length": 98.02265625, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.015416652337521064, "frac_reward_zero_std": 0.95, "grad_norm": 0.006817217450588942, "kl": 2.266943026194349, "learning_rate": 4.2036507936507934e-07, "loss": 0.0023, "num_tokens": 1143370793.0, "reward": 0.3671875, "reward_std": 0.04287213943898678, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9288706183433533, "step": 17035 }, { "completion_length": 288.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 84.3734375, "completions/mean_terminated_length": 84.3734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015421177330869324, "frac_reward_zero_std": 0.98125, "grad_norm": 0.2697228193283081, "kl": 0.39763158322311937, "learning_rate": 4.203253968253968e-07, "loss": 0.0004, "num_tokens": 1143674951.0, "reward": 0.525, "reward_std": 0.01872510462999344, "rewards/verify_chess_move/mean": 0.525, "rewards/verify_chess_move/std": 0.8293063282966614, "step": 17040 }, { "completion_length": 384.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 384.8, "completions/max_terminated_length": 330.6, "completions/mean_length": 95.39921875, "completions/mean_terminated_length": 94.87429351806641, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015425702324217583, "frac_reward_zero_std": 0.9625, "grad_norm": 6.242747783660889, "kl": 3.8650762712117284, "learning_rate": 4.2028571428571425e-07, "loss": 0.0039, "num_tokens": 1143998054.0, "reward": 0.3046875, "reward_std": 0.031983356550335885, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.946716594696045, "step": 17045 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 95.771875, "completions/mean_terminated_length": 95.771875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.015430227317565843, "frac_reward_zero_std": 0.975, "grad_norm": 0.0021758931688964367, "kl": 0.9533528469735757, "learning_rate": 4.2024603174603176e-07, "loss": 0.001, "num_tokens": 1144320826.0, "reward": 0.409375, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9010716080665588, "step": 17050 }, { "completion_length": 291.8, "completions/clipped_ratio": 0.0, "completions/max_length": 291.8, "completions/max_terminated_length": 291.8, "completions/mean_length": 83.2546875, "completions/mean_terminated_length": 83.2546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015434752310914103, "frac_reward_zero_std": 0.96875, "grad_norm": 1.8757879734039307, "kl": 1.4381422506878152, "learning_rate": 4.2020634920634916e-07, "loss": 0.0014, "num_tokens": 1144621952.0, "reward": 0.4296875, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8938907861709595, "step": 17055 }, { "completion_length": 317.8, "completions/clipped_ratio": 0.0, "completions/max_length": 317.8, "completions/max_terminated_length": 317.8, "completions/mean_length": 94.01640625, "completions/mean_terminated_length": 94.01640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015439277304262363, "frac_reward_zero_std": 0.9375, "grad_norm": 31.327194213867188, "kl": 1.8671565861906856, "learning_rate": 4.2016666666666666e-07, "loss": 0.0019, "num_tokens": 1144941037.0, "reward": 0.396875, "reward_std": 0.053973089158535006, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9179164648056031, "step": 17060 }, { "completion_length": 414.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 414.4, "completions/max_terminated_length": 395.0, "completions/mean_length": 92.275, "completions/mean_terminated_length": 91.2166519165039, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015443802297610623, "frac_reward_zero_std": 0.95, "grad_norm": 3.3540611267089844, "kl": 1.4219278497505001, "learning_rate": 4.201269841269841e-07, "loss": 0.0014, "num_tokens": 1145257757.0, "reward": 0.3359375, "reward_std": 0.04560605548322201, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9368629932403565, "step": 17065 }, { "completion_length": 321.4, "completions/clipped_ratio": 0.0, "completions/max_length": 321.4, "completions/max_terminated_length": 321.4, "completions/mean_length": 89.66796875, "completions/mean_terminated_length": 89.66796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015448327290958882, "frac_reward_zero_std": 0.9375, "grad_norm": 6.983442306518555, "kl": 1.2652412660187111, "learning_rate": 4.2008730158730157e-07, "loss": 0.0013, "num_tokens": 1145570260.0, "reward": 0.421875, "reward_std": 0.055081389471888544, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8894571781158447, "step": 17070 }, { "completion_length": 307.4, "completions/clipped_ratio": 0.0, "completions/max_length": 307.4, "completions/max_terminated_length": 307.4, "completions/mean_length": 94.76171875, "completions/mean_terminated_length": 94.76171875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015452852284307142, "frac_reward_zero_std": 0.975, "grad_norm": 0.0242473017424345, "kl": 1.9324621228734031, "learning_rate": 4.20047619047619e-07, "loss": 0.0019, "num_tokens": 1145893699.0, "reward": 0.2390625, "reward_std": 0.023568854853510856, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.960409653186798, "step": 17075 }, { "completion_length": 397.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 397.6, "completions/max_terminated_length": 336.8, "completions/mean_length": 90.0765625, "completions/mean_terminated_length": 89.55057678222656, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015457377277655402, "frac_reward_zero_std": 0.975, "grad_norm": 4.078164100646973, "kl": 1.626616515009664, "learning_rate": 4.200079365079365e-07, "loss": 0.0016, "num_tokens": 1146207037.0, "reward": 0.3953125, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9112160801887512, "step": 17080 }, { "completion_length": 455.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 455.0, "completions/max_terminated_length": 391.6, "completions/mean_length": 95.46015625, "completions/mean_terminated_length": 94.40317993164062, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015461902271003662, "frac_reward_zero_std": 0.96875, "grad_norm": 5.321981906890869, "kl": 1.2480002740281635, "learning_rate": 4.19968253968254e-07, "loss": 0.0012, "num_tokens": 1146530498.0, "reward": 0.296875, "reward_std": 0.02798827216029167, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9470124840736389, "step": 17085 }, { "completion_length": 459.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 459.8, "completions/max_terminated_length": 362.2, "completions/mean_length": 89.5390625, "completions/mean_terminated_length": 89.00236053466797, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015466427264351922, "frac_reward_zero_std": 0.9625, "grad_norm": 16.63535499572754, "kl": 0.8952004569233395, "learning_rate": 4.199285714285714e-07, "loss": 0.0009, "num_tokens": 1146842780.0, "reward": 0.4359375, "reward_std": 0.02993340939283371, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8883450865745545, "step": 17090 }, { "completion_length": 558.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 558.0, "completions/max_terminated_length": 493.6, "completions/mean_length": 87.62421875, "completions/mean_terminated_length": 84.95915985107422, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.015470952257700182, "frac_reward_zero_std": 0.975, "grad_norm": 0.002239018213003874, "kl": 0.7807193013955839, "learning_rate": 4.1988888888888884e-07, "loss": 0.0008, "num_tokens": 1147150019.0, "reward": 0.46875, "reward_std": 0.023356688767671586, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8595072388648987, "step": 17095 }, { "completion_length": 333.4, "completions/clipped_ratio": 0.0, "completions/max_length": 333.4, "completions/max_terminated_length": 333.4, "completions/mean_length": 92.61171875, "completions/mean_terminated_length": 92.61171875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01547547725104844, "frac_reward_zero_std": 0.94375, "grad_norm": 0.009696656838059425, "kl": 0.9036638893652708, "learning_rate": 4.1984920634920635e-07, "loss": 0.0009, "num_tokens": 1147468706.0, "reward": 0.4625, "reward_std": 0.05297057330608368, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8717746615409852, "step": 17100 }, { "completion_length": 271.8, "completions/clipped_ratio": 0.0, "completions/max_length": 271.8, "completions/max_terminated_length": 271.8, "completions/mean_length": 92.153125, "completions/mean_terminated_length": 92.153125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0154800022443967, "frac_reward_zero_std": 0.975, "grad_norm": 2.8988876342773438, "kl": 0.36881088172085585, "learning_rate": 4.198095238095238e-07, "loss": 0.0004, "num_tokens": 1147785198.0, "reward": 0.3765625, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9118528723716736, "step": 17105 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 93.146875, "completions/mean_terminated_length": 93.146875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01548452723774496, "frac_reward_zero_std": 0.98125, "grad_norm": 17.852500915527344, "kl": 0.2193090746877715, "learning_rate": 4.1976984126984126e-07, "loss": 0.0002, "num_tokens": 1148102370.0, "reward": 0.4015625, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.9106740236282349, "step": 17110 }, { "completion_length": 244.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 81.98984375, "completions/mean_terminated_length": 81.98984375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01548905223109322, "frac_reward_zero_std": 0.95, "grad_norm": 37.29969787597656, "kl": 0.17922365255653858, "learning_rate": 4.197301587301587e-07, "loss": 0.0002, "num_tokens": 1148403685.0, "reward": 0.3703125, "reward_std": 0.040822190791368486, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9072924256324768, "step": 17115 }, { "completion_length": 370.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 370.6, "completions/max_terminated_length": 342.8, "completions/mean_length": 93.8515625, "completions/mean_terminated_length": 92.25741424560547, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01549357722444148, "frac_reward_zero_std": 0.95625, "grad_norm": 10.643607139587402, "kl": 2.0958321700105444, "learning_rate": 4.1969047619047616e-07, "loss": 0.0021, "num_tokens": 1148724583.0, "reward": 0.2796875, "reward_std": 0.03819406554102898, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9445738196372986, "step": 17120 }, { "completion_length": 289.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 91.84609375, "completions/mean_terminated_length": 91.84609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015498102217789739, "frac_reward_zero_std": 0.975, "grad_norm": 0.42835840582847595, "kl": 1.5758094315184281, "learning_rate": 4.196507936507936e-07, "loss": 0.0016, "num_tokens": 1149040930.0, "reward": 0.4265625, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.8925123572349548, "step": 17125 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 86.9046875, "completions/mean_terminated_length": 86.9046875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015502627211138, "frac_reward_zero_std": 0.95, "grad_norm": 2.0170774459838867, "kl": 0.6076540638227016, "learning_rate": 4.1961111111111107e-07, "loss": 0.0006, "num_tokens": 1149350576.0, "reward": 0.359375, "reward_std": 0.045818221569061277, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.933149516582489, "step": 17130 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.0, "completions/max_length": 395.2, "completions/max_terminated_length": 395.2, "completions/mean_length": 92.8484375, "completions/mean_terminated_length": 92.8484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01550715220448626, "frac_reward_zero_std": 0.96875, "grad_norm": 17.61446189880371, "kl": 0.6504685846506618, "learning_rate": 4.195714285714286e-07, "loss": 0.0007, "num_tokens": 1149670550.0, "reward": 0.3421875, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9297125339508057, "step": 17135 }, { "completion_length": 516.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 516.0, "completions/max_terminated_length": 449.8, "completions/mean_length": 89.4015625, "completions/mean_terminated_length": 88.87250671386718, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01551167719783452, "frac_reward_zero_std": 0.9625, "grad_norm": 10.542283058166504, "kl": 0.21848676059162245, "learning_rate": 4.1953174603174603e-07, "loss": 0.0002, "num_tokens": 1149981512.0, "reward": 0.3375, "reward_std": 0.032666345685720445, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9231838345527649, "step": 17140 }, { "completion_length": 446.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 446.8, "completions/max_terminated_length": 377.2, "completions/mean_length": 89.23515625, "completions/mean_terminated_length": 88.17695007324218, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01551620219118278, "frac_reward_zero_std": 0.96875, "grad_norm": 14.18880558013916, "kl": 2.2131731887231583, "learning_rate": 4.1949206349206343e-07, "loss": 0.0022, "num_tokens": 1150292733.0, "reward": 0.29375, "reward_std": 0.02619796246290207, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.955563998222351, "step": 17145 }, { "completion_length": 369.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 369.0, "completions/max_terminated_length": 276.8, "completions/mean_length": 88.8890625, "completions/mean_terminated_length": 88.35485076904297, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.01552072718453104, "frac_reward_zero_std": 0.98125, "grad_norm": 0.023574167862534523, "kl": 0.7817941357381641, "learning_rate": 4.1945238095238094e-07, "loss": 0.0008, "num_tokens": 1150604095.0, "reward": 0.446875, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.8859438896179199, "step": 17150 }, { "completion_length": 381.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 381.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 88.56796875, "completions/mean_terminated_length": 88.03043975830079, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015525252177879298, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020709913223981857, "kl": 0.2713028295431286, "learning_rate": 4.194126984126984e-07, "loss": 0.0003, "num_tokens": 1150914990.0, "reward": 0.2359375, "reward_std": 0.03051002323627472, "rewards/verify_chess_move/mean": 0.2359375, "rewards/verify_chess_move/std": 0.9538371801376343, "step": 17155 }, { "completion_length": 357.6, "completions/clipped_ratio": 0.0, "completions/max_length": 357.6, "completions/max_terminated_length": 357.6, "completions/mean_length": 87.43046875, "completions/mean_terminated_length": 87.43046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015529777171227558, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0013094190508127213, "kl": 0.1572794143576175, "learning_rate": 4.193730158730159e-07, "loss": 0.0002, "num_tokens": 1151223357.0, "reward": 0.33125, "reward_std": 0.011572751402854919, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9038065314292908, "step": 17160 }, { "completion_length": 443.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 443.2, "completions/max_terminated_length": 401.4, "completions/mean_length": 89.76796875, "completions/mean_terminated_length": 89.23780670166016, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015534302164575818, "frac_reward_zero_std": 0.94375, "grad_norm": 0.04889964684844017, "kl": 0.721744974446483, "learning_rate": 4.193333333333333e-07, "loss": 0.0007, "num_tokens": 1151536828.0, "reward": 0.4296875, "reward_std": 0.0445586197078228, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8921790242195129, "step": 17165 }, { "completion_length": 294.6, "completions/clipped_ratio": 0.0, "completions/max_length": 294.6, "completions/max_terminated_length": 294.6, "completions/mean_length": 91.9375, "completions/mean_terminated_length": 91.9375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015538827157924078, "frac_reward_zero_std": 0.96875, "grad_norm": 23.53706169128418, "kl": 0.29343430770095436, "learning_rate": 4.1929365079365076e-07, "loss": 0.0003, "num_tokens": 1151855004.0, "reward": 0.39375, "reward_std": 0.02709311805665493, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9092936515808105, "step": 17170 }, { "completion_length": 337.8, "completions/clipped_ratio": 0.0, "completions/max_length": 337.8, "completions/max_terminated_length": 337.8, "completions/mean_length": 97.28515625, "completions/mean_terminated_length": 97.28515625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015543352151272338, "frac_reward_zero_std": 0.98125, "grad_norm": 1.4313201904296875, "kl": 0.17843961988110096, "learning_rate": 4.1925396825396826e-07, "loss": 0.0002, "num_tokens": 1152181217.0, "reward": 0.3046875, "reward_std": 0.01893727108836174, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.939203679561615, "step": 17175 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 348.2, "completions/max_terminated_length": 261.2, "completions/mean_length": 84.96796875, "completions/mean_terminated_length": 84.44319915771484, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015547877144620597, "frac_reward_zero_std": 0.9375, "grad_norm": 29.520061492919922, "kl": 4.444856408319902, "learning_rate": 4.1921428571428566e-07, "loss": 0.0044, "num_tokens": 1152485928.0, "reward": 0.4421875, "reward_std": 0.054185252636671066, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8589540123939514, "step": 17180 }, { "completion_length": 426.8, "completions/clipped_ratio": 0.0, "completions/max_length": 426.8, "completions/max_terminated_length": 426.8, "completions/mean_length": 91.80390625, "completions/mean_terminated_length": 91.80390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015552402137968857, "frac_reward_zero_std": 0.96875, "grad_norm": 0.3082351088523865, "kl": 2.3025069429539142, "learning_rate": 4.1917460317460317e-07, "loss": 0.0023, "num_tokens": 1152802437.0, "reward": 0.3046875, "reward_std": 0.028460075706243516, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9535065412521362, "step": 17185 }, { "completion_length": 460.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 460.4, "completions/max_terminated_length": 370.2, "completions/mean_length": 96.7578125, "completions/mean_terminated_length": 96.2335220336914, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015556927131317117, "frac_reward_zero_std": 0.975, "grad_norm": 8.113030433654785, "kl": 2.3677099674940107, "learning_rate": 4.191349206349206e-07, "loss": 0.0024, "num_tokens": 1153129519.0, "reward": 0.3296875, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9395146369934082, "step": 17190 }, { "completion_length": 312.8, "completions/clipped_ratio": 0.0, "completions/max_length": 312.8, "completions/max_terminated_length": 312.8, "completions/mean_length": 91.62734375, "completions/mean_terminated_length": 91.62734375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015561452124665377, "frac_reward_zero_std": 0.925, "grad_norm": 0.0052048820070922375, "kl": 2.1726252312306316, "learning_rate": 4.190952380952381e-07, "loss": 0.0022, "num_tokens": 1153445802.0, "reward": 0.31875, "reward_std": 0.06507501564919949, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9163851618766785, "step": 17195 }, { "completion_length": 330.4, "completions/clipped_ratio": 0.0, "completions/max_length": 330.4, "completions/max_terminated_length": 330.4, "completions/mean_length": 86.82265625, "completions/mean_terminated_length": 86.82265625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015565977118013637, "frac_reward_zero_std": 0.9625, "grad_norm": 18.11391830444336, "kl": 2.2487745970021935, "learning_rate": 4.1905555555555553e-07, "loss": 0.0022, "num_tokens": 1153754687.0, "reward": 0.4125, "reward_std": 0.034034284949302676, "rewards/verify_chess_move/mean": 0.4125, "rewards/verify_chess_move/std": 0.9004213333129882, "step": 17200 }, { "completion_length": 326.4, "completions/clipped_ratio": 0.0, "completions/max_length": 326.4, "completions/max_terminated_length": 326.4, "completions/mean_length": 91.28359375, "completions/mean_terminated_length": 91.28359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015570502111361897, "frac_reward_zero_std": 0.975, "grad_norm": 2.4335999488830566, "kl": 0.4843011154793203, "learning_rate": 4.19015873015873e-07, "loss": 0.0005, "num_tokens": 1154070306.0, "reward": 0.4203125, "reward_std": 0.023144521936774253, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8986517071723938, "step": 17205 }, { "completion_length": 366.6, "completions/clipped_ratio": 0.0, "completions/max_length": 366.6, "completions/max_terminated_length": 366.6, "completions/mean_length": 94.93046875, "completions/mean_terminated_length": 94.93046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015575027104710155, "frac_reward_zero_std": 0.98125, "grad_norm": 0.02070448361337185, "kl": 0.34184882321860643, "learning_rate": 4.189761904761905e-07, "loss": 0.0003, "num_tokens": 1154391297.0, "reward": 0.3078125, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.946921968460083, "step": 17210 }, { "completion_length": 402.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 94.4734375, "completions/mean_terminated_length": 94.4734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015579552098058416, "frac_reward_zero_std": 0.975, "grad_norm": 22.039487838745117, "kl": 0.1941279891645536, "learning_rate": 4.189365079365079e-07, "loss": 0.0002, "num_tokens": 1154712447.0, "reward": 0.3015625, "reward_std": 0.02198973000049591, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9474023699760437, "step": 17215 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 95.11953125, "completions/mean_terminated_length": 95.11953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015584077091406676, "frac_reward_zero_std": 0.9625, "grad_norm": 3.4177405834198, "kl": 0.5237375357421115, "learning_rate": 4.1889682539682535e-07, "loss": 0.0005, "num_tokens": 1155035056.0, "reward": 0.2296875, "reward_std": 0.03198335729539394, "rewards/verify_chess_move/mean": 0.2296875, "rewards/verify_chess_move/std": 0.9631929993629456, "step": 17220 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 90.8765625, "completions/mean_terminated_length": 90.8765625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015588602084754936, "frac_reward_zero_std": 0.96875, "grad_norm": 21.39618682861328, "kl": 1.1123426680453121, "learning_rate": 4.1885714285714286e-07, "loss": 0.0011, "num_tokens": 1155349922.0, "reward": 0.2828125, "reward_std": 0.026621313393116, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9421403288841248, "step": 17225 }, { "completion_length": 377.4, "completions/clipped_ratio": 0.0, "completions/max_length": 377.4, "completions/max_terminated_length": 377.4, "completions/mean_length": 95.64765625, "completions/mean_terminated_length": 95.64765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015593127078103196, "frac_reward_zero_std": 0.91875, "grad_norm": 24.11616325378418, "kl": 1.1802259464049711, "learning_rate": 4.188174603174603e-07, "loss": 0.0012, "num_tokens": 1155673743.0, "reward": 0.3140625, "reward_std": 0.06381541341543198, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.940949547290802, "step": 17230 }, { "completion_length": 357.2, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 90.02421875, "completions/mean_terminated_length": 90.02421875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015597652071451454, "frac_reward_zero_std": 0.95625, "grad_norm": 18.57054328918457, "kl": 1.562647702288814, "learning_rate": 4.1877777777777776e-07, "loss": 0.0016, "num_tokens": 1155990182.0, "reward": 0.3296875, "reward_std": 0.04050364941358566, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9400956034660339, "step": 17235 }, { "completion_length": 415.4, "completions/clipped_ratio": 0.0, "completions/max_length": 415.4, "completions/max_terminated_length": 415.4, "completions/mean_length": 91.69375, "completions/mean_terminated_length": 91.69375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015602177064799714, "frac_reward_zero_std": 0.95625, "grad_norm": 38.14213562011719, "kl": 2.1612188917351887, "learning_rate": 4.187380952380952e-07, "loss": 0.0022, "num_tokens": 1156307278.0, "reward": 0.396875, "reward_std": 0.03729792907834053, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.8934137225151062, "step": 17240 }, { "completion_length": 283.4, "completions/clipped_ratio": 0.0, "completions/max_length": 283.4, "completions/max_terminated_length": 283.4, "completions/mean_length": 83.7515625, "completions/mean_terminated_length": 83.7515625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015606702058147974, "frac_reward_zero_std": 0.9625, "grad_norm": 3.3862743377685547, "kl": 4.129469518363476, "learning_rate": 4.1869841269841267e-07, "loss": 0.0041, "num_tokens": 1156609424.0, "reward": 0.478125, "reward_std": 0.03061639852821827, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8704942107200623, "step": 17245 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 88.04296875, "completions/mean_terminated_length": 88.04296875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015611227051496234, "frac_reward_zero_std": 0.975, "grad_norm": 22.433425903320312, "kl": 2.689197105448693, "learning_rate": 4.186587301587302e-07, "loss": 0.0027, "num_tokens": 1156920303.0, "reward": 0.4375, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8990341663360596, "step": 17250 }, { "completion_length": 451.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.6, "completions/max_terminated_length": 394.8, "completions/mean_length": 85.25078125, "completions/mean_terminated_length": 84.72026214599609, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015615752044844495, "frac_reward_zero_std": 0.9625, "grad_norm": 6.563592910766602, "kl": 0.3669039910659194, "learning_rate": 4.186190476190476e-07, "loss": 0.0004, "num_tokens": 1157224736.0, "reward": 0.4015625, "reward_std": 0.032667326554656026, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8955878019332886, "step": 17255 }, { "completion_length": 457.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 457.2, "completions/max_terminated_length": 311.6, "completions/mean_length": 91.7875, "completions/mean_terminated_length": 90.1968505859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015620277038192755, "frac_reward_zero_std": 0.9875, "grad_norm": 0.5451962947845459, "kl": 0.7056057489011437, "learning_rate": 4.185793650793651e-07, "loss": 0.0007, "num_tokens": 1157543272.0, "reward": 0.4, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.8944802045822143, "step": 17260 }, { "completion_length": 452.6, "completions/clipped_ratio": 0.0, "completions/max_length": 452.6, "completions/max_terminated_length": 452.6, "completions/mean_length": 88.25234375, "completions/mean_terminated_length": 88.25234375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015624802031541013, "frac_reward_zero_std": 0.95, "grad_norm": 8.046059608459473, "kl": 0.7205579502508044, "learning_rate": 4.1853968253968254e-07, "loss": 0.0007, "num_tokens": 1157853027.0, "reward": 0.4671875, "reward_std": 0.04171734787523747, "rewards/verify_chess_move/mean": 0.4671875, "rewards/verify_chess_move/std": 0.870238208770752, "step": 17265 }, { "completion_length": 337.2, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/max_terminated_length": 337.2, "completions/mean_length": 86.646875, "completions/mean_terminated_length": 86.646875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015629327024889273, "frac_reward_zero_std": 0.975, "grad_norm": 21.84215545654297, "kl": 1.9060803968226536, "learning_rate": 4.1849999999999994e-07, "loss": 0.0019, "num_tokens": 1158162439.0, "reward": 0.4046875, "reward_std": 0.021778544783592223, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9019173383712769, "step": 17270 }, { "completion_length": 490.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 490.6, "completions/max_terminated_length": 389.4, "completions/mean_length": 87.63359375, "completions/mean_terminated_length": 87.10291748046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015633852018237533, "frac_reward_zero_std": 0.9875, "grad_norm": 3.297083616256714, "kl": 8.429900107788853, "learning_rate": 4.1846031746031745e-07, "loss": 0.0084, "num_tokens": 1158472018.0, "reward": 0.2734375, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.9394201636314392, "step": 17275 }, { "completion_length": 403.8, "completions/clipped_ratio": 0.0, "completions/max_length": 403.8, "completions/max_terminated_length": 403.8, "completions/mean_length": 97.9375, "completions/mean_terminated_length": 97.9375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015638377011585793, "frac_reward_zero_std": 0.9625, "grad_norm": 0.034893814474344254, "kl": 1.4212325874483214, "learning_rate": 4.184206349206349e-07, "loss": 0.0014, "num_tokens": 1158798666.0, "reward": 0.3671875, "reward_std": 0.03356248140335083, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9135337710380554, "step": 17280 }, { "completion_length": 298.2, "completions/clipped_ratio": 0.0, "completions/max_length": 298.2, "completions/max_terminated_length": 298.2, "completions/mean_length": 81.42578125, "completions/mean_terminated_length": 81.42578125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015642902004934053, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0025349410716444254, "kl": 1.0054832890629768, "learning_rate": 4.1838095238095236e-07, "loss": 0.001, "num_tokens": 1159098963.0, "reward": 0.4734375, "reward_std": 0.024831003695726394, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8758452773094177, "step": 17285 }, { "completion_length": 307.8, "completions/clipped_ratio": 0.0, "completions/max_length": 307.8, "completions/max_terminated_length": 307.8, "completions/mean_length": 99.4609375, "completions/mean_terminated_length": 99.4609375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015647426998282313, "frac_reward_zero_std": 0.9625, "grad_norm": 5.157090663909912, "kl": 0.8354532843455672, "learning_rate": 4.183412698412698e-07, "loss": 0.0008, "num_tokens": 1159429425.0, "reward": 0.2921875, "reward_std": 0.03561242893338203, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9516657948493957, "step": 17290 }, { "completion_length": 425.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 89.25390625, "completions/mean_terminated_length": 89.25390625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015651951991630574, "frac_reward_zero_std": 0.95, "grad_norm": 3.8977415561676025, "kl": 3.761957788444124, "learning_rate": 4.1830158730158727e-07, "loss": 0.0038, "num_tokens": 1159743854.0, "reward": 0.35, "reward_std": 0.044239097833633424, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9182279706001282, "step": 17295 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 89.23671875, "completions/mean_terminated_length": 89.23671875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015656476984978834, "frac_reward_zero_std": 0.95625, "grad_norm": 10.10927677154541, "kl": 0.69151459292043, "learning_rate": 4.1826190476190477e-07, "loss": 0.0007, "num_tokens": 1160056133.0, "reward": 0.3984375, "reward_std": 0.038664887100458144, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9134389400482178, "step": 17300 }, { "completion_length": 270.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 84.28203125, "completions/mean_terminated_length": 84.28203125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01566100197832709, "frac_reward_zero_std": 0.9375, "grad_norm": 28.583574295043945, "kl": 1.168098474247381, "learning_rate": 4.1822222222222217e-07, "loss": 0.0012, "num_tokens": 1160359694.0, "reward": 0.2875, "reward_std": 0.055127878487110135, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9536653637886048, "step": 17305 }, { "completion_length": 475.6, "completions/clipped_ratio": 0.00546875, "completions/max_length": 475.6, "completions/max_terminated_length": 338.0, "completions/mean_length": 99.7015625, "completions/mean_terminated_length": 96.05809936523437, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01566552697167535, "frac_reward_zero_std": 0.925, "grad_norm": 0.38935598731040955, "kl": 0.7222017924999818, "learning_rate": 4.1818253968253963e-07, "loss": 0.0007, "num_tokens": 1160686768.0, "reward": 0.378125, "reward_std": 0.0628119207918644, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9152417302131652, "step": 17310 }, { "completion_length": 404.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 404.4, "completions/max_terminated_length": 384.6, "completions/mean_length": 87.57578125, "completions/mean_terminated_length": 87.05618743896484, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01567005196502361, "frac_reward_zero_std": 0.9625, "grad_norm": 11.394143104553223, "kl": 0.19264380484819413, "learning_rate": 4.1814285714285713e-07, "loss": 0.0002, "num_tokens": 1160995121.0, "reward": 0.4375, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8898903846740722, "step": 17315 }, { "completion_length": 457.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 457.2, "completions/max_terminated_length": 362.0, "completions/mean_length": 90.25078125, "completions/mean_terminated_length": 89.71143493652343, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01567457695837187, "frac_reward_zero_std": 0.9625, "grad_norm": 0.010144233703613281, "kl": 0.398809031402925, "learning_rate": 4.181031746031746e-07, "loss": 0.0004, "num_tokens": 1161310722.0, "reward": 0.3078125, "reward_std": 0.030617378652095795, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9350520491600036, "step": 17320 }, { "completion_length": 380.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 380.2, "completions/max_terminated_length": 287.8, "completions/mean_length": 87.38515625, "completions/mean_terminated_length": 86.8426727294922, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01567910195172013, "frac_reward_zero_std": 0.975, "grad_norm": 2.361140251159668, "kl": 0.4353341756737791, "learning_rate": 4.1806349206349204e-07, "loss": 0.0004, "num_tokens": 1161620815.0, "reward": 0.428125, "reward_std": 0.0245114803314209, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.9008752465248108, "step": 17325 }, { "completion_length": 294.8, "completions/clipped_ratio": 0.0, "completions/max_length": 294.8, "completions/max_terminated_length": 294.8, "completions/mean_length": 92.65234375, "completions/mean_terminated_length": 92.65234375, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.01568362694506839, "frac_reward_zero_std": 0.975, "grad_norm": 2.825256824493408, "kl": 0.7112473480170592, "learning_rate": 4.180238095238095e-07, "loss": 0.0007, "num_tokens": 1161938322.0, "reward": 0.3203125, "reward_std": 0.019939782470464705, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9377402305603028, "step": 17330 }, { "completion_length": 261.6, "completions/clipped_ratio": 0.0, "completions/max_length": 261.6, "completions/max_terminated_length": 261.6, "completions/mean_length": 85.034375, "completions/mean_terminated_length": 85.034375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.01568815193841665, "frac_reward_zero_std": 0.96875, "grad_norm": 0.012020832858979702, "kl": 0.3942442837636918, "learning_rate": 4.1798412698412695e-07, "loss": 0.0004, "num_tokens": 1162244654.0, "reward": 0.3328125, "reward_std": 0.0286712609231472, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.9335789442062378, "step": 17335 }, { "completion_length": 308.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 83.17109375, "completions/mean_terminated_length": 83.17109375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01569267693176491, "frac_reward_zero_std": 0.98125, "grad_norm": 3.0102100372314453, "kl": 0.34800253331195563, "learning_rate": 4.1794444444444446e-07, "loss": 0.0003, "num_tokens": 1162547577.0, "reward": 0.325, "reward_std": 0.017570312693715097, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9264091730117798, "step": 17340 }, { "completion_length": 313.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 95.45546875, "completions/mean_terminated_length": 95.45546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01569720192511317, "frac_reward_zero_std": 0.95625, "grad_norm": 2.6494476795196533, "kl": 0.7823264194186776, "learning_rate": 4.1790476190476186e-07, "loss": 0.0008, "num_tokens": 1162869896.0, "reward": 0.3265625, "reward_std": 0.03661494068801403, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9395082473754883, "step": 17345 }, { "completion_length": 260.2, "completions/clipped_ratio": 0.0, "completions/max_length": 260.2, "completions/max_terminated_length": 260.2, "completions/mean_length": 80.00703125, "completions/mean_terminated_length": 80.00703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01570172691846143, "frac_reward_zero_std": 0.9625, "grad_norm": 1.6318340301513672, "kl": 0.8592862501740456, "learning_rate": 4.1786507936507936e-07, "loss": 0.0009, "num_tokens": 1163163801.0, "reward": 0.43125, "reward_std": 0.031300367787480354, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8861462593078613, "step": 17350 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/max_terminated_length": 359.2, "completions/mean_length": 94.5765625, "completions/mean_terminated_length": 94.5765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01570625191180969, "frac_reward_zero_std": 0.975, "grad_norm": 0.0064734118059277534, "kl": 0.3705457841278985, "learning_rate": 4.178253968253968e-07, "loss": 0.0004, "num_tokens": 1163483435.0, "reward": 0.4546875, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8919669389724731, "step": 17355 }, { "completion_length": 414.4, "completions/clipped_ratio": 0.0, "completions/max_length": 414.4, "completions/max_terminated_length": 414.4, "completions/mean_length": 88.8, "completions/mean_terminated_length": 88.8, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015710776905157948, "frac_reward_zero_std": 0.975, "grad_norm": 0.0010130986338481307, "kl": 0.3265581165906042, "learning_rate": 4.177857142857142e-07, "loss": 0.0003, "num_tokens": 1163793323.0, "reward": 0.375, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9197045207023621, "step": 17360 }, { "completion_length": 309.6, "completions/clipped_ratio": 0.0, "completions/max_length": 309.6, "completions/max_terminated_length": 309.6, "completions/mean_length": 91.68828125, "completions/mean_terminated_length": 91.68828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015715301898506208, "frac_reward_zero_std": 0.9625, "grad_norm": 11.979403495788574, "kl": 0.3213797816541046, "learning_rate": 4.1774603174603173e-07, "loss": 0.0003, "num_tokens": 1164109932.0, "reward": 0.21875, "reward_std": 0.0319843377918005, "rewards/verify_chess_move/mean": 0.21875, "rewards/verify_chess_move/std": 0.9646952271461486, "step": 17365 }, { "completion_length": 317.2, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/max_terminated_length": 317.2, "completions/mean_length": 90.4109375, "completions/mean_terminated_length": 90.4109375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015719826891854468, "frac_reward_zero_std": 0.94375, "grad_norm": 30.054378509521484, "kl": 1.1429870683234185, "learning_rate": 4.177063492063492e-07, "loss": 0.0011, "num_tokens": 1164424506.0, "reward": 0.4265625, "reward_std": 0.050237638503313066, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.8979169249534606, "step": 17370 }, { "completion_length": 421.4, "completions/clipped_ratio": 0.0, "completions/max_length": 421.4, "completions/max_terminated_length": 421.4, "completions/mean_length": 91.76171875, "completions/mean_terminated_length": 91.76171875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015724351885202728, "frac_reward_zero_std": 0.96875, "grad_norm": 5.321216583251953, "kl": 1.904496351024136, "learning_rate": 4.176666666666667e-07, "loss": 0.0019, "num_tokens": 1164739513.0, "reward": 0.51875, "reward_std": 0.02619796246290207, "rewards/verify_chess_move/mean": 0.51875, "rewards/verify_chess_move/std": 0.8506159305572509, "step": 17375 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 86.0578125, "completions/mean_terminated_length": 86.0578125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015728876878550988, "frac_reward_zero_std": 0.95625, "grad_norm": 5.63361930847168, "kl": 0.32662376421503725, "learning_rate": 4.176269841269841e-07, "loss": 0.0003, "num_tokens": 1165045691.0, "reward": 0.3890625, "reward_std": 0.038453702628612516, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9148507237434387, "step": 17380 }, { "completion_length": 469.2, "completions/clipped_ratio": 0.0, "completions/max_length": 469.2, "completions/max_terminated_length": 469.2, "completions/mean_length": 93.00078125, "completions/mean_terminated_length": 93.00078125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01573340187189925, "frac_reward_zero_std": 0.95, "grad_norm": 41.242652893066406, "kl": 0.8195022841449827, "learning_rate": 4.1758730158730154e-07, "loss": 0.0008, "num_tokens": 1165364204.0, "reward": 0.440625, "reward_std": 0.0437682744115591, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8698813199996949, "step": 17385 }, { "completion_length": 305.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 92.3140625, "completions/mean_terminated_length": 92.3140625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01573792686524751, "frac_reward_zero_std": 0.9875, "grad_norm": 28.09021759033203, "kl": 0.9264310955069959, "learning_rate": 4.1754761904761905e-07, "loss": 0.0009, "num_tokens": 1165682678.0, "reward": 0.403125, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9005169987678527, "step": 17390 }, { "completion_length": 260.2, "completions/clipped_ratio": 0.0, "completions/max_length": 260.2, "completions/max_terminated_length": 260.2, "completions/mean_length": 87.7796875, "completions/mean_terminated_length": 87.7796875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01574245185859577, "frac_reward_zero_std": 0.95625, "grad_norm": 0.22000287473201752, "kl": 0.8084799263160676, "learning_rate": 4.1750793650793645e-07, "loss": 0.0008, "num_tokens": 1165993188.0, "reward": 0.4328125, "reward_std": 0.03456499315798282, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.8834993720054627, "step": 17395 }, { "completion_length": 435.2, "completions/clipped_ratio": 0.0, "completions/max_length": 435.2, "completions/max_terminated_length": 435.2, "completions/mean_length": 92.56796875, "completions/mean_terminated_length": 92.56796875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01574697685194403, "frac_reward_zero_std": 0.96875, "grad_norm": 4.6923418045043945, "kl": 1.4092646413715557, "learning_rate": 4.1746825396825396e-07, "loss": 0.0014, "num_tokens": 1166310699.0, "reward": 0.3609375, "reward_std": 0.025726158171892166, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9016141533851624, "step": 17400 }, { "completion_length": 322.6, "completions/clipped_ratio": 0.0, "completions/max_length": 322.6, "completions/max_terminated_length": 322.6, "completions/mean_length": 88.31328125, "completions/mean_terminated_length": 88.31328125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01575150184529229, "frac_reward_zero_std": 0.96875, "grad_norm": 0.17905376851558685, "kl": 3.6158635765779765, "learning_rate": 4.174285714285714e-07, "loss": 0.0036, "num_tokens": 1166622396.0, "reward": 0.3609375, "reward_std": 0.02777610532939434, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9275570392608643, "step": 17405 }, { "completion_length": 320.6, "completions/clipped_ratio": 0.0, "completions/max_length": 320.6, "completions/max_terminated_length": 320.6, "completions/mean_length": 84.97109375, "completions/mean_terminated_length": 84.97109375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01575602683864055, "frac_reward_zero_std": 0.96875, "grad_norm": 0.10506176948547363, "kl": 0.23166645786259324, "learning_rate": 4.1738888888888887e-07, "loss": 0.0002, "num_tokens": 1166927359.0, "reward": 0.4703125, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8660680055618286, "step": 17410 }, { "completion_length": 318.6, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/max_terminated_length": 318.6, "completions/mean_length": 98.41015625, "completions/mean_terminated_length": 98.41015625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015760551831988805, "frac_reward_zero_std": 0.98125, "grad_norm": 1.8839999437332153, "kl": 0.8610737752402201, "learning_rate": 4.173492063492063e-07, "loss": 0.0009, "num_tokens": 1167254388.0, "reward": 0.359375, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9227653026580811, "step": 17415 }, { "completion_length": 321.6, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/max_terminated_length": 321.6, "completions/mean_length": 86.31171875, "completions/mean_terminated_length": 86.31171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015765076825337065, "frac_reward_zero_std": 0.93125, "grad_norm": 0.00382182071916759, "kl": 1.8025907571194693, "learning_rate": 4.1730952380952377e-07, "loss": 0.0018, "num_tokens": 1167563379.0, "reward": 0.3375, "reward_std": 0.06338853649795055, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9224232077598572, "step": 17420 }, { "completion_length": 393.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 393.2, "completions/max_terminated_length": 301.6, "completions/mean_length": 93.3859375, "completions/mean_terminated_length": 92.85879058837891, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015769601818685326, "frac_reward_zero_std": 0.975, "grad_norm": 4.268989562988281, "kl": 0.6303860984975472, "learning_rate": 4.172698412698413e-07, "loss": 0.0006, "num_tokens": 1167883353.0, "reward": 0.3390625, "reward_std": 0.023568854480981827, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9188488125801086, "step": 17425 }, { "completion_length": 305.8, "completions/clipped_ratio": 0.0, "completions/max_length": 305.8, "completions/max_terminated_length": 305.8, "completions/mean_length": 89.55546875, "completions/mean_terminated_length": 89.55546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015774126812033586, "frac_reward_zero_std": 0.9625, "grad_norm": 4.615702152252197, "kl": 2.747794999321923, "learning_rate": 4.1723015873015873e-07, "loss": 0.0027, "num_tokens": 1168196576.0, "reward": 0.3734375, "reward_std": 0.03356248140335083, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9262160420417785, "step": 17430 }, { "completion_length": 271.8, "completions/clipped_ratio": 0.0, "completions/max_length": 271.8, "completions/max_terminated_length": 271.8, "completions/mean_length": 90.7296875, "completions/mean_terminated_length": 90.7296875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015778651805381846, "frac_reward_zero_std": 0.96875, "grad_norm": 0.007069986313581467, "kl": 0.5606521158013493, "learning_rate": 4.1719047619047614e-07, "loss": 0.0006, "num_tokens": 1168510126.0, "reward": 0.31875, "reward_std": 0.030297856032848357, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9427780866622925, "step": 17435 }, { "completion_length": 298.8, "completions/clipped_ratio": 0.0, "completions/max_length": 298.8, "completions/max_terminated_length": 298.8, "completions/mean_length": 90.88515625, "completions/mean_terminated_length": 90.88515625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.015783176798730106, "frac_reward_zero_std": 0.95625, "grad_norm": 0.03406490013003349, "kl": 1.8728594158310443, "learning_rate": 4.1715079365079364e-07, "loss": 0.0019, "num_tokens": 1168824467.0, "reward": 0.4765625, "reward_std": 0.03934885673224926, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8744112491607666, "step": 17440 }, { "completion_length": 322.6, "completions/clipped_ratio": 0.0, "completions/max_length": 322.6, "completions/max_terminated_length": 322.6, "completions/mean_length": 92.14375, "completions/mean_terminated_length": 92.14375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015787701792078366, "frac_reward_zero_std": 0.98125, "grad_norm": 12.002989768981934, "kl": 0.777899022307247, "learning_rate": 4.171111111111111e-07, "loss": 0.0008, "num_tokens": 1169141483.0, "reward": 0.359375, "reward_std": 0.016675157845020293, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9244667410850524, "step": 17445 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 89.2765625, "completions/mean_terminated_length": 89.2765625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.015792226785426626, "frac_reward_zero_std": 0.98125, "grad_norm": 14.573124885559082, "kl": 0.2959259521914646, "learning_rate": 4.1707142857142855e-07, "loss": 0.0003, "num_tokens": 1169454357.0, "reward": 0.4953125, "reward_std": 0.01940809339284897, "rewards/verify_chess_move/mean": 0.4953125, "rewards/verify_chess_move/std": 0.8675588011741638, "step": 17450 }, { "completion_length": 281.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 89.67109375, "completions/mean_terminated_length": 89.67109375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.015796751778774886, "frac_reward_zero_std": 0.98125, "grad_norm": 2.704754114151001, "kl": 0.5550332748796791, "learning_rate": 4.17031746031746e-07, "loss": 0.0006, "num_tokens": 1169769152.0, "reward": 0.2234375, "reward_std": 0.015992169082164765, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9678121089935303, "step": 17455 }, { "completion_length": 401.4, "completions/clipped_ratio": 0.0, "completions/max_length": 401.4, "completions/max_terminated_length": 401.4, "completions/mean_length": 89.41484375, "completions/mean_terminated_length": 89.41484375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015801276772123146, "frac_reward_zero_std": 0.95, "grad_norm": 26.669052124023438, "kl": 1.61390426189173, "learning_rate": 4.1699206349206346e-07, "loss": 0.0016, "num_tokens": 1170081827.0, "reward": 0.3625, "reward_std": 0.041034357994794844, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.9249990105628967, "step": 17460 }, { "completion_length": 418.4, "completions/clipped_ratio": 0.0, "completions/max_length": 418.4, "completions/max_terminated_length": 418.4, "completions/mean_length": 92.409375, "completions/mean_terminated_length": 92.409375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015805801765471406, "frac_reward_zero_std": 0.98125, "grad_norm": 0.04010232910513878, "kl": 1.2564007283421232, "learning_rate": 4.1695238095238097e-07, "loss": 0.0013, "num_tokens": 1170399839.0, "reward": 0.3484375, "reward_std": 0.015992169082164765, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9157090187072754, "step": 17465 }, { "completion_length": 384.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 384.6, "completions/max_terminated_length": 291.6, "completions/mean_length": 94.61875, "completions/mean_terminated_length": 94.09244842529297, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015810326758819663, "frac_reward_zero_std": 0.95, "grad_norm": 16.52318000793457, "kl": 0.5896645066328347, "learning_rate": 4.1691269841269837e-07, "loss": 0.0006, "num_tokens": 1170720159.0, "reward": 0.325, "reward_std": 0.04397945925593376, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.943104374408722, "step": 17470 }, { "completion_length": 335.2, "completions/clipped_ratio": 0.0, "completions/max_length": 335.2, "completions/max_terminated_length": 335.2, "completions/mean_length": 91.146875, "completions/mean_terminated_length": 91.146875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015814851752167923, "frac_reward_zero_std": 0.95625, "grad_norm": 11.654617309570312, "kl": 2.012954010465182, "learning_rate": 4.1687301587301587e-07, "loss": 0.002, "num_tokens": 1171035419.0, "reward": 0.26875, "reward_std": 0.037981897965073584, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9508647203445435, "step": 17475 }, { "completion_length": 332.6, "completions/clipped_ratio": 0.0, "completions/max_length": 332.6, "completions/max_terminated_length": 332.6, "completions/mean_length": 90.0265625, "completions/mean_terminated_length": 90.0265625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015819376745516183, "frac_reward_zero_std": 0.94375, "grad_norm": 0.3481996953487396, "kl": 2.671951302862726, "learning_rate": 4.1683333333333333e-07, "loss": 0.0027, "num_tokens": 1171350197.0, "reward": 0.4109375, "reward_std": 0.04660856947302818, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.8696833372116088, "step": 17480 }, { "completion_length": 455.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 455.8, "completions/max_terminated_length": 377.4, "completions/mean_length": 94.98984375, "completions/mean_terminated_length": 94.46562957763672, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015823901738864443, "frac_reward_zero_std": 0.95625, "grad_norm": 14.321617126464844, "kl": 2.9160014741821216, "learning_rate": 4.1679365079365073e-07, "loss": 0.0029, "num_tokens": 1171674032.0, "reward": 0.396875, "reward_std": 0.037981897592544556, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9135552644729614, "step": 17485 }, { "completion_length": 438.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 93.115625, "completions/mean_terminated_length": 93.115625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015828426732212703, "frac_reward_zero_std": 0.9875, "grad_norm": 2.503028392791748, "kl": 6.888356669363565, "learning_rate": 4.1675396825396823e-07, "loss": 0.0069, "num_tokens": 1171992348.0, "reward": 0.36875, "reward_std": 0.010888782143592835, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9217553615570069, "step": 17490 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 88.2625, "completions/mean_terminated_length": 88.2625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015832951725560963, "frac_reward_zero_std": 0.95625, "grad_norm": 4.2706298828125, "kl": 2.4583707529585808, "learning_rate": 4.167142857142857e-07, "loss": 0.0025, "num_tokens": 1172302932.0, "reward": 0.184375, "reward_std": 0.03230288103222847, "rewards/verify_chess_move/mean": 0.184375, "rewards/verify_chess_move/std": 0.978633189201355, "step": 17495 }, { "completion_length": 315.6, "completions/clipped_ratio": 0.0, "completions/max_length": 315.6, "completions/max_terminated_length": 315.6, "completions/mean_length": 85.7015625, "completions/mean_terminated_length": 85.7015625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015837476718909223, "frac_reward_zero_std": 0.95, "grad_norm": 1.2224311828613281, "kl": 0.9715993081219494, "learning_rate": 4.166746031746032e-07, "loss": 0.001, "num_tokens": 1172609406.0, "reward": 0.4796875, "reward_std": 0.03761745244264603, "rewards/verify_chess_move/mean": 0.4796875, "rewards/verify_chess_move/std": 0.8784575223922729, "step": 17500 }, { "completion_length": 316.6, "completions/clipped_ratio": 0.0, "completions/max_length": 316.6, "completions/max_terminated_length": 316.6, "completions/mean_length": 94.590625, "completions/mean_terminated_length": 94.590625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.015842001712257484, "frac_reward_zero_std": 0.975, "grad_norm": 0.0028681624680757523, "kl": 0.724348517647013, "learning_rate": 4.166349206349206e-07, "loss": 0.0007, "num_tokens": 1172932258.0, "reward": 0.359375, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.92888023853302, "step": 17505 }, { "completion_length": 415.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 415.8, "completions/max_terminated_length": 313.2, "completions/mean_length": 89.93671875, "completions/mean_terminated_length": 89.4023941040039, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.015846526705605744, "frac_reward_zero_std": 0.975, "grad_norm": 0.0022823456674814224, "kl": 0.44734833943657576, "learning_rate": 4.1659523809523805e-07, "loss": 0.0004, "num_tokens": 1173246073.0, "reward": 0.465625, "reward_std": 0.019727616757154464, "rewards/verify_chess_move/mean": 0.465625, "rewards/verify_chess_move/std": 0.8843745470046998, "step": 17510 }, { "completion_length": 339.8, "completions/clipped_ratio": 0.0, "completions/max_length": 339.8, "completions/max_terminated_length": 339.8, "completions/mean_length": 89.51328125, "completions/mean_terminated_length": 89.51328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015851051698954004, "frac_reward_zero_std": 0.9625, "grad_norm": 22.319177627563477, "kl": 1.8301911025773734, "learning_rate": 4.1655555555555556e-07, "loss": 0.0018, "num_tokens": 1173558218.0, "reward": 0.4609375, "reward_std": 0.0347172737121582, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8320951342582703, "step": 17515 }, { "completion_length": 267.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 91.63828125, "completions/mean_terminated_length": 91.63828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015855576692302264, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0038676115218549967, "kl": 0.8948753731325269, "learning_rate": 4.16515873015873e-07, "loss": 0.0009, "num_tokens": 1173874267.0, "reward": 0.425, "reward_std": 0.02925042025744915, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9032961249351501, "step": 17520 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 395.2, "completions/max_terminated_length": 386.0, "completions/mean_length": 98.27890625, "completions/mean_terminated_length": 97.76202392578125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01586010168565052, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0019856817089021206, "kl": 0.3853266744874418, "learning_rate": 4.1647619047619047e-07, "loss": 0.0004, "num_tokens": 1174200880.0, "reward": 0.2609375, "reward_std": 0.05213530585169792, "rewards/verify_chess_move/mean": 0.2609375, "rewards/verify_chess_move/std": 0.9485105752944947, "step": 17525 }, { "completion_length": 263.8, "completions/clipped_ratio": 0.0, "completions/max_length": 263.8, "completions/max_terminated_length": 263.8, "completions/mean_length": 90.26328125, "completions/mean_terminated_length": 90.26328125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01586462667899878, "frac_reward_zero_std": 0.98125, "grad_norm": 0.8287296891212463, "kl": 0.9082852242980153, "learning_rate": 4.164365079365079e-07, "loss": 0.0009, "num_tokens": 1174516361.0, "reward": 0.4453125, "reward_std": 0.016887323930859566, "rewards/verify_chess_move/mean": 0.4453125, "rewards/verify_chess_move/std": 0.892969012260437, "step": 17530 }, { "completion_length": 386.8, "completions/clipped_ratio": 0.0, "completions/max_length": 386.8, "completions/max_terminated_length": 386.8, "completions/mean_length": 92.78046875, "completions/mean_terminated_length": 92.78046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01586915167234704, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0016277922550216317, "kl": 3.1046420291764663, "learning_rate": 4.163968253968254e-07, "loss": 0.0031, "num_tokens": 1174833560.0, "reward": 0.384375, "reward_std": 0.03424546979367733, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9183338761329651, "step": 17535 }, { "completion_length": 307.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 90.04453125, "completions/mean_terminated_length": 90.04453125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.0158736766656953, "frac_reward_zero_std": 0.94375, "grad_norm": 22.279842376708984, "kl": 1.443163010943681, "learning_rate": 4.1635714285714283e-07, "loss": 0.0014, "num_tokens": 1175149297.0, "reward": 0.353125, "reward_std": 0.04955465085804463, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9260960102081299, "step": 17540 }, { "completion_length": 270.4, "completions/clipped_ratio": 0.0, "completions/max_length": 270.4, "completions/max_terminated_length": 270.4, "completions/mean_length": 84.25546875, "completions/mean_terminated_length": 84.25546875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01587820165904356, "frac_reward_zero_std": 0.9875, "grad_norm": 0.20976252853870392, "kl": 0.35095764582511035, "learning_rate": 4.163174603174603e-07, "loss": 0.0004, "num_tokens": 1175453032.0, "reward": 0.440625, "reward_std": 0.010888782143592835, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8719533801078796, "step": 17545 }, { "completion_length": 646.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 646.2, "completions/max_terminated_length": 478.8, "completions/mean_length": 87.85234375, "completions/mean_terminated_length": 86.25646209716797, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01588272665239182, "frac_reward_zero_std": 0.9625, "grad_norm": 26.039939880371094, "kl": 2.4394789153710006, "learning_rate": 4.162777777777778e-07, "loss": 0.0024, "num_tokens": 1175762667.0, "reward": 0.4390625, "reward_std": 0.03719155415892601, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8955420613288879, "step": 17550 }, { "completion_length": 453.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 453.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 93.596875, "completions/mean_terminated_length": 92.04376678466797, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01588725164574008, "frac_reward_zero_std": 0.98125, "grad_norm": 1.1018033027648926, "kl": 2.4946369946934284, "learning_rate": 4.1623809523809524e-07, "loss": 0.0025, "num_tokens": 1176080695.0, "reward": 0.334375, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9419652819633484, "step": 17555 }, { "completion_length": 330.4, "completions/clipped_ratio": 0.0, "completions/max_length": 330.4, "completions/max_terminated_length": 330.4, "completions/mean_length": 90.18515625, "completions/mean_terminated_length": 90.18515625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01589177663908834, "frac_reward_zero_std": 0.9875, "grad_norm": 13.188584327697754, "kl": 0.32378089539706706, "learning_rate": 4.1619841269841264e-07, "loss": 0.0003, "num_tokens": 1176395516.0, "reward": 0.3703125, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9187801003456115, "step": 17560 }, { "completion_length": 340.8, "completions/clipped_ratio": 0.0, "completions/max_length": 340.8, "completions/max_terminated_length": 340.8, "completions/mean_length": 90.496875, "completions/mean_terminated_length": 90.496875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0158963016324366, "frac_reward_zero_std": 0.9625, "grad_norm": 15.09897232055664, "kl": 1.4401027497602628, "learning_rate": 4.1615873015873015e-07, "loss": 0.0014, "num_tokens": 1176709512.0, "reward": 0.3765625, "reward_std": 0.03198335617780686, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9130232691764831, "step": 17565 }, { "completion_length": 304.6, "completions/clipped_ratio": 0.0, "completions/max_length": 304.6, "completions/max_terminated_length": 304.6, "completions/mean_length": 87.58046875, "completions/mean_terminated_length": 87.58046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01590082662578486, "frac_reward_zero_std": 0.95625, "grad_norm": 19.01466941833496, "kl": 0.9443520696368068, "learning_rate": 4.161190476190476e-07, "loss": 0.0009, "num_tokens": 1177019359.0, "reward": 0.41875, "reward_std": 0.0359319519251585, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8971199154853821, "step": 17570 }, { "completion_length": 384.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 384.6, "completions/max_terminated_length": 335.6, "completions/mean_length": 89.46640625, "completions/mean_terminated_length": 88.93712158203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01590535161913312, "frac_reward_zero_std": 0.975, "grad_norm": 2.5367400646209717, "kl": 0.6924343825085089, "learning_rate": 4.160793650793651e-07, "loss": 0.0007, "num_tokens": 1177331940.0, "reward": 0.275, "reward_std": 0.02177756391465664, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9563099265098571, "step": 17575 }, { "completion_length": 403.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.2, "completions/max_terminated_length": 316.6, "completions/mean_length": 87.9796875, "completions/mean_terminated_length": 87.43189392089843, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015909876612481378, "frac_reward_zero_std": 0.98125, "grad_norm": 0.05650315433740616, "kl": 0.766923634451814, "learning_rate": 4.160396825396825e-07, "loss": 0.0008, "num_tokens": 1177640730.0, "reward": 0.5125, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.5125, "rewards/verify_chess_move/std": 0.8534539580345154, "step": 17580 }, { "completion_length": 318.8, "completions/clipped_ratio": 0.0, "completions/max_length": 318.8, "completions/max_terminated_length": 318.8, "completions/mean_length": 87.3671875, "completions/mean_terminated_length": 87.3671875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015914401605829638, "frac_reward_zero_std": 0.98125, "grad_norm": 10.789968490600586, "kl": 0.16432193662039934, "learning_rate": 4.1599999999999997e-07, "loss": 0.0002, "num_tokens": 1177951176.0, "reward": 0.428125, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8952305316925049, "step": 17585 }, { "completion_length": 262.2, "completions/clipped_ratio": 0.0, "completions/max_length": 262.2, "completions/max_terminated_length": 262.2, "completions/mean_length": 89.2203125, "completions/mean_terminated_length": 89.2203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.015918926599177898, "frac_reward_zero_std": 0.95, "grad_norm": 0.010991757735610008, "kl": 0.4878112151287496, "learning_rate": 4.1596031746031747e-07, "loss": 0.0005, "num_tokens": 1178264546.0, "reward": 0.3546875, "reward_std": 0.045817240700125696, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.920215106010437, "step": 17590 }, { "completion_length": 440.6, "completions/clipped_ratio": 0.0, "completions/max_length": 440.6, "completions/max_terminated_length": 440.6, "completions/mean_length": 93.09921875, "completions/mean_terminated_length": 93.09921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01592345159252616, "frac_reward_zero_std": 0.9625, "grad_norm": 1.9162079095840454, "kl": 0.2301769478712231, "learning_rate": 4.159206349206349e-07, "loss": 0.0002, "num_tokens": 1178582401.0, "reward": 0.4859375, "reward_std": 0.0351416066288948, "rewards/verify_chess_move/mean": 0.4859375, "rewards/verify_chess_move/std": 0.871047067642212, "step": 17595 }, { "completion_length": 269.4, "completions/clipped_ratio": 0.0, "completions/max_length": 269.4, "completions/max_terminated_length": 269.4, "completions/mean_length": 83.5421875, "completions/mean_terminated_length": 83.5421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01592797658587442, "frac_reward_zero_std": 0.96875, "grad_norm": 0.30679261684417725, "kl": 0.8526814201846719, "learning_rate": 4.158809523809524e-07, "loss": 0.0009, "num_tokens": 1178886591.0, "reward": 0.340625, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9375835418701172, "step": 17600 }, { "completion_length": 411.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 411.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 86.66484375, "completions/mean_terminated_length": 85.61131286621094, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01593250157922268, "frac_reward_zero_std": 0.975, "grad_norm": 14.072759628295898, "kl": 0.7179423298686742, "learning_rate": 4.1584126984126984e-07, "loss": 0.0007, "num_tokens": 1179194610.0, "reward": 0.384375, "reward_std": 0.022461533173918725, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9179256796836853, "step": 17605 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 365.0, "completions/max_terminated_length": 270.2, "completions/mean_length": 89.4984375, "completions/mean_terminated_length": 88.97707824707031, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01593702657257094, "frac_reward_zero_std": 0.9625, "grad_norm": 24.088102340698242, "kl": 0.5064174222527071, "learning_rate": 4.158015873015873e-07, "loss": 0.0005, "num_tokens": 1179509088.0, "reward": 0.296875, "reward_std": 0.034929439425468445, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9399136900901794, "step": 17610 }, { "completion_length": 267.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 90.5265625, "completions/mean_terminated_length": 90.5265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0159415515659192, "frac_reward_zero_std": 0.96875, "grad_norm": 0.07551904767751694, "kl": 0.9617207459174096, "learning_rate": 4.1576190476190474e-07, "loss": 0.001, "num_tokens": 1179824394.0, "reward": 0.2515625, "reward_std": 0.028246928378939627, "rewards/verify_chess_move/mean": 0.2515625, "rewards/verify_chess_move/std": 0.9672754168510437, "step": 17615 }, { "completion_length": 349.8, "completions/clipped_ratio": 0.0, "completions/max_length": 349.8, "completions/max_terminated_length": 349.8, "completions/mean_length": 81.7703125, "completions/mean_terminated_length": 81.7703125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01594607655926746, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0038002533838152885, "kl": 0.5743673825636506, "learning_rate": 4.157222222222222e-07, "loss": 0.0006, "num_tokens": 1180125092.0, "reward": 0.4765625, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8753651142120361, "step": 17620 }, { "completion_length": 364.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 89.53203125, "completions/mean_terminated_length": 89.53203125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01595060155261572, "frac_reward_zero_std": 0.975, "grad_norm": 22.428916931152344, "kl": 3.690147078386508, "learning_rate": 4.156825396825397e-07, "loss": 0.0037, "num_tokens": 1180437469.0, "reward": 0.3171875, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.9483794808387757, "step": 17625 }, { "completion_length": 378.2, "completions/clipped_ratio": 0.0, "completions/max_length": 378.2, "completions/max_terminated_length": 378.2, "completions/mean_length": 90.4265625, "completions/mean_terminated_length": 90.4265625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01595512654596398, "frac_reward_zero_std": 0.975, "grad_norm": 6.1056599617004395, "kl": 1.5091384403174744, "learning_rate": 4.156428571428571e-07, "loss": 0.0015, "num_tokens": 1180749959.0, "reward": 0.5328125, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.5328125, "rewards/verify_chess_move/std": 0.8348210453987122, "step": 17630 }, { "completion_length": 327.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 92.809375, "completions/mean_terminated_length": 92.809375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015959651539312236, "frac_reward_zero_std": 0.99375, "grad_norm": 0.41936421394348145, "kl": 0.45705892248079183, "learning_rate": 4.1560317460317456e-07, "loss": 0.0005, "num_tokens": 1181069379.0, "reward": 0.36875, "reward_std": 0.006681530922651291, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9129774332046509, "step": 17635 }, { "completion_length": 464.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 464.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 93.13125, "completions/mean_terminated_length": 92.07364044189453, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015964176532660496, "frac_reward_zero_std": 0.96875, "grad_norm": 10.492576599121094, "kl": 1.4719104710733517, "learning_rate": 4.1556349206349207e-07, "loss": 0.0015, "num_tokens": 1181388187.0, "reward": 0.4265625, "reward_std": 0.026196981593966485, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.8917383313179016, "step": 17640 }, { "completion_length": 283.8, "completions/clipped_ratio": 0.0, "completions/max_length": 283.8, "completions/max_terminated_length": 283.8, "completions/mean_length": 84.97578125, "completions/mean_terminated_length": 84.97578125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015968701526008756, "frac_reward_zero_std": 0.975, "grad_norm": 34.70383071899414, "kl": 0.4192071196855977, "learning_rate": 4.155238095238095e-07, "loss": 0.0004, "num_tokens": 1181692036.0, "reward": 0.3828125, "reward_std": 0.02446401007473469, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.923443603515625, "step": 17645 }, { "completion_length": 347.6, "completions/clipped_ratio": 0.0, "completions/max_length": 347.6, "completions/max_terminated_length": 347.6, "completions/mean_length": 88.22109375, "completions/mean_terminated_length": 88.22109375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015973226519357016, "frac_reward_zero_std": 0.9625, "grad_norm": 23.69658660888672, "kl": 1.0613185643451288, "learning_rate": 4.15484126984127e-07, "loss": 0.0011, "num_tokens": 1182002567.0, "reward": 0.4078125, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9012580037117004, "step": 17650 }, { "completion_length": 311.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 87.340625, "completions/mean_terminated_length": 87.340625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.015977751512705276, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017313726712018251, "kl": 0.6718296078033745, "learning_rate": 4.1544444444444443e-07, "loss": 0.0007, "num_tokens": 1182310867.0, "reward": 0.284375, "reward_std": 0.030509041622281075, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9490399241447449, "step": 17655 }, { "completion_length": 327.4, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/max_terminated_length": 327.4, "completions/mean_length": 92.08359375, "completions/mean_terminated_length": 92.08359375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.015982276506053536, "frac_reward_zero_std": 0.9625, "grad_norm": 24.13123893737793, "kl": 0.649355377489701, "learning_rate": 4.154047619047619e-07, "loss": 0.0006, "num_tokens": 1182627198.0, "reward": 0.35625, "reward_std": 0.02925042174756527, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9240997672080994, "step": 17660 }, { "completion_length": 278.4, "completions/clipped_ratio": 0.0, "completions/max_length": 278.4, "completions/max_terminated_length": 278.4, "completions/mean_length": 85.61640625, "completions/mean_terminated_length": 85.61640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.015986801499401796, "frac_reward_zero_std": 0.975, "grad_norm": 14.423163414001465, "kl": 0.20111921848729253, "learning_rate": 4.153650793650794e-07, "loss": 0.0002, "num_tokens": 1182934539.0, "reward": 0.265625, "reward_std": 0.019727615639567375, "rewards/verify_chess_move/mean": 0.265625, "rewards/verify_chess_move/std": 0.949441134929657, "step": 17665 }, { "completion_length": 425.2, "completions/clipped_ratio": 0.0, "completions/max_length": 425.2, "completions/max_terminated_length": 425.2, "completions/mean_length": 87.51015625, "completions/mean_terminated_length": 87.51015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015991326492750056, "frac_reward_zero_std": 0.95625, "grad_norm": 0.003650920232757926, "kl": 1.9555202032090164, "learning_rate": 4.153253968253968e-07, "loss": 0.002, "num_tokens": 1183242864.0, "reward": 0.4171875, "reward_std": 0.03934885822236538, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.900949239730835, "step": 17670 }, { "completion_length": 377.8, "completions/clipped_ratio": 0.0, "completions/max_length": 377.8, "completions/max_terminated_length": 377.8, "completions/mean_length": 94.6265625, "completions/mean_terminated_length": 94.6265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.015995851486098316, "frac_reward_zero_std": 0.95625, "grad_norm": 1.280983328819275, "kl": 0.681379295210354, "learning_rate": 4.1528571428571424e-07, "loss": 0.0007, "num_tokens": 1183565082.0, "reward": 0.409375, "reward_std": 0.035247981920838355, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9072713375091552, "step": 17675 }, { "completion_length": 510.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 510.6, "completions/max_terminated_length": 446.0, "completions/mean_length": 96.61015625, "completions/mean_terminated_length": 96.09669494628906, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016000376479446576, "frac_reward_zero_std": 0.98125, "grad_norm": 1.7593895196914673, "kl": 1.4359479924431071, "learning_rate": 4.1524603174603175e-07, "loss": 0.0014, "num_tokens": 1183889335.0, "reward": 0.353125, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9114519715309143, "step": 17680 }, { "completion_length": 280.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 92.83671875, "completions/mean_terminated_length": 92.83671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016004901472794836, "frac_reward_zero_std": 0.9625, "grad_norm": 24.885215759277344, "kl": 0.7890730308834464, "learning_rate": 4.1520634920634915e-07, "loss": 0.0008, "num_tokens": 1184208838.0, "reward": 0.359375, "reward_std": 0.034929439425468445, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.921811318397522, "step": 17685 }, { "completion_length": 297.2, "completions/clipped_ratio": 0.0, "completions/max_length": 297.2, "completions/max_terminated_length": 297.2, "completions/mean_length": 91.08828125, "completions/mean_terminated_length": 91.08828125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016009426466143093, "frac_reward_zero_std": 0.975, "grad_norm": 17.711654663085938, "kl": 0.2542657315498218, "learning_rate": 4.1516666666666666e-07, "loss": 0.0003, "num_tokens": 1184525063.0, "reward": 0.540625, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.540625, "rewards/verify_chess_move/std": 0.8335115313529968, "step": 17690 }, { "completion_length": 274.6, "completions/clipped_ratio": 0.0, "completions/max_length": 274.6, "completions/max_terminated_length": 274.6, "completions/mean_length": 88.51875, "completions/mean_terminated_length": 88.51875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016013951459491353, "frac_reward_zero_std": 0.975, "grad_norm": 0.002004192443564534, "kl": 0.19664363856427372, "learning_rate": 4.151269841269841e-07, "loss": 0.0002, "num_tokens": 1184835487.0, "reward": 0.4203125, "reward_std": 0.023144522309303285, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.9055395603179932, "step": 17695 }, { "completion_length": 285.6, "completions/clipped_ratio": 0.0, "completions/max_length": 285.6, "completions/max_terminated_length": 285.6, "completions/mean_length": 92.0515625, "completions/mean_terminated_length": 92.0515625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016018476452839613, "frac_reward_zero_std": 0.975, "grad_norm": 2.1714675426483154, "kl": 1.1372810020577162, "learning_rate": 4.1508730158730157e-07, "loss": 0.0011, "num_tokens": 1185153409.0, "reward": 0.3296875, "reward_std": 0.025194469094276428, "rewards/verify_chess_move/mean": 0.3296875, "rewards/verify_chess_move/std": 0.9443731427192688, "step": 17700 }, { "completion_length": 535.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 90.7703125, "completions/mean_terminated_length": 90.7703125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016023001446187873, "frac_reward_zero_std": 0.96875, "grad_norm": 3.6344141960144043, "kl": 0.2679188067908399, "learning_rate": 4.15047619047619e-07, "loss": 0.0003, "num_tokens": 1185465643.0, "reward": 0.3921875, "reward_std": 0.028246928378939627, "rewards/verify_chess_move/mean": 0.3921875, "rewards/verify_chess_move/std": 0.9209800004959107, "step": 17705 }, { "completion_length": 419.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 419.8, "completions/max_terminated_length": 388.6, "completions/mean_length": 85.55, "completions/mean_terminated_length": 84.48886566162109, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016027526439536133, "frac_reward_zero_std": 0.975, "grad_norm": 22.247007369995117, "kl": 0.11913538698572665, "learning_rate": 4.150079365079365e-07, "loss": 0.0001, "num_tokens": 1185771619.0, "reward": 0.425, "reward_std": 0.02220189608633518, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8873160719871521, "step": 17710 }, { "completion_length": 266.4, "completions/clipped_ratio": 0.0, "completions/max_length": 266.4, "completions/max_terminated_length": 266.4, "completions/mean_length": 85.57734375, "completions/mean_terminated_length": 85.57734375, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.016032051432884394, "frac_reward_zero_std": 0.9625, "grad_norm": 33.9835319519043, "kl": 0.23259663484059273, "learning_rate": 4.14968253968254e-07, "loss": 0.0002, "num_tokens": 1186079550.0, "reward": 0.321875, "reward_std": 0.03061639815568924, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9264591813087464, "step": 17715 }, { "completion_length": 452.2, "completions/clipped_ratio": 0.0, "completions/max_length": 452.2, "completions/max_terminated_length": 452.2, "completions/mean_length": 92.6125, "completions/mean_terminated_length": 92.6125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016036576426232654, "frac_reward_zero_std": 0.975, "grad_norm": 4.124180793762207, "kl": 0.325562088820152, "learning_rate": 4.149285714285714e-07, "loss": 0.0003, "num_tokens": 1186397974.0, "reward": 0.259375, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.259375, "rewards/verify_chess_move/std": 0.9611868858337402, "step": 17720 }, { "completion_length": 306.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 89.6796875, "completions/mean_terminated_length": 89.6796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016041101419580914, "frac_reward_zero_std": 0.95, "grad_norm": 3.573777914047241, "kl": 0.6673506863415242, "learning_rate": 4.1488888888888884e-07, "loss": 0.0007, "num_tokens": 1186711556.0, "reward": 0.303125, "reward_std": 0.04308430515229702, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9456048727035522, "step": 17725 }, { "completion_length": 282.4, "completions/clipped_ratio": 0.0, "completions/max_length": 282.4, "completions/max_terminated_length": 282.4, "completions/mean_length": 94.1328125, "completions/mean_terminated_length": 94.1328125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.016045626412929174, "frac_reward_zero_std": 0.94375, "grad_norm": 0.978064239025116, "kl": 0.39622682495974004, "learning_rate": 4.1484920634920634e-07, "loss": 0.0004, "num_tokens": 1187032598.0, "reward": 0.3265625, "reward_std": 0.04455862008035183, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9404894471168518, "step": 17730 }, { "completion_length": 278.4, "completions/clipped_ratio": 0.0, "completions/max_length": 278.4, "completions/max_terminated_length": 278.4, "completions/mean_length": 84.74296875, "completions/mean_terminated_length": 84.74296875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016050151406277434, "frac_reward_zero_std": 0.96875, "grad_norm": 6.291760444641113, "kl": 0.8999800469959155, "learning_rate": 4.148095238095238e-07, "loss": 0.0009, "num_tokens": 1187336421.0, "reward": 0.5171875, "reward_std": 0.03187599927186966, "rewards/verify_chess_move/mean": 0.5171875, "rewards/verify_chess_move/std": 0.8544797539710999, "step": 17735 }, { "completion_length": 386.6, "completions/clipped_ratio": 0.0, "completions/max_length": 386.6, "completions/max_terminated_length": 386.6, "completions/mean_length": 86.88359375, "completions/mean_terminated_length": 86.88359375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016054676399625694, "frac_reward_zero_std": 0.95625, "grad_norm": 19.572555541992188, "kl": 1.084314311738126, "learning_rate": 4.1476984126984125e-07, "loss": 0.0011, "num_tokens": 1187644784.0, "reward": 0.4046875, "reward_std": 0.043448750674724576, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9072871923446655, "step": 17740 }, { "completion_length": 346.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 87.1875, "completions/mean_terminated_length": 87.1875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01605920139297395, "frac_reward_zero_std": 0.98125, "grad_norm": 3.464468479156494, "kl": 0.7997851456049829, "learning_rate": 4.147301587301587e-07, "loss": 0.0008, "num_tokens": 1187952408.0, "reward": 0.43125, "reward_std": 0.01552036553621292, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8942653894424438, "step": 17745 }, { "completion_length": 490.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 490.6, "completions/max_terminated_length": 479.8, "completions/mean_length": 89.38984375, "completions/mean_terminated_length": 88.34424896240235, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01606372638632221, "frac_reward_zero_std": 0.94375, "grad_norm": 2.3419620990753174, "kl": 1.6978401844622568, "learning_rate": 4.1469047619047616e-07, "loss": 0.0017, "num_tokens": 1188264691.0, "reward": 0.5015625, "reward_std": 0.04818769246339798, "rewards/verify_chess_move/mean": 0.5015625, "rewards/verify_chess_move/std": 0.845726466178894, "step": 17750 }, { "completion_length": 317.6, "completions/clipped_ratio": 0.0, "completions/max_length": 317.6, "completions/max_terminated_length": 317.6, "completions/mean_length": 92.26484375, "completions/mean_terminated_length": 92.26484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01606825137967047, "frac_reward_zero_std": 0.95, "grad_norm": 0.7218247652053833, "kl": 1.082440860872157, "learning_rate": 4.1465079365079367e-07, "loss": 0.0011, "num_tokens": 1188583406.0, "reward": 0.2109375, "reward_std": 0.04240131601691246, "rewards/verify_chess_move/mean": 0.2109375, "rewards/verify_chess_move/std": 0.9524569630622863, "step": 17755 }, { "completion_length": 545.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 545.4, "completions/max_terminated_length": 420.6, "completions/mean_length": 89.34765625, "completions/mean_terminated_length": 88.28299865722656, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01607277637301873, "frac_reward_zero_std": 0.95625, "grad_norm": 6.9690022468566895, "kl": 1.9642391355708242, "learning_rate": 4.1461111111111107e-07, "loss": 0.002, "num_tokens": 1188896059.0, "reward": 0.421875, "reward_std": 0.03640277422964573, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.897525179386139, "step": 17760 }, { "completion_length": 316.6, "completions/clipped_ratio": 0.0, "completions/max_length": 316.6, "completions/max_terminated_length": 316.6, "completions/mean_length": 90.8765625, "completions/mean_terminated_length": 90.8765625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01607730136636699, "frac_reward_zero_std": 0.9625, "grad_norm": 1.5596128702163696, "kl": 4.347078472189605, "learning_rate": 4.145714285714286e-07, "loss": 0.0043, "num_tokens": 1189211133.0, "reward": 0.3421875, "reward_std": 0.03424645103514194, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9288663625717163, "step": 17765 }, { "completion_length": 317.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 87.815625, "completions/mean_terminated_length": 87.815625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01608182635971525, "frac_reward_zero_std": 0.975, "grad_norm": 0.001662620110437274, "kl": 1.4967176253907382, "learning_rate": 4.1453174603174603e-07, "loss": 0.0015, "num_tokens": 1189523553.0, "reward": 0.346875, "reward_std": 0.023356688022613526, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9340908527374268, "step": 17770 }, { "completion_length": 259.2, "completions/clipped_ratio": 0.0, "completions/max_length": 259.2, "completions/max_terminated_length": 259.2, "completions/mean_length": 92.27890625, "completions/mean_terminated_length": 92.27890625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01608635135306351, "frac_reward_zero_std": 0.9875, "grad_norm": 0.01600453443825245, "kl": 0.4306253204587847, "learning_rate": 4.1449206349206343e-07, "loss": 0.0004, "num_tokens": 1189842046.0, "reward": 0.2828125, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9414979219436646, "step": 17775 }, { "completion_length": 396.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 396.4, "completions/max_terminated_length": 342.4, "completions/mean_length": 89.89140625, "completions/mean_terminated_length": 89.37034454345704, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01609087634641177, "frac_reward_zero_std": 0.93125, "grad_norm": 17.55250358581543, "kl": 0.22888824969995766, "learning_rate": 4.1445238095238094e-07, "loss": 0.0002, "num_tokens": 1190155267.0, "reward": 0.38125, "reward_std": 0.061762919276952745, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9251431345939636, "step": 17780 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.8, "completions/max_terminated_length": 407.6, "completions/mean_length": 88.27578125, "completions/mean_terminated_length": 87.7605224609375, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.01609540133976003, "frac_reward_zero_std": 0.96875, "grad_norm": 0.010490402579307556, "kl": 1.2752892303513363, "learning_rate": 4.144126984126984e-07, "loss": 0.0013, "num_tokens": 1190465716.0, "reward": 0.36875, "reward_std": 0.024359199777245523, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9115031003952027, "step": 17785 }, { "completion_length": 434.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 434.8, "completions/max_terminated_length": 331.8, "completions/mean_length": 91.0390625, "completions/mean_terminated_length": 90.5145050048828, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.01609992633310829, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0015629420522600412, "kl": 1.840001677768305, "learning_rate": 4.143730158730159e-07, "loss": 0.0018, "num_tokens": 1190780646.0, "reward": 0.565625, "reward_std": 0.017570311948657037, "rewards/verify_chess_move/mean": 0.565625, "rewards/verify_chess_move/std": 0.8140824437141418, "step": 17790 }, { "completion_length": 450.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 87.846875, "completions/mean_terminated_length": 87.846875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01610445132645655, "frac_reward_zero_std": 0.96875, "grad_norm": 13.890937805175781, "kl": 1.5094673355109989, "learning_rate": 4.143333333333333e-07, "loss": 0.0015, "num_tokens": 1191090762.0, "reward": 0.4359375, "reward_std": 0.025726158916950227, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8784589529037475, "step": 17795 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 93.21328125, "completions/mean_terminated_length": 93.21328125, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "epoch": 0.016108976319804808, "frac_reward_zero_std": 0.9625, "grad_norm": 0.9646919369697571, "kl": 2.009605328645557, "learning_rate": 4.1429365079365075e-07, "loss": 0.002, "num_tokens": 1191410115.0, "reward": 0.3609375, "reward_std": 0.0335624810308218, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9242535948753356, "step": 17800 }, { "completion_length": 578.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 578.8, "completions/max_terminated_length": 452.4, "completions/mean_length": 88.4234375, "completions/mean_terminated_length": 87.36480865478515, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01611350131315307, "frac_reward_zero_std": 0.975, "grad_norm": 18.26969337463379, "kl": 0.21472909897565842, "learning_rate": 4.1425396825396826e-07, "loss": 0.0002, "num_tokens": 1191720649.0, "reward": 0.2390625, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9560458540916443, "step": 17805 }, { "completion_length": 282.8, "completions/clipped_ratio": 0.0, "completions/max_length": 282.8, "completions/max_terminated_length": 282.8, "completions/mean_length": 91.03984375, "completions/mean_terminated_length": 91.03984375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01611802630650133, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002693971386179328, "kl": 0.9720562704838812, "learning_rate": 4.1421428571428566e-07, "loss": 0.001, "num_tokens": 1192037636.0, "reward": 0.31875, "reward_std": 0.025513992831110954, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.9441166520118713, "step": 17810 }, { "completion_length": 333.2, "completions/clipped_ratio": 0.0, "completions/max_length": 333.2, "completions/max_terminated_length": 333.2, "completions/mean_length": 85.50234375, "completions/mean_terminated_length": 85.50234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01612255129984959, "frac_reward_zero_std": 0.96875, "grad_norm": 20.864578247070312, "kl": 1.300798740144819, "learning_rate": 4.1417460317460317e-07, "loss": 0.0013, "num_tokens": 1192344383.0, "reward": 0.3984375, "reward_std": 0.026196981221437453, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9170761466026306, "step": 17815 }, { "completion_length": 402.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 402.8, "completions/max_terminated_length": 313.0, "completions/mean_length": 88.9171875, "completions/mean_terminated_length": 87.8660675048828, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01612707629319785, "frac_reward_zero_std": 0.975, "grad_norm": 0.004353631287813187, "kl": 2.2854554941179233, "learning_rate": 4.141349206349206e-07, "loss": 0.0023, "num_tokens": 1192656941.0, "reward": 0.4, "reward_std": 0.02130674086511135, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.9058875560760498, "step": 17820 }, { "completion_length": 329.6, "completions/clipped_ratio": 0.0, "completions/max_length": 329.6, "completions/max_terminated_length": 329.6, "completions/mean_length": 87.1015625, "completions/mean_terminated_length": 87.1015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01613160128654611, "frac_reward_zero_std": 0.94375, "grad_norm": 0.08121728152036667, "kl": 4.541430485993624, "learning_rate": 4.140952380952381e-07, "loss": 0.0045, "num_tokens": 1192965367.0, "reward": 0.3875, "reward_std": 0.050920627638697626, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.8991042375564575, "step": 17825 }, { "completion_length": 580.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 580.6, "completions/max_terminated_length": 519.2, "completions/mean_length": 92.44375, "completions/mean_terminated_length": 91.91634979248047, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01613612627989437, "frac_reward_zero_std": 0.95625, "grad_norm": 34.86895751953125, "kl": 1.9887003166251815, "learning_rate": 4.1405555555555553e-07, "loss": 0.002, "num_tokens": 1193282439.0, "reward": 0.328125, "reward_std": 0.03708674423396587, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9235845446586609, "step": 17830 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 87.73828125, "completions/mean_terminated_length": 87.73828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01614065127324263, "frac_reward_zero_std": 0.99375, "grad_norm": 0.06483093649148941, "kl": 0.3478174635441974, "learning_rate": 4.14015873015873e-07, "loss": 0.0003, "num_tokens": 1193593640.0, "reward": 0.3109375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9128762245178222, "step": 17835 }, { "completion_length": 501.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 93.50390625, "completions/mean_terminated_length": 93.50390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01614517626659089, "frac_reward_zero_std": 0.95, "grad_norm": 36.27326583862305, "kl": 2.86527943671681, "learning_rate": 4.139761904761905e-07, "loss": 0.0029, "num_tokens": 1193913373.0, "reward": 0.284375, "reward_std": 0.04308430440723896, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9522780299186706, "step": 17840 }, { "completion_length": 621.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 621.4, "completions/max_terminated_length": 572.4, "completions/mean_length": 88.9640625, "completions/mean_terminated_length": 88.42422332763672, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01614970125993915, "frac_reward_zero_std": 0.95625, "grad_norm": 2.0541234016418457, "kl": 1.0634337895200587, "learning_rate": 4.139365079365079e-07, "loss": 0.0011, "num_tokens": 1194223711.0, "reward": 0.459375, "reward_std": 0.04050266854465008, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8839003086090088, "step": 17845 }, { "completion_length": 532.2, "completions/clipped_ratio": 0.0, "completions/max_length": 532.2, "completions/max_terminated_length": 532.2, "completions/mean_length": 90.5734375, "completions/mean_terminated_length": 90.5734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01615422625328741, "frac_reward_zero_std": 0.975, "grad_norm": 7.999907970428467, "kl": 0.8675930958474055, "learning_rate": 4.1389682539682534e-07, "loss": 0.0009, "num_tokens": 1194537629.0, "reward": 0.5125, "reward_std": 0.023356688767671586, "rewards/verify_chess_move/mean": 0.5125, "rewards/verify_chess_move/std": 0.850926399230957, "step": 17850 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 89.23671875, "completions/mean_terminated_length": 89.23671875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016158751246635666, "frac_reward_zero_std": 0.98125, "grad_norm": 24.061941146850586, "kl": 1.7752402093261481, "learning_rate": 4.1385714285714285e-07, "loss": 0.0018, "num_tokens": 1194851876.0, "reward": 0.36875, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9077076196670533, "step": 17855 }, { "completion_length": 378.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 378.2, "completions/max_terminated_length": 325.8, "completions/mean_length": 90.32734375, "completions/mean_terminated_length": 88.76309051513672, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016163276239983926, "frac_reward_zero_std": 0.95625, "grad_norm": 55.064735412597656, "kl": 1.8824321926105767, "learning_rate": 4.138174603174603e-07, "loss": 0.0019, "num_tokens": 1195166119.0, "reward": 0.4234375, "reward_std": 0.0391357097774744, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8734071254730225, "step": 17860 }, { "completion_length": 406.4, "completions/clipped_ratio": 0.0, "completions/max_length": 406.4, "completions/max_terminated_length": 406.4, "completions/mean_length": 94.503125, "completions/mean_terminated_length": 94.503125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016167801233332186, "frac_reward_zero_std": 0.95625, "grad_norm": 15.342275619506836, "kl": 2.715329567133449, "learning_rate": 4.1377777777777776e-07, "loss": 0.0027, "num_tokens": 1195485979.0, "reward": 0.415625, "reward_std": 0.04092700034379959, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.9067019104957581, "step": 17865 }, { "completion_length": 463.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 463.2, "completions/max_terminated_length": 418.4, "completions/mean_length": 98.08984375, "completions/mean_terminated_length": 96.51194458007812, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016172326226680446, "frac_reward_zero_std": 0.9625, "grad_norm": 5.678046703338623, "kl": 8.14853254284244, "learning_rate": 4.137380952380952e-07, "loss": 0.0081, "num_tokens": 1195812174.0, "reward": 0.4828125, "reward_std": 0.031983356550335885, "rewards/verify_chess_move/mean": 0.4828125, "rewards/verify_chess_move/std": 0.8467629671096801, "step": 17870 }, { "completion_length": 392.6, "completions/clipped_ratio": 0.0, "completions/max_length": 392.6, "completions/max_terminated_length": 392.6, "completions/mean_length": 88.12578125, "completions/mean_terminated_length": 88.12578125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016176851220028706, "frac_reward_zero_std": 0.98125, "grad_norm": 1.3383644819259644, "kl": 3.5468580155400558, "learning_rate": 4.1369841269841267e-07, "loss": 0.0035, "num_tokens": 1196122671.0, "reward": 0.3671875, "reward_std": 0.017358146235346796, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9089790940284729, "step": 17875 }, { "completion_length": 264.2, "completions/clipped_ratio": 0.0, "completions/max_length": 264.2, "completions/max_terminated_length": 264.2, "completions/mean_length": 86.6421875, "completions/mean_terminated_length": 86.6421875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016181376213376966, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0031241702381521463, "kl": 0.27393652126193047, "learning_rate": 4.136587301587302e-07, "loss": 0.0003, "num_tokens": 1196431181.0, "reward": 0.4546875, "reward_std": 0.017358146607875824, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8761286497116089, "step": 17880 }, { "completion_length": 394.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 394.2, "completions/max_terminated_length": 320.4, "completions/mean_length": 85.06484375, "completions/mean_terminated_length": 84.01438903808594, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016185901206725226, "frac_reward_zero_std": 0.95625, "grad_norm": 1.839676022529602, "kl": 2.494004511879757, "learning_rate": 4.136190476190476e-07, "loss": 0.0025, "num_tokens": 1196736320.0, "reward": 0.2703125, "reward_std": 0.04024401269853115, "rewards/verify_chess_move/mean": 0.2703125, "rewards/verify_chess_move/std": 0.9526196122169495, "step": 17885 }, { "completion_length": 320.4, "completions/clipped_ratio": 0.0, "completions/max_length": 320.4, "completions/max_terminated_length": 320.4, "completions/mean_length": 85.15390625, "completions/mean_terminated_length": 85.15390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016190426200073486, "frac_reward_zero_std": 0.9875, "grad_norm": 0.01962696760892868, "kl": 6.101979850232601, "learning_rate": 4.135793650793651e-07, "loss": 0.0061, "num_tokens": 1197042565.0, "reward": 0.3109375, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9312273383140564, "step": 17890 }, { "completion_length": 420.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 420.8, "completions/max_terminated_length": 326.8, "completions/mean_length": 93.9265625, "completions/mean_terminated_length": 93.4088363647461, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016194951193421746, "frac_reward_zero_std": 0.98125, "grad_norm": 8.83961296081543, "kl": 0.6035275789909065, "learning_rate": 4.1353968253968254e-07, "loss": 0.0006, "num_tokens": 1197360167.0, "reward": 0.409375, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.901740288734436, "step": 17895 }, { "completion_length": 431.2, "completions/clipped_ratio": 0.0, "completions/max_length": 431.2, "completions/max_terminated_length": 431.2, "completions/mean_length": 88.88125, "completions/mean_terminated_length": 88.88125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016199476186770007, "frac_reward_zero_std": 0.95, "grad_norm": 0.0064521306194365025, "kl": 0.2813346589682624, "learning_rate": 4.1349999999999994e-07, "loss": 0.0003, "num_tokens": 1197670711.0, "reward": 0.3421875, "reward_std": 0.04650121033191681, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9350923895835876, "step": 17900 }, { "completion_length": 360.2, "completions/clipped_ratio": 0.0, "completions/max_length": 360.2, "completions/max_terminated_length": 360.2, "completions/mean_length": 83.240625, "completions/mean_terminated_length": 83.240625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.016204001180118267, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02763286419212818, "kl": 0.5465722534572706, "learning_rate": 4.1346031746031744e-07, "loss": 0.0005, "num_tokens": 1197972523.0, "reward": 0.3671875, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9241614460945129, "step": 17905 }, { "completion_length": 411.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 411.8, "completions/max_terminated_length": 321.0, "completions/mean_length": 95.97109375, "completions/mean_terminated_length": 95.4488525390625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016208526173466523, "frac_reward_zero_std": 0.95625, "grad_norm": 28.538524627685547, "kl": 1.0486736169550568, "learning_rate": 4.134206349206349e-07, "loss": 0.001, "num_tokens": 1198297670.0, "reward": 0.2171875, "reward_std": 0.038664887100458144, "rewards/verify_chess_move/mean": 0.2171875, "rewards/verify_chess_move/std": 0.9472934603691101, "step": 17910 }, { "completion_length": 414.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 414.8, "completions/max_terminated_length": 400.2, "completions/mean_length": 95.709375, "completions/mean_terminated_length": 94.67076110839844, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016213051166814783, "frac_reward_zero_std": 0.975, "grad_norm": 45.64506912231445, "kl": 0.6936924650915899, "learning_rate": 4.133809523809524e-07, "loss": 0.0007, "num_tokens": 1198620586.0, "reward": 0.2984375, "reward_std": 0.0260896235704422, "rewards/verify_chess_move/mean": 0.2984375, "rewards/verify_chess_move/std": 0.9429117679595947, "step": 17915 }, { "completion_length": 333.2, "completions/clipped_ratio": 0.0, "completions/max_length": 333.2, "completions/max_terminated_length": 333.2, "completions/mean_length": 88.80390625, "completions/mean_terminated_length": 88.80390625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016217576160163043, "frac_reward_zero_std": 0.975, "grad_norm": 0.3448624014854431, "kl": 0.7507717857602983, "learning_rate": 4.133412698412698e-07, "loss": 0.0008, "num_tokens": 1198933719.0, "reward": 0.3125, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9393702030181885, "step": 17920 }, { "completion_length": 276.6, "completions/clipped_ratio": 0.0, "completions/max_length": 276.6, "completions/max_terminated_length": 276.6, "completions/mean_length": 102.3484375, "completions/mean_terminated_length": 102.3484375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016222101153511304, "frac_reward_zero_std": 0.95, "grad_norm": 3.4665608406066895, "kl": 1.0499932065606117, "learning_rate": 4.1330158730158726e-07, "loss": 0.001, "num_tokens": 1199268477.0, "reward": 0.278125, "reward_std": 0.04650219082832337, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9355814337730408, "step": 17925 }, { "completion_length": 477.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 477.2, "completions/max_terminated_length": 333.6, "completions/mean_length": 83.85546875, "completions/mean_terminated_length": 82.77163696289062, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016226626146859564, "frac_reward_zero_std": 0.98125, "grad_norm": 13.409700393676758, "kl": 1.7702064851415344, "learning_rate": 4.1326190476190477e-07, "loss": 0.0018, "num_tokens": 1199572956.0, "reward": 0.36875, "reward_std": 0.01825428232550621, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9233259081840515, "step": 17930 }, { "completion_length": 445.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 445.6, "completions/max_terminated_length": 346.4, "completions/mean_length": 93.6375, "completions/mean_terminated_length": 93.11024932861328, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016231151140207824, "frac_reward_zero_std": 0.96875, "grad_norm": 12.632257461547852, "kl": 0.5466757182031869, "learning_rate": 4.1322222222222217e-07, "loss": 0.0005, "num_tokens": 1199893244.0, "reward": 0.384375, "reward_std": 0.027563939243555068, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9003610610961914, "step": 17935 }, { "completion_length": 322.8, "completions/clipped_ratio": 0.0, "completions/max_length": 322.8, "completions/max_terminated_length": 322.8, "completions/mean_length": 89.55859375, "completions/mean_terminated_length": 89.55859375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016235676133556084, "frac_reward_zero_std": 0.9875, "grad_norm": 0.20085807144641876, "kl": 0.6462669832864776, "learning_rate": 4.131825396825397e-07, "loss": 0.0006, "num_tokens": 1200206055.0, "reward": 0.353125, "reward_std": 0.012467906624078751, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9312185764312744, "step": 17940 }, { "completion_length": 284.8, "completions/clipped_ratio": 0.0, "completions/max_length": 284.8, "completions/max_terminated_length": 284.8, "completions/mean_length": 84.509375, "completions/mean_terminated_length": 84.509375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016240201126904344, "frac_reward_zero_std": 0.9875, "grad_norm": 1.4410666227340698, "kl": 0.20262080698739737, "learning_rate": 4.1314285714285713e-07, "loss": 0.0002, "num_tokens": 1200508875.0, "reward": 0.41875, "reward_std": 0.011572751402854919, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8988862872123718, "step": 17945 }, { "completion_length": 279.2, "completions/clipped_ratio": 0.0, "completions/max_length": 279.2, "completions/max_terminated_length": 279.2, "completions/mean_length": 90.8703125, "completions/mean_terminated_length": 90.8703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016244726120252604, "frac_reward_zero_std": 0.98125, "grad_norm": 20.96994972229004, "kl": 0.25709670507349075, "learning_rate": 4.131031746031746e-07, "loss": 0.0003, "num_tokens": 1200823877.0, "reward": 0.3640625, "reward_std": 0.016887323930859566, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9184218525886536, "step": 17950 }, { "completion_length": 300.6, "completions/clipped_ratio": 0.0, "completions/max_length": 300.6, "completions/max_terminated_length": 300.6, "completions/mean_length": 96.31796875, "completions/mean_terminated_length": 96.31796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016249251113600864, "frac_reward_zero_std": 0.95625, "grad_norm": 3.9137725830078125, "kl": 2.007296570390463, "learning_rate": 4.1306349206349204e-07, "loss": 0.002, "num_tokens": 1201148364.0, "reward": 0.3265625, "reward_std": 0.037769732624292375, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9112488627433777, "step": 17955 }, { "completion_length": 254.8, "completions/clipped_ratio": 0.0, "completions/max_length": 254.8, "completions/max_terminated_length": 254.8, "completions/mean_length": 87.66640625, "completions/mean_terminated_length": 87.66640625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.016253776106949124, "frac_reward_zero_std": 0.96875, "grad_norm": 3.9272098541259766, "kl": 3.401623951946385, "learning_rate": 4.130238095238095e-07, "loss": 0.0034, "num_tokens": 1201457585.0, "reward": 0.4515625, "reward_std": 0.026196981221437453, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8764912009239196, "step": 17960 }, { "completion_length": 501.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 501.8, "completions/max_terminated_length": 471.6, "completions/mean_length": 93.0640625, "completions/mean_terminated_length": 92.5409423828125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016258301100297384, "frac_reward_zero_std": 0.96875, "grad_norm": 6.669892311096191, "kl": 1.0502573365811259, "learning_rate": 4.12984126984127e-07, "loss": 0.0011, "num_tokens": 1201775515.0, "reward": 0.3109375, "reward_std": 0.024831003323197366, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9487349390983582, "step": 17965 }, { "completion_length": 568.2, "completions/clipped_ratio": 0.003125, "completions/max_length": 568.2, "completions/max_terminated_length": 452.8, "completions/mean_length": 95.96171875, "completions/mean_terminated_length": 93.86047058105468, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01626282609364564, "frac_reward_zero_std": 0.96875, "grad_norm": 19.93068504333496, "kl": 2.217761848005466, "learning_rate": 4.1294444444444445e-07, "loss": 0.0022, "num_tokens": 1202098010.0, "reward": 0.3109375, "reward_std": 0.02824692875146866, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9412540912628173, "step": 17970 }, { "completion_length": 373.4, "completions/clipped_ratio": 0.00390625, "completions/max_length": 373.4, "completions/max_terminated_length": 362.6, "completions/mean_length": 95.17265625, "completions/mean_terminated_length": 92.59471893310547, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0162673510869939, "frac_reward_zero_std": 0.93125, "grad_norm": 12.080682754516602, "kl": 1.1043313637841492, "learning_rate": 4.1290476190476185e-07, "loss": 0.0011, "num_tokens": 1202418535.0, "reward": 0.3265625, "reward_std": 0.06202157735824585, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.9430597305297852, "step": 17975 }, { "completion_length": 439.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 439.2, "completions/max_terminated_length": 419.8, "completions/mean_length": 88.596875, "completions/mean_terminated_length": 87.56559448242187, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01627187608034216, "frac_reward_zero_std": 0.9375, "grad_norm": 15.882723808288574, "kl": 0.8859536746283994, "learning_rate": 4.1286507936507936e-07, "loss": 0.0009, "num_tokens": 1202728099.0, "reward": 0.3515625, "reward_std": 0.0485062338411808, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9291828155517579, "step": 17980 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.2, "completions/max_terminated_length": 393.4, "completions/mean_length": 92.0296875, "completions/mean_terminated_length": 91.50162811279297, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01627640107369042, "frac_reward_zero_std": 0.9625, "grad_norm": 11.694845199584961, "kl": 2.0386887237313203, "learning_rate": 4.128253968253968e-07, "loss": 0.002, "num_tokens": 1203046497.0, "reward": 0.2234375, "reward_std": 0.03445763699710369, "rewards/verify_chess_move/mean": 0.2234375, "rewards/verify_chess_move/std": 0.9693378686904908, "step": 17985 }, { "completion_length": 281.6, "completions/clipped_ratio": 0.0, "completions/max_length": 281.6, "completions/max_terminated_length": 281.6, "completions/mean_length": 88.53046875, "completions/mean_terminated_length": 88.53046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01628092606703868, "frac_reward_zero_std": 0.94375, "grad_norm": 25.01140785217285, "kl": 1.9164604475721716, "learning_rate": 4.1278571428571427e-07, "loss": 0.0019, "num_tokens": 1203357568.0, "reward": 0.4328125, "reward_std": 0.05817779004573822, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.8945096492767334, "step": 17990 }, { "completion_length": 358.6, "completions/clipped_ratio": 0.0, "completions/max_length": 358.6, "completions/max_terminated_length": 358.6, "completions/mean_length": 86.1609375, "completions/mean_terminated_length": 86.1609375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01628545106038694, "frac_reward_zero_std": 0.975, "grad_norm": 0.0009715622290968895, "kl": 0.891368435882032, "learning_rate": 4.127460317460317e-07, "loss": 0.0009, "num_tokens": 1203664974.0, "reward": 0.4515625, "reward_std": 0.023144522309303285, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8867542982101441, "step": 17995 }, { "completion_length": 389.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 389.6, "completions/max_terminated_length": 348.6, "completions/mean_length": 89.4484375, "completions/mean_terminated_length": 88.40255889892578, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0162899760537352, "frac_reward_zero_std": 0.96875, "grad_norm": 25.3475399017334, "kl": 0.23254071690607817, "learning_rate": 4.127063492063492e-07, "loss": 0.0002, "num_tokens": 1203977476.0, "reward": 0.365625, "reward_std": 0.026409146934747697, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9295343160629272, "step": 18000 }, { "completion_length": 285.2, "completions/clipped_ratio": 0.0, "completions/max_length": 285.2, "completions/max_terminated_length": 285.2, "completions/mean_length": 86.809375, "completions/mean_terminated_length": 86.809375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01629450104708346, "frac_reward_zero_std": 0.99375, "grad_norm": 0.0026070557069033384, "kl": 0.9519420058932155, "learning_rate": 4.126666666666667e-07, "loss": 0.001, "num_tokens": 1204285656.0, "reward": 0.4234375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8874192953109741, "step": 18005 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.2, "completions/max_terminated_length": 376.8, "completions/mean_length": 94.18359375, "completions/mean_terminated_length": 93.64991149902343, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.01629902604043172, "frac_reward_zero_std": 0.95, "grad_norm": 0.33999380469322205, "kl": 0.2513264827197418, "learning_rate": 4.126269841269841e-07, "loss": 0.0003, "num_tokens": 1204605347.0, "reward": 0.3390625, "reward_std": 0.04739636480808258, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9219579935073853, "step": 18010 }, { "completion_length": 366.2, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/max_terminated_length": 366.2, "completions/mean_length": 90.884375, "completions/mean_terminated_length": 90.884375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016303551033779982, "frac_reward_zero_std": 0.975, "grad_norm": 1.7851285934448242, "kl": 0.4146396612282842, "learning_rate": 4.125873015873016e-07, "loss": 0.0004, "num_tokens": 1204920247.0, "reward": 0.3703125, "reward_std": 0.023144522309303285, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9163749694824219, "step": 18015 }, { "completion_length": 463.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 463.6, "completions/max_terminated_length": 383.8, "completions/mean_length": 91.6796875, "completions/mean_terminated_length": 91.15672302246094, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016308076027128242, "frac_reward_zero_std": 0.96875, "grad_norm": 23.782867431640625, "kl": 0.22285422995919363, "learning_rate": 4.1254761904761904e-07, "loss": 0.0002, "num_tokens": 1205238101.0, "reward": 0.4875, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.4875, "rewards/verify_chess_move/std": 0.8681458234786987, "step": 18020 }, { "completion_length": 446.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 446.8, "completions/max_terminated_length": 438.0, "completions/mean_length": 95.6046875, "completions/mean_terminated_length": 95.08034057617188, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0163126010204765, "frac_reward_zero_std": 0.975, "grad_norm": 6.633412837982178, "kl": 0.4683644344448112, "learning_rate": 4.1250793650793645e-07, "loss": 0.0005, "num_tokens": 1205561995.0, "reward": 0.3796875, "reward_std": 0.019939782470464705, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.9238223671913147, "step": 18025 }, { "completion_length": 457.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 457.4, "completions/max_terminated_length": 457.0, "completions/mean_length": 99.084375, "completions/mean_terminated_length": 98.06301727294922, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01631712601382476, "frac_reward_zero_std": 0.925, "grad_norm": 0.24542495608329773, "kl": 0.6010857121669687, "learning_rate": 4.1246825396825395e-07, "loss": 0.0006, "num_tokens": 1205888231.0, "reward": 0.38125, "reward_std": 0.06759578585624695, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9044639587402343, "step": 18030 }, { "completion_length": 254.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 79.96796875, "completions/mean_terminated_length": 79.96796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01632165100717302, "frac_reward_zero_std": 0.95625, "grad_norm": 0.5960729718208313, "kl": 0.3711176009615883, "learning_rate": 4.124285714285714e-07, "loss": 0.0004, "num_tokens": 1206184070.0, "reward": 0.528125, "reward_std": 0.04092700071632862, "rewards/verify_chess_move/mean": 0.528125, "rewards/verify_chess_move/std": 0.8305941939353942, "step": 18035 }, { "completion_length": 574.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 574.8, "completions/max_terminated_length": 419.2, "completions/mean_length": 96.2546875, "completions/mean_terminated_length": 94.65304718017578, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01632617600052128, "frac_reward_zero_std": 0.94375, "grad_norm": 17.211885452270508, "kl": 0.2982483863015659, "learning_rate": 4.123888888888889e-07, "loss": 0.0003, "num_tokens": 1206507524.0, "reward": 0.35625, "reward_std": 0.04887068048119545, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9282206654548645, "step": 18040 }, { "completion_length": 279.2, "completions/clipped_ratio": 0.0, "completions/max_length": 279.2, "completions/max_terminated_length": 279.2, "completions/mean_length": 88.35078125, "completions/mean_terminated_length": 88.35078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01633070099386954, "frac_reward_zero_std": 0.95, "grad_norm": 0.9449410438537598, "kl": 0.3019571125973016, "learning_rate": 4.123492063492063e-07, "loss": 0.0003, "num_tokens": 1206818613.0, "reward": 0.4421875, "reward_std": 0.04240131638944149, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8892425179481507, "step": 18045 }, { "completion_length": 498.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 498.0, "completions/max_terminated_length": 486.4, "completions/mean_length": 95.60703125, "completions/mean_terminated_length": 95.08299407958984, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.0163352259872178, "frac_reward_zero_std": 0.96875, "grad_norm": 0.005243966821581125, "kl": 0.183101289649494, "learning_rate": 4.1230952380952377e-07, "loss": 0.0002, "num_tokens": 1207141718.0, "reward": 0.3546875, "reward_std": 0.03140517696738243, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9190487861633301, "step": 18050 }, { "completion_length": 335.4, "completions/clipped_ratio": 0.0, "completions/max_length": 335.4, "completions/max_terminated_length": 335.4, "completions/mean_length": 93.06328125, "completions/mean_terminated_length": 93.06328125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01633975098056606, "frac_reward_zero_std": 0.96875, "grad_norm": 1.418154239654541, "kl": 0.5606194077059626, "learning_rate": 4.122698412698413e-07, "loss": 0.0006, "num_tokens": 1207460079.0, "reward": 0.3890625, "reward_std": 0.0286712609231472, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9151763558387757, "step": 18055 }, { "completion_length": 328.6, "completions/clipped_ratio": 0.0, "completions/max_length": 328.6, "completions/max_terminated_length": 328.6, "completions/mean_length": 90.63671875, "completions/mean_terminated_length": 90.63671875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01634427597391432, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0022978903725743294, "kl": 0.7181788872461766, "learning_rate": 4.1223015873015873e-07, "loss": 0.0007, "num_tokens": 1207773814.0, "reward": 0.309375, "reward_std": 0.037981899455189705, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9407610058784485, "step": 18060 }, { "completion_length": 485.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 485.4, "completions/max_terminated_length": 426.4, "completions/mean_length": 92.79296875, "completions/mean_terminated_length": 92.26002502441406, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.01634880096726258, "frac_reward_zero_std": 0.95, "grad_norm": 16.964019775390625, "kl": 0.2642094312934205, "learning_rate": 4.121904761904762e-07, "loss": 0.0003, "num_tokens": 1208091909.0, "reward": 0.4203125, "reward_std": 0.0421901311725378, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8911486148834229, "step": 18065 }, { "completion_length": 563.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 563.2, "completions/max_terminated_length": 557.6, "completions/mean_length": 90.64609375, "completions/mean_terminated_length": 90.11920166015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01635332596061084, "frac_reward_zero_std": 0.94375, "grad_norm": 4.5234174728393555, "kl": 0.08271489669568836, "learning_rate": 4.1215079365079364e-07, "loss": 0.0001, "num_tokens": 1208403568.0, "reward": 0.396875, "reward_std": 0.04613676369190216, "rewards/verify_chess_move/mean": 0.396875, "rewards/verify_chess_move/std": 0.9171158909797669, "step": 18070 }, { "completion_length": 351.4, "completions/clipped_ratio": 0.0, "completions/max_length": 351.4, "completions/max_terminated_length": 351.4, "completions/mean_length": 90.096875, "completions/mean_terminated_length": 90.096875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0163578509539591, "frac_reward_zero_std": 0.96875, "grad_norm": 18.893125534057617, "kl": 0.13698543368373067, "learning_rate": 4.121111111111111e-07, "loss": 0.0001, "num_tokens": 1208717636.0, "reward": 0.3125, "reward_std": 0.030509040877223014, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9297500967979431, "step": 18075 }, { "completion_length": 398.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 398.8, "completions/max_terminated_length": 299.8, "completions/mean_length": 93.0734375, "completions/mean_terminated_length": 92.54768829345703, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016362375947307356, "frac_reward_zero_std": 0.95, "grad_norm": 24.029518127441406, "kl": 1.0919408239424229, "learning_rate": 4.1207142857142855e-07, "loss": 0.0011, "num_tokens": 1209036250.0, "reward": 0.2796875, "reward_std": 0.036722297221422194, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9374522686004638, "step": 18080 }, { "completion_length": 413.2, "completions/clipped_ratio": 0.0, "completions/max_length": 413.2, "completions/max_terminated_length": 413.2, "completions/mean_length": 89.5359375, "completions/mean_terminated_length": 89.5359375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016366900940655616, "frac_reward_zero_std": 0.95625, "grad_norm": 0.012430369853973389, "kl": 0.5857205295702442, "learning_rate": 4.12031746031746e-07, "loss": 0.0006, "num_tokens": 1209349056.0, "reward": 0.3453125, "reward_std": 0.03981968015432358, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9334509372711182, "step": 18085 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 86.49609375, "completions/mean_terminated_length": 86.49609375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016371425934003876, "frac_reward_zero_std": 0.9875, "grad_norm": 0.025437848642468452, "kl": 0.4321988191222772, "learning_rate": 4.1199206349206345e-07, "loss": 0.0004, "num_tokens": 1209656243.0, "reward": 0.45, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8702051520347596, "step": 18090 }, { "completion_length": 342.2, "completions/clipped_ratio": 0.0, "completions/max_length": 342.2, "completions/max_terminated_length": 342.2, "completions/mean_length": 88.50078125, "completions/mean_terminated_length": 88.50078125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016375950927352136, "frac_reward_zero_std": 0.9625, "grad_norm": 23.47358512878418, "kl": 0.18695259105879813, "learning_rate": 4.1195238095238096e-07, "loss": 0.0002, "num_tokens": 1209968524.0, "reward": 0.353125, "reward_std": 0.032195523753762244, "rewards/verify_chess_move/mean": 0.353125, "rewards/verify_chess_move/std": 0.9270328521728516, "step": 18095 }, { "completion_length": 366.4, "completions/clipped_ratio": 0.0, "completions/max_length": 366.4, "completions/max_terminated_length": 366.4, "completions/mean_length": 94.159375, "completions/mean_terminated_length": 94.159375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016380475920700396, "frac_reward_zero_std": 0.98125, "grad_norm": 2.544832229614258, "kl": 0.2563507670070976, "learning_rate": 4.1191269841269836e-07, "loss": 0.0003, "num_tokens": 1210290872.0, "reward": 0.48125, "reward_std": 0.01552036553621292, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8648823976516724, "step": 18100 }, { "completion_length": 550.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 550.6, "completions/max_terminated_length": 539.4, "completions/mean_length": 94.71953125, "completions/mean_terminated_length": 93.13648223876953, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016385000914048656, "frac_reward_zero_std": 0.95625, "grad_norm": 26.825843811035156, "kl": 0.5920687067438848, "learning_rate": 4.1187301587301587e-07, "loss": 0.0006, "num_tokens": 1210613305.0, "reward": 0.3734375, "reward_std": 0.03661494068801403, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9189263224601746, "step": 18105 }, { "completion_length": 406.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 406.4, "completions/max_terminated_length": 328.2, "completions/mean_length": 87.0125, "completions/mean_terminated_length": 86.48847198486328, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016389525907396917, "frac_reward_zero_std": 0.95625, "grad_norm": 0.05665210634469986, "kl": 0.5775977433891967, "learning_rate": 4.118333333333333e-07, "loss": 0.0006, "num_tokens": 1210922017.0, "reward": 0.3109375, "reward_std": 0.04003282710909843, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.948052453994751, "step": 18110 }, { "completion_length": 271.8, "completions/clipped_ratio": 0.0, "completions/max_length": 271.8, "completions/max_terminated_length": 271.8, "completions/mean_length": 82.85546875, "completions/mean_terminated_length": 82.85546875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016394050900745177, "frac_reward_zero_std": 0.975, "grad_norm": 0.09054084867238998, "kl": 1.0833496378269047, "learning_rate": 4.117936507936507e-07, "loss": 0.0011, "num_tokens": 1211224248.0, "reward": 0.4765625, "reward_std": 0.02382849156856537, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8493565082550049, "step": 18115 }, { "completion_length": 533.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 533.2, "completions/max_terminated_length": 429.6, "completions/mean_length": 83.89921875, "completions/mean_terminated_length": 82.8363037109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016398575894093437, "frac_reward_zero_std": 0.98125, "grad_norm": 5.628748893737793, "kl": 0.1816788950469345, "learning_rate": 4.1175396825396823e-07, "loss": 0.0002, "num_tokens": 1211526143.0, "reward": 0.503125, "reward_std": 0.016675157845020293, "rewards/verify_chess_move/mean": 0.503125, "rewards/verify_chess_move/std": 0.8565815091133118, "step": 18120 }, { "completion_length": 285.8, "completions/clipped_ratio": 0.0, "completions/max_length": 285.8, "completions/max_terminated_length": 285.8, "completions/mean_length": 85.90390625, "completions/mean_terminated_length": 85.90390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016403100887441697, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0017352929571643472, "kl": 0.21042723185382783, "learning_rate": 4.117142857142857e-07, "loss": 0.0002, "num_tokens": 1211832060.0, "reward": 0.4828125, "reward_std": 0.028930897638201714, "rewards/verify_chess_move/mean": 0.4828125, "rewards/verify_chess_move/std": 0.8634475350379944, "step": 18125 }, { "completion_length": 401.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 401.8, "completions/max_terminated_length": 372.6, "completions/mean_length": 88.16796875, "completions/mean_terminated_length": 87.63282928466796, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016407625880789957, "frac_reward_zero_std": 0.95625, "grad_norm": 28.1292724609375, "kl": 1.06842206325382, "learning_rate": 4.116746031746032e-07, "loss": 0.0011, "num_tokens": 1212141243.0, "reward": 0.4484375, "reward_std": 0.03866488784551621, "rewards/verify_chess_move/mean": 0.4484375, "rewards/verify_chess_move/std": 0.8896542668342591, "step": 18130 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.0046875, "completions/max_length": 480.2, "completions/max_terminated_length": 388.2, "completions/mean_length": 97.23671875, "completions/mean_terminated_length": 94.0938720703125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016412150874138214, "frac_reward_zero_std": 0.9625, "grad_norm": 3.8969597816467285, "kl": 0.37073933247011154, "learning_rate": 4.116349206349206e-07, "loss": 0.0004, "num_tokens": 1212465578.0, "reward": 0.378125, "reward_std": 0.028566451370716096, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9066098213195801, "step": 18135 }, { "completion_length": 519.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 519.8, "completions/max_terminated_length": 482.6, "completions/mean_length": 89.09609375, "completions/mean_terminated_length": 88.55651397705078, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.016416675867486474, "frac_reward_zero_std": 0.98125, "grad_norm": 0.07069935649633408, "kl": 0.6215480866376311, "learning_rate": 4.1159523809523805e-07, "loss": 0.0006, "num_tokens": 1212776421.0, "reward": 0.409375, "reward_std": 0.016675157845020293, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.9043204545974731, "step": 18140 }, { "completion_length": 425.6, "completions/clipped_ratio": 0.0, "completions/max_length": 425.6, "completions/max_terminated_length": 425.6, "completions/mean_length": 93.44375, "completions/mean_terminated_length": 93.44375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016421200860834734, "frac_reward_zero_std": 0.96875, "grad_norm": 0.8820291757583618, "kl": 0.6102033066330478, "learning_rate": 4.1155555555555555e-07, "loss": 0.0006, "num_tokens": 1213095821.0, "reward": 0.4296875, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.895055890083313, "step": 18145 }, { "completion_length": 552.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 552.8, "completions/max_terminated_length": 524.4, "completions/mean_length": 89.19375, "completions/mean_terminated_length": 88.66563873291015, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016425725854182994, "frac_reward_zero_std": 0.975, "grad_norm": 0.11921710520982742, "kl": 0.17100105073768646, "learning_rate": 4.11515873015873e-07, "loss": 0.0002, "num_tokens": 1213408117.0, "reward": 0.3734375, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9287868022918702, "step": 18150 }, { "completion_length": 343.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 91.296875, "completions/mean_terminated_length": 91.296875, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.016430250847531254, "frac_reward_zero_std": 0.96875, "grad_norm": 0.34564873576164246, "kl": 0.1908505380852148, "learning_rate": 4.1147619047619046e-07, "loss": 0.0002, "num_tokens": 1213723841.0, "reward": 0.2421875, "reward_std": 0.030296875163912773, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9444739580154419, "step": 18155 }, { "completion_length": 480.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.0, "completions/max_terminated_length": 394.8, "completions/mean_length": 87.62109375, "completions/mean_terminated_length": 87.10966796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016434775840879514, "frac_reward_zero_std": 0.95625, "grad_norm": 0.09054158627986908, "kl": 0.5800339878769591, "learning_rate": 4.114365079365079e-07, "loss": 0.0006, "num_tokens": 1214032868.0, "reward": 0.3359375, "reward_std": 0.03503581620752812, "rewards/verify_chess_move/mean": 0.3359375, "rewards/verify_chess_move/std": 0.9276216745376586, "step": 18160 }, { "completion_length": 479.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 479.6, "completions/max_terminated_length": 421.4, "completions/mean_length": 91.440625, "completions/mean_terminated_length": 90.90597229003906, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016439300834227774, "frac_reward_zero_std": 0.98125, "grad_norm": 0.027424555271863937, "kl": 0.09999359630746767, "learning_rate": 4.1139682539682537e-07, "loss": 0.0001, "num_tokens": 1214349824.0, "reward": 0.26875, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.26875, "rewards/verify_chess_move/std": 0.9479615330696106, "step": 18165 }, { "completion_length": 383.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 383.0, "completions/max_terminated_length": 339.8, "completions/mean_length": 96.40703125, "completions/mean_terminated_length": 94.825927734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016443825827576034, "frac_reward_zero_std": 0.95625, "grad_norm": 23.29130744934082, "kl": 0.23070599290076643, "learning_rate": 4.113571428571428e-07, "loss": 0.0002, "num_tokens": 1214675673.0, "reward": 0.3140625, "reward_std": 0.03866488784551621, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9446581363677978, "step": 18170 }, { "completion_length": 493.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 493.0, "completions/max_terminated_length": 408.2, "completions/mean_length": 89.6546875, "completions/mean_terminated_length": 89.11591033935547, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016448350820924294, "frac_reward_zero_std": 0.9625, "grad_norm": 10.11093807220459, "kl": 0.340870795736555, "learning_rate": 4.113174603174603e-07, "loss": 0.0003, "num_tokens": 1214987607.0, "reward": 0.3984375, "reward_std": 0.03198335729539394, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.9146909952163697, "step": 18175 }, { "completion_length": 283.6, "completions/clipped_ratio": 0.0, "completions/max_length": 283.6, "completions/max_terminated_length": 283.6, "completions/mean_length": 91.3640625, "completions/mean_terminated_length": 91.3640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016452875814272554, "frac_reward_zero_std": 0.975, "grad_norm": 2.3012208938598633, "kl": 0.16580850402824582, "learning_rate": 4.112777777777778e-07, "loss": 0.0002, "num_tokens": 1215303041.0, "reward": 0.465625, "reward_std": 0.0245114803314209, "rewards/verify_chess_move/mean": 0.465625, "rewards/verify_chess_move/std": 0.881714940071106, "step": 18180 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/max_terminated_length": 358.4, "completions/mean_length": 87.19921875, "completions/mean_terminated_length": 87.19921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016457400807620814, "frac_reward_zero_std": 0.975, "grad_norm": 0.00237980205565691, "kl": 0.17027291404083372, "learning_rate": 4.1123809523809524e-07, "loss": 0.0002, "num_tokens": 1215611568.0, "reward": 0.340625, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.340625, "rewards/verify_chess_move/std": 0.9368404030799866, "step": 18185 }, { "completion_length": 418.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 418.2, "completions/max_terminated_length": 367.2, "completions/mean_length": 87.078125, "completions/mean_terminated_length": 86.54219512939453, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01646192580096907, "frac_reward_zero_std": 0.98125, "grad_norm": 15.295534133911133, "kl": 0.3445881623076275, "learning_rate": 4.1119841269841264e-07, "loss": 0.0003, "num_tokens": 1215919916.0, "reward": 0.403125, "reward_std": 0.01735912710428238, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.8913679599761963, "step": 18190 }, { "completion_length": 329.4, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/max_terminated_length": 329.4, "completions/mean_length": 87.33125, "completions/mean_terminated_length": 87.33125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01646645079431733, "frac_reward_zero_std": 0.95, "grad_norm": 0.0019359205616638064, "kl": 1.2063380872132257, "learning_rate": 4.1115873015873015e-07, "loss": 0.0012, "num_tokens": 1216229700.0, "reward": 0.3859375, "reward_std": 0.044451264292001726, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9013116359710693, "step": 18195 }, { "completion_length": 411.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 411.6, "completions/max_terminated_length": 309.0, "completions/mean_length": 88.55859375, "completions/mean_terminated_length": 88.02733306884765, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01647097578766559, "frac_reward_zero_std": 0.9875, "grad_norm": 0.002432822482660413, "kl": 0.12106607425957919, "learning_rate": 4.111190476190476e-07, "loss": 0.0001, "num_tokens": 1216542607.0, "reward": 0.2953125, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9453303456306458, "step": 18200 }, { "completion_length": 402.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 402.8, "completions/max_terminated_length": 356.8, "completions/mean_length": 89.1625, "completions/mean_terminated_length": 88.63647766113282, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01647550078101385, "frac_reward_zero_std": 0.94375, "grad_norm": 31.06525421142578, "kl": 0.5085732494713738, "learning_rate": 4.110793650793651e-07, "loss": 0.0005, "num_tokens": 1216855959.0, "reward": 0.35, "reward_std": 0.05070944242179394, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9295671582221985, "step": 18205 }, { "completion_length": 374.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 374.0, "completions/max_terminated_length": 282.8, "completions/mean_length": 87.196875, "completions/mean_terminated_length": 86.67158508300781, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01648002577436211, "frac_reward_zero_std": 0.96875, "grad_norm": 0.028790319338440895, "kl": 0.23372237933799625, "learning_rate": 4.110396825396825e-07, "loss": 0.0002, "num_tokens": 1217164331.0, "reward": 0.359375, "reward_std": 0.026409147679805754, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9300041198730469, "step": 18210 }, { "completion_length": 458.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 458.6, "completions/max_terminated_length": 362.2, "completions/mean_length": 89.734375, "completions/mean_terminated_length": 88.67576293945312, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01648455076771037, "frac_reward_zero_std": 0.94375, "grad_norm": 16.64847755432129, "kl": 0.9728936762199737, "learning_rate": 4.1099999999999996e-07, "loss": 0.001, "num_tokens": 1217479319.0, "reward": 0.334375, "reward_std": 0.051815781742334366, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9392278671264649, "step": 18215 }, { "completion_length": 384.4, "completions/clipped_ratio": 0.0, "completions/max_length": 384.4, "completions/max_terminated_length": 384.4, "completions/mean_length": 83.67734375, "completions/mean_terminated_length": 83.67734375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01648907576105863, "frac_reward_zero_std": 0.96875, "grad_norm": 25.846385955810547, "kl": 0.26898652473464607, "learning_rate": 4.1096031746031747e-07, "loss": 0.0003, "num_tokens": 1217782722.0, "reward": 0.471875, "reward_std": 0.028247909247875215, "rewards/verify_chess_move/mean": 0.471875, "rewards/verify_chess_move/std": 0.8780145406723022, "step": 18220 }, { "completion_length": 404.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 88.9859375, "completions/mean_terminated_length": 88.9859375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016493600754406892, "frac_reward_zero_std": 0.975, "grad_norm": 15.992337226867676, "kl": 0.47645836023148147, "learning_rate": 4.1092063492063487e-07, "loss": 0.0005, "num_tokens": 1218097136.0, "reward": 0.271875, "reward_std": 0.019727616757154464, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9596749186515808, "step": 18225 }, { "completion_length": 271.6, "completions/clipped_ratio": 0.0, "completions/max_length": 271.6, "completions/max_terminated_length": 271.6, "completions/mean_length": 85.80078125, "completions/mean_terminated_length": 85.80078125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.016498125747755152, "frac_reward_zero_std": 0.975, "grad_norm": 0.0018503431929275393, "kl": 0.2915938647929579, "learning_rate": 4.108809523809524e-07, "loss": 0.0003, "num_tokens": 1218405201.0, "reward": 0.4375, "reward_std": 0.022461533546447754, "rewards/verify_chess_move/mean": 0.4375, "rewards/verify_chess_move/std": 0.8736087441444397, "step": 18230 }, { "completion_length": 388.8, "completions/clipped_ratio": 0.0, "completions/max_length": 388.8, "completions/max_terminated_length": 388.8, "completions/mean_length": 95.37421875, "completions/mean_terminated_length": 95.37421875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016502650741103412, "frac_reward_zero_std": 0.98125, "grad_norm": 0.01017728541046381, "kl": 0.18518382038455455, "learning_rate": 4.1084126984126983e-07, "loss": 0.0002, "num_tokens": 1218727184.0, "reward": 0.2890625, "reward_std": 0.015992168709635733, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9417834043502807, "step": 18235 }, { "completion_length": 402.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 402.2, "completions/max_terminated_length": 391.4, "completions/mean_length": 91.21328125, "completions/mean_terminated_length": 90.69440002441407, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016507175734451672, "frac_reward_zero_std": 0.96875, "grad_norm": 3.3060996532440186, "kl": 0.19819507314823567, "learning_rate": 4.108015873015873e-07, "loss": 0.0002, "num_tokens": 1219043473.0, "reward": 0.378125, "reward_std": 0.02346404492855072, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.8917038202285766, "step": 18240 }, { "completion_length": 288.4, "completions/clipped_ratio": 0.0, "completions/max_length": 288.4, "completions/max_terminated_length": 288.4, "completions/mean_length": 86.2890625, "completions/mean_terminated_length": 86.2890625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01651170072779993, "frac_reward_zero_std": 0.95, "grad_norm": 1.9652689695358276, "kl": 0.611330641945824, "learning_rate": 4.1076190476190474e-07, "loss": 0.0006, "num_tokens": 1219352779.0, "reward": 0.4546875, "reward_std": 0.04492208585143089, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8836601734161377, "step": 18245 }, { "completion_length": 480.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.2, "completions/max_terminated_length": 384.6, "completions/mean_length": 93.18671875, "completions/mean_terminated_length": 92.64859466552734, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01651622572114819, "frac_reward_zero_std": 0.96875, "grad_norm": 0.7220855355262756, "kl": 0.6231194521998986, "learning_rate": 4.107222222222222e-07, "loss": 0.0006, "num_tokens": 1219672842.0, "reward": 0.2796875, "reward_std": 0.029355230554938317, "rewards/verify_chess_move/mean": 0.2796875, "rewards/verify_chess_move/std": 0.9518437504768371, "step": 18250 }, { "completion_length": 332.2, "completions/clipped_ratio": 0.0, "completions/max_length": 332.2, "completions/max_terminated_length": 332.2, "completions/mean_length": 90.73203125, "completions/mean_terminated_length": 90.73203125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01652075071449645, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0018204302759841084, "kl": 0.23014452354982495, "learning_rate": 4.106825396825397e-07, "loss": 0.0002, "num_tokens": 1219987451.0, "reward": 0.415625, "reward_std": 0.01293872892856598, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.891271424293518, "step": 18255 }, { "completion_length": 368.6, "completions/clipped_ratio": 0.0, "completions/max_length": 368.6, "completions/max_terminated_length": 368.6, "completions/mean_length": 87.01953125, "completions/mean_terminated_length": 87.01953125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01652527570784471, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0023967998567968607, "kl": 0.14945813666563482, "learning_rate": 4.106428571428571e-07, "loss": 0.0001, "num_tokens": 1220298044.0, "reward": 0.3515625, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9258715391159058, "step": 18260 }, { "completion_length": 393.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 393.2, "completions/max_terminated_length": 337.4, "completions/mean_length": 90.5640625, "completions/mean_terminated_length": 90.03960571289062, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01652980070119297, "frac_reward_zero_std": 0.95625, "grad_norm": 0.2722800076007843, "kl": 0.4057722938596271, "learning_rate": 4.1060317460317455e-07, "loss": 0.0004, "num_tokens": 1220614406.0, "reward": 0.3703125, "reward_std": 0.04113916680216789, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9204434990882874, "step": 18265 }, { "completion_length": 431.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 431.4, "completions/max_terminated_length": 358.0, "completions/mean_length": 94.35859375, "completions/mean_terminated_length": 93.32001647949218, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01653432569454123, "frac_reward_zero_std": 0.95625, "grad_norm": 14.287515640258789, "kl": 0.43392346655018627, "learning_rate": 4.1056349206349206e-07, "loss": 0.0004, "num_tokens": 1220936329.0, "reward": 0.378125, "reward_std": 0.03708674423396587, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.9193025588989258, "step": 18270 }, { "completion_length": 407.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 407.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 89.625, "completions/mean_terminated_length": 89.09345092773438, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01653885068788949, "frac_reward_zero_std": 0.975, "grad_norm": 0.7708641290664673, "kl": 0.2041951424558647, "learning_rate": 4.105238095238095e-07, "loss": 0.0002, "num_tokens": 1221250561.0, "reward": 0.41875, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.8906075835227967, "step": 18275 }, { "completion_length": 480.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 480.8, "completions/max_terminated_length": 379.4, "completions/mean_length": 91.815625, "completions/mean_terminated_length": 90.22245178222656, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.01654337568123775, "frac_reward_zero_std": 0.94375, "grad_norm": 4.924991607666016, "kl": 0.4623891977709718, "learning_rate": 4.1048412698412697e-07, "loss": 0.0005, "num_tokens": 1221564165.0, "reward": 0.484375, "reward_std": 0.050920628011226654, "rewards/verify_chess_move/mean": 0.484375, "rewards/verify_chess_move/std": 0.8566677212715149, "step": 18280 }, { "completion_length": 286.8, "completions/clipped_ratio": 0.0, "completions/max_length": 286.8, "completions/max_terminated_length": 286.8, "completions/mean_length": 83.22265625, "completions/mean_terminated_length": 83.22265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01654790067458601, "frac_reward_zero_std": 0.95, "grad_norm": 3.5151238441467285, "kl": 0.31155844796448945, "learning_rate": 4.104444444444444e-07, "loss": 0.0003, "num_tokens": 1221867002.0, "reward": 0.4890625, "reward_std": 0.04376729428768158, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8604406356811524, "step": 18285 }, { "completion_length": 471.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 471.6, "completions/max_terminated_length": 442.8, "completions/mean_length": 91.146875, "completions/mean_terminated_length": 90.61148071289062, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01655242566793427, "frac_reward_zero_std": 0.95, "grad_norm": 0.2086605429649353, "kl": 1.098636005341541, "learning_rate": 4.104047619047619e-07, "loss": 0.0011, "num_tokens": 1222183558.0, "reward": 0.3734375, "reward_std": 0.04171734601259232, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9261009097099304, "step": 18290 }, { "completion_length": 338.6, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/max_terminated_length": 338.6, "completions/mean_length": 85.58984375, "completions/mean_terminated_length": 85.58984375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01655695066128253, "frac_reward_zero_std": 0.95625, "grad_norm": 18.562946319580078, "kl": 1.6345865471055732, "learning_rate": 4.103650793650794e-07, "loss": 0.0016, "num_tokens": 1222489945.0, "reward": 0.4390625, "reward_std": 0.03298586867749691, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8950446128845215, "step": 18295 }, { "completion_length": 386.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 386.2, "completions/max_terminated_length": 352.4, "completions/mean_length": 91.184375, "completions/mean_terminated_length": 90.6664321899414, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016561475654630786, "frac_reward_zero_std": 0.95, "grad_norm": 0.0016568319406360388, "kl": 0.8825979052926414, "learning_rate": 4.103253968253968e-07, "loss": 0.0009, "num_tokens": 1222803957.0, "reward": 0.4625, "reward_std": 0.04513425230979919, "rewards/verify_chess_move/mean": 0.4625, "rewards/verify_chess_move/std": 0.8722740173339844, "step": 18300 }, { "completion_length": 417.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 417.0, "completions/max_terminated_length": 322.2, "completions/mean_length": 89.35234375, "completions/mean_terminated_length": 88.82950439453126, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016566000647979046, "frac_reward_zero_std": 0.975, "grad_norm": 2.277296543121338, "kl": 0.5205860051326454, "learning_rate": 4.102857142857143e-07, "loss": 0.0005, "num_tokens": 1223117016.0, "reward": 0.321875, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9271778464317322, "step": 18305 }, { "completion_length": 428.4, "completions/clipped_ratio": 0.0, "completions/max_length": 428.4, "completions/max_terminated_length": 428.4, "completions/mean_length": 89.5515625, "completions/mean_terminated_length": 89.5515625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016570525641327306, "frac_reward_zero_std": 0.98125, "grad_norm": 26.25677490234375, "kl": 0.26282350949477407, "learning_rate": 4.1024603174603175e-07, "loss": 0.0003, "num_tokens": 1223429834.0, "reward": 0.3546875, "reward_std": 0.018042115867137908, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9324610710144043, "step": 18310 }, { "completion_length": 406.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 406.6, "completions/max_terminated_length": 372.0, "completions/mean_length": 86.58046875, "completions/mean_terminated_length": 86.06422271728516, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016575050634675566, "frac_reward_zero_std": 0.975, "grad_norm": 28.093509674072266, "kl": 0.20958991476800293, "learning_rate": 4.1020634920634915e-07, "loss": 0.0002, "num_tokens": 1223736537.0, "reward": 0.4328125, "reward_std": 0.02198973000049591, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.8620880842208862, "step": 18315 }, { "completion_length": 468.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 468.4, "completions/max_terminated_length": 463.4, "completions/mean_length": 97.7734375, "completions/mean_terminated_length": 96.7437255859375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016579575628023827, "frac_reward_zero_std": 0.95, "grad_norm": 0.001640779315494001, "kl": 0.5663649625843391, "learning_rate": 4.1016666666666665e-07, "loss": 0.0006, "num_tokens": 1224062407.0, "reward": 0.35, "reward_std": 0.041034357994794844, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9265529394149781, "step": 18320 }, { "completion_length": 377.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 377.0, "completions/max_terminated_length": 298.6, "completions/mean_length": 86.596875, "completions/mean_terminated_length": 86.0598358154297, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016584100621372087, "frac_reward_zero_std": 0.96875, "grad_norm": 1.694606065750122, "kl": 0.2477013026480563, "learning_rate": 4.101269841269841e-07, "loss": 0.0002, "num_tokens": 1224370587.0, "reward": 0.4109375, "reward_std": 0.027564920112490655, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.8918369174003601, "step": 18325 }, { "completion_length": 311.4, "completions/clipped_ratio": 0.0, "completions/max_length": 311.4, "completions/max_terminated_length": 311.4, "completions/mean_length": 87.534375, "completions/mean_terminated_length": 87.534375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.016588625614720347, "frac_reward_zero_std": 0.98125, "grad_norm": 0.001972949830815196, "kl": 0.33925598815549163, "learning_rate": 4.100873015873016e-07, "loss": 0.0003, "num_tokens": 1224678935.0, "reward": 0.35, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9189841866493225, "step": 18330 }, { "completion_length": 307.2, "completions/clipped_ratio": 0.0, "completions/max_length": 307.2, "completions/max_terminated_length": 307.2, "completions/mean_length": 95.53515625, "completions/mean_terminated_length": 95.53515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016593150608068607, "frac_reward_zero_std": 0.95625, "grad_norm": 9.109490394592285, "kl": 0.377796174120158, "learning_rate": 4.10047619047619e-07, "loss": 0.0004, "num_tokens": 1225001036.0, "reward": 0.36875, "reward_std": 0.04003184661269188, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.926612937450409, "step": 18335 }, { "completion_length": 401.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 401.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 90.2984375, "completions/mean_terminated_length": 89.76773986816406, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016597675601416867, "frac_reward_zero_std": 0.99375, "grad_norm": 0.0011407453566789627, "kl": 0.24983555321814493, "learning_rate": 4.1000793650793647e-07, "loss": 0.0002, "num_tokens": 1225316682.0, "reward": 0.346875, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9157299041748047, "step": 18340 }, { "completion_length": 277.8, "completions/clipped_ratio": 0.0, "completions/max_length": 277.8, "completions/max_terminated_length": 277.8, "completions/mean_length": 89.6421875, "completions/mean_terminated_length": 89.6421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016602200594765127, "frac_reward_zero_std": 0.975, "grad_norm": 0.16869603097438812, "kl": 0.8032066041603685, "learning_rate": 4.09968253968254e-07, "loss": 0.0008, "num_tokens": 1225631000.0, "reward": 0.31875, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.31875, "rewards/verify_chess_move/std": 0.925985050201416, "step": 18345 }, { "completion_length": 272.4, "completions/clipped_ratio": 0.0, "completions/max_length": 272.4, "completions/max_terminated_length": 272.4, "completions/mean_length": 90.0203125, "completions/mean_terminated_length": 90.0203125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016606725588113387, "frac_reward_zero_std": 0.9625, "grad_norm": 0.583579421043396, "kl": 0.35021071741357446, "learning_rate": 4.099285714285714e-07, "loss": 0.0004, "num_tokens": 1225945186.0, "reward": 0.4140625, "reward_std": 0.031512534245848656, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8920680165290833, "step": 18350 }, { "completion_length": 403.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.8, "completions/max_terminated_length": 338.2, "completions/mean_length": 85.75, "completions/mean_terminated_length": 85.21714324951172, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016611250581461644, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002211464336141944, "kl": 2.1679388148942964, "learning_rate": 4.098888888888889e-07, "loss": 0.0022, "num_tokens": 1226252554.0, "reward": 0.2734375, "reward_std": 0.024831003695726394, "rewards/verify_chess_move/mean": 0.2734375, "rewards/verify_chess_move/std": 0.950696873664856, "step": 18355 }, { "completion_length": 298.2, "completions/clipped_ratio": 0.0, "completions/max_length": 298.2, "completions/max_terminated_length": 298.2, "completions/mean_length": 95.84921875, "completions/mean_terminated_length": 95.84921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016615775574809904, "frac_reward_zero_std": 0.95, "grad_norm": 25.515270233154297, "kl": 1.0312040408607572, "learning_rate": 4.0984920634920634e-07, "loss": 0.001, "num_tokens": 1226576329.0, "reward": 0.471875, "reward_std": 0.04397945925593376, "rewards/verify_chess_move/mean": 0.471875, "rewards/verify_chess_move/std": 0.8805157780647278, "step": 18360 }, { "completion_length": 334.8, "completions/clipped_ratio": 0.0, "completions/max_length": 334.8, "completions/max_terminated_length": 334.8, "completions/mean_length": 85.9171875, "completions/mean_terminated_length": 85.9171875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016620300568158164, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0021659908816218376, "kl": 0.9138546205824241, "learning_rate": 4.098095238095238e-07, "loss": 0.0009, "num_tokens": 1226881879.0, "reward": 0.440625, "reward_std": 0.01552036553621292, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8928126931190491, "step": 18365 }, { "completion_length": 398.2, "completions/clipped_ratio": 0.0, "completions/max_length": 398.2, "completions/max_terminated_length": 398.2, "completions/mean_length": 98.746875, "completions/mean_terminated_length": 98.746875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016624825561506424, "frac_reward_zero_std": 0.94375, "grad_norm": 2.804823160171509, "kl": 0.5924222604488023, "learning_rate": 4.0976984126984125e-07, "loss": 0.0006, "num_tokens": 1227210075.0, "reward": 0.3109375, "reward_std": 0.04387465007603168, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9419145107269287, "step": 18370 }, { "completion_length": 359.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 359.2, "completions/max_terminated_length": 262.8, "completions/mean_length": 86.32578125, "completions/mean_terminated_length": 85.79378509521484, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016629350554854684, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0031498263124376535, "kl": 0.47319222732912747, "learning_rate": 4.097301587301587e-07, "loss": 0.0005, "num_tokens": 1227516228.0, "reward": 0.4890625, "reward_std": 0.03776973225176335, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8499322175979614, "step": 18375 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 93.775, "completions/mean_terminated_length": 93.775, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016633875548202944, "frac_reward_zero_std": 0.96875, "grad_norm": 5.885088920593262, "kl": 0.789936694712378, "learning_rate": 4.096904761904762e-07, "loss": 0.0008, "num_tokens": 1227835660.0, "reward": 0.3859375, "reward_std": 0.025726158916950227, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9154704689979554, "step": 18380 }, { "completion_length": 259.4, "completions/clipped_ratio": 0.0, "completions/max_length": 259.4, "completions/max_terminated_length": 259.4, "completions/mean_length": 85.66796875, "completions/mean_terminated_length": 85.66796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016638400541551204, "frac_reward_zero_std": 0.99375, "grad_norm": 2.3627736568450928, "kl": 0.13663739790208637, "learning_rate": 4.0965079365079366e-07, "loss": 0.0001, "num_tokens": 1228142195.0, "reward": 0.5359375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.5359375, "rewards/verify_chess_move/std": 0.8354346990585327, "step": 18385 }, { "completion_length": 290.2, "completions/clipped_ratio": 0.0, "completions/max_length": 290.2, "completions/max_terminated_length": 290.2, "completions/mean_length": 93.12890625, "completions/mean_terminated_length": 93.12890625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.016642925534899464, "frac_reward_zero_std": 0.95625, "grad_norm": 0.11788808554410934, "kl": 0.15255208918824792, "learning_rate": 4.0961111111111106e-07, "loss": 0.0002, "num_tokens": 1228461840.0, "reward": 0.35625, "reward_std": 0.038452721759676935, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9318332195281982, "step": 18390 }, { "completion_length": 437.8, "completions/clipped_ratio": 0.00390625, "completions/max_length": 437.8, "completions/max_terminated_length": 405.6, "completions/mean_length": 99.4296875, "completions/mean_terminated_length": 96.81866302490235, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016647450528247724, "frac_reward_zero_std": 0.95625, "grad_norm": 30.900514602661133, "kl": 0.24887810642831026, "learning_rate": 4.0957142857142857e-07, "loss": 0.0002, "num_tokens": 1228789646.0, "reward": 0.425, "reward_std": 0.03956102356314659, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.8960837721824646, "step": 18395 }, { "completion_length": 477.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 477.6, "completions/max_terminated_length": 404.6, "completions/mean_length": 90.07734375, "completions/mean_terminated_length": 89.54664001464843, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016651975521595985, "frac_reward_zero_std": 0.94375, "grad_norm": 19.7624568939209, "kl": 0.7494277431280352, "learning_rate": 4.09531746031746e-07, "loss": 0.0007, "num_tokens": 1229102609.0, "reward": 0.446875, "reward_std": 0.047291556373238564, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.877817690372467, "step": 18400 }, { "completion_length": 362.4, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/max_terminated_length": 362.4, "completions/mean_length": 96.2265625, "completions/mean_terminated_length": 96.2265625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016656500514944245, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02126091532409191, "kl": 2.423489729454741, "learning_rate": 4.094920634920635e-07, "loss": 0.0024, "num_tokens": 1229428387.0, "reward": 0.321875, "reward_std": 0.02709311656653881, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9338126420974732, "step": 18405 }, { "completion_length": 354.2, "completions/clipped_ratio": 0.0, "completions/max_length": 354.2, "completions/max_terminated_length": 354.2, "completions/mean_length": 85.28203125, "completions/mean_terminated_length": 85.28203125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.0166610255082925, "frac_reward_zero_std": 0.9625, "grad_norm": 3.735823392868042, "kl": 0.8279499661410228, "learning_rate": 4.0945238095238093e-07, "loss": 0.0008, "num_tokens": 1229734316.0, "reward": 0.4359375, "reward_std": 0.03287851139903068, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8909478068351746, "step": 18410 }, { "completion_length": 295.4, "completions/clipped_ratio": 0.0, "completions/max_length": 295.4, "completions/max_terminated_length": 295.4, "completions/mean_length": 92.65390625, "completions/mean_terminated_length": 92.65390625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01666555050164076, "frac_reward_zero_std": 0.975, "grad_norm": 1.746604084968567, "kl": 0.7955009057419374, "learning_rate": 4.094126984126984e-07, "loss": 0.0008, "num_tokens": 1230055449.0, "reward": 0.3015625, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9376966118812561, "step": 18415 }, { "completion_length": 401.4, "completions/clipped_ratio": 0.0, "completions/max_length": 401.4, "completions/max_terminated_length": 401.4, "completions/mean_length": 90.0015625, "completions/mean_terminated_length": 90.0015625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01667007549498902, "frac_reward_zero_std": 0.98125, "grad_norm": 1.8921233415603638, "kl": 0.4637344516464509, "learning_rate": 4.093730158730159e-07, "loss": 0.0005, "num_tokens": 1230367387.0, "reward": 0.3859375, "reward_std": 0.01893727108836174, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.8894542694091797, "step": 18420 }, { "completion_length": 301.6, "completions/clipped_ratio": 0.0, "completions/max_length": 301.6, "completions/max_terminated_length": 301.6, "completions/mean_length": 87.778125, "completions/mean_terminated_length": 87.778125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01667460048833728, "frac_reward_zero_std": 0.975, "grad_norm": 0.10308749973773956, "kl": 1.2854787326185033, "learning_rate": 4.093333333333333e-07, "loss": 0.0013, "num_tokens": 1230677479.0, "reward": 0.321875, "reward_std": 0.021306741610169412, "rewards/verify_chess_move/mean": 0.321875, "rewards/verify_chess_move/std": 0.9480937719345093, "step": 18425 }, { "completion_length": 379.2, "completions/clipped_ratio": 0.0, "completions/max_length": 379.2, "completions/max_terminated_length": 379.2, "completions/mean_length": 87.42734375, "completions/mean_terminated_length": 87.42734375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01667912548168554, "frac_reward_zero_std": 0.9875, "grad_norm": 0.9579446315765381, "kl": 2.790106586436741, "learning_rate": 4.092936507936508e-07, "loss": 0.0028, "num_tokens": 1230986738.0, "reward": 0.4359375, "reward_std": 0.010205793008208276, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8877734065055847, "step": 18430 }, { "completion_length": 602.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 602.0, "completions/max_terminated_length": 411.8, "completions/mean_length": 91.16875, "completions/mean_terminated_length": 89.58160247802735, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.016683650475033802, "frac_reward_zero_std": 0.95625, "grad_norm": 0.018185779452323914, "kl": 5.079779737978242, "learning_rate": 4.0925396825396825e-07, "loss": 0.0051, "num_tokens": 1231301658.0, "reward": 0.3, "reward_std": 0.03845272213220596, "rewards/verify_chess_move/mean": 0.3, "rewards/verify_chess_move/std": 0.9420216917991638, "step": 18435 }, { "completion_length": 338.4, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/max_terminated_length": 338.4, "completions/mean_length": 89.5078125, "completions/mean_terminated_length": 89.5078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016688175468382062, "frac_reward_zero_std": 0.96875, "grad_norm": 0.4232262372970581, "kl": 0.5015170603059232, "learning_rate": 4.0921428571428565e-07, "loss": 0.0005, "num_tokens": 1231615588.0, "reward": 0.3234375, "reward_std": 0.02688094973564148, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.940101146697998, "step": 18440 }, { "completion_length": 454.6, "completions/clipped_ratio": 0.00234375, "completions/max_length": 454.6, "completions/max_terminated_length": 439.6, "completions/mean_length": 89.95859375, "completions/mean_terminated_length": 88.39018096923829, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016692700461730322, "frac_reward_zero_std": 0.925, "grad_norm": 4.13083028793335, "kl": 3.6601701707113534, "learning_rate": 4.0917460317460316e-07, "loss": 0.0037, "num_tokens": 1231927231.0, "reward": 0.3421875, "reward_std": 0.07028223499655724, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.9377496361732482, "step": 18445 }, { "completion_length": 356.4, "completions/clipped_ratio": 0.0, "completions/max_length": 356.4, "completions/max_terminated_length": 356.4, "completions/mean_length": 95.5890625, "completions/mean_terminated_length": 95.5890625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016697225455078582, "frac_reward_zero_std": 0.9625, "grad_norm": 0.0015595948789268732, "kl": 0.46862879316322503, "learning_rate": 4.091349206349206e-07, "loss": 0.0005, "num_tokens": 1232249137.0, "reward": 0.409375, "reward_std": 0.031300367414951326, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.906326699256897, "step": 18450 }, { "completion_length": 519.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 519.6, "completions/max_terminated_length": 442.6, "completions/mean_length": 94.9375, "completions/mean_terminated_length": 94.41315307617188, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016701750448426842, "frac_reward_zero_std": 0.95625, "grad_norm": 0.32420802116394043, "kl": 0.42209355962695555, "learning_rate": 4.090952380952381e-07, "loss": 0.0004, "num_tokens": 1232569281.0, "reward": 0.4796875, "reward_std": 0.04255359619855881, "rewards/verify_chess_move/mean": 0.4796875, "rewards/verify_chess_move/std": 0.8485823154449463, "step": 18455 }, { "completion_length": 289.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 89.30625, "completions/mean_terminated_length": 89.30625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016706275441775102, "frac_reward_zero_std": 0.96875, "grad_norm": 10.70682144165039, "kl": 1.806161651108414, "learning_rate": 4.090555555555555e-07, "loss": 0.0018, "num_tokens": 1232880857.0, "reward": 0.359375, "reward_std": 0.025513992086052893, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9242718696594239, "step": 18460 }, { "completion_length": 341.8, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/max_terminated_length": 341.8, "completions/mean_length": 88.43203125, "completions/mean_terminated_length": 88.43203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01671080043512336, "frac_reward_zero_std": 0.99375, "grad_norm": 13.826568603515625, "kl": 0.5752119206124917, "learning_rate": 4.09015873015873e-07, "loss": 0.0006, "num_tokens": 1233192762.0, "reward": 0.3953125, "reward_std": 0.00646936446428299, "rewards/verify_chess_move/mean": 0.3953125, "rewards/verify_chess_move/std": 0.9114110708236695, "step": 18465 }, { "completion_length": 366.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 366.0, "completions/max_terminated_length": 278.2, "completions/mean_length": 86.59453125, "completions/mean_terminated_length": 86.06321563720704, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01671532542847162, "frac_reward_zero_std": 0.96875, "grad_norm": 10.175949096679688, "kl": 0.7431452804943547, "learning_rate": 4.089761904761905e-07, "loss": 0.0007, "num_tokens": 1233501035.0, "reward": 0.4140625, "reward_std": 0.02961486726999283, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.8772005319595337, "step": 18470 }, { "completion_length": 346.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 85.546875, "completions/mean_terminated_length": 85.546875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01671985042181988, "frac_reward_zero_std": 0.98125, "grad_norm": 0.31523948907852173, "kl": 1.947101429104805, "learning_rate": 4.0893650793650794e-07, "loss": 0.0019, "num_tokens": 1233805959.0, "reward": 0.4078125, "reward_std": 0.016887323930859566, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.908616304397583, "step": 18475 }, { "completion_length": 324.4, "completions/clipped_ratio": 0.0, "completions/max_length": 324.4, "completions/max_terminated_length": 324.4, "completions/mean_length": 89.028125, "completions/mean_terminated_length": 89.028125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01672437541516814, "frac_reward_zero_std": 0.98125, "grad_norm": 11.87348461151123, "kl": 0.3637644920032471, "learning_rate": 4.0889682539682534e-07, "loss": 0.0004, "num_tokens": 1234118811.0, "reward": 0.475, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.475, "rewards/verify_chess_move/std": 0.8645813941955567, "step": 18480 }, { "completion_length": 324.6, "completions/clipped_ratio": 0.0, "completions/max_length": 324.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 91.65625, "completions/mean_terminated_length": 91.65625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.0167289004085164, "frac_reward_zero_std": 0.95625, "grad_norm": 0.7374626398086548, "kl": 0.8720581702888012, "learning_rate": 4.0885714285714285e-07, "loss": 0.0009, "num_tokens": 1234436155.0, "reward": 0.3171875, "reward_std": 0.037298910692334174, "rewards/verify_chess_move/mean": 0.3171875, "rewards/verify_chess_move/std": 0.924455177783966, "step": 18485 }, { "completion_length": 404.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 404.8, "completions/max_terminated_length": 313.4, "completions/mean_length": 89.94921875, "completions/mean_terminated_length": 89.42557067871094, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01673342540186466, "frac_reward_zero_std": 0.98125, "grad_norm": 16.416614532470703, "kl": 0.45365792538505045, "learning_rate": 4.088174603174603e-07, "loss": 0.0005, "num_tokens": 1234748330.0, "reward": 0.384375, "reward_std": 0.01872510462999344, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9202588438987732, "step": 18490 }, { "completion_length": 628.2, "completions/clipped_ratio": 0.003125, "completions/max_length": 628.2, "completions/max_terminated_length": 460.4, "completions/mean_length": 95.41953125, "completions/mean_terminated_length": 93.3107177734375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01673795039521292, "frac_reward_zero_std": 0.93125, "grad_norm": 10.876378059387207, "kl": 1.488656047254335, "learning_rate": 4.0877777777777775e-07, "loss": 0.0015, "num_tokens": 1235071067.0, "reward": 0.34375, "reward_std": 0.055659566447138785, "rewards/verify_chess_move/mean": 0.34375, "rewards/verify_chess_move/std": 0.9187273144721985, "step": 18495 }, { "completion_length": 416.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 416.6, "completions/max_terminated_length": 313.2, "completions/mean_length": 87.11796875, "completions/mean_terminated_length": 86.59970703125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01674247538856118, "frac_reward_zero_std": 0.96875, "grad_norm": 0.06640180945396423, "kl": 0.4282999636605382, "learning_rate": 4.087380952380952e-07, "loss": 0.0004, "num_tokens": 1235379586.0, "reward": 0.45, "reward_std": 0.027563939988613128, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8710056781768799, "step": 18500 }, { "completion_length": 337.4, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/max_terminated_length": 337.4, "completions/mean_length": 93.39609375, "completions/mean_terminated_length": 93.39609375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01674700038190944, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0039571369998157024, "kl": 0.8916089520789683, "learning_rate": 4.0869841269841266e-07, "loss": 0.0009, "num_tokens": 1235698309.0, "reward": 0.40625, "reward_std": 0.03435282669961452, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.8912690281867981, "step": 18505 }, { "completion_length": 360.2, "completions/clipped_ratio": 0.0, "completions/max_length": 360.2, "completions/max_terminated_length": 360.2, "completions/mean_length": 91.1921875, "completions/mean_terminated_length": 91.1921875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0167515253752577, "frac_reward_zero_std": 0.98125, "grad_norm": 0.20020592212677002, "kl": 0.7051445391727611, "learning_rate": 4.0865873015873017e-07, "loss": 0.0007, "num_tokens": 1236012795.0, "reward": 0.375, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.911587655544281, "step": 18510 }, { "completion_length": 441.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 441.0, "completions/max_terminated_length": 406.4, "completions/mean_length": 93.028125, "completions/mean_terminated_length": 92.5045654296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01675605036860596, "frac_reward_zero_std": 0.95625, "grad_norm": 15.083582878112793, "kl": 2.011082174698822, "learning_rate": 4.0861904761904757e-07, "loss": 0.002, "num_tokens": 1236330607.0, "reward": 0.3828125, "reward_std": 0.04071483500301838, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9018239498138427, "step": 18515 }, { "completion_length": 480.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 480.8, "completions/max_terminated_length": 336.6, "completions/mean_length": 88.4546875, "completions/mean_terminated_length": 87.38233947753906, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016760575361954216, "frac_reward_zero_std": 0.93125, "grad_norm": 18.095800399780273, "kl": 4.063958250009454, "learning_rate": 4.085793650793651e-07, "loss": 0.0041, "num_tokens": 1236641389.0, "reward": 0.35, "reward_std": 0.049980548396706584, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9291206240653992, "step": 18520 }, { "completion_length": 349.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 95.015625, "completions/mean_terminated_length": 95.015625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016765100355302476, "frac_reward_zero_std": 0.95625, "grad_norm": 19.719942092895508, "kl": 3.3874852249166, "learning_rate": 4.0853968253968253e-07, "loss": 0.0034, "num_tokens": 1236963033.0, "reward": 0.475, "reward_std": 0.03777071312069893, "rewards/verify_chess_move/mean": 0.475, "rewards/verify_chess_move/std": 0.8630287289619446, "step": 18525 }, { "completion_length": 548.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 548.0, "completions/max_terminated_length": 523.4, "completions/mean_length": 89.3828125, "completions/mean_terminated_length": 88.84727478027344, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016769625348650737, "frac_reward_zero_std": 0.98125, "grad_norm": 0.009080075658857822, "kl": 2.821849069185555, "learning_rate": 4.0849999999999993e-07, "loss": 0.0028, "num_tokens": 1237276123.0, "reward": 0.36875, "reward_std": 0.01825428232550621, "rewards/verify_chess_move/mean": 0.36875, "rewards/verify_chess_move/std": 0.9253239631652832, "step": 18530 }, { "completion_length": 333.8, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/max_terminated_length": 333.8, "completions/mean_length": 90.10703125, "completions/mean_terminated_length": 90.10703125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016774150341998997, "frac_reward_zero_std": 0.975, "grad_norm": 1.2196950912475586, "kl": 5.879492429783568, "learning_rate": 4.0846031746031744e-07, "loss": 0.0059, "num_tokens": 1237590244.0, "reward": 0.39375, "reward_std": 0.022461533173918725, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.902296268939972, "step": 18535 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 368.0, "completions/max_terminated_length": 293.4, "completions/mean_length": 86.69140625, "completions/mean_terminated_length": 86.15468292236328, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016778675335347257, "frac_reward_zero_std": 0.96875, "grad_norm": 15.334653854370117, "kl": 1.1818098071031273, "learning_rate": 4.084206349206349e-07, "loss": 0.0012, "num_tokens": 1237899009.0, "reward": 0.2953125, "reward_std": 0.02777610570192337, "rewards/verify_chess_move/mean": 0.2953125, "rewards/verify_chess_move/std": 0.9507977485656738, "step": 18540 }, { "completion_length": 506.8, "completions/clipped_ratio": 0.00234375, "completions/max_length": 506.8, "completions/max_terminated_length": 363.8, "completions/mean_length": 95.26796875, "completions/mean_terminated_length": 93.67736511230468, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016783200328695517, "frac_reward_zero_std": 0.9625, "grad_norm": 0.13735249638557434, "kl": 0.42271496216999366, "learning_rate": 4.083809523809524e-07, "loss": 0.0004, "num_tokens": 1238221504.0, "reward": 0.5046875, "reward_std": 0.027883461862802505, "rewards/verify_chess_move/mean": 0.5046875, "rewards/verify_chess_move/std": 0.8181620001792907, "step": 18545 }, { "completion_length": 340.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 340.2, "completions/max_terminated_length": 322.2, "completions/mean_length": 87.334375, "completions/mean_terminated_length": 86.81329803466797, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016787725322043777, "frac_reward_zero_std": 0.98125, "grad_norm": 16.168821334838867, "kl": 0.6087850823649206, "learning_rate": 4.083412698412698e-07, "loss": 0.0006, "num_tokens": 1238530900.0, "reward": 0.334375, "reward_std": 0.01914943754673004, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9369790077209472, "step": 18550 }, { "completion_length": 376.8, "completions/clipped_ratio": 0.0, "completions/max_length": 376.8, "completions/max_terminated_length": 376.8, "completions/mean_length": 87.45625, "completions/mean_terminated_length": 87.45625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016792250315392037, "frac_reward_zero_std": 0.96875, "grad_norm": 31.158315658569336, "kl": 0.8701894341036678, "learning_rate": 4.0830158730158726e-07, "loss": 0.0009, "num_tokens": 1238842268.0, "reward": 0.3859375, "reward_std": 0.026196981221437453, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9161070704460144, "step": 18555 }, { "completion_length": 365.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 365.4, "completions/max_terminated_length": 339.0, "completions/mean_length": 90.528125, "completions/mean_terminated_length": 90.01147155761718, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016796775308740297, "frac_reward_zero_std": 0.96875, "grad_norm": 0.11834707856178284, "kl": 0.9331795876845718, "learning_rate": 4.0826190476190476e-07, "loss": 0.0009, "num_tokens": 1239157336.0, "reward": 0.2421875, "reward_std": 0.025726158544421195, "rewards/verify_chess_move/mean": 0.2421875, "rewards/verify_chess_move/std": 0.9531105875968933, "step": 18560 }, { "completion_length": 375.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 87.21796875, "completions/mean_terminated_length": 87.21796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016801300302088557, "frac_reward_zero_std": 0.98125, "grad_norm": 17.356191635131836, "kl": 0.7155548705952242, "learning_rate": 4.082222222222222e-07, "loss": 0.0007, "num_tokens": 1239466743.0, "reward": 0.35, "reward_std": 0.016675157472491264, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9232994556427002, "step": 18565 }, { "completion_length": 342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 89.0765625, "completions/mean_terminated_length": 89.0765625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016805825295436817, "frac_reward_zero_std": 0.96875, "grad_norm": 7.535909652709961, "kl": 0.583556788531132, "learning_rate": 4.0818253968253967e-07, "loss": 0.0006, "num_tokens": 1239778537.0, "reward": 0.5140625, "reward_std": 0.024831003323197366, "rewards/verify_chess_move/mean": 0.5140625, "rewards/verify_chess_move/std": 0.8518327832221985, "step": 18570 }, { "completion_length": 290.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 90.490625, "completions/mean_terminated_length": 90.490625, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.016810350288785074, "frac_reward_zero_std": 0.96875, "grad_norm": 0.024029027670621872, "kl": 0.6907249201089144, "learning_rate": 4.081428571428571e-07, "loss": 0.0007, "num_tokens": 1240091773.0, "reward": 0.4265625, "reward_std": 0.027776104956865312, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.900044071674347, "step": 18575 }, { "completion_length": 298.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 83.1359375, "completions/mean_terminated_length": 83.1359375, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016814875282133334, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00263842917047441, "kl": 2.2920608864165843, "learning_rate": 4.081031746031746e-07, "loss": 0.0023, "num_tokens": 1240393099.0, "reward": 0.425, "reward_std": 0.025513992458581925, "rewards/verify_chess_move/mean": 0.425, "rewards/verify_chess_move/std": 0.9054726243019104, "step": 18580 }, { "completion_length": 442.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 442.2, "completions/max_terminated_length": 351.6, "completions/mean_length": 85.31484375, "completions/mean_terminated_length": 84.78252563476562, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016819400275481594, "frac_reward_zero_std": 0.96875, "grad_norm": 26.04119873046875, "kl": 1.7307016068603844, "learning_rate": 4.0806349206349203e-07, "loss": 0.0017, "num_tokens": 1240700526.0, "reward": 0.1921875, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.1921875, "rewards/verify_chess_move/std": 0.9598446488380432, "step": 18585 }, { "completion_length": 297.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 86.50546875, "completions/mean_terminated_length": 86.50546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016823925268829854, "frac_reward_zero_std": 0.9625, "grad_norm": 30.789756774902344, "kl": 0.4288541738176718, "learning_rate": 4.080238095238095e-07, "loss": 0.0004, "num_tokens": 1241009941.0, "reward": 0.3671875, "reward_std": 0.02993340864777565, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9278370261192321, "step": 18590 }, { "completion_length": 309.6, "completions/clipped_ratio": 0.0, "completions/max_length": 309.6, "completions/max_terminated_length": 309.6, "completions/mean_length": 94.91328125, "completions/mean_terminated_length": 94.91328125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016828450262178114, "frac_reward_zero_std": 0.95, "grad_norm": 0.2647341787815094, "kl": 0.6058032606262713, "learning_rate": 4.07984126984127e-07, "loss": 0.0006, "num_tokens": 1241332582.0, "reward": 0.4984375, "reward_std": 0.04082219153642654, "rewards/verify_chess_move/mean": 0.4984375, "rewards/verify_chess_move/std": 0.8593950390815734, "step": 18595 }, { "completion_length": 455.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 455.2, "completions/max_terminated_length": 370.6, "completions/mean_length": 86.3859375, "completions/mean_terminated_length": 85.85184478759766, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016832975255526374, "frac_reward_zero_std": 0.975, "grad_norm": 30.08034324645996, "kl": 0.9822858765022829, "learning_rate": 4.0794444444444445e-07, "loss": 0.001, "num_tokens": 1241640716.0, "reward": 0.5453125, "reward_std": 0.024039676785469054, "rewards/verify_chess_move/mean": 0.5453125, "rewards/verify_chess_move/std": 0.8344867467880249, "step": 18600 }, { "completion_length": 438.4, "completions/clipped_ratio": 0.0, "completions/max_length": 438.4, "completions/max_terminated_length": 438.4, "completions/mean_length": 88.8296875, "completions/mean_terminated_length": 88.8296875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016837500248874634, "frac_reward_zero_std": 0.98125, "grad_norm": 29.179134368896484, "kl": 2.1826823876355776, "learning_rate": 4.0790476190476185e-07, "loss": 0.0022, "num_tokens": 1241953354.0, "reward": 0.515625, "reward_std": 0.01552036516368389, "rewards/verify_chess_move/mean": 0.515625, "rewards/verify_chess_move/std": 0.8262002110481262, "step": 18605 }, { "completion_length": 421.6, "completions/clipped_ratio": 0.0, "completions/max_length": 421.6, "completions/max_terminated_length": 421.6, "completions/mean_length": 86.9453125, "completions/mean_terminated_length": 86.9453125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016842025242222895, "frac_reward_zero_std": 0.95, "grad_norm": 0.01978943683207035, "kl": 3.2014417076366954, "learning_rate": 4.0786507936507935e-07, "loss": 0.0032, "num_tokens": 1242261436.0, "reward": 0.459375, "reward_std": 0.04944729208946228, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8629661202430725, "step": 18610 }, { "completion_length": 610.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 610.0, "completions/max_terminated_length": 406.6, "completions/mean_length": 91.2265625, "completions/mean_terminated_length": 89.11291809082032, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016846550235571155, "frac_reward_zero_std": 0.94375, "grad_norm": 42.68757247924805, "kl": 2.410719337977935, "learning_rate": 4.078253968253968e-07, "loss": 0.0024, "num_tokens": 1242578318.0, "reward": 0.3765625, "reward_std": 0.05023763924837112, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.898375928401947, "step": 18615 }, { "completion_length": 328.4, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/max_terminated_length": 328.4, "completions/mean_length": 82.0953125, "completions/mean_terminated_length": 82.0953125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016851075228919415, "frac_reward_zero_std": 0.96875, "grad_norm": 0.03359060734510422, "kl": 1.6466323896776884, "learning_rate": 4.0778571428571426e-07, "loss": 0.0016, "num_tokens": 1242878400.0, "reward": 0.365625, "reward_std": 0.02845909409224987, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.8963603615760803, "step": 18620 }, { "completion_length": 310.8, "completions/clipped_ratio": 0.0, "completions/max_length": 310.8, "completions/max_terminated_length": 310.8, "completions/mean_length": 86.8828125, "completions/mean_terminated_length": 86.8828125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016855600222267675, "frac_reward_zero_std": 0.98125, "grad_norm": 0.13345220685005188, "kl": 2.25621293655131, "learning_rate": 4.077460317460317e-07, "loss": 0.0023, "num_tokens": 1243188874.0, "reward": 0.3984375, "reward_std": 0.01530819907784462, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.899623692035675, "step": 18625 }, { "completion_length": 362.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 86.79453125, "completions/mean_terminated_length": 86.79453125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01686012521561593, "frac_reward_zero_std": 0.94375, "grad_norm": 0.0908873900771141, "kl": 0.40514848101884127, "learning_rate": 4.0770634920634917e-07, "loss": 0.0004, "num_tokens": 1243498571.0, "reward": 0.3109375, "reward_std": 0.05113279521465301, "rewards/verify_chess_move/mean": 0.3109375, "rewards/verify_chess_move/std": 0.9406268239021301, "step": 18630 }, { "completion_length": 425.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 93.15078125, "completions/mean_terminated_length": 93.15078125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01686465020896419, "frac_reward_zero_std": 0.95625, "grad_norm": 1.386512041091919, "kl": 1.089637028076686, "learning_rate": 4.076666666666667e-07, "loss": 0.0011, "num_tokens": 1243817236.0, "reward": 0.371875, "reward_std": 0.04050266854465008, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9239655375480652, "step": 18635 }, { "completion_length": 286.2, "completions/clipped_ratio": 0.0, "completions/max_length": 286.2, "completions/max_terminated_length": 286.2, "completions/mean_length": 84.39375, "completions/mean_terminated_length": 84.39375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01686917520231245, "frac_reward_zero_std": 0.95625, "grad_norm": 25.756973266601562, "kl": 0.3434918764512986, "learning_rate": 4.076269841269841e-07, "loss": 0.0003, "num_tokens": 1244122076.0, "reward": 0.4421875, "reward_std": 0.03661494106054306, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8860756039619446, "step": 18640 }, { "completion_length": 423.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 92.25, "completions/mean_terminated_length": 92.25, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.016873700195660712, "frac_reward_zero_std": 0.9625, "grad_norm": 19.980009078979492, "kl": 0.5001735387602821, "learning_rate": 4.075873015873016e-07, "loss": 0.0005, "num_tokens": 1244441380.0, "reward": 0.4046875, "reward_std": 0.029933410137891768, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9141124010086059, "step": 18645 }, { "completion_length": 377.2, "completions/clipped_ratio": 0.0, "completions/max_length": 377.2, "completions/max_terminated_length": 377.2, "completions/mean_length": 92.99375, "completions/mean_terminated_length": 92.99375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016878225189008972, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021438193507492542, "kl": 0.8697067459812388, "learning_rate": 4.0754761904761904e-07, "loss": 0.0009, "num_tokens": 1244759460.0, "reward": 0.4578125, "reward_std": 0.025726158544421195, "rewards/verify_chess_move/mean": 0.4578125, "rewards/verify_chess_move/std": 0.8901670336723327, "step": 18650 }, { "completion_length": 467.4, "completions/clipped_ratio": 0.0, "completions/max_length": 467.4, "completions/max_terminated_length": 467.4, "completions/mean_length": 96.3734375, "completions/mean_terminated_length": 96.3734375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.016882750182357232, "frac_reward_zero_std": 0.9625, "grad_norm": 1.5316715240478516, "kl": 0.6836365667404607, "learning_rate": 4.075079365079365e-07, "loss": 0.0007, "num_tokens": 1245082794.0, "reward": 0.346875, "reward_std": 0.02925042025744915, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9309824466705322, "step": 18655 }, { "completion_length": 290.4, "completions/clipped_ratio": 0.0, "completions/max_length": 290.4, "completions/max_terminated_length": 290.4, "completions/mean_length": 82.84375, "completions/mean_terminated_length": 82.84375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016887275175705492, "frac_reward_zero_std": 0.975, "grad_norm": 1.5477217435836792, "kl": 0.4662271943176165, "learning_rate": 4.0746825396825395e-07, "loss": 0.0005, "num_tokens": 1245385042.0, "reward": 0.371875, "reward_std": 0.022461533173918725, "rewards/verify_chess_move/mean": 0.371875, "rewards/verify_chess_move/std": 0.9103752255439759, "step": 18660 }, { "completion_length": 412.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 412.6, "completions/max_terminated_length": 333.2, "completions/mean_length": 81.91484375, "completions/mean_terminated_length": 81.39162750244141, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.016891800169053752, "frac_reward_zero_std": 0.975, "grad_norm": 0.315236896276474, "kl": 0.278336562612094, "learning_rate": 4.074285714285714e-07, "loss": 0.0003, "num_tokens": 1245684397.0, "reward": 0.296875, "reward_std": 0.01767766922712326, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9517513155937195, "step": 18665 }, { "completion_length": 403.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.8, "completions/max_terminated_length": 357.4, "completions/mean_length": 92.5015625, "completions/mean_terminated_length": 91.9680404663086, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.016896325162402012, "frac_reward_zero_std": 0.975, "grad_norm": 14.772910118103027, "kl": 0.7104260542662815, "learning_rate": 4.073888888888889e-07, "loss": 0.0007, "num_tokens": 1246004231.0, "reward": 0.4296875, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8964250802993774, "step": 18670 }, { "completion_length": 274.2, "completions/clipped_ratio": 0.0, "completions/max_length": 274.2, "completions/max_terminated_length": 274.2, "completions/mean_length": 86.02109375, "completions/mean_terminated_length": 86.02109375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016900850155750272, "frac_reward_zero_std": 0.95, "grad_norm": 19.85944938659668, "kl": 0.713280257768929, "learning_rate": 4.073492063492063e-07, "loss": 0.0007, "num_tokens": 1246312586.0, "reward": 0.3859375, "reward_std": 0.04376729391515255, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.919468879699707, "step": 18675 }, { "completion_length": 335.2, "completions/clipped_ratio": 0.0, "completions/max_length": 335.2, "completions/max_terminated_length": 335.2, "completions/mean_length": 89.63046875, "completions/mean_terminated_length": 89.63046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016905375149098532, "frac_reward_zero_std": 0.96875, "grad_norm": 11.427915573120117, "kl": 0.19722800590097905, "learning_rate": 4.0730952380952376e-07, "loss": 0.0002, "num_tokens": 1246626681.0, "reward": 0.2921875, "reward_std": 0.028460075706243516, "rewards/verify_chess_move/mean": 0.2921875, "rewards/verify_chess_move/std": 0.9519009351730346, "step": 18680 }, { "completion_length": 291.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 86.6109375, "completions/mean_terminated_length": 86.6109375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01690990014244679, "frac_reward_zero_std": 0.9625, "grad_norm": 5.883625507354736, "kl": 0.4069326029159129, "learning_rate": 4.0726984126984127e-07, "loss": 0.0004, "num_tokens": 1246935135.0, "reward": 0.50625, "reward_std": 0.03540026247501373, "rewards/verify_chess_move/mean": 0.50625, "rewards/verify_chess_move/std": 0.8579990863800049, "step": 18685 }, { "completion_length": 283.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 86.846875, "completions/mean_terminated_length": 86.846875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01691442513579505, "frac_reward_zero_std": 0.98125, "grad_norm": 9.260892868041992, "kl": 0.5408039647620171, "learning_rate": 4.072301587301587e-07, "loss": 0.0005, "num_tokens": 1247244947.0, "reward": 0.478125, "reward_std": 0.01552036553621292, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8584128975868225, "step": 18690 }, { "completion_length": 369.2, "completions/clipped_ratio": 0.0015625, "completions/max_length": 369.2, "completions/max_terminated_length": 357.0, "completions/mean_length": 88.428125, "completions/mean_terminated_length": 87.36921081542968, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01691895012914331, "frac_reward_zero_std": 0.975, "grad_norm": 7.613200664520264, "kl": 0.5958142617950216, "learning_rate": 4.071904761904762e-07, "loss": 0.0006, "num_tokens": 1247557119.0, "reward": 0.1953125, "reward_std": 0.0247236467897892, "rewards/verify_chess_move/mean": 0.1953125, "rewards/verify_chess_move/std": 0.9679169297218323, "step": 18695 }, { "completion_length": 551.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 551.0, "completions/max_terminated_length": 448.6, "completions/mean_length": 94.4203125, "completions/mean_terminated_length": 93.37111358642578, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01692347512249157, "frac_reward_zero_std": 0.95, "grad_norm": 29.32726287841797, "kl": 0.9047527364105917, "learning_rate": 4.0715079365079363e-07, "loss": 0.0009, "num_tokens": 1247877873.0, "reward": 0.30625, "reward_std": 0.0415051806718111, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9391561031341553, "step": 18700 }, { "completion_length": 273.4, "completions/clipped_ratio": 0.0, "completions/max_length": 273.4, "completions/max_terminated_length": 273.4, "completions/mean_length": 81.82265625, "completions/mean_terminated_length": 81.82265625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01692800011583983, "frac_reward_zero_std": 0.9625, "grad_norm": 28.056516647338867, "kl": 0.9804031079169363, "learning_rate": 4.071111111111111e-07, "loss": 0.001, "num_tokens": 1248180374.0, "reward": 0.4421875, "reward_std": 0.03445763662457466, "rewards/verify_chess_move/mean": 0.4421875, "rewards/verify_chess_move/std": 0.8783542275428772, "step": 18705 }, { "completion_length": 314.8, "completions/clipped_ratio": 0.0, "completions/max_length": 314.8, "completions/max_terminated_length": 314.8, "completions/mean_length": 84.2265625, "completions/mean_terminated_length": 84.2265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01693252510918809, "frac_reward_zero_std": 0.95625, "grad_norm": 12.923566818237305, "kl": 0.5027557721128687, "learning_rate": 4.0707142857142854e-07, "loss": 0.0005, "num_tokens": 1248485880.0, "reward": 0.296875, "reward_std": 0.04092700183391571, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9481060266494751, "step": 18710 }, { "completion_length": 402.4, "completions/clipped_ratio": 0.0, "completions/max_length": 402.4, "completions/max_terminated_length": 402.4, "completions/mean_length": 90.55, "completions/mean_terminated_length": 90.55, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01693705010253635, "frac_reward_zero_std": 0.975, "grad_norm": 28.889883041381836, "kl": 0.8356778721092268, "learning_rate": 4.07031746031746e-07, "loss": 0.0008, "num_tokens": 1248801416.0, "reward": 0.3609375, "reward_std": 0.023144521936774253, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9207241892814636, "step": 18715 }, { "completion_length": 365.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 365.0, "completions/max_terminated_length": 313.2, "completions/mean_length": 89.51796875, "completions/mean_terminated_length": 88.99058380126954, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.01694157509588461, "frac_reward_zero_std": 0.9625, "grad_norm": 6.2236199378967285, "kl": 3.969847796286922, "learning_rate": 4.069920634920635e-07, "loss": 0.004, "num_tokens": 1249114639.0, "reward": 0.428125, "reward_std": 0.028566450625658036, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.9021239519119263, "step": 18720 }, { "completion_length": 384.2, "completions/clipped_ratio": 0.0, "completions/max_length": 384.2, "completions/max_terminated_length": 384.2, "completions/mean_length": 85.18515625, "completions/mean_terminated_length": 85.18515625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01694610008923287, "frac_reward_zero_std": 0.95625, "grad_norm": 36.224605560302734, "kl": 3.0731917331228034, "learning_rate": 4.0695238095238095e-07, "loss": 0.0031, "num_tokens": 1249419764.0, "reward": 0.3828125, "reward_std": 0.03503581546247005, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.9121828317642212, "step": 18725 }, { "completion_length": 280.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 81.625, "completions/mean_terminated_length": 81.625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01695062508258113, "frac_reward_zero_std": 0.96875, "grad_norm": 22.08333969116211, "kl": 3.0321243211627005, "learning_rate": 4.0691269841269836e-07, "loss": 0.003, "num_tokens": 1249718132.0, "reward": 0.490625, "reward_std": 0.027563939243555068, "rewards/verify_chess_move/mean": 0.490625, "rewards/verify_chess_move/std": 0.86013423204422, "step": 18730 }, { "completion_length": 400.4, "completions/clipped_ratio": 0.0, "completions/max_length": 400.4, "completions/max_terminated_length": 400.4, "completions/mean_length": 94.98671875, "completions/mean_terminated_length": 94.98671875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01695515007592939, "frac_reward_zero_std": 0.99375, "grad_norm": 0.505666196346283, "kl": 0.9067224015016109, "learning_rate": 4.0687301587301586e-07, "loss": 0.0009, "num_tokens": 1250041003.0, "reward": 0.4109375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.905071759223938, "step": 18735 }, { "completion_length": 311.2, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/max_terminated_length": 311.2, "completions/mean_length": 93.9109375, "completions/mean_terminated_length": 93.9109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016959675069277647, "frac_reward_zero_std": 0.9625, "grad_norm": 20.24138069152832, "kl": 5.133054148964584, "learning_rate": 4.068333333333333e-07, "loss": 0.0051, "num_tokens": 1250361473.0, "reward": 0.3703125, "reward_std": 0.03356248177587986, "rewards/verify_chess_move/mean": 0.3703125, "rewards/verify_chess_move/std": 0.9247174501419068, "step": 18740 }, { "completion_length": 493.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 493.8, "completions/max_terminated_length": 435.6, "completions/mean_length": 95.99765625, "completions/mean_terminated_length": 95.47829132080078, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.016964200062625907, "frac_reward_zero_std": 0.95625, "grad_norm": 0.0477568618953228, "kl": 0.9065163701772689, "learning_rate": 4.067936507936508e-07, "loss": 0.0009, "num_tokens": 1250684414.0, "reward": 0.3828125, "reward_std": 0.03640375547111034, "rewards/verify_chess_move/mean": 0.3828125, "rewards/verify_chess_move/std": 0.8830494523048401, "step": 18745 }, { "completion_length": 276.2, "completions/clipped_ratio": 0.0, "completions/max_length": 276.2, "completions/max_terminated_length": 276.2, "completions/mean_length": 83.3234375, "completions/mean_terminated_length": 83.3234375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.016968725055974167, "frac_reward_zero_std": 0.9875, "grad_norm": 0.015777315944433212, "kl": 0.7140350752510131, "learning_rate": 4.067539682539682e-07, "loss": 0.0007, "num_tokens": 1250987204.0, "reward": 0.3734375, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.913454270362854, "step": 18750 }, { "completion_length": 333.2, "completions/clipped_ratio": 0.0, "completions/max_length": 333.2, "completions/max_terminated_length": 333.2, "completions/mean_length": 85.8234375, "completions/mean_terminated_length": 85.8234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016973250049322427, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0021598387975245714, "kl": 0.2186621905537322, "learning_rate": 4.067142857142857e-07, "loss": 0.0002, "num_tokens": 1251295818.0, "reward": 0.4609375, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.877879512310028, "step": 18755 }, { "completion_length": 358.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 358.4, "completions/max_terminated_length": 298.2, "completions/mean_length": 86.00859375, "completions/mean_terminated_length": 85.47095031738282, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.016977775042670687, "frac_reward_zero_std": 0.95625, "grad_norm": 20.390239715576172, "kl": 0.19610862797126175, "learning_rate": 4.066746031746032e-07, "loss": 0.0002, "num_tokens": 1251602773.0, "reward": 0.3484375, "reward_std": 0.030935921147465704, "rewards/verify_chess_move/mean": 0.3484375, "rewards/verify_chess_move/std": 0.9341068744659424, "step": 18760 }, { "completion_length": 372.8, "completions/clipped_ratio": 0.0, "completions/max_length": 372.8, "completions/max_terminated_length": 372.8, "completions/mean_length": 82.78203125, "completions/mean_terminated_length": 82.78203125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016982300036018947, "frac_reward_zero_std": 0.975, "grad_norm": 21.75029754638672, "kl": 0.8614436912117526, "learning_rate": 4.066349206349206e-07, "loss": 0.0009, "num_tokens": 1251904382.0, "reward": 0.4984375, "reward_std": 0.019044627994298936, "rewards/verify_chess_move/mean": 0.4984375, "rewards/verify_chess_move/std": 0.8570520877838135, "step": 18765 }, { "completion_length": 273.4, "completions/clipped_ratio": 0.0, "completions/max_length": 273.4, "completions/max_terminated_length": 273.4, "completions/mean_length": 92.42421875, "completions/mean_terminated_length": 92.42421875, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.016986825029367207, "frac_reward_zero_std": 0.96875, "grad_norm": 0.01114074606448412, "kl": 0.2122268565930426, "learning_rate": 4.065952380952381e-07, "loss": 0.0002, "num_tokens": 1252224757.0, "reward": 0.3515625, "reward_std": 0.02688095085322857, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9245989203453064, "step": 18770 }, { "completion_length": 286.6, "completions/clipped_ratio": 0.0, "completions/max_length": 286.6, "completions/max_terminated_length": 286.6, "completions/mean_length": 89.05078125, "completions/mean_terminated_length": 89.05078125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.016991350022715467, "frac_reward_zero_std": 0.94375, "grad_norm": 8.763242721557617, "kl": 1.0204111074563116, "learning_rate": 4.0655555555555555e-07, "loss": 0.001, "num_tokens": 1252539846.0, "reward": 0.284375, "reward_std": 0.04887068197131157, "rewards/verify_chess_move/mean": 0.284375, "rewards/verify_chess_move/std": 0.9414827346801757, "step": 18775 }, { "completion_length": 403.4, "completions/clipped_ratio": 0.0, "completions/max_length": 403.4, "completions/max_terminated_length": 403.4, "completions/mean_length": 95.9453125, "completions/mean_terminated_length": 95.9453125, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.016995875016063727, "frac_reward_zero_std": 0.98125, "grad_norm": 0.07275477796792984, "kl": 0.8268750628223642, "learning_rate": 4.06515873015873e-07, "loss": 0.0008, "num_tokens": 1252863528.0, "reward": 0.4296875, "reward_std": 0.015992168709635733, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8827696561813354, "step": 18780 }, { "completion_length": 467.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 467.4, "completions/max_terminated_length": 407.8, "completions/mean_length": 89.52578125, "completions/mean_terminated_length": 88.46457977294922, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017000400009411987, "frac_reward_zero_std": 0.95625, "grad_norm": 3.035706043243408, "kl": 0.8337525008129887, "learning_rate": 4.0647619047619046e-07, "loss": 0.0008, "num_tokens": 1253176713.0, "reward": 0.35, "reward_std": 0.03729792907834053, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9354765295982361, "step": 18785 }, { "completion_length": 403.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 403.8, "completions/max_terminated_length": 362.0, "completions/mean_length": 85.46953125, "completions/mean_terminated_length": 84.93406982421875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017004925002760247, "frac_reward_zero_std": 0.975, "grad_norm": 0.0008735735900700092, "kl": 1.1150676370831207, "learning_rate": 4.064365079365079e-07, "loss": 0.0011, "num_tokens": 1253483666.0, "reward": 0.3671875, "reward_std": 0.02198973000049591, "rewards/verify_chess_move/mean": 0.3671875, "rewards/verify_chess_move/std": 0.9295914173126221, "step": 18790 }, { "completion_length": 316.8, "completions/clipped_ratio": 0.0, "completions/max_length": 316.8, "completions/max_terminated_length": 316.8, "completions/mean_length": 88.459375, "completions/mean_terminated_length": 88.459375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017009449996108504, "frac_reward_zero_std": 0.98125, "grad_norm": 0.16268113255500793, "kl": 0.14568722697440534, "learning_rate": 4.063968253968254e-07, "loss": 0.0001, "num_tokens": 1253796310.0, "reward": 0.4765625, "reward_std": 0.016887323930859566, "rewards/verify_chess_move/mean": 0.4765625, "rewards/verify_chess_move/std": 0.8271916151046753, "step": 18795 }, { "completion_length": 301.8, "completions/clipped_ratio": 0.0, "completions/max_length": 301.8, "completions/max_terminated_length": 301.8, "completions/mean_length": 84.18828125, "completions/mean_terminated_length": 84.18828125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.017013974989456764, "frac_reward_zero_std": 0.95, "grad_norm": 14.821907043457031, "kl": 0.980456716567278, "learning_rate": 4.063571428571428e-07, "loss": 0.001, "num_tokens": 1254100311.0, "reward": 0.459375, "reward_std": 0.035355338081717494, "rewards/verify_chess_move/mean": 0.459375, "rewards/verify_chess_move/std": 0.8885106325149537, "step": 18800 }, { "completion_length": 251.2, "completions/clipped_ratio": 0.0, "completions/max_length": 251.2, "completions/max_terminated_length": 251.2, "completions/mean_length": 80.08671875, "completions/mean_terminated_length": 80.08671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017018499982805024, "frac_reward_zero_std": 0.9875, "grad_norm": 0.002914207987487316, "kl": 0.1425933690741658, "learning_rate": 4.0631746031746027e-07, "loss": 0.0001, "num_tokens": 1254398910.0, "reward": 0.378125, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.378125, "rewards/verify_chess_move/std": 0.917261278629303, "step": 18805 }, { "completion_length": 405.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.0, "completions/max_terminated_length": 363.6, "completions/mean_length": 94.84296875, "completions/mean_terminated_length": 94.32503814697266, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.017023024976153284, "frac_reward_zero_std": 0.95, "grad_norm": 19.55962562561035, "kl": 0.5893298970186152, "learning_rate": 4.062777777777778e-07, "loss": 0.0006, "num_tokens": 1254720829.0, "reward": 0.309375, "reward_std": 0.045134251564741136, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.9499773740768432, "step": 18810 }, { "completion_length": 301.6, "completions/clipped_ratio": 0.0, "completions/max_length": 301.6, "completions/max_terminated_length": 301.6, "completions/mean_length": 82.925, "completions/mean_terminated_length": 82.925, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017027549969501544, "frac_reward_zero_std": 0.9625, "grad_norm": 15.420268058776855, "kl": 4.6314313155831766, "learning_rate": 4.0623809523809523e-07, "loss": 0.0046, "num_tokens": 1255023501.0, "reward": 0.43125, "reward_std": 0.03219552300870419, "rewards/verify_chess_move/mean": 0.43125, "rewards/verify_chess_move/std": 0.8913515925407409, "step": 18815 }, { "completion_length": 352.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 86.60625, "completions/mean_terminated_length": 86.60625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017032074962849805, "frac_reward_zero_std": 0.96875, "grad_norm": 3.30741810798645, "kl": 1.0246642548358067, "learning_rate": 4.061984126984127e-07, "loss": 0.001, "num_tokens": 1255335829.0, "reward": 0.2640625, "reward_std": 0.026196980848908424, "rewards/verify_chess_move/mean": 0.2640625, "rewards/verify_chess_move/std": 0.9574540853500366, "step": 18820 }, { "completion_length": 409.4, "completions/clipped_ratio": 0.0, "completions/max_length": 409.4, "completions/max_terminated_length": 409.4, "completions/mean_length": 90.575, "completions/mean_terminated_length": 90.575, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.017036599956198065, "frac_reward_zero_std": 0.9875, "grad_norm": 3.009413242340088, "kl": 0.3318329869303852, "learning_rate": 4.0615873015873014e-07, "loss": 0.0003, "num_tokens": 1255648485.0, "reward": 0.428125, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.893967080116272, "step": 18825 }, { "completion_length": 264.8, "completions/clipped_ratio": 0.0, "completions/max_length": 264.8, "completions/max_terminated_length": 264.8, "completions/mean_length": 87.77109375, "completions/mean_terminated_length": 87.77109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017041124949546325, "frac_reward_zero_std": 0.975, "grad_norm": 0.10024362057447433, "kl": 0.4196953630074859, "learning_rate": 4.061190476190476e-07, "loss": 0.0004, "num_tokens": 1255960088.0, "reward": 0.3203125, "reward_std": 0.024039677530527114, "rewards/verify_chess_move/mean": 0.3203125, "rewards/verify_chess_move/std": 0.9470470070838928, "step": 18830 }, { "completion_length": 286.2, "completions/clipped_ratio": 0.0, "completions/max_length": 286.2, "completions/max_terminated_length": 286.2, "completions/mean_length": 83.384375, "completions/mean_terminated_length": 83.384375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017045649942894585, "frac_reward_zero_std": 0.99375, "grad_norm": 0.04715120792388916, "kl": 0.14402951044030488, "learning_rate": 4.060793650793651e-07, "loss": 0.0001, "num_tokens": 1256264372.0, "reward": 0.35625, "reward_std": 0.006681530922651291, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9299902439117431, "step": 18835 }, { "completion_length": 312.6, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/max_terminated_length": 312.6, "completions/mean_length": 86.78828125, "completions/mean_terminated_length": 86.78828125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017050174936242845, "frac_reward_zero_std": 0.975, "grad_norm": 0.14483347535133362, "kl": 0.3028617488220334, "learning_rate": 4.060396825396825e-07, "loss": 0.0003, "num_tokens": 1256574389.0, "reward": 0.2390625, "reward_std": 0.02608962431550026, "rewards/verify_chess_move/mean": 0.2390625, "rewards/verify_chess_move/std": 0.9576291441917419, "step": 18840 }, { "completion_length": 280.8, "completions/clipped_ratio": 0.0, "completions/max_length": 280.8, "completions/max_terminated_length": 280.8, "completions/mean_length": 88.4859375, "completions/mean_terminated_length": 88.4859375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017054699929591105, "frac_reward_zero_std": 0.9875, "grad_norm": 0.46238502860069275, "kl": 0.5060015820432454, "learning_rate": 4.06e-07, "loss": 0.0005, "num_tokens": 1256886059.0, "reward": 0.365625, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.365625, "rewards/verify_chess_move/std": 0.9156276822090149, "step": 18845 }, { "completion_length": 447.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 447.0, "completions/max_terminated_length": 354.6, "completions/mean_length": 91.15, "completions/mean_terminated_length": 90.62104034423828, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01705922492293936, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0016158638754859567, "kl": 0.12522240551188588, "learning_rate": 4.0596031746031746e-07, "loss": 0.0001, "num_tokens": 1257204299.0, "reward": 0.3078125, "reward_std": 0.024831003695726394, "rewards/verify_chess_move/mean": 0.3078125, "rewards/verify_chess_move/std": 0.9373363256454468, "step": 18850 }, { "completion_length": 410.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 410.0, "completions/max_terminated_length": 335.6, "completions/mean_length": 92.296875, "completions/mean_terminated_length": 91.76911010742188, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017063749916287622, "frac_reward_zero_std": 0.98125, "grad_norm": 1.006295084953308, "kl": 0.1878936668857932, "learning_rate": 4.0592063492063486e-07, "loss": 0.0002, "num_tokens": 1257521671.0, "reward": 0.3328125, "reward_std": 0.017358146235346796, "rewards/verify_chess_move/mean": 0.3328125, "rewards/verify_chess_move/std": 0.940680730342865, "step": 18855 }, { "completion_length": 317.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 84.90234375, "completions/mean_terminated_length": 84.90234375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017068274909635882, "frac_reward_zero_std": 0.95625, "grad_norm": 1.9788365364074707, "kl": 0.2701446164865047, "learning_rate": 4.0588095238095237e-07, "loss": 0.0003, "num_tokens": 1257827754.0, "reward": 0.4171875, "reward_std": 0.038194064795970914, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8995973587036132, "step": 18860 }, { "completion_length": 505.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 88.64609375, "completions/mean_terminated_length": 88.64609375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017072799902984142, "frac_reward_zero_std": 0.98125, "grad_norm": 34.023284912109375, "kl": 0.340358879044652, "learning_rate": 4.058412698412698e-07, "loss": 0.0003, "num_tokens": 1258137917.0, "reward": 0.3859375, "reward_std": 0.018042115867137908, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9144731163978577, "step": 18865 }, { "completion_length": 279.2, "completions/clipped_ratio": 0.0, "completions/max_length": 279.2, "completions/max_terminated_length": 279.2, "completions/mean_length": 83.8921875, "completions/mean_terminated_length": 83.8921875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017077324896332402, "frac_reward_zero_std": 0.95, "grad_norm": 7.773767948150635, "kl": 0.7793055849382654, "learning_rate": 4.058015873015873e-07, "loss": 0.0008, "num_tokens": 1258442995.0, "reward": 0.44375, "reward_std": 0.04287311993539333, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8741742968559265, "step": 18870 }, { "completion_length": 359.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 359.8, "completions/max_terminated_length": 333.2, "completions/mean_length": 86.9140625, "completions/mean_terminated_length": 86.38774871826172, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017081849889680662, "frac_reward_zero_std": 0.975, "grad_norm": 11.322783470153809, "kl": 0.39902836608234793, "learning_rate": 4.0576190476190473e-07, "loss": 0.0004, "num_tokens": 1258752021.0, "reward": 0.4078125, "reward_std": 0.019939782842993737, "rewards/verify_chess_move/mean": 0.4078125, "rewards/verify_chess_move/std": 0.9082431197166443, "step": 18875 }, { "completion_length": 374.8, "completions/clipped_ratio": 0.0, "completions/max_length": 374.8, "completions/max_terminated_length": 374.8, "completions/mean_length": 89.44453125, "completions/mean_terminated_length": 89.44453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.017086374883028922, "frac_reward_zero_std": 0.9625, "grad_norm": 3.4720442295074463, "kl": 0.13017223302740605, "learning_rate": 4.057222222222222e-07, "loss": 0.0001, "num_tokens": 1259066598.0, "reward": 0.3125, "reward_std": 0.03014557547867298, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9450354695320129, "step": 18880 }, { "completion_length": 347.8, "completions/clipped_ratio": 0.0, "completions/max_length": 347.8, "completions/max_terminated_length": 347.8, "completions/mean_length": 87.865625, "completions/mean_terminated_length": 87.865625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017090899876377182, "frac_reward_zero_std": 0.99375, "grad_norm": 1.7609561681747437, "kl": 0.21530097090872005, "learning_rate": 4.056825396825397e-07, "loss": 0.0002, "num_tokens": 1259376410.0, "reward": 0.3765625, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9212231993675232, "step": 18885 }, { "completion_length": 404.6, "completions/clipped_ratio": 0.0, "completions/max_length": 404.6, "completions/max_terminated_length": 404.6, "completions/mean_length": 84.94453125, "completions/mean_terminated_length": 84.94453125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017095424869725442, "frac_reward_zero_std": 0.975, "grad_norm": 0.1668456792831421, "kl": 0.18318267588037998, "learning_rate": 4.056428571428571e-07, "loss": 0.0002, "num_tokens": 1259681395.0, "reward": 0.5078125, "reward_std": 0.019939782470464705, "rewards/verify_chess_move/mean": 0.5078125, "rewards/verify_chess_move/std": 0.8599923729896546, "step": 18890 }, { "completion_length": 295.6, "completions/clipped_ratio": 0.0, "completions/max_length": 295.6, "completions/max_terminated_length": 295.6, "completions/mean_length": 89.14921875, "completions/mean_terminated_length": 89.14921875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.017099949863073702, "frac_reward_zero_std": 0.975, "grad_norm": 12.455784797668457, "kl": 0.2028584676096216, "learning_rate": 4.0560317460317455e-07, "loss": 0.0002, "num_tokens": 1259997306.0, "reward": 0.503125, "reward_std": 0.023827510699629784, "rewards/verify_chess_move/mean": 0.503125, "rewards/verify_chess_move/std": 0.850744080543518, "step": 18895 }, { "completion_length": 458.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 458.8, "completions/max_terminated_length": 360.6, "completions/mean_length": 89.89296875, "completions/mean_terminated_length": 89.3597625732422, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017104474856421963, "frac_reward_zero_std": 0.9625, "grad_norm": 0.9843869805335999, "kl": 0.21102699094917626, "learning_rate": 4.0556349206349206e-07, "loss": 0.0002, "num_tokens": 1260311729.0, "reward": 0.290625, "reward_std": 0.03104073107242584, "rewards/verify_chess_move/mean": 0.290625, "rewards/verify_chess_move/std": 0.9334744095802308, "step": 18900 }, { "completion_length": 307.4, "completions/clipped_ratio": 0.0, "completions/max_length": 307.4, "completions/max_terminated_length": 307.4, "completions/mean_length": 85.0046875, "completions/mean_terminated_length": 85.0046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01710899984977022, "frac_reward_zero_std": 0.9625, "grad_norm": 1.7002679109573364, "kl": 0.27329798457212745, "learning_rate": 4.055238095238095e-07, "loss": 0.0003, "num_tokens": 1260618959.0, "reward": 0.39375, "reward_std": 0.02925042100250721, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.912419068813324, "step": 18905 }, { "completion_length": 351.4, "completions/clipped_ratio": 0.0, "completions/max_length": 351.4, "completions/max_terminated_length": 351.4, "completions/mean_length": 81.5125, "completions/mean_terminated_length": 81.5125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01711352484311848, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0023661816958338022, "kl": 0.1268428062554449, "learning_rate": 4.0548412698412696e-07, "loss": 0.0001, "num_tokens": 1260917727.0, "reward": 0.4859375, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.4859375, "rewards/verify_chess_move/std": 0.8677755117416381, "step": 18910 }, { "completion_length": 295.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 88.01796875, "completions/mean_terminated_length": 88.01796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01711804983646674, "frac_reward_zero_std": 0.975, "grad_norm": 0.022487560287117958, "kl": 0.2818004135740921, "learning_rate": 4.054444444444444e-07, "loss": 0.0003, "num_tokens": 1261228494.0, "reward": 0.3234375, "reward_std": 0.023144522309303285, "rewards/verify_chess_move/mean": 0.3234375, "rewards/verify_chess_move/std": 0.9432860732078552, "step": 18915 }, { "completion_length": 366.8, "completions/clipped_ratio": 0.0, "completions/max_length": 366.8, "completions/max_terminated_length": 366.8, "completions/mean_length": 86.009375, "completions/mean_terminated_length": 86.009375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017122574829815, "frac_reward_zero_std": 0.9625, "grad_norm": 8.329324722290039, "kl": 1.0342962556285784, "learning_rate": 4.0540476190476187e-07, "loss": 0.001, "num_tokens": 1261534722.0, "reward": 0.4109375, "reward_std": 0.02993340902030468, "rewards/verify_chess_move/mean": 0.4109375, "rewards/verify_chess_move/std": 0.9017691850662232, "step": 18920 }, { "completion_length": 308.2, "completions/clipped_ratio": 0.0, "completions/max_length": 308.2, "completions/max_terminated_length": 308.2, "completions/mean_length": 85.36953125, "completions/mean_terminated_length": 85.36953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01712709982316326, "frac_reward_zero_std": 0.94375, "grad_norm": 14.634833335876465, "kl": 0.4081204941496253, "learning_rate": 4.053650793650794e-07, "loss": 0.0004, "num_tokens": 1261839659.0, "reward": 0.4296875, "reward_std": 0.050921608507633206, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8996849656105042, "step": 18925 }, { "completion_length": 480.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 480.6, "completions/max_terminated_length": 477.6, "completions/mean_length": 92.3, "completions/mean_terminated_length": 91.77967681884766, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01713162481651152, "frac_reward_zero_std": 0.95, "grad_norm": 1.3126420974731445, "kl": 0.40907149490667505, "learning_rate": 4.053253968253968e-07, "loss": 0.0004, "num_tokens": 1262155667.0, "reward": 0.4015625, "reward_std": 0.03877224624156952, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8970674991607666, "step": 18930 }, { "completion_length": 349.2, "completions/clipped_ratio": 0.0, "completions/max_length": 349.2, "completions/max_terminated_length": 349.2, "completions/mean_length": 86.39375, "completions/mean_terminated_length": 86.39375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01713614980985978, "frac_reward_zero_std": 0.99375, "grad_norm": 0.1711670458316803, "kl": 0.3078289384022355, "learning_rate": 4.052857142857143e-07, "loss": 0.0003, "num_tokens": 1262464203.0, "reward": 0.421875, "reward_std": 0.005786375701427459, "rewards/verify_chess_move/mean": 0.421875, "rewards/verify_chess_move/std": 0.8930481314659119, "step": 18935 }, { "completion_length": 465.8, "completions/clipped_ratio": 0.0015625, "completions/max_length": 465.8, "completions/max_terminated_length": 362.8, "completions/mean_length": 89.02265625, "completions/mean_terminated_length": 87.95537567138672, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.01714067480320804, "frac_reward_zero_std": 0.95625, "grad_norm": 3.1669089794158936, "kl": 0.5982594251283444, "learning_rate": 4.0524603174603174e-07, "loss": 0.0006, "num_tokens": 1262775200.0, "reward": 0.33125, "reward_std": 0.03798189871013165, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9180943369865417, "step": 18940 }, { "completion_length": 315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 81.56484375, "completions/mean_terminated_length": 81.56484375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.0171451997965563, "frac_reward_zero_std": 0.96875, "grad_norm": 14.517333984375, "kl": 0.4521822360577062, "learning_rate": 4.0520634920634914e-07, "loss": 0.0005, "num_tokens": 1263075355.0, "reward": 0.325, "reward_std": 0.029143064469099044, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.937740707397461, "step": 18945 }, { "completion_length": 428.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 428.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 91.296875, "completions/mean_terminated_length": 90.77020874023438, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01714972478990456, "frac_reward_zero_std": 0.96875, "grad_norm": 0.013560313731431961, "kl": 1.34665358485654, "learning_rate": 4.0516666666666665e-07, "loss": 0.0013, "num_tokens": 1263390495.0, "reward": 0.3515625, "reward_std": 0.030721206590533255, "rewards/verify_chess_move/mean": 0.3515625, "rewards/verify_chess_move/std": 0.9117299556732178, "step": 18950 }, { "completion_length": 480.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 480.0, "completions/max_terminated_length": 377.6, "completions/mean_length": 93.028125, "completions/mean_terminated_length": 91.971875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.01715424978325282, "frac_reward_zero_std": 0.95625, "grad_norm": 1.88641357421875, "kl": 1.7264971429831348, "learning_rate": 4.051269841269841e-07, "loss": 0.0017, "num_tokens": 1263710035.0, "reward": 0.275, "reward_std": 0.03913669139146805, "rewards/verify_chess_move/mean": 0.275, "rewards/verify_chess_move/std": 0.9580431342124939, "step": 18955 }, { "completion_length": 359.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 359.4, "completions/max_terminated_length": 270.6, "completions/mean_length": 88.29296875, "completions/mean_terminated_length": 87.76606750488281, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017158774776601077, "frac_reward_zero_std": 0.9375, "grad_norm": 3.4220972061157227, "kl": 2.6518113663536496, "learning_rate": 4.050873015873016e-07, "loss": 0.0027, "num_tokens": 1264021170.0, "reward": 0.390625, "reward_std": 0.05329107940196991, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9186409473419189, "step": 18960 }, { "completion_length": 280.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 85.909375, "completions/mean_terminated_length": 85.909375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017163299769949337, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0022090349812060595, "kl": 0.24878787458874285, "learning_rate": 4.05047619047619e-07, "loss": 0.0002, "num_tokens": 1264329798.0, "reward": 0.3765625, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3765625, "rewards/verify_chess_move/std": 0.9095724940299987, "step": 18965 }, { "completion_length": 357.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 93.03984375, "completions/mean_terminated_length": 93.03984375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017167824763297597, "frac_reward_zero_std": 0.95, "grad_norm": 0.8719030618667603, "kl": 0.4287621944909915, "learning_rate": 4.0500793650793646e-07, "loss": 0.0004, "num_tokens": 1264649177.0, "reward": 0.3421875, "reward_std": 0.043556108698248865, "rewards/verify_chess_move/mean": 0.3421875, "rewards/verify_chess_move/std": 0.918314254283905, "step": 18970 }, { "completion_length": 330.8, "completions/clipped_ratio": 0.0, "completions/max_length": 330.8, "completions/max_terminated_length": 330.8, "completions/mean_length": 83.2921875, "completions/mean_terminated_length": 83.2921875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017172349756645857, "frac_reward_zero_std": 0.95, "grad_norm": 0.6836117506027222, "kl": 1.8415254241786898, "learning_rate": 4.0496825396825397e-07, "loss": 0.0018, "num_tokens": 1264951455.0, "reward": 0.25, "reward_std": 0.0458182230591774, "rewards/verify_chess_move/mean": 0.25, "rewards/verify_chess_move/std": 0.9578448534011841, "step": 18975 }, { "completion_length": 334.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 89.18046875, "completions/mean_terminated_length": 89.18046875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017176874749994117, "frac_reward_zero_std": 0.9875, "grad_norm": 18.025955200195312, "kl": 0.2648363061482087, "learning_rate": 4.0492857142857137e-07, "loss": 0.0003, "num_tokens": 1265264326.0, "reward": 0.4234375, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.4234375, "rewards/verify_chess_move/std": 0.8780223250389099, "step": 18980 }, { "completion_length": 459.4, "completions/clipped_ratio": 0.0015625, "completions/max_length": 459.4, "completions/max_terminated_length": 353.0, "completions/mean_length": 90.2875, "completions/mean_terminated_length": 89.24447631835938, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017181399743342377, "frac_reward_zero_std": 0.9375, "grad_norm": 10.814247131347656, "kl": 1.0710878990474157, "learning_rate": 4.048888888888889e-07, "loss": 0.0011, "num_tokens": 1265579246.0, "reward": 0.3125, "reward_std": 0.05192313902080059, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.948540723323822, "step": 18985 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.6, "completions/max_terminated_length": 350.8, "completions/mean_length": 83.9609375, "completions/mean_terminated_length": 83.43024597167968, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017185924736690637, "frac_reward_zero_std": 0.99375, "grad_norm": 0.16003692150115967, "kl": 0.18849107900168746, "learning_rate": 4.0484920634920633e-07, "loss": 0.0002, "num_tokens": 1265883932.0, "reward": 0.4359375, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.4359375, "rewards/verify_chess_move/std": 0.8766697406768799, "step": 18990 }, { "completion_length": 319.2, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/max_terminated_length": 319.2, "completions/mean_length": 78.11640625, "completions/mean_terminated_length": 78.11640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017190449730038897, "frac_reward_zero_std": 0.93125, "grad_norm": 20.898530960083008, "kl": 1.2462806939845905, "learning_rate": 4.048095238095238e-07, "loss": 0.0012, "num_tokens": 1266178345.0, "reward": 0.45625, "reward_std": 0.052030496671795844, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8890776872634888, "step": 18995 }, { "completion_length": 404.8, "completions/clipped_ratio": 0.0, "completions/max_length": 404.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 91.1375, "completions/mean_terminated_length": 91.1375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017194974723387157, "frac_reward_zero_std": 0.94375, "grad_norm": 23.69089126586914, "kl": 1.6449778444599361, "learning_rate": 4.0476984126984124e-07, "loss": 0.0016, "num_tokens": 1266493449.0, "reward": 0.571875, "reward_std": 0.04797552525997162, "rewards/verify_chess_move/mean": 0.571875, "rewards/verify_chess_move/std": 0.8122594356536865, "step": 19000 }, { "completion_length": 387.6, "completions/clipped_ratio": 0.0, "completions/max_length": 387.6, "completions/max_terminated_length": 387.6, "completions/mean_length": 89.84296875, "completions/mean_terminated_length": 89.84296875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017199499716735418, "frac_reward_zero_std": 0.9625, "grad_norm": 17.302736282348633, "kl": 0.2192748298868537, "learning_rate": 4.047301587301587e-07, "loss": 0.0002, "num_tokens": 1266807720.0, "reward": 0.4515625, "reward_std": 0.03356248252093792, "rewards/verify_chess_move/mean": 0.4515625, "rewards/verify_chess_move/std": 0.8796605348587037, "step": 19005 }, { "completion_length": 290.4, "completions/clipped_ratio": 0.0, "completions/max_length": 290.4, "completions/max_terminated_length": 290.4, "completions/mean_length": 95.78671875, "completions/mean_terminated_length": 95.78671875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017204024710083678, "frac_reward_zero_std": 0.95625, "grad_norm": 13.877063751220703, "kl": 1.4884370212443172, "learning_rate": 4.046904761904762e-07, "loss": 0.0015, "num_tokens": 1267133295.0, "reward": 0.253125, "reward_std": 0.041397823765873906, "rewards/verify_chess_move/mean": 0.253125, "rewards/verify_chess_move/std": 0.9597575664520264, "step": 19010 }, { "completion_length": 328.6, "completions/clipped_ratio": 0.0, "completions/max_length": 328.6, "completions/max_terminated_length": 328.6, "completions/mean_length": 85.6390625, "completions/mean_terminated_length": 85.6390625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017208549703431934, "frac_reward_zero_std": 0.9625, "grad_norm": 11.475615501403809, "kl": 2.384346276265569, "learning_rate": 4.0465079365079366e-07, "loss": 0.0024, "num_tokens": 1267441241.0, "reward": 0.434375, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8776229023933411, "step": 19015 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 87.325, "completions/mean_terminated_length": 87.325, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017213074696780194, "frac_reward_zero_std": 0.9625, "grad_norm": 19.02520751953125, "kl": 0.2726581267779693, "learning_rate": 4.0461111111111106e-07, "loss": 0.0003, "num_tokens": 1267753233.0, "reward": 0.415625, "reward_std": 0.0319843377918005, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.8880198240280152, "step": 19020 }, { "completion_length": 381.2, "completions/clipped_ratio": 0.00078125, "completions/max_length": 381.2, "completions/max_terminated_length": 285.0, "completions/mean_length": 89.6515625, "completions/mean_terminated_length": 89.1221694946289, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017217599690128454, "frac_reward_zero_std": 0.9875, "grad_norm": 0.012516183778643608, "kl": 0.7371709365979768, "learning_rate": 4.0457142857142856e-07, "loss": 0.0007, "num_tokens": 1268066507.0, "reward": 0.346875, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.346875, "rewards/verify_chess_move/std": 0.9257321715354919, "step": 19025 }, { "completion_length": 420.4, "completions/clipped_ratio": 0.0, "completions/max_length": 420.4, "completions/max_terminated_length": 420.4, "completions/mean_length": 86.96875, "completions/mean_terminated_length": 86.96875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017222124683476715, "frac_reward_zero_std": 0.9625, "grad_norm": 7.985255718231201, "kl": 0.46481532651232554, "learning_rate": 4.04531746031746e-07, "loss": 0.0005, "num_tokens": 1268374851.0, "reward": 0.39375, "reward_std": 0.03130036853253841, "rewards/verify_chess_move/mean": 0.39375, "rewards/verify_chess_move/std": 0.9083997964859009, "step": 19030 }, { "completion_length": 295.4, "completions/clipped_ratio": 0.0, "completions/max_length": 295.4, "completions/max_terminated_length": 295.4, "completions/mean_length": 92.68828125, "completions/mean_terminated_length": 92.68828125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017226649676824975, "frac_reward_zero_std": 0.95, "grad_norm": 0.05903227999806404, "kl": 0.5600932884961367, "learning_rate": 4.0449206349206347e-07, "loss": 0.0006, "num_tokens": 1268694772.0, "reward": 0.3046875, "reward_std": 0.039456214383244516, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.937523889541626, "step": 19035 }, { "completion_length": 353.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 83.3546875, "completions/mean_terminated_length": 83.3546875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017231174670173235, "frac_reward_zero_std": 0.95625, "grad_norm": 19.982053756713867, "kl": 0.22691319561563433, "learning_rate": 4.044523809523809e-07, "loss": 0.0002, "num_tokens": 1268998954.0, "reward": 0.428125, "reward_std": 0.037981899082660676, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8992509961128234, "step": 19040 }, { "completion_length": 392.8, "completions/clipped_ratio": 0.0, "completions/max_length": 392.8, "completions/max_terminated_length": 392.8, "completions/mean_length": 88.190625, "completions/mean_terminated_length": 88.190625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017235699663521495, "frac_reward_zero_std": 0.98125, "grad_norm": 2.6384618282318115, "kl": 0.39949381842743603, "learning_rate": 4.044126984126984e-07, "loss": 0.0004, "num_tokens": 1269311654.0, "reward": 0.2875, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.2875, "rewards/verify_chess_move/std": 0.9533960938453674, "step": 19045 }, { "completion_length": 259.8, "completions/clipped_ratio": 0.0, "completions/max_length": 259.8, "completions/max_terminated_length": 259.8, "completions/mean_length": 87.20546875, "completions/mean_terminated_length": 87.20546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017240224656869755, "frac_reward_zero_std": 0.975, "grad_norm": 15.074098587036133, "kl": 0.5806483549531549, "learning_rate": 4.043730158730159e-07, "loss": 0.0006, "num_tokens": 1269621333.0, "reward": 0.4171875, "reward_std": 0.023144522309303285, "rewards/verify_chess_move/mean": 0.4171875, "rewards/verify_chess_move/std": 0.8982801556587219, "step": 19050 }, { "completion_length": 421.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 421.0, "completions/max_terminated_length": 315.2, "completions/mean_length": 85.43671875, "completions/mean_terminated_length": 84.89803771972656, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017244749650218015, "frac_reward_zero_std": 0.94375, "grad_norm": 16.492565155029297, "kl": 0.6490313725546002, "learning_rate": 4.043333333333333e-07, "loss": 0.0006, "num_tokens": 1269927044.0, "reward": 0.5375, "reward_std": 0.04976583644747734, "rewards/verify_chess_move/mean": 0.5375, "rewards/verify_chess_move/std": 0.8223935246467591, "step": 19055 }, { "completion_length": 548.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 548.4, "completions/max_terminated_length": 500.4, "completions/mean_length": 100.23984375, "completions/mean_terminated_length": 99.71900787353516, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.017249274643566275, "frac_reward_zero_std": 0.94375, "grad_norm": 5.6201605796813965, "kl": 1.37372359035071, "learning_rate": 4.042936507936508e-07, "loss": 0.0014, "num_tokens": 1270259687.0, "reward": 0.33125, "reward_std": 0.046820734068751334, "rewards/verify_chess_move/mean": 0.33125, "rewards/verify_chess_move/std": 0.9127390384674072, "step": 19060 }, { "completion_length": 353.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 353.4, "completions/max_terminated_length": 261.8, "completions/mean_length": 85.27890625, "completions/mean_terminated_length": 84.7455337524414, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017253799636914535, "frac_reward_zero_std": 0.98125, "grad_norm": 0.0075614904053509235, "kl": 0.15289437873288989, "learning_rate": 4.0425396825396825e-07, "loss": 0.0002, "num_tokens": 1270564476.0, "reward": 0.4890625, "reward_std": 0.016887324303388594, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8678268194198608, "step": 19065 }, { "completion_length": 353.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 353.4, "completions/max_terminated_length": 313.2, "completions/mean_length": 86.62265625, "completions/mean_terminated_length": 86.09572143554688, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017258324630262792, "frac_reward_zero_std": 0.96875, "grad_norm": 17.23283576965332, "kl": 0.3652904239250347, "learning_rate": 4.0421428571428565e-07, "loss": 0.0004, "num_tokens": 1270873737.0, "reward": 0.48125, "reward_std": 0.02346404492855072, "rewards/verify_chess_move/mean": 0.48125, "rewards/verify_chess_move/std": 0.8589780688285827, "step": 19070 }, { "completion_length": 271.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 89.04296875, "completions/mean_terminated_length": 89.04296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017262849623611052, "frac_reward_zero_std": 0.9875, "grad_norm": 0.003494861302897334, "kl": 0.711975920572877, "learning_rate": 4.0417460317460316e-07, "loss": 0.0007, "num_tokens": 1271186600.0, "reward": 0.315625, "reward_std": 0.012467906624078751, "rewards/verify_chess_move/mean": 0.315625, "rewards/verify_chess_move/std": 0.9426332116127014, "step": 19075 }, { "completion_length": 280.4, "completions/clipped_ratio": 0.0, "completions/max_length": 280.4, "completions/max_terminated_length": 280.4, "completions/mean_length": 81.434375, "completions/mean_terminated_length": 81.434375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017267374616959312, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0023430478759109974, "kl": 0.219228082196787, "learning_rate": 4.041349206349206e-07, "loss": 0.0002, "num_tokens": 1271485900.0, "reward": 0.509375, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.509375, "rewards/verify_chess_move/std": 0.8292558193206787, "step": 19080 }, { "completion_length": 268.8, "completions/clipped_ratio": 0.0, "completions/max_length": 268.8, "completions/max_terminated_length": 268.8, "completions/mean_length": 99.9609375, "completions/mean_terminated_length": 99.9609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017271899610307572, "frac_reward_zero_std": 0.9625, "grad_norm": 1.002946138381958, "kl": 0.17510853796266018, "learning_rate": 4.040952380952381e-07, "loss": 0.0002, "num_tokens": 1271818970.0, "reward": 0.309375, "reward_std": 0.030145575851202012, "rewards/verify_chess_move/mean": 0.309375, "rewards/verify_chess_move/std": 0.938294506072998, "step": 19085 }, { "completion_length": 514.4, "completions/clipped_ratio": 0.0, "completions/max_length": 514.4, "completions/max_terminated_length": 514.4, "completions/mean_length": 88.92109375, "completions/mean_terminated_length": 88.92109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017276424603655832, "frac_reward_zero_std": 0.975, "grad_norm": 2.2196905612945557, "kl": 0.638581629912369, "learning_rate": 4.040555555555555e-07, "loss": 0.0006, "num_tokens": 1272132149.0, "reward": 0.4640625, "reward_std": 0.02382849156856537, "rewards/verify_chess_move/mean": 0.4640625, "rewards/verify_chess_move/std": 0.8852895379066468, "step": 19090 }, { "completion_length": 370.8, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/max_terminated_length": 370.8, "completions/mean_length": 82.93828125, "completions/mean_terminated_length": 82.93828125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017280949597004092, "frac_reward_zero_std": 0.975, "grad_norm": 0.0018122087931260467, "kl": 0.6922592274611816, "learning_rate": 4.0401587301587297e-07, "loss": 0.0007, "num_tokens": 1272433046.0, "reward": 0.4140625, "reward_std": 0.02198972962796688, "rewards/verify_chess_move/mean": 0.4140625, "rewards/verify_chess_move/std": 0.9088017106056213, "step": 19095 }, { "completion_length": 320.8, "completions/clipped_ratio": 0.0, "completions/max_length": 320.8, "completions/max_terminated_length": 320.8, "completions/mean_length": 91.1390625, "completions/mean_terminated_length": 91.1390625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017285474590352352, "frac_reward_zero_std": 0.95, "grad_norm": 0.017741519957780838, "kl": 0.16266057665925474, "learning_rate": 4.039761904761905e-07, "loss": 0.0002, "num_tokens": 1272751408.0, "reward": 0.3125, "reward_std": 0.04718419909477234, "rewards/verify_chess_move/mean": 0.3125, "rewards/verify_chess_move/std": 0.9446429371833801, "step": 19100 }, { "completion_length": 416.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 416.4, "completions/max_terminated_length": 320.4, "completions/mean_length": 88.740625, "completions/mean_terminated_length": 88.20442199707031, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017289999583700612, "frac_reward_zero_std": 0.96875, "grad_norm": 0.02615024521946907, "kl": 0.2060445645125583, "learning_rate": 4.0393650793650793e-07, "loss": 0.0002, "num_tokens": 1273065596.0, "reward": 0.325, "reward_std": 0.029613886773586274, "rewards/verify_chess_move/mean": 0.325, "rewards/verify_chess_move/std": 0.9308650493621826, "step": 19105 }, { "completion_length": 303.8, "completions/clipped_ratio": 0.0, "completions/max_length": 303.8, "completions/max_terminated_length": 303.8, "completions/mean_length": 78.9828125, "completions/mean_terminated_length": 78.9828125, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.017294524577048873, "frac_reward_zero_std": 0.96875, "grad_norm": 7.316184043884277, "kl": 0.17753251229878514, "learning_rate": 4.038968253968254e-07, "loss": 0.0002, "num_tokens": 1273362374.0, "reward": 0.359375, "reward_std": 0.027988273277878763, "rewards/verify_chess_move/mean": 0.359375, "rewards/verify_chess_move/std": 0.9321663975715637, "step": 19110 }, { "completion_length": 248.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 80.49609375, "completions/mean_terminated_length": 80.49609375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017299049570397133, "frac_reward_zero_std": 0.9875, "grad_norm": 1.339830994606018, "kl": 0.1464715498033911, "learning_rate": 4.0385714285714284e-07, "loss": 0.0001, "num_tokens": 1273659849.0, "reward": 0.428125, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.883775532245636, "step": 19115 }, { "completion_length": 405.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 405.4, "completions/max_terminated_length": 360.2, "completions/mean_length": 87.56484375, "completions/mean_terminated_length": 87.03812866210937, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017303574563745393, "frac_reward_zero_std": 0.94375, "grad_norm": 7.096429347991943, "kl": 0.314696613047272, "learning_rate": 4.038174603174603e-07, "loss": 0.0003, "num_tokens": 1273968404.0, "reward": 0.4796875, "reward_std": 0.051132792979478835, "rewards/verify_chess_move/mean": 0.4796875, "rewards/verify_chess_move/std": 0.8581032872200012, "step": 19120 }, { "completion_length": 262.4, "completions/clipped_ratio": 0.0, "completions/max_length": 262.4, "completions/max_terminated_length": 262.4, "completions/mean_length": 82.7453125, "completions/mean_terminated_length": 82.7453125, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.01730809955709365, "frac_reward_zero_std": 0.96875, "grad_norm": 6.699337482452393, "kl": 0.6324954739771783, "learning_rate": 4.0377777777777775e-07, "loss": 0.0006, "num_tokens": 1274271446.0, "reward": 0.3375, "reward_std": 0.02619796209037304, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9131881833076477, "step": 19125 }, { "completion_length": 315.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 92.20234375, "completions/mean_terminated_length": 92.20234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01731262455044191, "frac_reward_zero_std": 0.9375, "grad_norm": 10.606189727783203, "kl": 0.7868861329741776, "learning_rate": 4.037380952380952e-07, "loss": 0.0008, "num_tokens": 1274588833.0, "reward": 0.4609375, "reward_std": 0.05896911770105362, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8697861790657043, "step": 19130 }, { "completion_length": 294.2, "completions/clipped_ratio": 0.0, "completions/max_length": 294.2, "completions/max_terminated_length": 294.2, "completions/mean_length": 81.71171875, "completions/mean_terminated_length": 81.71171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01731714954379017, "frac_reward_zero_std": 0.9625, "grad_norm": 0.013431196101009846, "kl": 0.12660644473508, "learning_rate": 4.036984126984127e-07, "loss": 0.0001, "num_tokens": 1274890232.0, "reward": 0.4015625, "reward_std": 0.0344576358795166, "rewards/verify_chess_move/mean": 0.4015625, "rewards/verify_chess_move/std": 0.8978806138038635, "step": 19135 }, { "completion_length": 355.6, "completions/clipped_ratio": 0.0, "completions/max_length": 355.6, "completions/max_terminated_length": 355.6, "completions/mean_length": 92.3671875, "completions/mean_terminated_length": 92.3671875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01732167453713843, "frac_reward_zero_std": 0.96875, "grad_norm": 9.035446166992188, "kl": 0.22490290028508753, "learning_rate": 4.0365873015873016e-07, "loss": 0.0002, "num_tokens": 1275209790.0, "reward": 0.390625, "reward_std": 0.028247908875346183, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.8975344061851501, "step": 19140 }, { "completion_length": 390.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 390.6, "completions/max_terminated_length": 374.0, "completions/mean_length": 87.9328125, "completions/mean_terminated_length": 86.86447448730469, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01732619953048669, "frac_reward_zero_std": 0.9625, "grad_norm": 0.09672126173973083, "kl": 0.45962003187742084, "learning_rate": 4.0361904761904757e-07, "loss": 0.0005, "num_tokens": 1275519680.0, "reward": 0.434375, "reward_std": 0.02651650309562683, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.9014070868492127, "step": 19145 }, { "completion_length": 363.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 86.3140625, "completions/mean_terminated_length": 86.3140625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01733072452383495, "frac_reward_zero_std": 0.95, "grad_norm": 24.491106033325195, "kl": 0.6842452239128761, "learning_rate": 4.0357936507936507e-07, "loss": 0.0007, "num_tokens": 1275828042.0, "reward": 0.415625, "reward_std": 0.04218915067613125, "rewards/verify_chess_move/mean": 0.415625, "rewards/verify_chess_move/std": 0.895513939857483, "step": 19150 }, { "completion_length": 309.4, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/max_terminated_length": 309.4, "completions/mean_length": 94.025, "completions/mean_terminated_length": 94.025, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01733524951718321, "frac_reward_zero_std": 0.96875, "grad_norm": 17.02526092529297, "kl": 0.2927357023814693, "learning_rate": 4.035396825396825e-07, "loss": 0.0003, "num_tokens": 1276149610.0, "reward": 0.4265625, "reward_std": 0.025726158171892166, "rewards/verify_chess_move/mean": 0.4265625, "rewards/verify_chess_move/std": 0.9011099100112915, "step": 19155 }, { "completion_length": 376.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 376.0, "completions/max_terminated_length": 353.4, "completions/mean_length": 86.88359375, "completions/mean_terminated_length": 85.83616485595704, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.01733977451053147, "frac_reward_zero_std": 0.95, "grad_norm": 8.775323867797852, "kl": 1.3925236269133165, "learning_rate": 4.0350000000000003e-07, "loss": 0.0014, "num_tokens": 1276459477.0, "reward": 0.440625, "reward_std": 0.048763323575258255, "rewards/verify_chess_move/mean": 0.440625, "rewards/verify_chess_move/std": 0.8787354707717896, "step": 19160 }, { "completion_length": 296.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 85.21953125, "completions/mean_terminated_length": 85.21953125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01734429950387973, "frac_reward_zero_std": 0.9625, "grad_norm": 22.608488082885742, "kl": 0.20098355039954185, "learning_rate": 4.0346031746031743e-07, "loss": 0.0002, "num_tokens": 1276768158.0, "reward": 0.478125, "reward_std": 0.03219552300870419, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8713658094406128, "step": 19165 }, { "completion_length": 305.2, "completions/clipped_ratio": 0.0, "completions/max_length": 305.2, "completions/max_terminated_length": 305.2, "completions/mean_length": 90.10859375, "completions/mean_terminated_length": 90.10859375, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.01734882449722799, "frac_reward_zero_std": 0.95625, "grad_norm": 0.001187938149087131, "kl": 0.8568507058545947, "learning_rate": 4.034206349206349e-07, "loss": 0.0009, "num_tokens": 1277081761.0, "reward": 0.428125, "reward_std": 0.03845272213220596, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8936637997627258, "step": 19170 }, { "completion_length": 370.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 370.0, "completions/max_terminated_length": 360.4, "completions/mean_length": 94.7484375, "completions/mean_terminated_length": 94.23011169433593, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01735334949057625, "frac_reward_zero_std": 0.95625, "grad_norm": 2.073556900024414, "kl": 1.0771413389127702, "learning_rate": 4.033809523809524e-07, "loss": 0.0011, "num_tokens": 1277405527.0, "reward": 0.246875, "reward_std": 0.03524798229336738, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.964926290512085, "step": 19175 }, { "completion_length": 288.2, "completions/clipped_ratio": 0.0, "completions/max_length": 288.2, "completions/max_terminated_length": 288.2, "completions/mean_length": 89.2640625, "completions/mean_terminated_length": 89.2640625, "completions/min_length": 28.8, "completions/min_terminated_length": 28.8, "epoch": 0.017357874483924507, "frac_reward_zero_std": 0.9625, "grad_norm": 0.24032120406627655, "kl": 0.6505202276865021, "learning_rate": 4.033412698412698e-07, "loss": 0.0007, "num_tokens": 1277720033.0, "reward": 0.2890625, "reward_std": 0.0299334105104208, "rewards/verify_chess_move/mean": 0.2890625, "rewards/verify_chess_move/std": 0.9475658297538757, "step": 19180 }, { "completion_length": 357.2, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/max_terminated_length": 357.2, "completions/mean_length": 90.8828125, "completions/mean_terminated_length": 90.8828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017362399477272767, "frac_reward_zero_std": 0.96875, "grad_norm": 9.971866607666016, "kl": 0.282024267828092, "learning_rate": 4.033015873015873e-07, "loss": 0.0003, "num_tokens": 1278037091.0, "reward": 0.3453125, "reward_std": 0.030250385403633118, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9275768280029297, "step": 19185 }, { "completion_length": 317.4, "completions/clipped_ratio": 0.0, "completions/max_length": 317.4, "completions/max_terminated_length": 317.4, "completions/mean_length": 84.94296875, "completions/mean_terminated_length": 84.94296875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017366924470621027, "frac_reward_zero_std": 0.98125, "grad_norm": 13.00924015045166, "kl": 0.31512980521656575, "learning_rate": 4.0326190476190476e-07, "loss": 0.0003, "num_tokens": 1278342938.0, "reward": 0.3625, "reward_std": 0.017570312693715097, "rewards/verify_chess_move/mean": 0.3625, "rewards/verify_chess_move/std": 0.887827730178833, "step": 19190 }, { "completion_length": 310.2, "completions/clipped_ratio": 0.0, "completions/max_length": 310.2, "completions/max_terminated_length": 310.2, "completions/mean_length": 81.91328125, "completions/mean_terminated_length": 81.91328125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017371449463969287, "frac_reward_zero_std": 0.975, "grad_norm": 10.758060455322266, "kl": 0.3829999841051176, "learning_rate": 4.032222222222222e-07, "loss": 0.0004, "num_tokens": 1278644139.0, "reward": 0.3046875, "reward_std": 0.02109457477927208, "rewards/verify_chess_move/mean": 0.3046875, "rewards/verify_chess_move/std": 0.9395136833190918, "step": 19195 }, { "completion_length": 264.8, "completions/clipped_ratio": 0.0, "completions/max_length": 264.8, "completions/max_terminated_length": 264.8, "completions/mean_length": 83.96640625, "completions/mean_terminated_length": 83.96640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017375974457317547, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024917142000049353, "kl": 0.8807026051683351, "learning_rate": 4.0318253968253966e-07, "loss": 0.0009, "num_tokens": 1278949440.0, "reward": 0.434375, "reward_std": 0.029143063724040984, "rewards/verify_chess_move/mean": 0.434375, "rewards/verify_chess_move/std": 0.8978275775909423, "step": 19200 }, { "completion_length": 283.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 86.06015625, "completions/mean_terminated_length": 86.06015625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017380499450665807, "frac_reward_zero_std": 0.9625, "grad_norm": 19.890918731689453, "kl": 0.26905192006379364, "learning_rate": 4.031428571428571e-07, "loss": 0.0003, "num_tokens": 1279255797.0, "reward": 0.4703125, "reward_std": 0.03403330445289612, "rewards/verify_chess_move/mean": 0.4703125, "rewards/verify_chess_move/std": 0.8793779850006104, "step": 19205 }, { "completion_length": 330.2, "completions/clipped_ratio": 0.0, "completions/max_length": 330.2, "completions/max_terminated_length": 330.2, "completions/mean_length": 87.125, "completions/mean_terminated_length": 87.125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.017385024444014067, "frac_reward_zero_std": 0.9625, "grad_norm": 50.364479064941406, "kl": 0.8538622221676633, "learning_rate": 4.031031746031746e-07, "loss": 0.0009, "num_tokens": 1279565437.0, "reward": 0.409375, "reward_std": 0.03219552263617516, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.8997615694999694, "step": 19210 }, { "completion_length": 451.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 451.0, "completions/max_terminated_length": 354.6, "completions/mean_length": 90.97421875, "completions/mean_terminated_length": 90.45146331787109, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017389549437362328, "frac_reward_zero_std": 0.975, "grad_norm": 13.902870178222656, "kl": 0.5248669320950284, "learning_rate": 4.0306349206349203e-07, "loss": 0.0005, "num_tokens": 1279881764.0, "reward": 0.428125, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.8773520469665528, "step": 19215 }, { "completion_length": 315.2, "completions/clipped_ratio": 0.0, "completions/max_length": 315.2, "completions/max_terminated_length": 315.2, "completions/mean_length": 89.34921875, "completions/mean_terminated_length": 89.34921875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017394074430710588, "frac_reward_zero_std": 0.9375, "grad_norm": 23.52678108215332, "kl": 0.3878451412310824, "learning_rate": 4.030238095238095e-07, "loss": 0.0004, "num_tokens": 1280196059.0, "reward": 0.20625, "reward_std": 0.056707003712654115, "rewards/verify_chess_move/mean": 0.20625, "rewards/verify_chess_move/std": 0.963345468044281, "step": 19220 }, { "completion_length": 348.2, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/max_terminated_length": 348.2, "completions/mean_length": 88.6296875, "completions/mean_terminated_length": 88.6296875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017398599424058848, "frac_reward_zero_std": 0.9625, "grad_norm": 0.09254211187362671, "kl": 0.6467162651941181, "learning_rate": 4.02984126984127e-07, "loss": 0.0006, "num_tokens": 1280507337.0, "reward": 0.3875, "reward_std": 0.03130036816000938, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9160028696060181, "step": 19225 }, { "completion_length": 264.8, "completions/clipped_ratio": 0.0, "completions/max_length": 264.8, "completions/max_terminated_length": 264.8, "completions/mean_length": 86.12734375, "completions/mean_terminated_length": 86.12734375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017403124417407108, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0022600858937948942, "kl": 6.7811881642788645, "learning_rate": 4.0294444444444444e-07, "loss": 0.0068, "num_tokens": 1280815436.0, "reward": 0.246875, "reward_std": 0.010888782143592835, "rewards/verify_chess_move/mean": 0.246875, "rewards/verify_chess_move/std": 0.950872826576233, "step": 19230 }, { "completion_length": 380.4, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/max_terminated_length": 380.4, "completions/mean_length": 89.20234375, "completions/mean_terminated_length": 89.20234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017407649410755364, "frac_reward_zero_std": 0.98125, "grad_norm": 2.7430973052978516, "kl": 1.5556100399931894, "learning_rate": 4.029047619047619e-07, "loss": 0.0016, "num_tokens": 1281129367.0, "reward": 0.3734375, "reward_std": 0.01530819907784462, "rewards/verify_chess_move/mean": 0.3734375, "rewards/verify_chess_move/std": 0.9234403967857361, "step": 19235 }, { "completion_length": 394.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 394.2, "completions/max_terminated_length": 351.2, "completions/mean_length": 95.81796875, "completions/mean_terminated_length": 94.23852233886718, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017412174404103625, "frac_reward_zero_std": 0.95625, "grad_norm": 2.1611974239349365, "kl": 5.148539908952079, "learning_rate": 4.0286507936507935e-07, "loss": 0.0051, "num_tokens": 1281452662.0, "reward": 0.303125, "reward_std": 0.03866586945950985, "rewards/verify_chess_move/mean": 0.303125, "rewards/verify_chess_move/std": 0.9474705219268799, "step": 19240 }, { "completion_length": 379.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 96.78671875, "completions/mean_terminated_length": 96.78671875, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.017416699397451885, "frac_reward_zero_std": 0.95625, "grad_norm": 8.132758140563965, "kl": 2.4758498783921823, "learning_rate": 4.028253968253968e-07, "loss": 0.0025, "num_tokens": 1281779461.0, "reward": 0.278125, "reward_std": 0.04092700034379959, "rewards/verify_chess_move/mean": 0.278125, "rewards/verify_chess_move/std": 0.9399351954460144, "step": 19245 }, { "completion_length": 383.4, "completions/clipped_ratio": 0.0, "completions/max_length": 383.4, "completions/max_terminated_length": 383.4, "completions/mean_length": 79.9, "completions/mean_terminated_length": 79.9, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017421224390800145, "frac_reward_zero_std": 0.9625, "grad_norm": 1.3826920986175537, "kl": 3.2738310088636355, "learning_rate": 4.027857142857143e-07, "loss": 0.0033, "num_tokens": 1282075085.0, "reward": 0.4546875, "reward_std": 0.03082856573164463, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8868678212165833, "step": 19250 }, { "completion_length": 277.4, "completions/clipped_ratio": 0.0, "completions/max_length": 277.4, "completions/max_terminated_length": 277.4, "completions/mean_length": 87.31328125, "completions/mean_terminated_length": 87.31328125, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.017425749384148405, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002238909015432, "kl": 0.7404167635366321, "learning_rate": 4.027460317460317e-07, "loss": 0.0007, "num_tokens": 1282384542.0, "reward": 0.4046875, "reward_std": 0.02414703369140625, "rewards/verify_chess_move/mean": 0.4046875, "rewards/verify_chess_move/std": 0.9149090766906738, "step": 19255 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 85.084375, "completions/mean_terminated_length": 85.084375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017430274377496665, "frac_reward_zero_std": 0.94375, "grad_norm": 29.37782859802246, "kl": 4.085690222075209, "learning_rate": 4.0270634920634917e-07, "loss": 0.0041, "num_tokens": 1282691362.0, "reward": 0.4328125, "reward_std": 0.04818769320845604, "rewards/verify_chess_move/mean": 0.4328125, "rewards/verify_chess_move/std": 0.9007981419563293, "step": 19260 }, { "completion_length": 478.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 478.6, "completions/max_terminated_length": 378.6, "completions/mean_length": 83.85, "completions/mean_terminated_length": 83.30977783203124, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.017434799370844925, "frac_reward_zero_std": 0.9125, "grad_norm": 19.975692749023438, "kl": 4.090662627713755, "learning_rate": 4.0266666666666667e-07, "loss": 0.0041, "num_tokens": 1282995714.0, "reward": 0.30625, "reward_std": 0.06755086220800877, "rewards/verify_chess_move/mean": 0.30625, "rewards/verify_chess_move/std": 0.9512992382049561, "step": 19265 }, { "completion_length": 483.2, "completions/clipped_ratio": 0.00234375, "completions/max_length": 483.2, "completions/max_terminated_length": 288.0, "completions/mean_length": 89.60703125, "completions/mean_terminated_length": 88.01066741943359, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017439324364193185, "frac_reward_zero_std": 0.975, "grad_norm": 8.722922325134277, "kl": 4.449360462336335, "learning_rate": 4.0262698412698407e-07, "loss": 0.0044, "num_tokens": 1283309011.0, "reward": 0.328125, "reward_std": 0.02130674198269844, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9419077157974243, "step": 19270 }, { "completion_length": 368.6, "completions/clipped_ratio": 0.0, "completions/max_length": 368.6, "completions/max_terminated_length": 368.6, "completions/mean_length": 94.065625, "completions/mean_terminated_length": 94.065625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017443849357541445, "frac_reward_zero_std": 0.95, "grad_norm": 16.66871452331543, "kl": 6.07108821098227, "learning_rate": 4.025873015873016e-07, "loss": 0.0061, "num_tokens": 1283632087.0, "reward": 0.29375, "reward_std": 0.04376827478408814, "rewards/verify_chess_move/mean": 0.29375, "rewards/verify_chess_move/std": 0.9409013509750366, "step": 19275 }, { "completion_length": 324.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 83.3671875, "completions/mean_terminated_length": 83.3671875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017448374350889705, "frac_reward_zero_std": 0.95, "grad_norm": 30.46648597717285, "kl": 5.20236309056636, "learning_rate": 4.0254761904761903e-07, "loss": 0.0052, "num_tokens": 1283934797.0, "reward": 0.3984375, "reward_std": 0.04308528564870358, "rewards/verify_chess_move/mean": 0.3984375, "rewards/verify_chess_move/std": 0.8995887279510498, "step": 19280 }, { "completion_length": 262.2, "completions/clipped_ratio": 0.0, "completions/max_length": 262.2, "completions/max_terminated_length": 262.2, "completions/mean_length": 86.0546875, "completions/mean_terminated_length": 86.0546875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017452899344237965, "frac_reward_zero_std": 0.975, "grad_norm": 4.055591106414795, "kl": 2.422678609052673, "learning_rate": 4.025079365079365e-07, "loss": 0.0024, "num_tokens": 1284243211.0, "reward": 0.45, "reward_std": 0.02177756354212761, "rewards/verify_chess_move/mean": 0.45, "rewards/verify_chess_move/std": 0.8857655763626099, "step": 19285 }, { "completion_length": 294.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 88.303125, "completions/mean_terminated_length": 88.303125, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017457424337586222, "frac_reward_zero_std": 0.95625, "grad_norm": 3.6342570781707764, "kl": 0.8423698944272473, "learning_rate": 4.0246825396825394e-07, "loss": 0.0008, "num_tokens": 1284556471.0, "reward": 0.3609375, "reward_std": 0.037298910319805145, "rewards/verify_chess_move/mean": 0.3609375, "rewards/verify_chess_move/std": 0.9192441463470459, "step": 19290 }, { "completion_length": 272.4, "completions/clipped_ratio": 0.0, "completions/max_length": 272.4, "completions/max_terminated_length": 272.4, "completions/mean_length": 87.8609375, "completions/mean_terminated_length": 87.8609375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017461949330934482, "frac_reward_zero_std": 0.95625, "grad_norm": 0.6879158020019531, "kl": 1.029654531367123, "learning_rate": 4.024285714285714e-07, "loss": 0.001, "num_tokens": 1284866341.0, "reward": 0.4640625, "reward_std": 0.03503581620752812, "rewards/verify_chess_move/mean": 0.4640625, "rewards/verify_chess_move/std": 0.8759124875068665, "step": 19295 }, { "completion_length": 342.8, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/max_terminated_length": 342.8, "completions/mean_length": 83.36640625, "completions/mean_terminated_length": 83.36640625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017466474324282742, "frac_reward_zero_std": 0.93125, "grad_norm": 13.581896781921387, "kl": 1.8462942719226703, "learning_rate": 4.023888888888889e-07, "loss": 0.0018, "num_tokens": 1285169266.0, "reward": 0.46875, "reward_std": 0.05928864106535912, "rewards/verify_chess_move/mean": 0.46875, "rewards/verify_chess_move/std": 0.8480712413787842, "step": 19300 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 91.2109375, "completions/mean_terminated_length": 91.2109375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017470999317631002, "frac_reward_zero_std": 0.975, "grad_norm": 0.004604637622833252, "kl": 0.2596985698794015, "learning_rate": 4.023492063492063e-07, "loss": 0.0003, "num_tokens": 1285486008.0, "reward": 0.3796875, "reward_std": 0.019044627994298936, "rewards/verify_chess_move/mean": 0.3796875, "rewards/verify_chess_move/std": 0.8895658612251282, "step": 19305 }, { "completion_length": 302.4, "completions/clipped_ratio": 0.0, "completions/max_length": 302.4, "completions/max_terminated_length": 302.4, "completions/mean_length": 85.24765625, "completions/mean_terminated_length": 85.24765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017475524310979262, "frac_reward_zero_std": 0.975, "grad_norm": 0.0018110175151377916, "kl": 0.7306576711824164, "learning_rate": 4.0230952380952376e-07, "loss": 0.0007, "num_tokens": 1285792813.0, "reward": 0.4734375, "reward_std": 0.02109457515180111, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8686610221862793, "step": 19310 }, { "completion_length": 369.8, "completions/clipped_ratio": 0.0, "completions/max_length": 369.8, "completions/max_terminated_length": 369.8, "completions/mean_length": 91.46328125, "completions/mean_terminated_length": 91.46328125, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017480049304327522, "frac_reward_zero_std": 0.9875, "grad_norm": 2.2102789878845215, "kl": 0.2787095707608387, "learning_rate": 4.0226984126984127e-07, "loss": 0.0003, "num_tokens": 1286110870.0, "reward": 0.3390625, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9199816942214966, "step": 19315 }, { "completion_length": 357.8, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/max_terminated_length": 357.8, "completions/mean_length": 91.5078125, "completions/mean_terminated_length": 91.5078125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.017484574297675783, "frac_reward_zero_std": 0.9625, "grad_norm": 8.929285049438477, "kl": 0.2683383092284203, "learning_rate": 4.022301587301587e-07, "loss": 0.0003, "num_tokens": 1286427504.0, "reward": 0.3546875, "reward_std": 0.030617379024624823, "rewards/verify_chess_move/mean": 0.3546875, "rewards/verify_chess_move/std": 0.9312562108039856, "step": 19320 }, { "completion_length": 368.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 90.6625, "completions/mean_terminated_length": 90.6625, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.017489099291024043, "frac_reward_zero_std": 0.95625, "grad_norm": 0.7289367914199829, "kl": 0.33575722500681876, "learning_rate": 4.0219047619047617e-07, "loss": 0.0003, "num_tokens": 1286741840.0, "reward": 0.44375, "reward_std": 0.038452721387147906, "rewards/verify_chess_move/mean": 0.44375, "rewards/verify_chess_move/std": 0.8864491701126098, "step": 19325 }, { "completion_length": 353.2, "completions/clipped_ratio": 0.0, "completions/max_length": 353.2, "completions/max_terminated_length": 353.2, "completions/mean_length": 83.4609375, "completions/mean_terminated_length": 83.4609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017493624284372303, "frac_reward_zero_std": 0.96875, "grad_norm": 10.234353065490723, "kl": 0.6558278064243496, "learning_rate": 4.0215079365079363e-07, "loss": 0.0007, "num_tokens": 1287043190.0, "reward": 0.4890625, "reward_std": 0.024831003695726394, "rewards/verify_chess_move/mean": 0.4890625, "rewards/verify_chess_move/std": 0.8496184349060059, "step": 19330 }, { "completion_length": 444.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.8, "completions/max_terminated_length": 430.6, "completions/mean_length": 87.621875, "completions/mean_terminated_length": 87.08682098388672, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017498149277720563, "frac_reward_zero_std": 0.95625, "grad_norm": 12.150105476379395, "kl": 0.13923607461620122, "learning_rate": 4.021111111111111e-07, "loss": 0.0001, "num_tokens": 1287353714.0, "reward": 0.35625, "reward_std": 0.03640277422964573, "rewards/verify_chess_move/mean": 0.35625, "rewards/verify_chess_move/std": 0.9267928123474121, "step": 19335 }, { "completion_length": 280.8, "completions/clipped_ratio": 0.0, "completions/max_length": 280.8, "completions/max_terminated_length": 280.8, "completions/mean_length": 85.796875, "completions/mean_terminated_length": 85.796875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017502674271068823, "frac_reward_zero_std": 0.95625, "grad_norm": 0.07879363745450974, "kl": 0.9418396503664553, "learning_rate": 4.0207142857142854e-07, "loss": 0.0009, "num_tokens": 1287661182.0, "reward": 0.4734375, "reward_std": 0.04050364941358566, "rewards/verify_chess_move/mean": 0.4734375, "rewards/verify_chess_move/std": 0.8533820867538452, "step": 19340 }, { "completion_length": 284.8, "completions/clipped_ratio": 0.0, "completions/max_length": 284.8, "completions/max_terminated_length": 284.8, "completions/mean_length": 89.97265625, "completions/mean_terminated_length": 89.97265625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01750719926441708, "frac_reward_zero_std": 0.9375, "grad_norm": 3.6722190380096436, "kl": 0.5829719272907823, "learning_rate": 4.02031746031746e-07, "loss": 0.0006, "num_tokens": 1287976547.0, "reward": 0.390625, "reward_std": 0.05534102618694305, "rewards/verify_chess_move/mean": 0.390625, "rewards/verify_chess_move/std": 0.9134839177131653, "step": 19345 }, { "completion_length": 302.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 92.15390625, "completions/mean_terminated_length": 92.15390625, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.01751172425776534, "frac_reward_zero_std": 0.94375, "grad_norm": 22.937044143676758, "kl": 0.4480680227279663, "learning_rate": 4.019920634920635e-07, "loss": 0.0004, "num_tokens": 1288294784.0, "reward": 0.5125, "reward_std": 0.045925579592585565, "rewards/verify_chess_move/mean": 0.5125, "rewards/verify_chess_move/std": 0.8495750546455383, "step": 19350 }, { "completion_length": 441.6, "completions/clipped_ratio": 0.0, "completions/max_length": 441.6, "completions/max_terminated_length": 441.6, "completions/mean_length": 87.79609375, "completions/mean_terminated_length": 87.79609375, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.0175162492511136, "frac_reward_zero_std": 0.95, "grad_norm": 10.654855728149414, "kl": 0.8785892789717764, "learning_rate": 4.0195238095238095e-07, "loss": 0.0009, "num_tokens": 1288605083.0, "reward": 0.5125, "reward_std": 0.04171832874417305, "rewards/verify_chess_move/mean": 0.5125, "rewards/verify_chess_move/std": 0.8531105637550354, "step": 19355 }, { "completion_length": 472.8, "completions/clipped_ratio": 0.00078125, "completions/max_length": 472.8, "completions/max_terminated_length": 373.4, "completions/mean_length": 88.0671875, "completions/mean_terminated_length": 87.53697357177734, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01752077424446186, "frac_reward_zero_std": 0.91875, "grad_norm": 17.460567474365234, "kl": 1.137399513926357, "learning_rate": 4.0191269841269835e-07, "loss": 0.0011, "num_tokens": 1288916089.0, "reward": 0.334375, "reward_std": 0.0720161847770214, "rewards/verify_chess_move/mean": 0.334375, "rewards/verify_chess_move/std": 0.9293598294258117, "step": 19360 }, { "completion_length": 388.6, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/max_terminated_length": 388.6, "completions/mean_length": 90.7828125, "completions/mean_terminated_length": 90.7828125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01752529923781012, "frac_reward_zero_std": 0.95625, "grad_norm": 0.05277717486023903, "kl": 0.8012010090518743, "learning_rate": 4.0187301587301586e-07, "loss": 0.0008, "num_tokens": 1289231187.0, "reward": 0.409375, "reward_std": 0.03866586834192276, "rewards/verify_chess_move/mean": 0.409375, "rewards/verify_chess_move/std": 0.8989960432052613, "step": 19365 }, { "completion_length": 297.4, "completions/clipped_ratio": 0.0, "completions/max_length": 297.4, "completions/max_terminated_length": 297.4, "completions/mean_length": 81.28671875, "completions/mean_terminated_length": 81.28671875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.01752982423115838, "frac_reward_zero_std": 0.975, "grad_norm": 2.1724088191986084, "kl": 0.5313170852372423, "learning_rate": 4.018333333333333e-07, "loss": 0.0005, "num_tokens": 1289530490.0, "reward": 0.50625, "reward_std": 0.024935813248157503, "rewards/verify_chess_move/mean": 0.50625, "rewards/verify_chess_move/std": 0.8381917357444764, "step": 19370 }, { "completion_length": 517.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 517.0, "completions/max_terminated_length": 458.2, "completions/mean_length": 95.9578125, "completions/mean_terminated_length": 94.91571197509765, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.01753434922450664, "frac_reward_zero_std": 0.95, "grad_norm": 0.006761620752513409, "kl": 0.9179654340725392, "learning_rate": 4.017936507936508e-07, "loss": 0.0009, "num_tokens": 1289854796.0, "reward": 0.203125, "reward_std": 0.04013920240104198, "rewards/verify_chess_move/mean": 0.203125, "rewards/verify_chess_move/std": 0.9502345800399781, "step": 19375 }, { "completion_length": 335.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 81.67578125, "completions/mean_terminated_length": 81.67578125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0175388742178549, "frac_reward_zero_std": 0.96875, "grad_norm": 0.040141861885786057, "kl": 0.16277773689944297, "learning_rate": 4.017539682539682e-07, "loss": 0.0002, "num_tokens": 1290154549.0, "reward": 0.446875, "reward_std": 0.0275639396160841, "rewards/verify_chess_move/mean": 0.446875, "rewards/verify_chess_move/std": 0.8907755732536315, "step": 19380 }, { "completion_length": 264.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 89.5453125, "completions/mean_terminated_length": 89.5453125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01754339921120316, "frac_reward_zero_std": 0.975, "grad_norm": 1.380858302116394, "kl": 0.32490453254431484, "learning_rate": 4.017142857142857e-07, "loss": 0.0003, "num_tokens": 1290467375.0, "reward": 0.478125, "reward_std": 0.01767766885459423, "rewards/verify_chess_move/mean": 0.478125, "rewards/verify_chess_move/std": 0.8675707936286926, "step": 19385 }, { "completion_length": 312.2, "completions/clipped_ratio": 0.0, "completions/max_length": 312.2, "completions/max_terminated_length": 312.2, "completions/mean_length": 94.64140625, "completions/mean_terminated_length": 94.64140625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.01754792420455142, "frac_reward_zero_std": 0.9625, "grad_norm": 0.8875670433044434, "kl": 0.5691262340173125, "learning_rate": 4.016746031746032e-07, "loss": 0.0006, "num_tokens": 1290789524.0, "reward": 0.4203125, "reward_std": 0.03540124297142029, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.8941986560821533, "step": 19390 }, { "completion_length": 281.6, "completions/clipped_ratio": 0.0, "completions/max_length": 281.6, "completions/max_terminated_length": 281.6, "completions/mean_length": 91.9640625, "completions/mean_terminated_length": 91.9640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01755244919789968, "frac_reward_zero_std": 0.9625, "grad_norm": 5.86818265914917, "kl": 0.3333254511468112, "learning_rate": 4.016349206349206e-07, "loss": 0.0003, "num_tokens": 1291107614.0, "reward": 0.4, "reward_std": 0.031040730699896813, "rewards/verify_chess_move/mean": 0.4, "rewards/verify_chess_move/std": 0.8993873000144958, "step": 19395 }, { "completion_length": 340.4, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/max_terminated_length": 340.4, "completions/mean_length": 88.29765625, "completions/mean_terminated_length": 88.29765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01755697419124794, "frac_reward_zero_std": 0.99375, "grad_norm": 1.3100817203521729, "kl": 0.1862169418251142, "learning_rate": 4.015952380952381e-07, "loss": 0.0002, "num_tokens": 1291418707.0, "reward": 0.3890625, "reward_std": 0.004419417306780815, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9133622050285339, "step": 19400 }, { "completion_length": 294.8, "completions/clipped_ratio": 0.0, "completions/max_length": 294.8, "completions/max_terminated_length": 294.8, "completions/mean_length": 92.8390625, "completions/mean_terminated_length": 92.8390625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017561499184596197, "frac_reward_zero_std": 0.9875, "grad_norm": 0.20133739709854126, "kl": 0.19435369677376002, "learning_rate": 4.0155555555555554e-07, "loss": 0.0002, "num_tokens": 1291738421.0, "reward": 0.3875, "reward_std": 0.00883883461356163, "rewards/verify_chess_move/mean": 0.3875, "rewards/verify_chess_move/std": 0.9139845728874206, "step": 19405 }, { "completion_length": 288.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 89.51796875, "completions/mean_terminated_length": 89.51796875, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017566024177944457, "frac_reward_zero_std": 0.975, "grad_norm": 12.352751731872559, "kl": 0.45804364925716073, "learning_rate": 4.01515873015873e-07, "loss": 0.0005, "num_tokens": 1292050668.0, "reward": 0.4546875, "reward_std": 0.019044627621769904, "rewards/verify_chess_move/mean": 0.4546875, "rewards/verify_chess_move/std": 0.8722794771194458, "step": 19410 }, { "completion_length": 295.4, "completions/clipped_ratio": 0.0, "completions/max_length": 295.4, "completions/max_terminated_length": 295.4, "completions/mean_length": 94.60234375, "completions/mean_terminated_length": 94.60234375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017570549171292717, "frac_reward_zero_std": 0.95, "grad_norm": 8.423346519470215, "kl": 0.38553792648017404, "learning_rate": 4.0147619047619045e-07, "loss": 0.0004, "num_tokens": 1292372055.0, "reward": 0.375, "reward_std": 0.04671337716281414, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9034479260444641, "step": 19415 }, { "completion_length": 352.6, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/max_terminated_length": 352.6, "completions/mean_length": 83.12890625, "completions/mean_terminated_length": 83.12890625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017575074164640977, "frac_reward_zero_std": 0.96875, "grad_norm": 0.009939503856003284, "kl": 0.1538550986442715, "learning_rate": 4.014365079365079e-07, "loss": 0.0002, "num_tokens": 1292673548.0, "reward": 0.5171875, "reward_std": 0.02777610644698143, "rewards/verify_chess_move/mean": 0.5171875, "rewards/verify_chess_move/std": 0.8230860829353333, "step": 19420 }, { "completion_length": 404.6, "completions/clipped_ratio": 0.0, "completions/max_length": 404.6, "completions/max_terminated_length": 404.6, "completions/mean_length": 98.49453125, "completions/mean_terminated_length": 98.49453125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017579599157989238, "frac_reward_zero_std": 0.975, "grad_norm": 0.17278127372264862, "kl": 0.9459734008647501, "learning_rate": 4.013968253968254e-07, "loss": 0.0009, "num_tokens": 1293004021.0, "reward": 0.328125, "reward_std": 0.023356688022613526, "rewards/verify_chess_move/mean": 0.328125, "rewards/verify_chess_move/std": 0.9366427898406983, "step": 19425 }, { "completion_length": 444.6, "completions/clipped_ratio": 0.00078125, "completions/max_length": 444.6, "completions/max_terminated_length": 402.0, "completions/mean_length": 93.99375, "completions/mean_terminated_length": 93.47228088378907, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017584124151337498, "frac_reward_zero_std": 0.91875, "grad_norm": 24.168771743774414, "kl": 2.3040140451514164, "learning_rate": 4.013571428571428e-07, "loss": 0.0023, "num_tokens": 1293325365.0, "reward": 0.403125, "reward_std": 0.07222736924886704, "rewards/verify_chess_move/mean": 0.403125, "rewards/verify_chess_move/std": 0.9052759170532226, "step": 19430 }, { "completion_length": 383.2, "completions/clipped_ratio": 0.0, "completions/max_length": 383.2, "completions/max_terminated_length": 383.2, "completions/mean_length": 89.915625, "completions/mean_terminated_length": 89.915625, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017588649144685758, "frac_reward_zero_std": 0.98125, "grad_norm": 2.184659481048584, "kl": 0.8039769566734322, "learning_rate": 4.0131746031746027e-07, "loss": 0.0008, "num_tokens": 1293639401.0, "reward": 0.40625, "reward_std": 0.01552036553621292, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.9060974478721618, "step": 19435 }, { "completion_length": 271.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 88.86640625, "completions/mean_terminated_length": 88.86640625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017593174138034018, "frac_reward_zero_std": 0.94375, "grad_norm": 5.610726356506348, "kl": 0.7368432720657438, "learning_rate": 4.0127777777777777e-07, "loss": 0.0007, "num_tokens": 1293951750.0, "reward": 0.2859375, "reward_std": 0.04250867366790771, "rewards/verify_chess_move/mean": 0.2859375, "rewards/verify_chess_move/std": 0.9376341223716735, "step": 19440 }, { "completion_length": 470.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 96.3546875, "completions/mean_terminated_length": 96.3546875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.017597699131382278, "frac_reward_zero_std": 0.95, "grad_norm": 0.003244087565690279, "kl": 3.35858913899865, "learning_rate": 4.0123809523809523e-07, "loss": 0.0034, "num_tokens": 1294278268.0, "reward": 0.384375, "reward_std": 0.04171832762658596, "rewards/verify_chess_move/mean": 0.384375, "rewards/verify_chess_move/std": 0.9099930047988891, "step": 19445 }, { "completion_length": 344.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 89.88203125, "completions/mean_terminated_length": 89.88203125, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.017602224124730538, "frac_reward_zero_std": 0.9875, "grad_norm": 0.001187948859296739, "kl": 0.17740059006027878, "learning_rate": 4.011984126984127e-07, "loss": 0.0002, "num_tokens": 1294594557.0, "reward": 0.4921875, "reward_std": 0.011100948229432106, "rewards/verify_chess_move/mean": 0.4921875, "rewards/verify_chess_move/std": 0.8453811287879944, "step": 19450 }, { "completion_length": 395.2, "completions/clipped_ratio": 0.0, "completions/max_length": 395.2, "completions/max_terminated_length": 395.2, "completions/mean_length": 89.546875, "completions/mean_terminated_length": 89.546875, "completions/min_length": 29.6, "completions/min_terminated_length": 29.6, "epoch": 0.017606749118078798, "frac_reward_zero_std": 0.95, "grad_norm": 0.0021974865812808275, "kl": 0.8532187439617701, "learning_rate": 4.0115873015873014e-07, "loss": 0.0009, "num_tokens": 1294907873.0, "reward": 0.375, "reward_std": 0.04218915067613125, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.9256083130836487, "step": 19455 }, { "completion_length": 265.6, "completions/clipped_ratio": 0.0, "completions/max_length": 265.6, "completions/max_terminated_length": 265.6, "completions/mean_length": 88.41328125, "completions/mean_terminated_length": 88.41328125, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017611274111427055, "frac_reward_zero_std": 0.98125, "grad_norm": 0.2455480545759201, "kl": 0.2808697790838778, "learning_rate": 4.011190476190476e-07, "loss": 0.0003, "num_tokens": 1295219690.0, "reward": 0.45625, "reward_std": 0.01462521031498909, "rewards/verify_chess_move/mean": 0.45625, "rewards/verify_chess_move/std": 0.8692331433296203, "step": 19460 }, { "completion_length": 324.6, "completions/clipped_ratio": 0.0, "completions/max_length": 324.6, "completions/max_terminated_length": 324.6, "completions/mean_length": 83.91171875, "completions/mean_terminated_length": 83.91171875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017615799104775315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023911530151963234, "kl": 0.11664636626373977, "learning_rate": 4.010793650793651e-07, "loss": 0.0001, "num_tokens": 1295522721.0, "reward": 0.35, "reward_std": 0.0, "rewards/verify_chess_move/mean": 0.35, "rewards/verify_chess_move/std": 0.9290006875991821, "step": 19465 }, { "completion_length": 445.8, "completions/clipped_ratio": 0.0, "completions/max_length": 445.8, "completions/max_terminated_length": 445.8, "completions/mean_length": 87.215625, "completions/mean_terminated_length": 87.215625, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017620324098123575, "frac_reward_zero_std": 0.95, "grad_norm": 6.656869411468506, "kl": 0.622638997703325, "learning_rate": 4.010396825396825e-07, "loss": 0.0006, "num_tokens": 1295832445.0, "reward": 0.28125, "reward_std": 0.04261348247528076, "rewards/verify_chess_move/mean": 0.28125, "rewards/verify_chess_move/std": 0.9271968960762024, "step": 19470 }, { "completion_length": 439.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 439.4, "completions/max_terminated_length": 365.2, "completions/mean_length": 88.43671875, "completions/mean_terminated_length": 87.90235443115235, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017624849091471835, "frac_reward_zero_std": 0.94375, "grad_norm": 0.6548981070518494, "kl": 1.6362238475587219, "learning_rate": 4.01e-07, "loss": 0.0016, "num_tokens": 1296143228.0, "reward": 0.3265625, "reward_std": 0.04865851476788521, "rewards/verify_chess_move/mean": 0.3265625, "rewards/verify_chess_move/std": 0.939540708065033, "step": 19475 }, { "completion_length": 351.4, "completions/clipped_ratio": 0.0, "completions/max_length": 351.4, "completions/max_terminated_length": 351.4, "completions/mean_length": 96.65859375, "completions/mean_terminated_length": 96.65859375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017629374084820095, "frac_reward_zero_std": 0.975, "grad_norm": 3.360414505004883, "kl": 0.35942876499611887, "learning_rate": 4.0096031746031746e-07, "loss": 0.0004, "num_tokens": 1296469151.0, "reward": 0.296875, "reward_std": 0.02041158601641655, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9363545179367065, "step": 19480 }, { "completion_length": 270.6, "completions/clipped_ratio": 0.0, "completions/max_length": 270.6, "completions/max_terminated_length": 270.6, "completions/mean_length": 82.9640625, "completions/mean_terminated_length": 82.9640625, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017633899078168355, "frac_reward_zero_std": 0.95, "grad_norm": 2.472153902053833, "kl": 1.3996797410771251, "learning_rate": 4.0092063492063486e-07, "loss": 0.0014, "num_tokens": 1296770849.0, "reward": 0.3390625, "reward_std": 0.03761745169758797, "rewards/verify_chess_move/mean": 0.3390625, "rewards/verify_chess_move/std": 0.9353012084960938, "step": 19485 }, { "completion_length": 334.2, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/max_terminated_length": 334.2, "completions/mean_length": 89.96015625, "completions/mean_terminated_length": 89.96015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017638424071516615, "frac_reward_zero_std": 0.9625, "grad_norm": 3.6204376220703125, "kl": 0.7472873979015275, "learning_rate": 4.0088095238095237e-07, "loss": 0.0007, "num_tokens": 1297086246.0, "reward": 0.3640625, "reward_std": 0.031983356922864914, "rewards/verify_chess_move/mean": 0.3640625, "rewards/verify_chess_move/std": 0.9179085612297058, "step": 19490 }, { "completion_length": 312.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 90.196875, "completions/mean_terminated_length": 90.196875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017642949064864875, "frac_reward_zero_std": 0.9875, "grad_norm": 44.67755126953125, "kl": 0.1476938068633899, "learning_rate": 4.008412698412698e-07, "loss": 0.0001, "num_tokens": 1297401626.0, "reward": 0.4296875, "reward_std": 0.01225574016571045, "rewards/verify_chess_move/mean": 0.4296875, "rewards/verify_chess_move/std": 0.8978821754455566, "step": 19495 }, { "completion_length": 351.6, "completions/clipped_ratio": 0.0, "completions/max_length": 351.6, "completions/max_terminated_length": 351.6, "completions/mean_length": 91.4203125, "completions/mean_terminated_length": 91.4203125, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017647474058213136, "frac_reward_zero_std": 0.96875, "grad_norm": 1.7637873888015747, "kl": 0.5081580123165622, "learning_rate": 4.0080158730158733e-07, "loss": 0.0005, "num_tokens": 1297717084.0, "reward": 0.3375, "reward_std": 0.031663833558559416, "rewards/verify_chess_move/mean": 0.3375, "rewards/verify_chess_move/std": 0.9182679653167725, "step": 19500 }, { "completion_length": 399.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 92.66171875, "completions/mean_terminated_length": 92.66171875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017651999051561396, "frac_reward_zero_std": 0.975, "grad_norm": 4.896932601928711, "kl": 0.17244542890693992, "learning_rate": 4.0076190476190473e-07, "loss": 0.0002, "num_tokens": 1298034155.0, "reward": 0.296875, "reward_std": 0.021306741610169412, "rewards/verify_chess_move/mean": 0.296875, "rewards/verify_chess_move/std": 0.9520601868629456, "step": 19505 }, { "completion_length": 379.6, "completions/clipped_ratio": 0.0015625, "completions/max_length": 379.6, "completions/max_terminated_length": 338.2, "completions/mean_length": 89.4953125, "completions/mean_terminated_length": 88.44900970458984, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017656524044909656, "frac_reward_zero_std": 0.95625, "grad_norm": 0.09752342104911804, "kl": 6.097280546091497, "learning_rate": 4.007222222222222e-07, "loss": 0.0061, "num_tokens": 1298346821.0, "reward": 0.3453125, "reward_std": 0.033669838309288026, "rewards/verify_chess_move/mean": 0.3453125, "rewards/verify_chess_move/std": 0.9017732381820679, "step": 19510 }, { "completion_length": 391.4, "completions/clipped_ratio": 0.00078125, "completions/max_length": 391.4, "completions/max_terminated_length": 289.8, "completions/mean_length": 93.35625, "completions/mean_terminated_length": 92.8298843383789, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017661049038257912, "frac_reward_zero_std": 0.95625, "grad_norm": 2.034660577774048, "kl": 0.5112357081845402, "learning_rate": 4.006825396825397e-07, "loss": 0.0005, "num_tokens": 1298667053.0, "reward": 0.4203125, "reward_std": 0.033669837936759, "rewards/verify_chess_move/mean": 0.4203125, "rewards/verify_chess_move/std": 0.9080496788024902, "step": 19515 }, { "completion_length": 330.6, "completions/clipped_ratio": 0.0, "completions/max_length": 330.6, "completions/max_terminated_length": 330.6, "completions/mean_length": 95.03203125, "completions/mean_terminated_length": 95.03203125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017665574031606172, "frac_reward_zero_std": 0.98125, "grad_norm": 23.108354568481445, "kl": 0.8855560444062576, "learning_rate": 4.006428571428571e-07, "loss": 0.0009, "num_tokens": 1298988158.0, "reward": 0.2828125, "reward_std": 0.01530819907784462, "rewards/verify_chess_move/mean": 0.2828125, "rewards/verify_chess_move/std": 0.9494544863700867, "step": 19520 }, { "completion_length": 405.6, "completions/clipped_ratio": 0.0, "completions/max_length": 405.6, "completions/max_terminated_length": 405.6, "completions/mean_length": 87.778125, "completions/mean_terminated_length": 87.778125, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017670099024954432, "frac_reward_zero_std": 0.9625, "grad_norm": 5.055538177490234, "kl": 1.8866420848993584, "learning_rate": 4.006031746031746e-07, "loss": 0.0019, "num_tokens": 1299297754.0, "reward": 0.3140625, "reward_std": 0.032667326182127, "rewards/verify_chess_move/mean": 0.3140625, "rewards/verify_chess_move/std": 0.9280728459358215, "step": 19525 }, { "completion_length": 323.2, "completions/clipped_ratio": 0.0, "completions/max_length": 323.2, "completions/max_terminated_length": 323.2, "completions/mean_length": 85.3796875, "completions/mean_terminated_length": 85.3796875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017674624018302693, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0020118854008615017, "kl": 0.6348035125527531, "learning_rate": 4.0056349206349205e-07, "loss": 0.0006, "num_tokens": 1299604120.0, "reward": 0.3015625, "reward_std": 0.028930898010730743, "rewards/verify_chess_move/mean": 0.3015625, "rewards/verify_chess_move/std": 0.9408122420310974, "step": 19530 }, { "completion_length": 325.4, "completions/clipped_ratio": 0.0, "completions/max_length": 325.4, "completions/max_terminated_length": 325.4, "completions/mean_length": 86.3046875, "completions/mean_terminated_length": 86.3046875, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.017679149011650953, "frac_reward_zero_std": 0.98125, "grad_norm": 16.220258712768555, "kl": 0.15613455404527485, "learning_rate": 4.005238095238095e-07, "loss": 0.0002, "num_tokens": 1299913462.0, "reward": 0.4390625, "reward_std": 0.015992168709635733, "rewards/verify_chess_move/mean": 0.4390625, "rewards/verify_chess_move/std": 0.8983725309371948, "step": 19535 }, { "completion_length": 318.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 90.8296875, "completions/mean_terminated_length": 90.8296875, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017683674004999213, "frac_reward_zero_std": 0.9625, "grad_norm": 0.792511522769928, "kl": 0.19655926993582398, "learning_rate": 4.0048412698412696e-07, "loss": 0.0002, "num_tokens": 1300229172.0, "reward": 0.271875, "reward_std": 0.03014557547867298, "rewards/verify_chess_move/mean": 0.271875, "rewards/verify_chess_move/std": 0.9544949889183044, "step": 19540 }, { "completion_length": 291.8, "completions/clipped_ratio": 0.0, "completions/max_length": 291.8, "completions/max_terminated_length": 291.8, "completions/mean_length": 90.6171875, "completions/mean_terminated_length": 90.6171875, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.017688198998347473, "frac_reward_zero_std": 0.98125, "grad_norm": 6.652283668518066, "kl": 0.15140749767888337, "learning_rate": 4.004444444444444e-07, "loss": 0.0002, "num_tokens": 1300542202.0, "reward": 0.4609375, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.4609375, "rewards/verify_chess_move/std": 0.8638861060142518, "step": 19545 }, { "completion_length": 371.8, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 89.1109375, "completions/mean_terminated_length": 89.1109375, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.017692723991695733, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0018095866544172168, "kl": 0.11830674700904638, "learning_rate": 4.004047619047619e-07, "loss": 0.0001, "num_tokens": 1300854288.0, "reward": 0.38125, "reward_std": 0.010888782143592835, "rewards/verify_chess_move/mean": 0.38125, "rewards/verify_chess_move/std": 0.9156423211097717, "step": 19550 }, { "completion_length": 379.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 81.5375, "completions/mean_terminated_length": 81.5375, "completions/min_length": 30.2, "completions/min_terminated_length": 30.2, "epoch": 0.017697248985043993, "frac_reward_zero_std": 0.93125, "grad_norm": 8.639657974243164, "kl": 0.8697568628704175, "learning_rate": 4.003650793650794e-07, "loss": 0.0009, "num_tokens": 1301153168.0, "reward": 0.3890625, "reward_std": 0.06565064787864686, "rewards/verify_chess_move/mean": 0.3890625, "rewards/verify_chess_move/std": 0.9164467334747315, "step": 19555 }, { "completion_length": 321.4, "completions/clipped_ratio": 0.0, "completions/max_length": 321.4, "completions/max_terminated_length": 321.4, "completions/mean_length": 86.9296875, "completions/mean_terminated_length": 86.9296875, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.017701773978392253, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0023796719033271074, "kl": 0.12735922419233248, "learning_rate": 4.003253968253968e-07, "loss": 0.0001, "num_tokens": 1301462918.0, "reward": 0.40625, "reward_std": 0.010888781771063805, "rewards/verify_chess_move/mean": 0.40625, "rewards/verify_chess_move/std": 0.8895167827606201, "step": 19560 }, { "completion_length": 316.4, "completions/clipped_ratio": 0.0, "completions/max_length": 316.4, "completions/max_terminated_length": 316.4, "completions/mean_length": 90.04765625, "completions/mean_terminated_length": 90.04765625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.017706298971740513, "frac_reward_zero_std": 0.9875, "grad_norm": 11.71978759765625, "kl": 0.7290070130024106, "learning_rate": 4.002857142857143e-07, "loss": 0.0007, "num_tokens": 1301776963.0, "reward": 0.41875, "reward_std": 0.010888782143592835, "rewards/verify_chess_move/mean": 0.41875, "rewards/verify_chess_move/std": 0.9020823001861572, "step": 19565 }, { "completion_length": 307.6, "completions/clipped_ratio": 0.0, "completions/max_length": 307.6, "completions/max_terminated_length": 307.6, "completions/mean_length": 92.94765625, "completions/mean_terminated_length": 92.94765625, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01771082396508877, "frac_reward_zero_std": 0.9625, "grad_norm": 19.66027069091797, "kl": 1.4888814169913531, "learning_rate": 4.0024603174603174e-07, "loss": 0.0015, "num_tokens": 1302095760.0, "reward": 0.375, "reward_std": 0.028566450253129004, "rewards/verify_chess_move/mean": 0.375, "rewards/verify_chess_move/std": 0.8985618352890015, "step": 19570 }, { "completion_length": 288.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 87.98046875, "completions/mean_terminated_length": 87.98046875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01771534895843703, "frac_reward_zero_std": 0.975, "grad_norm": 11.6709623336792, "kl": 0.18739616856910288, "learning_rate": 4.002063492063492e-07, "loss": 0.0002, "num_tokens": 1302406175.0, "reward": 0.49375, "reward_std": 0.019727616384625436, "rewards/verify_chess_move/mean": 0.49375, "rewards/verify_chess_move/std": 0.8634547352790832, "step": 19575 }, { "completion_length": 274.6, "completions/clipped_ratio": 0.0, "completions/max_length": 274.6, "completions/max_terminated_length": 274.6, "completions/mean_length": 87.759375, "completions/mean_terminated_length": 87.759375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.01771987395178529, "frac_reward_zero_std": 0.95625, "grad_norm": 3.7536075115203857, "kl": 0.3550347415264696, "learning_rate": 4.0016666666666664e-07, "loss": 0.0004, "num_tokens": 1302714987.0, "reward": 0.5265625, "reward_std": 0.034564992785453795, "rewards/verify_chess_move/mean": 0.5265625, "rewards/verify_chess_move/std": 0.8383187890052796, "step": 19580 }, { "completion_length": 339.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 90.9859375, "completions/mean_terminated_length": 90.9859375, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.01772439894513355, "frac_reward_zero_std": 0.98125, "grad_norm": 0.2895953357219696, "kl": 0.2606194134335965, "learning_rate": 4.001269841269841e-07, "loss": 0.0003, "num_tokens": 1303030241.0, "reward": 0.3859375, "reward_std": 0.013258251920342445, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9205833315849304, "step": 19585 }, { "completion_length": 330.6, "completions/clipped_ratio": 0.0, "completions/max_length": 330.6, "completions/max_terminated_length": 330.6, "completions/mean_length": 90.07265625, "completions/mean_terminated_length": 90.07265625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01772892393848181, "frac_reward_zero_std": 0.95, "grad_norm": 6.637908458709717, "kl": 0.2648000199114904, "learning_rate": 4.000873015873016e-07, "loss": 0.0003, "num_tokens": 1303343878.0, "reward": 0.3859375, "reward_std": 0.03966739922761917, "rewards/verify_chess_move/mean": 0.3859375, "rewards/verify_chess_move/std": 0.9014758944511414, "step": 19590 }, { "completion_length": 363.2, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/max_terminated_length": 363.2, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.01773344893183007, "frac_reward_zero_std": 0.95625, "grad_norm": 26.252195358276367, "kl": 0.24850918969605118, "learning_rate": 4.00047619047619e-07, "loss": 0.0002, "num_tokens": 1303669790.0, "reward": 0.428125, "reward_std": 0.03913669139146805, "rewards/verify_chess_move/mean": 0.428125, "rewards/verify_chess_move/std": 0.9003048181533814, "step": 19595 }, { "completion_length": 293.6, "completions/clipped_ratio": 0.0, "completions/max_length": 293.6, "completions/max_terminated_length": 293.6, "completions/mean_length": 86.93515625, "completions/mean_terminated_length": 86.93515625, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.01773797392517833, "frac_reward_zero_std": 0.9625, "grad_norm": 4.112201690673828, "kl": 1.7533302484080195, "learning_rate": 4.000079365079365e-07, "loss": 0.0018, "num_tokens": 1303980211.0, "reward": 0.5515625, "reward_std": 0.030617379397153855, "rewards/verify_chess_move/mean": 0.5515625, "rewards/verify_chess_move/std": 0.8323888421058655, "step": 19600 } ], "logging_steps": 5, "max_steps": 70000, "num_input_tokens_seen": 1303980211, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }