diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3004 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.054, + "eval_steps": 5.0, + "global_step": 54, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "/length/completion": 9061.760416666666, + "/length/completion/max": 19510, + "/length/completion/min": 2153, + "/length/completion/std": 333.8509541397672, + "/length/context": 42230.270833333336, + "/length/context/max": 101733, + "/length/context/min": 3126, + "/length/context/std": 2205.856462898938, + "/length/forward": 42235.0, + "/length/forward/max": 101736, + "/length/forward/min": 3128, + "/length/forward/std": 2205.8737442778643, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5446428571428571, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49800302740658364, + "/record/score/max": 1.0, + "/record/score/mean": 0.5446428571428571, + "/record/score/min": 0.0, + "/record/score/std": 0.49800302740658364, + "advantages": -0.10318419991938722, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.019792414635250263, + "entropy": 0.65704345703125, + "entropy/max": 0.87890625, + "entropy/min": 0.455078125, + "entropy/std": 0.008027744975431144, + "epoch": 0.001, + "grad_norm": 15232.0, + "learning_rate": 0.0, + "loss": -165.1345625, + "out_of_date_ratio": 0.0014260866205404454, + "out_of_date_ratio/max": 0.003501293947920203, + "out_of_date_ratio/min": 0.0001568135485285893, + "out_of_date_ratio/std": 8.450739925303703e-05, + "rewards": 0.5104166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.051019960662424764, + "sampled_at_step": 0.9999999925494194, + "sampled_at_step/max": 1.0, + "sampled_at_step/min": 0.9999999403953552, + "sampled_at_step/std": 2.011886704218841e-09, + "scores": 0.4623135832325675, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.010009663488673342, + "step": 1, + "steps": 24.84375, + "steps/max": 80, + "steps/min": 1, + "steps/std": 1.7357010149325394 + }, + { + "/length/completion": 8887.78125, + "/length/completion/max": 20317, + "/length/completion/min": 3326, + "/length/completion/std": 433.71652733755445, + "/length/context": 40515.65625, + "/length/context/max": 121680, + "/length/context/min": 4305, + "/length/context/std": 2330.0265864167477, + "/length/forward": 40519.833333333336, + "/length/forward/max": 121688, + "/length/forward/min": 4312, + "/length/forward/std": 2330.095548169546, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.592375366568915, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4913927061437305, + "/record/score/max": 1.0, + "/record/score/mean": 0.592375366568915, + "/record/score/min": 0.0, + "/record/score/std": 0.4913927061437305, + "advantages": -0.028806329974655393, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.020580049780948472, + "entropy": 0.6922200520833334, + "entropy/max": 0.8984375, + "entropy/min": 0.4921875, + "entropy/std": 0.009487675328552674, + "epoch": 0.002, + "grad_norm": 15232.0, + "learning_rate": 2e-07, + "loss": -14.253539583333334, + "out_of_date_ratio": 0.0013957185919935, + "out_of_date_ratio/max": 0.003554652677848935, + "out_of_date_ratio/min": 0.0002505480661056936, + "out_of_date_ratio/std": 7.495178299694405e-05, + "rewards": 0.6145833333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04967295748634071, + "sampled_at_step": 1.330572656666239, + "sampled_at_step/max": 2.0, + "sampled_at_step/min": 0.9999999403953552, + "sampled_at_step/std": 0.04314657295355584, + "scores": 0.574643011683254, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.010284327028776, + "step": 2, + "steps": 23.072916666666668, + "steps/max": 84, + "steps/min": 1, + "steps/std": 1.808259214570864 + }, + { + "/length/completion": 9210.4375, + "/length/completion/max": 25411, + "/length/completion/min": 2206, + "/length/completion/std": 474.7124508411544, + "/length/context": 43727.708333333336, + "/length/context/max": 122274, + "/length/context/min": 3516, + "/length/context/std": 2767.7691200088607, + "/length/forward": 43732.0, + "/length/forward/max": 122280, + "/length/forward/min": 3520, + "/length/forward/std": 2767.7861615578454, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5821205821205822, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49321010734997783, + "/record/score/max": 1.0, + "/record/score/mean": 0.5821205821205822, + "/record/score/min": 0.0, + "/record/score/std": 0.49321010734997783, + "advantages": -0.09020842379505026, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.018848007565270594, + "entropy": 0.6811930338541666, + "entropy/max": 0.859375, + "entropy/min": 0.55078125, + "entropy/std": 0.007361056852565585, + "epoch": 0.003, + "grad_norm": 15232.0, + "learning_rate": 4e-07, + "loss": -92.92765208333333, + "out_of_date_ratio": 0.002393863861925638, + "out_of_date_ratio/max": 0.09005628526210785, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 0.0009420074751743802, + "rewards": 0.5625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05063078670631141, + "sampled_at_step": 2.32747404028972, + "sampled_at_step/max": 3.000000238418579, + "sampled_at_step/min": 0.9999999403953552, + "sampled_at_step/std": 0.07218538224091829, + "scores": 0.5509118541033434, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009695359383926561, + "step": 3, + "steps": 26.416666666666668, + "steps/max": 97, + "steps/min": 0, + "steps/std": 2.1152075570163933 + }, + { + "/length/completion": 10750.739583333334, + "/length/completion/max": 21108, + "/length/completion/min": 2193, + "/length/completion/std": 450.1033487719153, + "/length/context": 50107.604166666664, + "/length/context/max": 114001, + "/length/context/min": 6428, + "/length/context/std": 2518.8334550931213, + "/length/forward": 50112.166666666664, + "/length/forward/max": 114008, + "/length/forward/min": 6432, + "/length/forward/std": 2518.891348333757, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5820668693009119, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49321904764835184, + "/record/score/max": 1.0, + "/record/score/mean": 0.5820668693009119, + "/record/score/min": 0.0, + "/record/score/std": 0.49321904764835184, + "advantages": -0.052586616041765684, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01657736980411449, + "entropy": 0.6615804036458334, + "entropy/max": 0.87890625, + "entropy/min": 0.4921875, + "entropy/std": 0.008354287781568109, + "epoch": 0.004, + "grad_norm": 16384.0, + "learning_rate": 6e-07, + "loss": -242.6494145833333, + "out_of_date_ratio": 0.0013956707824339294, + "out_of_date_ratio/max": 0.0036721748765558004, + "out_of_date_ratio/min": 0.00016429803508799523, + "out_of_date_ratio/std": 7.971234370627556e-05, + "rewards": 0.59375, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05012598061177124, + "sampled_at_step": 3.343302513162295, + "sampled_at_step/max": 4.0, + "sampled_at_step/min": 2.2274067401885986, + "sampled_at_step/std": 0.05714829785258465, + "scores": 0.493687707641196, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00911280642020964, + "step": 4, + "steps": 30.354166666666668, + "steps/max": 76, + "steps/min": 1, + "steps/std": 1.7908779251434763 + }, + { + "/length/completion": 11387.947916666666, + "/length/completion/max": 22277, + "/length/completion/min": 4552, + "/length/completion/std": 385.16190915095103, + "/length/context": 53574.416666666664, + "/length/context/max": 105110, + "/length/context/min": 16843, + "/length/context/std": 2121.527686130431, + "/length/forward": 53579.0, + "/length/forward/max": 105112, + "/length/forward/min": 16848, + "/length/forward/std": 2121.4906782535404, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6083832335329341, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.488111744059642, + "/record/score/max": 1.0, + "/record/score/mean": 0.6083832335329341, + "/record/score/min": 0.0, + "/record/score/std": 0.488111744059642, + "advantages": -0.07026523964634883, + "advantages/max": 2.0, + "advantages/min": -1.7142857142857142, + "advantages/std": 0.018303452579415777, + "entropy": 0.6888427734375, + "entropy/max": 0.8359375, + "entropy/min": 0.56640625, + "entropy/std": 0.00563372915583649, + "epoch": 0.005, + "grad_norm": 19456.0, + "learning_rate": 8e-07, + "loss": -275.4627083333333, + "out_of_date_ratio": 0.0016567955062782858, + "out_of_date_ratio/max": 0.020350120961666107, + "out_of_date_ratio/min": 0.0002010050229728222, + "out_of_date_ratio/std": 0.0002122593014383166, + "rewards": 0.4895833333333333, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.051019960662424785, + "sampled_at_step": 4.356270944078763, + "sampled_at_step/max": 5.0, + "sampled_at_step/min": 3.0, + "sampled_at_step/std": 0.05214663289803722, + "scores": 0.40390879478827363, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008855820888513739, + "step": 5, + "steps": 30.979166666666668, + "steps/max": 77, + "steps/min": 7, + "steps/std": 1.7108073527787282 + }, + { + "/length/completion": 8645.0625, + "/length/completion/max": 22095, + "/length/completion/min": 4112, + "/length/completion/std": 328.20605970067106, + "/length/context": 49704.583333333336, + "/length/context/max": 128783, + "/length/context/min": 11158, + "/length/context/std": 2747.831565315202, + "/length/forward": 49709.25, + "/length/forward/max": 128784, + "/length/forward/min": 11160, + "/length/forward/std": 2747.838415181319, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6016597510373444, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.48955622253120745, + "/record/score/max": 1.0, + "/record/score/mean": 0.6016597510373444, + "/record/score/min": 0.0, + "/record/score/std": 0.48955622253120745, + "advantages": -0.02579739867235099, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.018538305458941026, + "entropy": 0.6691487630208334, + "entropy/max": 0.9609375, + "entropy/min": 0.455078125, + "entropy/std": 0.010216517799197189, + "epoch": 0.006, + "grad_norm": 14656.0, + "learning_rate": 1e-06, + "loss": -45.5740875, + "out_of_date_ratio": 0.0014883852118146024, + "out_of_date_ratio/max": 0.006303992588073015, + "out_of_date_ratio/min": 0.0001845529186539352, + "out_of_date_ratio/std": 0.00010177097145203845, + "rewards": 0.53125, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05093126879064569, + "sampled_at_step": 5.492770120501518, + "sampled_at_step/max": 6.000000476837158, + "sampled_at_step/min": 4.611965656280518, + "sampled_at_step/std": 0.045385259602976326, + "scores": 0.5179448432187382, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009712100160934692, + "step": 6, + "steps": 26.572916666666668, + "steps/max": 84, + "steps/min": 4, + "steps/std": 1.638879348525962 + }, + { + "/length/completion": 10545.895833333334, + "/length/completion/max": 26415, + "/length/completion/min": 1407, + "/length/completion/std": 454.70999573960717, + "/length/context": 49447.052083333336, + "/length/context/max": 125477, + "/length/context/min": 4282, + "/length/context/std": 2438.5141942497844, + "/length/forward": 49451.416666666664, + "/length/forward/max": 125480, + "/length/forward/min": 4288, + "/length/forward/std": 2438.461217154489, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6220238095238095, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4848816246356396, + "/record/score/max": 1.0, + "/record/score/mean": 0.6030405405405406, + "/record/score/min": 0.0, + "/record/score/std": 0.4892674595812734, + "advantages": -0.025122265122264592, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01904916215825559, + "entropy": 0.6802164713541666, + "entropy/max": 0.85546875, + "entropy/min": 0.54296875, + "entropy/std": 0.006917466056080718, + "epoch": 0.007, + "grad_norm": 18304.0, + "learning_rate": 1.2e-06, + "loss": 12.00070625, + "out_of_date_ratio": 0.0014764862108525751, + "out_of_date_ratio/max": 0.004187604878097773, + "out_of_date_ratio/min": 0.00011552680371096358, + "out_of_date_ratio/std": 8.891737377791334e-05, + "rewards": 0.5208333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05098671929023751, + "sampled_at_step": 6.518794342875481, + "sampled_at_step/max": 7.000000476837158, + "sampled_at_step/min": 5.0, + "sampled_at_step/std": 0.05228243236909025, + "scores": 0.4976576576576577, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009491475804562591, + "step": 7, + "steps": 27.90625, + "steps/max": 89, + "steps/min": 1, + "steps/std": 1.8318577232140487 + }, + { + "/length/completion": 9839.854166666666, + "/length/completion/max": 21195, + "/length/completion/min": 2998, + "/length/completion/std": 366.70528619391644, + "/length/context": 49984.9375, + "/length/context/max": 113953, + "/length/context/min": 12001, + "/length/context/std": 2039.1666751173045, + "/length/forward": 49989.416666666664, + "/length/forward/max": 113960, + "/length/forward/min": 12008, + "/length/forward/std": 2039.1574836150392, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6066037735849057, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4885034651437574, + "/record/score/max": 1.0, + "/record/score/mean": 0.5986646884272997, + "/record/score/min": 0.0, + "/record/score/std": 0.49016862328952066, + "advantages": 0.030626780626780342, + "advantages/max": 2.0, + "advantages/min": -1.7142857142857142, + "advantages/std": 0.019301630821304063, + "entropy": 0.67767333984375, + "entropy/max": 0.89453125, + "entropy/min": 0.458984375, + "entropy/std": 0.008873916231699142, + "epoch": 0.008, + "grad_norm": 17280.0, + "learning_rate": 1.4e-06, + "loss": 347.6390791666667, + "out_of_date_ratio": 0.0013641679650694034, + "out_of_date_ratio/max": 0.012770682573318481, + "out_of_date_ratio/min": 0.00014114326040726155, + "out_of_date_ratio/std": 0.00013914642752668266, + "rewards": 0.6041666666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04991130733147589, + "sampled_at_step": 7.504495506485303, + "sampled_at_step/max": 8.0, + "sampled_at_step/min": 6.0, + "sampled_at_step/std": 0.04903364618515806, + "scores": 0.6477920227920227, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009014026700098238, + "step": 8, + "steps": 28.25, + "steps/max": 68, + "steps/min": 4, + "steps/std": 1.4796067420470587 + }, + { + "/length/completion": 8776.010416666666, + "/length/completion/max": 26631, + "/length/completion/min": 982, + "/length/completion/std": 428.9860690236972, + "/length/context": 43058.010416666664, + "/length/context/max": 124668, + "/length/context/min": 2162, + "/length/context/std": 2756.5255490843815, + "/length/forward": 43062.666666666664, + "/length/forward/max": 124672, + "/length/forward/min": 2168, + "/length/forward/std": 2756.5407963672587, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.600375234521576, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4898212044151812, + "/record/score/max": 1.0, + "/record/score/mean": 0.5981308411214953, + "/record/score/min": 0.0, + "/record/score/std": 0.49027577751790646, + "advantages": -0.19077901430842445, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.018712851888668346, + "entropy": 0.6672770182291666, + "entropy/max": 0.84375, + "entropy/min": 0.5234375, + "entropy/std": 0.007357147745400299, + "epoch": 0.009, + "grad_norm": 14848.0, + "learning_rate": 1.6e-06, + "loss": -744.38655625, + "out_of_date_ratio": 0.0014761180206429951, + "out_of_date_ratio/max": 0.0039397054351866245, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 9.058071731289764e-05, + "rewards": 0.4791666666666667, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.050986719290237494, + "sampled_at_step": 8.244493653376898, + "sampled_at_step/max": 9.0, + "sampled_at_step/min": 7.0, + "sampled_at_step/std": 0.05606692134224259, + "scores": 0.34379968203497613, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009469251146552133, + "step": 9, + "steps": 25.208333333333332, + "steps/max": 96, + "steps/min": 0, + "steps/std": 2.0891022557437693 + }, + { + "/length/completion": 9919.166666666666, + "/length/completion/max": 28914, + "/length/completion/min": 1503, + "/length/completion/std": 462.08533345737334, + "/length/context": 48338.229166666664, + "/length/context/max": 116558, + "/length/context/min": 2761, + "/length/context/std": 2398.330957210102, + "/length/forward": 48342.916666666664, + "/length/forward/max": 116560, + "/length/forward/min": 2768, + "/length/forward/std": 2398.321283904732, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5992402659069326, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49005241517894965, + "/record/score/max": 1.0, + "/record/score/mean": 0.5964912280701754, + "/record/score/min": 0.0, + "/record/score/std": 0.4906011036529667, + "advantages": 0.14579998967422325, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.017462920262873362, + "entropy": 0.66534423828125, + "entropy/max": 0.890625, + "entropy/min": 0.310546875, + "entropy/std": 0.010153132071121932, + "epoch": 0.01, + "grad_norm": 15424.0, + "learning_rate": 1.8e-06, + "loss": 576.7980958333334, + "out_of_date_ratio": 0.0012154976905852284, + "out_of_date_ratio/max": 0.0027450979687273502, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 6.903620514673011e-05, + "rewards": 0.6041666666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04991130733147588, + "sampled_at_step": 9.399776776631674, + "sampled_at_step/max": 10.0, + "sampled_at_step/min": 8.0, + "sampled_at_step/std": 0.06032417629102527, + "scores": 0.5858330321647994, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009364186993252418, + "step": 10, + "steps": 27.822916666666668, + "steps/max": 88, + "steps/min": 1, + "steps/std": 1.7334281569407148 + }, + { + "/length/completion": 10443.125, + "/length/completion/max": 27506, + "/length/completion/min": 1803, + "/length/completion/std": 543.500688424718, + "/length/context": 47232.635416666664, + "/length/context/max": 120937, + "/length/context/min": 3018, + "/length/context/std": 2898.144162085004, + "/length/forward": 47237.166666666664, + "/length/forward/max": 120944, + "/length/forward/min": 3024, + "/length/forward/std": 2898.150111563425, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5850860420650096, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4927071802254477, + "/record/score/max": 1.0, + "/record/score/mean": 0.5896174863387978, + "/record/score/min": 0.0, + "/record/score/std": 0.4919031471156853, + "advantages": -0.07608915906788609, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.018877367468057998, + "entropy": 0.7043863932291666, + "entropy/max": 0.86328125, + "entropy/min": 0.5390625, + "entropy/std": 0.007723941035489615, + "epoch": 0.011, + "grad_norm": 30080.0, + "learning_rate": 2e-06, + "loss": -558.7412104166666, + "out_of_date_ratio": 0.0014792358707988267, + "out_of_date_ratio/max": 0.005837538279592991, + "out_of_date_ratio/min": 0.00012627856631297618, + "out_of_date_ratio/std": 9.652012977910382e-05, + "rewards": 0.5104166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05101996066242478, + "sampled_at_step": 10.584631284077963, + "sampled_at_step/max": 11.0, + "sampled_at_step/min": 10.0, + "sampled_at_step/std": 0.04345596676676561, + "scores": 0.46418439716312054, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009391357929316552, + "step": 11, + "steps": 28.375, + "steps/max": 88, + "steps/min": 0, + "steps/std": 2.2854199901221657 + }, + { + "/length/completion": 11570.645833333334, + "/length/completion/max": 24767, + "/length/completion/min": 1517, + "/length/completion/std": 538.688173773276, + "/length/context": 57460.239583333336, + "/length/context/max": 116545, + "/length/context/min": 8185, + "/length/context/std": 2912.14489634308, + "/length/forward": 57465.083333333336, + "/length/forward/max": 116552, + "/length/forward/min": 8192, + "/length/forward/std": 2912.1244622454274, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.581547064305685, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4933052567154787, + "/record/score/max": 1.0, + "/record/score/mean": 0.5925740090316106, + "/record/score/min": 0.0, + "/record/score/std": 0.49135532240102475, + "advantages": 0.015989072794471607, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.015835078943366814, + "entropy": 0.682373046875, + "entropy/max": 0.84375, + "entropy/min": 0.54296875, + "entropy/std": 0.006832136883896569, + "epoch": 0.012, + "grad_norm": 24576.0, + "learning_rate": 1.9999949650055508e-06, + "loss": 381.7514645833333, + "out_of_date_ratio": 0.001392214337556652, + "out_of_date_ratio/max": 0.0036032216157764196, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 6.81944290825032e-05, + "rewards": 0.4791666666666667, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.050986719290237514, + "sampled_at_step": 11.37171138326327, + "sampled_at_step/max": 12.000000953674316, + "sampled_at_step/min": 10.790054321289062, + "sampled_at_step/std": 0.04296443813268482, + "scores": 0.4578177727784027, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008354839024517575, + "step": 12, + "steps": 36.041666666666664, + "steps/max": 98, + "steps/min": 0, + "steps/std": 2.405592292268241 + }, + { + "/length/completion": 9533.010416666666, + "/length/completion/max": 20970, + "/length/completion/min": 2845, + "/length/completion/std": 475.7710193195782, + "/length/context": 42798.520833333336, + "/length/context/max": 127818, + "/length/context/min": 5819, + "/length/context/std": 2309.1928122194336, + "/length/forward": 42803.083333333336, + "/length/forward/max": 127824, + "/length/forward/min": 5824, + "/length/forward/std": 2309.192172870657, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5896927651139743, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49188942648344175, + "/record/score/max": 1.0, + "/record/score/mean": 0.5953379953379954, + "/record/score/min": 0.0, + "/record/score/std": 0.49082651379579373, + "advantages": 0.028693766334871362, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.019878183877504816, + "entropy": 0.6960856119791666, + "entropy/max": 0.85546875, + "entropy/min": 0.515625, + "entropy/std": 0.00751492793808001, + "epoch": 0.013, + "grad_norm": 16384.0, + "learning_rate": 1.9999798600729064e-06, + "loss": 202.53405833333332, + "out_of_date_ratio": 0.0016945011958947969, + "out_of_date_ratio/max": 0.016010673716664314, + "out_of_date_ratio/min": 0.00016920473717618734, + "out_of_date_ratio/std": 0.00017768531065253792, + "rewards": 0.5, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05103103630798287, + "sampled_at_step": 12.429851442575455, + "sampled_at_step/max": 13.000000953674316, + "sampled_at_step/min": 11.195213317871094, + "sampled_at_step/std": 0.04931637853899746, + "scores": 0.5165434021019852, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009859391293109745, + "step": 13, + "steps": 25.760416666666668, + "steps/max": 98, + "steps/min": 1, + "steps/std": 1.8281024328893596 + }, + { + "/length/completion": 9545.020833333334, + "/length/completion/max": 27580, + "/length/completion/min": 2641, + "/length/completion/std": 491.1694360666533, + "/length/context": 41898.489583333336, + "/length/context/max": 111089, + "/length/context/min": 4075, + "/length/context/std": 2197.090228116492, + "/length/forward": 41903.166666666664, + "/length/forward/max": 111096, + "/length/forward/min": 4080, + "/length/forward/std": 2197.118581736236, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5901981230448383, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.491797009546813, + "/record/score/max": 1.0, + "/record/score/mean": 0.59849157054126, + "/record/score/min": 0.0, + "/record/score/std": 0.49020343790340376, + "advantages": 0.0003391938492836318, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.020971885229945038, + "entropy": 0.66033935546875, + "entropy/max": 0.93359375, + "entropy/min": 0.470703125, + "entropy/std": 0.008022561389017911, + "epoch": 0.014, + "grad_norm": 16640.0, + "learning_rate": 1.9999546853541726e-06, + "loss": -108.18811041666667, + "out_of_date_ratio": 0.001645550712358575, + "out_of_date_ratio/max": 0.02955367974936962, + "out_of_date_ratio/min": 0.0001886258542072028, + "out_of_date_ratio/std": 0.0003018806362539357, + "rewards": 0.5416666666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05085353651346114, + "sampled_at_step": 13.120210727055868, + "sampled_at_step/max": 14.0, + "sampled_at_step/min": 11.999999046325684, + "sampled_at_step/std": 0.04944200516546299, + "scores": 0.5041551246537396, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009946090042752999, + "step": 14, + "steps": 25.322916666666668, + "steps/max": 73, + "steps/min": 1, + "steps/std": 1.6704705804045474 + }, + { + "/length/completion": 8026.5, + "/length/completion/max": 21793, + "/length/completion/min": 1298, + "/length/completion/std": 418.3677279291144, + "/length/context": 37541.177083333336, + "/length/context/max": 120264, + "/length/context/min": 2177, + "/length/context/std": 2429.7662521622196, + "/length/forward": 37545.416666666664, + "/length/forward/max": 120272, + "/length/forward/min": 2184, + "/length/forward/std": 2429.7884767185656, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.610223642172524, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.48769944505424995, + "/record/score/max": 1.0, + "/record/score/mean": 0.6011730205278593, + "/record/score/min": 0.0, + "/record/score/std": 0.48965704316109804, + "advantages": -0.028815848716794226, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.02292120980091652, + "entropy": 0.6660970052083334, + "entropy/max": 0.85546875, + "entropy/min": 0.474609375, + "entropy/std": 0.0068975977831584635, + "epoch": 0.015, + "grad_norm": 15488.0, + "learning_rate": 1.9999194411028592e-06, + "loss": -59.172912499999995, + "out_of_date_ratio": 0.0014122336497166543, + "out_of_date_ratio/max": 0.004552352242171764, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 8.94015080482922e-05, + "rewards": 0.5729166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05048547230415048, + "sampled_at_step": 14.362340231736502, + "sampled_at_step/max": 15.000000953674316, + "sampled_at_step/min": 13.0, + "sampled_at_step/std": 0.050649083833140505, + "scores": 0.5875731652408824, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.010445522162230367, + "step": 15, + "steps": 22.135416666666668, + "steps/max": 94, + "steps/min": 1, + "steps/std": 2.065213902174111 + }, + { + "/length/completion": 8805.333333333334, + "/length/completion/max": 21973, + "/length/completion/min": 1796, + "/length/completion/std": 445.0948948510345, + "/length/context": 46869.760416666664, + "/length/context/max": 120639, + "/length/context/min": 2709, + "/length/context/std": 2724.167357688649, + "/length/forward": 46874.166666666664, + "/length/forward/max": 120640, + "/length/forward/min": 2712, + "/length/forward/std": 2724.1792405113283, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6228448275862069, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4846742703458842, + "/record/score/max": 1.0, + "/record/score/mean": 0.6072555205047319, + "/record/score/min": 0.0, + "/record/score/std": 0.48836078192383503, + "advantages": -0.07705723798953784, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.019517642661302088, + "entropy": 0.6896565755208334, + "entropy/max": 0.87109375, + "entropy/min": 0.54296875, + "entropy/std": 0.007813531975228124, + "epoch": 0.016, + "grad_norm": 15744.0, + "learning_rate": 1.9998741276738752e-06, + "loss": -173.27766875, + "out_of_date_ratio": 0.0016480689225015037, + "out_of_date_ratio/max": 0.03180282190442085, + "out_of_date_ratio/min": 9.623712685424834e-05, + "out_of_date_ratio/std": 0.0003251372886183483, + "rewards": 0.5208333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.050986719290237514, + "sampled_at_step": 15.466753671566645, + "sampled_at_step/max": 16.0, + "sampled_at_step/min": 13.999999046325684, + "sampled_at_step/std": 0.05769970522461167, + "scores": 0.4565297817240104, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009580747823909712, + "step": 16, + "steps": 27.15625, + "steps/max": 89, + "steps/min": 1, + "steps/std": 1.9363419548250318 + }, + { + "/length/completion": 10960.5, + "/length/completion/max": 25811, + "/length/completion/min": 5331, + "/length/completion/std": 404.2495592615958, + "/length/context": 58679.833333333336, + "/length/context/max": 128199, + "/length/context/min": 15882, + "/length/context/std": 2293.188648558825, + "/length/forward": 58684.416666666664, + "/length/forward/max": 128200, + "/length/forward/min": 15888, + "/length/forward/std": 2293.1736586545526, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.655958549222798, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.47505466098578625, + "/record/score/max": 1.0, + "/record/score/mean": 0.6157798165137615, + "/record/score/min": 0.0, + "/record/score/std": 0.4864103556546057, + "advantages": 0.06352010768971889, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.014668813291095996, + "entropy": 0.6750081380208334, + "entropy/max": 0.85546875, + "entropy/min": 0.49609375, + "entropy/std": 0.00851281709713449, + "epoch": 0.017, + "grad_norm": 15808.0, + "learning_rate": 1.9998187455235257e-06, + "loss": 523.4619145833334, + "out_of_date_ratio": 0.0012592377715918701, + "out_of_date_ratio/max": 0.003246423788368702, + "out_of_date_ratio/min": 0.000278965977486223, + "out_of_date_ratio/std": 7.217478768110386e-05, + "rewards": 0.7083333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04639024033294229, + "sampled_at_step": 16.395306944847107, + "sampled_at_step/max": 17.0, + "sampled_at_step/min": 15.0, + "sampled_at_step/std": 0.05894587906922644, + "scores": 0.6763839811542992, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00802837817144003, + "step": 17, + "steps": 34.375, + "steps/max": 87, + "steps/min": 4, + "steps/std": 1.7676595333079401 + }, + { + "/length/completion": 7872.354166666667, + "/length/completion/max": 21558, + "/length/completion/min": 1382, + "/length/completion/std": 426.2353171293944, + "/length/context": 44733.21875, + "/length/context/max": 127878, + "/length/context/min": 2521, + "/length/context/std": 3317.4042700867667, + "/length/forward": 44737.75, + "/length/forward/max": 127880, + "/length/forward/min": 2528, + "/length/forward/std": 3317.3828607032756, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6842684268426843, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4648065693048262, + "/record/score/max": 1.0, + "/record/score/mean": 0.6210896309314587, + "/record/score/min": 0.0, + "/record/score/std": 0.48511576070138474, + "advantages": 0.11858076563958492, + "advantages/max": 1.4285714285714286, + "advantages/min": -2.0, + "advantages/std": 0.015536615058260218, + "entropy": 0.67474365234375, + "entropy/max": 0.92578125, + "entropy/min": 0.478515625, + "entropy/std": 0.009013613661301047, + "epoch": 0.018, + "grad_norm": 12096.0, + "learning_rate": 1.999753295209509e-06, + "loss": 204.49255416666665, + "out_of_date_ratio": 0.0012250137936765289, + "out_of_date_ratio/max": 0.003539822995662689, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 7.54957539457252e-05, + "rewards": 0.7395833333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.0447911619803609, + "sampled_at_step": 17.16051246722539, + "sampled_at_step/max": 18.0, + "sampled_at_step/min": 16.0, + "sampled_at_step/std": 0.04216236406924461, + "scores": 0.8149509803921569, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.007848803038037575, + "step": 18, + "steps": 24.5, + "steps/max": 90, + "steps/min": 1, + "steps/std": 2.258039264371735 + }, + { + "/length/completion": 10017.989583333334, + "/length/completion/max": 22287, + "/length/completion/min": 3320, + "/length/completion/std": 440.913435562424, + "/length/context": 51859.25, + "/length/context/max": 127071, + "/length/context/min": 7556, + "/length/context/std": 2528.4204074337463, + "/length/forward": 51863.833333333336, + "/length/forward/max": 127072, + "/length/forward/min": 7560, + "/length/forward/std": 2528.3746782467865, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.7072892938496583, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.45500675671390867, + "/record/score/max": 1.0, + "/record/score/mean": 0.6281186783546865, + "/record/score/min": 0.0, + "/record/score/std": 0.4833069462118742, + "advantages": -0.01376936316695331, + "advantages/max": 1.1428571428571428, + "advantages/min": -2.0, + "advantages/std": 0.01817736219182349, + "entropy": 0.66009521484375, + "entropy/max": 0.94140625, + "entropy/min": 0.44921875, + "entropy/std": 0.009324537253803052, + "epoch": 0.019, + "grad_norm": 17152.0, + "learning_rate": 1.999677777390909e-06, + "loss": 84.7324875, + "out_of_date_ratio": 0.0011100193546553783, + "out_of_date_ratio/max": 0.0025269172620028257, + "out_of_date_ratio/min": 6.997410673648119e-05, + "out_of_date_ratio/std": 6.765312539943454e-05, + "rewards": 0.6979166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04686294212198703, + "sampled_at_step": 18.135198891162872, + "sampled_at_step/max": 19.0, + "sampled_at_step/min": 17.0, + "sampled_at_step/std": 0.04848014928364492, + "scores": 0.6662650602409639, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008183805815967112, + "step": 19, + "steps": 33.583333333333336, + "steps/max": 98, + "steps/min": 2, + "steps/std": 2.2649803520936715 + }, + { + "/length/completion": 11512.09375, + "/length/completion/max": 27309, + "/length/completion/min": 4980, + "/length/completion/std": 501.45766494340864, + "/length/context": 56020.770833333336, + "/length/context/max": 126836, + "/length/context/min": 11030, + "/length/context/std": 2774.8540662281084, + "/length/forward": 56025.083333333336, + "/length/forward/max": 126840, + "/length/forward/min": 11032, + "/length/forward/std": 2774.8575054000307, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.7183257918552036, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.44981534946108553, + "/record/score/max": 1.0, + "/record/score/mean": 0.632258064516129, + "/record/score/min": 0.0, + "/record/score/std": 0.4821906307368982, + "advantages": 0.07681134654818979, + "advantages/max": 1.1428571428571428, + "advantages/min": -2.0, + "advantages/std": 0.01614613564061393, + "entropy": 0.6991780598958334, + "entropy/max": 0.87109375, + "entropy/min": 0.5390625, + "entropy/std": 0.007499462124219176, + "epoch": 0.02, + "grad_norm": 17280.0, + "learning_rate": 1.999592192828189e-06, + "loss": 827.3307291666666, + "out_of_date_ratio": 0.0011428044173650658, + "out_of_date_ratio/max": 0.003580619813874364, + "out_of_date_ratio/min": 0.00019168104336131364, + "out_of_date_ratio/std": 6.266466155041168e-05, + "rewards": 0.75, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.044194173824159216, + "sampled_at_step": 19.341610689957935, + "sampled_at_step/max": 20.0, + "sampled_at_step/min": 18.123659133911133, + "sampled_at_step/std": 0.05884703632870817, + "scores": 0.7580741626794258, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.007405661196030622, + "step": 20, + "steps": 33.833333333333336, + "steps/max": 99, + "steps/min": 2, + "steps/std": 2.107704671314623 + }, + { + "/length/completion": 11401.34375, + "/length/completion/max": 21933, + "/length/completion/min": 4242, + "/length/completion/std": 396.7083745875666, + "/length/context": 64273.041666666664, + "/length/context/max": 124766, + "/length/context/min": 13984, + "/length/context/std": 2506.1907459968925, + "/length/forward": 64277.916666666664, + "/length/forward/max": 124768, + "/length/forward/min": 13992, + "/length/forward/std": 2506.190649740026, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.7045951859956237, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.45622451695126653, + "/record/score/max": 1.0, + "/record/score/mean": 0.6270718232044199, + "/record/score/min": 0.0, + "/record/score/std": 0.48358324179762907, + "advantages": 0.0018951597619684107, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.016091829848799057, + "entropy": 0.6525065104166666, + "entropy/max": 0.84375, + "entropy/min": 0.458984375, + "entropy/std": 0.009489987290893497, + "epoch": 0.021, + "grad_norm": 18432.0, + "learning_rate": 1.999496542383185e-06, + "loss": 121.03529791666666, + "out_of_date_ratio": 0.0013320353434664867, + "out_of_date_ratio/max": 0.003433594247326255, + "out_of_date_ratio/min": 0.00010756158008007333, + "out_of_date_ratio/std": 7.833091810820446e-05, + "rewards": 0.6145833333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04967295748634071, + "sampled_at_step": 20.23511741558711, + "sampled_at_step/max": 21.0, + "sampled_at_step/min": 19.0, + "sampled_at_step/std": 0.05663429886634042, + "scores": 0.5839745290527991, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008028674375231538, + "step": 21, + "steps": 38.260416666666664, + "steps/max": 86, + "steps/min": 7, + "steps/std": 2.0449720687775765 + }, + { + "/length/completion": 9401.072916666666, + "/length/completion/max": 24161, + "/length/completion/min": 1460, + "/length/completion/std": 499.54914422142616, + "/length/context": 47311.802083333336, + "/length/context/max": 122394, + "/length/context/min": 2396, + "/length/context/std": 2575.039869424853, + "/length/forward": 47316.333333333336, + "/length/forward/max": 122400, + "/length/forward/min": 2400, + "/length/forward/std": 2575.044503166312, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6977272727272728, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.45924277416180914, + "/record/score/max": 1.0, + "/record/score/mean": 0.6279691211401425, + "/record/score/min": 0.0, + "/record/score/std": 0.4833465672109604, + "advantages": -0.012113324657122618, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.019199719746732787, + "entropy": 0.6626383463541666, + "entropy/max": 0.89453125, + "entropy/min": 0.494140625, + "entropy/std": 0.007544228380313726, + "epoch": 0.022, + "grad_norm": 232448.0, + "learning_rate": 1.9993908270190957e-06, + "loss": 111.84316041666666, + "out_of_date_ratio": 0.0016641596797247378, + "out_of_date_ratio/max": 0.020300446078181267, + "out_of_date_ratio/min": 0.00035029338323511183, + "out_of_date_ratio/std": 0.00023182703826180737, + "rewards": 0.5729166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.050485472304150465, + "sampled_at_step": 21.164862950642902, + "sampled_at_step/max": 22.0, + "sampled_at_step/min": 20.999998092651367, + "sampled_at_step/std": 0.03211382577428944, + "scores": 0.49159074982480727, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009357968706678136, + "step": 22, + "steps": 28.729166666666668, + "steps/max": 89, + "steps/min": 0, + "steps/std": 1.9640032706013673 + }, + { + "/length/completion": 9214.354166666666, + "/length/completion/max": 23003, + "/length/completion/min": 1919, + "/length/completion/std": 387.5235336657432, + "/length/context": 48108.802083333336, + "/length/context/max": 114013, + "/length/context/min": 3056, + "/length/context/std": 2795.0509071366123, + "/length/forward": 48113.416666666664, + "/length/forward/max": 114016, + "/length/forward/min": 3064, + "/length/forward/std": 2795.03867741772, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.7045177045177046, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4562592558390556, + "/record/score/max": 1.0, + "/record/score/mean": 0.6330749354005168, + "/record/score/min": 0.0, + "/record/score/std": 0.4819658302910577, + "advantages": 0.023602484472047516, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.018105340055766753, + "entropy": 0.6816813151041666, + "entropy/max": 0.96875, + "entropy/min": 0.52734375, + "entropy/std": 0.009203881162625499, + "epoch": 0.023, + "grad_norm": 15232.0, + "learning_rate": 1.9992750478004735e-06, + "loss": 9.265197916666667, + "out_of_date_ratio": 0.0013355615161951089, + "out_of_date_ratio/max": 0.00573215214535594, + "out_of_date_ratio/min": 0.00025896672741509974, + "out_of_date_ratio/std": 8.832910798917201e-05, + "rewards": 0.65625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.048475287679651785, + "sampled_at_step": 22.168105483055115, + "sampled_at_step/max": 23.0, + "sampled_at_step/min": 21.0, + "sampled_at_step/std": 0.050861245309274106, + "scores": 0.6652173913043479, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00898273734163259, + "step": 23, + "steps": 27.75, + "steps/max": 81, + "steps/min": 1, + "steps/std": 1.9785635045827228 + }, + { + "/length/completion": 9490.1875, + "/length/completion/max": 29269, + "/length/completion/min": 2814, + "/length/completion/std": 472.25405473316374, + "/length/context": 48921.354166666664, + "/length/context/max": 126739, + "/length/context/min": 11459, + "/length/context/std": 2698.3990532783187, + "/length/forward": 48925.833333333336, + "/length/forward/max": 126744, + "/length/forward/min": 11464, + "/length/forward/std": 2698.425859230934, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.7132701421800948, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.45223428270585836, + "/record/score/max": 1.0, + "/record/score/mean": 0.6399560922063666, + "/record/score/min": 0.0, + "/record/score/std": 0.4800128042608059, + "advantages": -0.13765028874372895, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.01947662744072505, + "entropy": 0.6754150390625, + "entropy/max": 0.85546875, + "entropy/min": 0.48046875, + "entropy/std": 0.007579306748481353, + "epoch": 0.024, + "grad_norm": 16000.0, + "learning_rate": 1.999149205893214e-06, + "loss": -90.01123125, + "out_of_date_ratio": 0.001372865597204509, + "out_of_date_ratio/max": 0.010023866780102253, + "out_of_date_ratio/min": 0.00016106950351968408, + "out_of_date_ratio/std": 0.00013162758332460245, + "rewards": 0.625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04941058844013093, + "sampled_at_step": 23.256375809510548, + "sampled_at_step/max": 24.0, + "sampled_at_step/min": 21.6400089263916, + "sampled_at_step/std": 0.06531165671866788, + "scores": 0.5569913850231941, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009042129178310435, + "step": 24, + "steps": 30.4375, + "steps/max": 99, + "steps/min": 3, + "steps/std": 2.303464541344009 + }, + { + "/length/completion": 9030.229166666666, + "/length/completion/max": 20017, + "/length/completion/min": 2032, + "/length/completion/std": 404.27926977346874, + "/length/context": 43733.875, + "/length/context/max": 115111, + "/length/context/min": 2969, + "/length/context/std": 2356.429606294356, + "/length/forward": 43737.833333333336, + "/length/forward/max": 115112, + "/length/forward/min": 2976, + "/length/forward/std": 2356.3995413175926, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.673055242390079, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4690968802724136, + "/record/score/max": 1.0, + "/record/score/mean": 0.6356711321250328, + "/record/score/min": 0.0, + "/record/score/std": 0.4812414611272727, + "advantages": -0.14799107142857285, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.020134242058078504, + "entropy": 0.6791585286458334, + "entropy/max": 0.9296875, + "entropy/min": 0.4921875, + "entropy/std": 0.009211186287749336, + "epoch": 0.025, + "grad_norm": 16512.0, + "learning_rate": 1.9990133025645437e-06, + "loss": -228.33545, + "out_of_date_ratio": 0.0014518716141841044, + "out_of_date_ratio/max": 0.003765060333535075, + "out_of_date_ratio/min": 0.00016504373343195766, + "out_of_date_ratio/std": 8.420615578605796e-05, + "rewards": 0.5208333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.050986719290237514, + "sampled_at_step": 24.447250723838806, + "sampled_at_step/max": 25.000001907348633, + "sampled_at_step/min": 23.619524002075195, + "sampled_at_step/std": 0.04587732273306496, + "scores": 0.4421875, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009815837705506364, + "step": 25, + "steps": 25.666666666666668, + "steps/max": 79, + "steps/min": 1, + "steps/std": 1.8250087201719623 + }, + { + "/length/completion": 10101.09375, + "/length/completion/max": 20788, + "/length/completion/min": 3039, + "/length/completion/std": 403.0559760239524, + "/length/context": 48021.208333333336, + "/length/context/max": 112424, + "/length/context/min": 7596, + "/length/context/std": 2270.2870376868336, + "/length/forward": 48025.833333333336, + "/length/forward/max": 112432, + "/length/forward/min": 7600, + "/length/forward/std": 2270.3245285615944, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6428571428571429, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4791574237499546, + "/record/score/max": 1.0, + "/record/score/mean": 0.6338742393509128, + "/record/score/min": 0.0, + "/record/score/std": 0.48174442190669386, + "advantages": -0.05486008836524312, + "advantages/max": 2.0, + "advantages/min": -1.4285714285714286, + "advantages/std": 0.018416687139217066, + "entropy": 0.6940511067708334, + "entropy/max": 0.89453125, + "entropy/min": 0.51171875, + "entropy/std": 0.008278710396076156, + "epoch": 0.026, + "grad_norm": 19840.0, + "learning_rate": 1.998867339183008e-06, + "loss": -140.20670625, + "out_of_date_ratio": 0.001547872148269865, + "out_of_date_ratio/max": 0.004394118674099445, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 8.020284871100495e-05, + "rewards": 0.3229166666666667, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.047723322942373116, + "sampled_at_step": 25.289051393667858, + "sampled_at_step/max": 26.000001907348633, + "sampled_at_step/min": 24.999998092651367, + "sampled_at_step/std": 0.037862932441932125, + "scores": 0.3286082474226804, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00843075316638251, + "step": 26, + "steps": 31.333333333333332, + "steps/max": 89, + "steps/min": 1, + "steps/std": 1.9080232341449717 + }, + { + "/length/completion": 8978.75, + "/length/completion/max": 18034, + "/length/completion/min": 3624, + "/length/completion/std": 311.56472994576995, + "/length/context": 52919.604166666664, + "/length/context/max": 103118, + "/length/context/min": 11282, + "/length/context/std": 2021.9312112484567, + "/length/forward": 52924.333333333336, + "/length/forward/max": 103120, + "/length/forward/min": 11288, + "/length/forward/std": 2021.9383608195633, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6522222222222223, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4762650470711957, + "/record/score/max": 1.0, + "/record/score/mean": 0.6329268292682927, + "/record/score/min": 0.0, + "/record/score/std": 0.4820066991865132, + "advantages": -0.07285397529300003, + "advantages/max": 2.0, + "advantages/min": -1.7142857142857142, + "advantages/std": 0.017689748039436834, + "entropy": 0.6405843098958334, + "entropy/max": 0.81640625, + "entropy/min": 0.478515625, + "entropy/std": 0.008094466537248381, + "epoch": 0.027, + "grad_norm": 15616.0, + "learning_rate": 1.998711317218456e-06, + "loss": -217.17899583333335, + "out_of_date_ratio": 0.0014654934244996791, + "out_of_date_ratio/max": 0.003957169596105814, + "out_of_date_ratio/min": 0.00015121730393730104, + "out_of_date_ratio/std": 8.927867169875872e-05, + "rewards": 0.4270833333333333, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.050485472304150465, + "sampled_at_step": 26.36866702636083, + "sampled_at_step/max": 27.000001907348633, + "sampled_at_step/min": 25.0, + "sampled_at_step/std": 0.050479192137442085, + "scores": 0.3801076971808679, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00863921262874711, + "step": 27, + "steps": 31.885416666666668, + "steps/max": 71, + "steps/min": 5, + "steps/std": 1.6309233516920185 + }, + { + "/length/completion": 10679.104166666666, + "/length/completion/max": 26624, + "/length/completion/min": 2763, + "/length/completion/std": 485.3186727086443, + "/length/context": 54714.947916666664, + "/length/context/max": 115639, + "/length/context/min": 3676, + "/length/context/std": 2660.0730577165136, + "/length/forward": 54719.416666666664, + "/length/forward/max": 115640, + "/length/forward/min": 3680, + "/length/forward/std": 2660.049047283798, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.677765843179377, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4673321142386053, + "/record/score/max": 1.0, + "/record/score/mean": 0.6375442739079102, + "/record/score/min": 0.0, + "/record/score/std": 0.48070944729133946, + "advantages": -0.017792985457656552, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.018213078624609743, + "entropy": 0.6812744140625, + "entropy/max": 0.859375, + "entropy/min": 0.47265625, + "entropy/std": 0.007308055562462717, + "epoch": 0.028, + "grad_norm": 17920.0, + "learning_rate": 1.9985452382420274e-06, + "loss": -186.76735416666668, + "out_of_date_ratio": 0.001270884770140886, + "out_of_date_ratio/max": 0.0031341412104666233, + "out_of_date_ratio/min": 0.00020686801872216165, + "out_of_date_ratio/std": 7.350484083591433e-05, + "rewards": 0.6041666666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04991130733147589, + "sampled_at_step": 27.260556002457935, + "sampled_at_step/max": 28.000001907348633, + "sampled_at_step/min": 26.0, + "sampled_at_step/std": 0.05604123740619327, + "scores": 0.5955089820359282, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00849230074604125, + "step": 28, + "steps": 33.791666666666664, + "steps/max": 92, + "steps/min": 1, + "steps/std": 2.219846006034937 + }, + { + "/length/completion": 9437.427083333334, + "/length/completion/max": 28132, + "/length/completion/min": 2891, + "/length/completion/std": 436.3822586208799, + "/length/context": 51917.46875, + "/length/context/max": 118962, + "/length/context/min": 10854, + "/length/context/std": 2628.8098323681484, + "/length/forward": 51922.166666666664, + "/length/forward/max": 118968, + "/length/forward/min": 10856, + "/length/forward/std": 2628.812668929029, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6684901531728665, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.47075584784873803, + "/record/score/max": 1.0, + "/record/score/mean": 0.6378043178686266, + "/record/score/min": 0.0, + "/record/score/std": 0.48063496541217493, + "advantages": 0.01068174677976811, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.01657294113911273, + "entropy": 0.6856689453125, + "entropy/max": 0.87890625, + "entropy/min": 0.4765625, + "entropy/std": 0.006581478913246546, + "epoch": 0.029, + "grad_norm": 16768.0, + "learning_rate": 1.9983691039261353e-06, + "loss": 0.014404166666666668, + "out_of_date_ratio": 0.001233935588516033, + "out_of_date_ratio/max": 0.003753588069230318, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 7.756263237153342e-05, + "rewards": 0.6770833333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.047723322942373116, + "sampled_at_step": 28.41019606590271, + "sampled_at_step/max": 29.000001907348633, + "sampled_at_step/min": 27.322860717773438, + "sampled_at_step/std": 0.0516098577435189, + "scores": 0.6760917373546969, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008294601161650362, + "step": 29, + "steps": 32.15625, + "steps/max": 99, + "steps/min": 4, + "steps/std": 2.146709374423526 + }, + { + "/length/completion": 11369.114583333334, + "/length/completion/max": 23314, + "/length/completion/min": 3318, + "/length/completion/std": 507.260502736684, + "/length/context": 58615.427083333336, + "/length/context/max": 113119, + "/length/context/min": 16362, + "/length/context/std": 2548.0394785382373, + "/length/forward": 58619.833333333336, + "/length/forward/max": 113120, + "/length/forward/min": 16368, + "/length/forward/std": 2548.016881319358, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6605691056910569, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.47351616899275406, + "/record/score/max": 1.0, + "/record/score/mean": 0.6413898601398601, + "/record/score/min": 0.0, + "/record/score/std": 0.47959243889956243, + "advantages": -0.00934836403629328, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.016053681878628597, + "entropy": 0.6816813151041666, + "entropy/max": 0.86328125, + "entropy/min": 0.49609375, + "entropy/std": 0.007637391596894437, + "epoch": 0.03, + "grad_norm": 17408.0, + "learning_rate": 1.998182916044451e-06, + "loss": 372.35253958333334, + "out_of_date_ratio": 0.0014495693473387898, + "out_of_date_ratio/max": 0.010763758793473244, + "out_of_date_ratio/min": 0.00022534365416504443, + "out_of_date_ratio/std": 0.0001220856333759772, + "rewards": 0.5104166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05101996066242478, + "sampled_at_step": 29.722339312235516, + "sampled_at_step/max": 30.000001907348633, + "sampled_at_step/min": 28.63516616821289, + "sampled_at_step/std": 0.04449124005690786, + "scores": 0.46989276876546604, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008275792529581869, + "step": 30, + "steps": 36.885416666666664, + "steps/max": 91, + "steps/min": 6, + "steps/std": 2.0705791778804565 + }, + { + "/length/completion": 9677.302083333334, + "/length/completion/max": 20464, + "/length/completion/min": 3520, + "/length/completion/std": 360.3757061020706, + "/length/context": 47705.333333333336, + "/length/context/max": 114634, + "/length/context/min": 10643, + "/length/context/std": 2297.2822418484816, + "/length/forward": 47709.666666666664, + "/length/forward/max": 114640, + "/length/forward/min": 10648, + "/length/forward/std": 2297.297518331373, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6553147574819401, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.47526553221154866, + "/record/score/max": 1.0, + "/record/score/mean": 0.6407540775259479, + "/record/score/min": 0.0, + "/record/score/std": 0.47977941771174426, + "advantages": 0.04816326530612251, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01873621505921223, + "entropy": 0.6687418619791666, + "entropy/max": 0.84375, + "entropy/min": 0.546875, + "entropy/std": 0.0069627937479380154, + "epoch": 0.031, + "grad_norm": 16512.0, + "learning_rate": 1.9979866764718843e-06, + "loss": 245.54707916666666, + "out_of_date_ratio": 0.0014398050516319927, + "out_of_date_ratio/max": 0.0034001213498413563, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 8.200511865324555e-05, + "rewards": 0.5625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05063078670631141, + "sampled_at_step": 30.396397809187572, + "sampled_at_step/max": 31.000001907348633, + "sampled_at_step/min": 29.999998092651367, + "sampled_at_step/std": 0.040759151687636004, + "scores": 0.5892857142857143, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00929723578782738, + "step": 31, + "steps": 28.166666666666668, + "steps/max": 81, + "steps/min": 3, + "steps/std": 1.9046080512020584 + }, + { + "/length/completion": 11456.354166666666, + "/length/completion/max": 23502, + "/length/completion/min": 3792, + "/length/completion/std": 482.2487134353623, + "/length/context": 56354.708333333336, + "/length/context/max": 97717, + "/length/context/min": 12262, + "/length/context/std": 2432.4953052653914, + "/length/forward": 56359.333333333336, + "/length/forward/max": 97720, + "/length/forward/min": 12264, + "/length/forward/std": 2432.503741765612, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6494845360824743, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.47713140063530446, + "/record/score/max": 1.0, + "/record/score/mean": 0.6368864243171083, + "/record/score/min": 0.0, + "/record/score/std": 0.4808971894674339, + "advantages": 0.07540518464382524, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.016593084064504177, + "entropy": 0.6540120442708334, + "entropy/max": 0.82421875, + "entropy/min": 0.4921875, + "entropy/std": 0.0068052894061513365, + "epoch": 0.032, + "grad_norm": 18560.0, + "learning_rate": 1.997780387184565e-06, + "loss": 354.41247500000003, + "out_of_date_ratio": 0.0013360527542924199, + "out_of_date_ratio/max": 0.0034993954468518496, + "out_of_date_ratio/min": 0.0001190476177725941, + "out_of_date_ratio/std": 7.784003677692855e-05, + "rewards": 0.5625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05063078670631141, + "sampled_at_step": 31.47506093978882, + "sampled_at_step/max": 32.0, + "sampled_at_step/min": 30.20079231262207, + "sampled_at_step/std": 0.046957284692903785, + "scores": 0.5853144748721035, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008546511861260894, + "step": 32, + "steps": 33.614583333333336, + "steps/max": 84, + "steps/min": 4, + "steps/std": 2.050542451925803 + }, + { + "/length/completion": 10748.010416666666, + "/length/completion/max": 28022, + "/length/completion/min": 2958, + "/length/completion/std": 467.90537643482224, + "/length/context": 52605.020833333336, + "/length/context/max": 119246, + "/length/context/min": 6469, + "/length/context/std": 2694.21027027666, + "/length/forward": 52609.5, + "/length/forward/max": 119248, + "/length/forward/min": 6472, + "/length/forward/std": 2694.183390793798, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.675701839303001, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.46811202042411, + "/record/score/max": 1.0, + "/record/score/mean": 0.6410408042578356, + "/record/score/min": 0.0, + "/record/score/std": 0.47969520691195516, + "advantages": 0.035493036471189295, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01854459908350241, + "entropy": 0.6657511393229166, + "entropy/max": 0.94921875, + "entropy/min": 0.474609375, + "entropy/std": 0.00920301048279462, + "epoch": 0.033, + "grad_norm": 18944.0, + "learning_rate": 1.997564050259824e-06, + "loss": 32.570058333333336, + "out_of_date_ratio": 0.0016721839908010832, + "out_of_date_ratio/max": 0.03893996775150299, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 0.00039718575078297327, + "rewards": 0.6041666666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04991130733147588, + "sampled_at_step": 32.382101813952126, + "sampled_at_step/max": 33.0, + "sampled_at_step/min": 31.0, + "sampled_at_step/std": 0.05291823048647063, + "scores": 0.5575480925986306, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008968448426516548, + "step": 33, + "steps": 30.947916666666668, + "steps/max": 83, + "steps/min": 0, + "steps/std": 2.1503476882781585 + }, + { + "/length/completion": 9198.875, + "/length/completion/max": 21159, + "/length/completion/min": 1864, + "/length/completion/std": 383.6682451283641, + "/length/context": 47677.3125, + "/length/context/max": 127958, + "/length/context/min": 2721, + "/length/context/std": 2562.6787387829513, + "/length/forward": 47681.916666666664, + "/length/forward/max": 127960, + "/length/forward/min": 2728, + "/length/forward/std": 2562.679914401653, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6801579466929911, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4664151736847391, + "/record/score/max": 1.0, + "/record/score/mean": 0.6436405618626131, + "/record/score/min": 0.0, + "/record/score/std": 0.47892315561872006, + "advantages": -0.0682769007642093, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01756966769979579, + "entropy": 0.6900634765625, + "entropy/max": 0.8828125, + "entropy/min": 0.43359375, + "entropy/std": 0.008183432081512754, + "epoch": 0.034, + "grad_norm": 15040.0, + "learning_rate": 1.997337667876172e-06, + "loss": -356.5561125, + "out_of_date_ratio": 0.001257536003322457, + "out_of_date_ratio/max": 0.005396825261414051, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 9.732487820129926e-05, + "rewards": 0.6875, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04730703678277331, + "sampled_at_step": 33.30234805742899, + "sampled_at_step/max": 34.0, + "sampled_at_step/min": 33.0, + "sampled_at_step/std": 0.03829985984963102, + "scores": 0.5903943771964076, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009717381918579165, + "step": 34, + "steps": 25.677083333333332, + "steps/max": 92, + "steps/min": 1, + "steps/std": 1.7743528011009773 + }, + { + "/length/completion": 9827.28125, + "/length/completion/max": 22478, + "/length/completion/min": 1481, + "/length/completion/std": 426.35430911083745, + "/length/context": 52099.104166666664, + "/length/context/max": 121980, + "/length/context/min": 4481, + "/length/context/std": 2602.116649626552, + "/length/forward": 52103.75, + "/length/forward/max": 121984, + "/length/forward/min": 4488, + "/length/forward/std": 2602.108545255828, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6769078295341923, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.46765758825181203, + "/record/score/max": 1.0, + "/record/score/mean": 0.6439947536068953, + "/record/score/min": 0.0, + "/record/score/std": 0.478816782218094, + "advantages": -0.11816995990144273, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01787321571590539, + "entropy": 0.6913859049479166, + "entropy/max": 0.92578125, + "entropy/min": 0.462890625, + "entropy/std": 0.009919730611026047, + "epoch": 0.035, + "grad_norm": 16064.0, + "learning_rate": 1.9971012423132772e-06, + "loss": -428.37520416666666, + "out_of_date_ratio": 0.0012809614311966773, + "out_of_date_ratio/max": 0.004535594489425421, + "out_of_date_ratio/min": 0.00010249052138533443, + "out_of_date_ratio/std": 8.340348975318018e-05, + "rewards": 0.625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04941058844013093, + "sampled_at_step": 34.23181116580963, + "sampled_at_step/max": 35.0, + "sampled_at_step/min": 33.0, + "sampled_at_step/std": 0.05921732305282789, + "scores": 0.5302671626648631, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009177981231588788, + "step": 35, + "steps": 29.802083333333332, + "steps/max": 77, + "steps/min": 0, + "steps/std": 1.7894620515550024 + }, + { + "/length/completion": 9422.875, + "/length/completion/max": 23769, + "/length/completion/min": 2900, + "/length/completion/std": 459.94317547239297, + "/length/context": 49770.354166666664, + "/length/context/max": 126792, + "/length/context/min": 6995, + "/length/context/std": 2917.8340353953995, + "/length/forward": 49774.916666666664, + "/length/forward/max": 126800, + "/length/forward/min": 7000, + "/length/forward/std": 2917.8420979566768, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.678646934460888, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4669960094130649, + "/record/score/max": 1.0, + "/record/score/mean": 0.6463280964559737, + "/record/score/min": 0.0, + "/record/score/std": 0.4781088664599017, + "advantages": -0.01185733257446363, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01525715744572812, + "entropy": 0.7012125651041666, + "entropy/max": 1.0078125, + "entropy/min": 0.5234375, + "entropy/std": 0.008509193891243078, + "epoch": 0.036, + "grad_norm": 13440.0, + "learning_rate": 1.9968547759519425e-06, + "loss": -72.75817916666666, + "out_of_date_ratio": 0.0014415098589779518, + "out_of_date_ratio/max": 0.004487856291234493, + "out_of_date_ratio/min": 0.00010764262697193772, + "out_of_date_ratio/std": 9.46737329751352e-05, + "rewards": 0.5833333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05031728036871333, + "sampled_at_step": 35.245263735453285, + "sampled_at_step/max": 36.000003814697266, + "sampled_at_step/min": 34.0, + "sampled_at_step/std": 0.051642521621436184, + "scores": 0.5703851261620186, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009019786682503346, + "step": 36, + "steps": 30.375, + "steps/max": 96, + "steps/min": 2, + "steps/std": 2.3679193611046903 + }, + { + "/length/completion": 9100.052083333334, + "/length/completion/max": 24602, + "/length/completion/min": 2334, + "/length/completion/std": 462.026403863143, + "/length/context": 48322.8125, + "/length/context/max": 104864, + "/length/context/min": 3297, + "/length/context/std": 2566.345017526433, + "/length/forward": 48327.333333333336, + "/length/forward/max": 104872, + "/length/forward/min": 3304, + "/length/forward/std": 2566.355532188125, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6878363832077503, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4633761896595802, + "/record/score/max": 1.0, + "/record/score/mean": 0.6477414747366542, + "/record/score/min": 0.0, + "/record/score/std": 0.47767400666420845, + "advantages": 0.008175892166610221, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.017455894236115342, + "entropy": 0.7001139322916666, + "entropy/max": 0.9375, + "entropy/min": 0.52734375, + "entropy/std": 0.00808525356253917, + "epoch": 0.037, + "grad_norm": 30336.0, + "learning_rate": 1.9965982712740806e-06, + "loss": 11.875345833333334, + "out_of_date_ratio": 0.0015731908315501641, + "out_of_date_ratio/max": 0.015670742839574814, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 0.00018833875065268633, + "rewards": 0.6354166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04912381533653095, + "sampled_at_step": 36.266119639078774, + "sampled_at_step/max": 37.0, + "sampled_at_step/min": 35.0, + "sampled_at_step/std": 0.062039810694395646, + "scores": 0.654292343387471, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009352466428458224, + "step": 37, + "steps": 25.9375, + "steps/max": 72, + "steps/min": 1, + "steps/std": 1.6376253994075451 + }, + { + "/length/completion": 10135.427083333334, + "/length/completion/max": 30915, + "/length/completion/min": 3327, + "/length/completion/std": 468.35185469585446, + "/length/context": 53150.572916666664, + "/length/context/max": 122558, + "/length/context/min": 15804, + "/length/context/std": 2433.6580449971475, + "/length/forward": 53155.25, + "/length/forward/max": 122560, + "/length/forward/min": 15808, + "/length/forward/std": 2433.6683033657837, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6797385620915033, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4665769489562024, + "/record/score/max": 1.0, + "/record/score/mean": 0.6460006985679357, + "/record/score/min": 0.0, + "/record/score/std": 0.47820894598248015, + "advantages": -0.1376781704019445, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01651928713975243, + "entropy": 0.6912027994791666, + "entropy/max": 0.83203125, + "entropy/min": 0.53125, + "entropy/std": 0.00708367145695894, + "epoch": 0.038, + "grad_norm": 15680.0, + "learning_rate": 1.996331730862691e-06, + "loss": -134.66802916666668, + "out_of_date_ratio": 0.0013084049907471733, + "out_of_date_ratio/max": 0.003387369913980365, + "out_of_date_ratio/min": 0.0002214839478256181, + "out_of_date_ratio/std": 7.499232801296063e-05, + "rewards": 0.53125, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05093126879064569, + "sampled_at_step": 37.08955097198486, + "sampled_at_step/max": 38.000003814697266, + "sampled_at_step/min": 36.19251251220703, + "sampled_at_step/std": 0.041960562771199195, + "scores": 0.4007058068655759, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008777370925423156, + "step": 38, + "steps": 31.46875, + "steps/max": 98, + "steps/min": 4, + "steps/std": 1.941804868393939 + }, + { + "/length/completion": 9828.083333333334, + "/length/completion/max": 23061, + "/length/completion/min": 1558, + "/length/completion/std": 470.9235785716673, + "/length/context": 51597.614583333336, + "/length/context/max": 109570, + "/length/context/min": 4229, + "/length/context/std": 2691.8637708957626, + "/length/forward": 51602.166666666664, + "/length/forward/max": 109576, + "/length/forward/min": 4232, + "/length/forward/std": 2691.8928374513284, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6431095406360424, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.47908210087514325, + "/record/score/max": 1.0, + "/record/score/mean": 0.6410693001872978, + "/record/score/min": 0.0, + "/record/score/std": 0.47968682757051556, + "advantages": -0.13596059113300493, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.018862637046954452, + "entropy": 0.6741943359375, + "entropy/max": 0.91015625, + "entropy/min": 0.4609375, + "entropy/std": 0.00861796389963598, + "epoch": 0.039, + "grad_norm": 16384.0, + "learning_rate": 1.996055157401834e-06, + "loss": -246.81274374999998, + "out_of_date_ratio": 0.0014630949255357943, + "out_of_date_ratio/max": 0.004159239586442709, + "out_of_date_ratio/min": 0.0001836041483329609, + "out_of_date_ratio/std": 9.1284843466261e-05, + "rewards": 0.5625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05063078670631141, + "sampled_at_step": 38.16589645544688, + "sampled_at_step/max": 39.0, + "sampled_at_step/min": 37.30094528198242, + "sampled_at_step/std": 0.036066542168659795, + "scores": 0.503448275862069, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009284546103208795, + "step": 39, + "steps": 29.208333333333332, + "steps/max": 89, + "steps/min": 1, + "steps/std": 1.9946715404793218 + }, + { + "/length/completion": 10498.177083333334, + "/length/completion/max": 20891, + "/length/completion/min": 2779, + "/length/completion/std": 468.1356710498424, + "/length/context": 56444.9375, + "/length/context/max": 124660, + "/length/context/min": 9841, + "/length/context/std": 2764.127233626301, + "/length/forward": 56449.5, + "/length/forward/max": 124664, + "/length/forward/min": 9848, + "/length/forward/std": 2764.099841701785, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6257744733581165, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4839222890624985, + "/record/score/max": 1.0, + "/record/score/mean": 0.6392202991093934, + "/record/score/min": 0.0, + "/record/score/std": 0.4802267259492034, + "advantages": -0.047900650502660895, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.015531604552016694, + "entropy": 0.6410319010416666, + "entropy/max": 0.796875, + "entropy/min": 0.478515625, + "entropy/std": 0.00667534252894688, + "epoch": 0.04, + "grad_norm": 16512.0, + "learning_rate": 1.9957685536765995e-06, + "loss": -490.44510833333334, + "out_of_date_ratio": 0.0016274743311441853, + "out_of_date_ratio/max": 0.009337756782770157, + "out_of_date_ratio/min": 0.00029226948390714824, + "out_of_date_ratio/std": 0.00011572550944232628, + "rewards": 0.4166666666666667, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05031728036871333, + "sampled_at_step": 39.08427309989929, + "sampled_at_step/max": 39.60692596435547, + "sampled_at_step/min": 38.14115524291992, + "sampled_at_step/std": 0.02244150888420229, + "scores": 0.4544648137196925, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008561989957449728, + "step": 40, + "steps": 34.229166666666664, + "steps/max": 98, + "steps/min": 3, + "steps/std": 2.1958411985285036 + }, + { + "/length/completion": 8468.854166666666, + "/length/completion/max": 16865, + "/length/completion/min": 3451, + "/length/completion/std": 290.9413056124056, + "/length/context": 45148.833333333336, + "/length/context/max": 94495, + "/length/context/min": 17906, + "/length/context/std": 1885.4234569782568, + "/length/forward": 45153.0, + "/length/forward/max": 94496, + "/length/forward/min": 17912, + "/length/forward/std": 1885.4056712386564, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6025316455696202, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4893743573756043, + "/record/score/max": 1.0, + "/record/score/mean": 0.6380216891225764, + "/record/score/min": 0.0, + "/record/score/std": 0.48057258903494593, + "advantages": -0.1638573108584894, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.020575259221953697, + "entropy": 0.6600748697916666, + "entropy/max": 0.8203125, + "entropy/min": 0.515625, + "entropy/std": 0.005811703397960358, + "epoch": 0.041, + "grad_norm": 14784.0, + "learning_rate": 1.9954719225730845e-06, + "loss": -400.2380375, + "out_of_date_ratio": 0.0013619511713235017, + "out_of_date_ratio/max": 0.01208761241286993, + "out_of_date_ratio/min": 0.00017220595327671617, + "out_of_date_ratio/std": 0.00013614273060853817, + "rewards": 0.625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.04941058844013093, + "sampled_at_step": 40.14164388179779, + "sampled_at_step/max": 41.0, + "sampled_at_step/min": 39.0, + "sampled_at_step/std": 0.059141493917955895, + "scores": 0.5049000392003136, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00989905914015969, + "step": 41, + "steps": 25.572916666666668, + "steps/max": 82, + "steps/min": 6, + "steps/std": 1.6592115026998668 + }, + { + "/length/completion": 9004.208333333334, + "/length/completion/max": 22447, + "/length/completion/min": 3307, + "/length/completion/std": 401.7670371646414, + "/length/context": 48468.375, + "/length/context/max": 118308, + "/length/context/min": 4344, + "/length/context/std": 2498.548039093242, + "/length/forward": 48472.75, + "/length/forward/max": 118312, + "/length/forward/min": 4352, + "/length/forward/std": 2498.565013379694, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5725190839694656, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49471303041281367, + "/record/score/max": 1.0, + "/record/score/mean": 0.6363929146537842, + "/record/score/min": 0.0, + "/record/score/std": 0.4810373923430952, + "advantages": -0.150188230632059, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.01863469859831098, + "entropy": 0.6724446614583334, + "entropy/max": 0.88671875, + "entropy/min": 0.5078125, + "entropy/std": 0.007728973447664365, + "epoch": 0.042, + "grad_norm": 15872.0, + "learning_rate": 1.995165267078361e-06, + "loss": -295.17856875, + "out_of_date_ratio": 0.0015124180954444455, + "out_of_date_ratio/max": 0.012281018309295177, + "out_of_date_ratio/min": 0.0002090737980324775, + "out_of_date_ratio/std": 0.0001382285172342395, + "rewards": 0.5833333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05031728036871333, + "sampled_at_step": 41.327045361200966, + "sampled_at_step/max": 42.0, + "sampled_at_step/min": 40.344154357910156, + "sampled_at_step/std": 0.04198771784926251, + "scores": 0.478502080443828, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009301876674003574, + "step": 42, + "steps": 29.041666666666668, + "steps/max": 87, + "steps/min": 1, + "steps/std": 1.926881558505675 + }, + { + "/length/completion": 10251.0625, + "/length/completion/max": 23711, + "/length/completion/min": 3195, + "/length/completion/std": 406.6798169120462, + "/length/context": 57392.572916666664, + "/length/context/max": 128658, + "/length/context/min": 6731, + "/length/context/std": 2686.198047917464, + "/length/forward": 57397.416666666664, + "/length/forward/max": 128664, + "/length/forward/min": 6736, + "/length/forward/std": 2686.221671415675, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5697399527186762, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49511245085818306, + "/record/score/max": 1.0, + "/record/score/mean": 0.6366197183098592, + "/record/score/min": 0.0, + "/record/score/std": 0.48097302686214627, + "advantages": -0.040922190201730727, + "advantages/max": 2.0, + "advantages/min": -1.7142857142857142, + "advantages/std": 0.018565406605664785, + "entropy": 0.7135823567708334, + "entropy/max": 0.9140625, + "entropy/min": 0.53515625, + "entropy/std": 0.008849231390132668, + "epoch": 0.043, + "grad_norm": 19584.0, + "learning_rate": 1.994848590280447e-06, + "loss": -46.936462500000005, + "out_of_date_ratio": 0.0016592781415359543, + "out_of_date_ratio/max": 0.005419677589088678, + "out_of_date_ratio/min": 0.00015295197954401374, + "out_of_date_ratio/std": 9.289293381818487e-05, + "rewards": 0.4375, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05063078670631141, + "sampled_at_step": 42.43688189983368, + "sampled_at_step/max": 43.000003814697266, + "sampled_at_step/min": 41.0, + "sampled_at_step/std": 0.04539542797800991, + "scores": 0.39020172910662826, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00828081250988518, + "step": 43, + "steps": 35.145833333333336, + "steps/max": 97, + "steps/min": 1, + "steps/std": 2.2647557799388003 + }, + { + "/length/completion": 9909.958333333334, + "/length/completion/max": 25309, + "/length/completion/min": 2894, + "/length/completion/std": 413.7526306239452, + "/length/context": 50159.177083333336, + "/length/context/max": 119338, + "/length/context/min": 9507, + "/length/context/std": 2976.311486937006, + "/length/forward": 50163.583333333336, + "/length/forward/max": 119344, + "/length/forward/min": 9512, + "/length/forward/std": 2976.3120073100904, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5692137320044297, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.49518628747373306, + "/record/score/max": 1.0, + "/record/score/mean": 0.6355764848853106, + "/record/score/min": 0.0, + "/record/score/std": 0.4812681339400569, + "advantages": -0.030305856023870745, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.016289078510691973, + "entropy": 0.6901448567708334, + "entropy/max": 0.9140625, + "entropy/min": 0.48046875, + "entropy/std": 0.008722974198219498, + "epoch": 0.044, + "grad_norm": 15232.0, + "learning_rate": 1.994521895368273e-06, + "loss": -118.76766041666667, + "out_of_date_ratio": 0.0013302226734595024, + "out_of_date_ratio/max": 0.005487493705004454, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 8.94447647063914e-05, + "rewards": 0.59375, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05012598061177124, + "sampled_at_step": 43.42779644330343, + "sampled_at_step/max": 44.000003814697266, + "sampled_at_step/min": 42.6025276184082, + "sampled_at_step/std": 0.046298369511212654, + "scores": 0.48204960835509136, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009027044213490274, + "step": 44, + "steps": 30.916666666666668, + "steps/max": 91, + "steps/min": 2, + "steps/std": 2.3740251410569284 + }, + { + "/length/completion": 9256.635416666666, + "/length/completion/max": 22972, + "/length/completion/min": 3043, + "/length/completion/std": 451.1958094288279, + "/length/context": 46203.208333333336, + "/length/context/max": 112819, + "/length/context/min": 12488, + "/length/context/std": 2378.974874873011, + "/length/forward": 46207.833333333336, + "/length/forward/max": 112824, + "/length/forward/min": 12496, + "/length/forward/std": 2378.9563187198955, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6008537886872999, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4897228944080695, + "/record/score/max": 1.0, + "/record/score/mean": 0.6373756865073474, + "/record/score/min": 0.0, + "/record/score/std": 0.4807576528321059, + "advantages": -0.0014897579143391263, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.018824250703849507, + "entropy": 0.6787516276041666, + "entropy/max": 0.8515625, + "entropy/min": 0.53125, + "entropy/std": 0.007416265674992091, + "epoch": 0.045, + "grad_norm": 16512.0, + "learning_rate": 1.9941851856316543e-06, + "loss": -221.58431041666665, + "out_of_date_ratio": 0.001474499657585208, + "out_of_date_ratio/max": 0.003614853834733367, + "out_of_date_ratio/min": 0.000163612567121163, + "out_of_date_ratio/std": 8.391956347657486e-05, + "rewards": 0.5208333333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05098671929023751, + "sampled_at_step": 44.238667726516724, + "sampled_at_step/max": 45.0, + "sampled_at_step/min": 43.0, + "sampled_at_step/std": 0.04742509657311229, + "scores": 0.539292364990689, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009619504357267591, + "step": 45, + "steps": 26.96875, + "steps/max": 75, + "steps/min": 4, + "steps/std": 1.8648133659897332 + }, + { + "/length/completion": 10171.645833333334, + "/length/completion/max": 18897, + "/length/completion/min": 1419, + "/length/completion/std": 372.87753867676923, + "/length/context": 46847.260416666664, + "/length/context/max": 118579, + "/length/context/min": 12678, + "/length/context/std": 2387.7613746383, + "/length/forward": 46851.5, + "/length/forward/max": 118584, + "/length/forward/min": 12680, + "/length/forward/std": 2387.756631967953, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.602711157455683, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4893367124323641, + "/record/score/max": 1.0, + "/record/score/mean": 0.6342740762292697, + "/record/score/min": 0.0, + "/record/score/std": 0.48163313055974066, + "advantages": 0.002143392991105286, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.020591038312953034, + "entropy": 0.71630859375, + "entropy/max": 0.9765625, + "entropy/min": 0.5703125, + "entropy/std": 0.008231126334650838, + "epoch": 0.046, + "grad_norm": 17664.0, + "learning_rate": 1.993838464461254e-06, + "loss": -98.02665208333333, + "out_of_date_ratio": 0.00150063132878131, + "out_of_date_ratio/max": 0.014094432815909386, + "out_of_date_ratio/min": 0.00014930944598745555, + "out_of_date_ratio/std": 0.00016047740447908465, + "rewards": 0.5520833333333334, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05075342008066314, + "sampled_at_step": 45.25676174958547, + "sampled_at_step/max": 46.000003814697266, + "sampled_at_step/min": 44.0, + "sampled_at_step/std": 0.05679606363792777, + "scores": 0.5375093773443361, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009656381511994647, + "step": 46, + "steps": 26.770833333333332, + "steps/max": 99, + "steps/min": 1, + "steps/std": 2.0040366797041256 + }, + { + "/length/completion": 11373.427083333334, + "/length/completion/max": 27384, + "/length/completion/min": 3119, + "/length/completion/std": 528.5598256687088, + "/length/context": 53539.96875, + "/length/context/max": 128783, + "/length/context/min": 4073, + "/length/context/std": 3093.0401978765053, + "/length/forward": 53544.416666666664, + "/length/forward/max": 128784, + "/length/forward/min": 4080, + "/length/forward/std": 3093.016143871078, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5946775844421699, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4909543308742636, + "/record/score/max": 1.0, + "/record/score/mean": 0.6324810741322668, + "/record/score/min": 0.0, + "/record/score/std": 0.4821294068989783, + "advantages": -0.06990488503093467, + "advantages/max": 2.0, + "advantages/min": -1.7142857142857142, + "advantages/std": 0.017079460019094654, + "entropy": 0.7022705078125, + "entropy/max": 0.9921875, + "entropy/min": 0.51953125, + "entropy/std": 0.008319350894964734, + "epoch": 0.047, + "grad_norm": 18432.0, + "learning_rate": 1.9934817353485502e-06, + "loss": -573.1879875, + "out_of_date_ratio": 0.0016289489382567506, + "out_of_date_ratio/max": 0.00349434744566679, + "out_of_date_ratio/min": 0.00033151003299281, + "out_of_date_ratio/std": 7.278455981428102e-05, + "rewards": 0.40625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05012598061177124, + "sampled_at_step": 46.28057046731313, + "sampled_at_step/max": 47.000003814697266, + "sampled_at_step/min": 45.32149124145508, + "sampled_at_step/std": 0.041052582981975456, + "scores": 0.3610213316095669, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008634743818926866, + "step": 47, + "steps": 31.229166666666668, + "steps/max": 89, + "steps/min": 1, + "steps/std": 2.263054296677534 + }, + { + "/length/completion": 12594.364583333334, + "/length/completion/max": 27909, + "/length/completion/min": 4662, + "/length/completion/std": 465.0296403803164, + "/length/context": 62327.614583333336, + "/length/context/max": 128086, + "/length/context/min": 11295, + "/length/context/std": 2814.866780361495, + "/length/forward": 62332.166666666664, + "/length/forward/max": 128088, + "/length/forward/min": 11296, + "/length/forward/std": 2814.867033516383, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6015779092702169, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4895732104070766, + "/record/score/max": 1.0, + "/record/score/mean": 0.631578947368421, + "/record/score/min": 0.0, + "/record/score/std": 0.4823763889427196, + "advantages": 0.07370510176285243, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.017063208956500202, + "entropy": 0.6767171223958334, + "entropy/max": 0.875, + "entropy/min": 0.462890625, + "entropy/std": 0.008830032891094176, + "epoch": 0.048, + "grad_norm": 20352.0, + "learning_rate": 1.993115001885801e-06, + "loss": 524.9152833333334, + "out_of_date_ratio": 0.0015428930751113512, + "out_of_date_ratio/max": 0.010222065262496471, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 0.00012055963656145637, + "rewards": 0.4895833333333333, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05101996066242478, + "sampled_at_step": 47.23998463153839, + "sampled_at_step/max": 48.0, + "sampled_at_step/min": 46.0, + "sampled_at_step/std": 0.05458848832887572, + "scores": 0.4832470716426042, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.008247719765965627, + "step": 48, + "steps": 37.239583333333336, + "steps/max": 93, + "steps/min": 3, + "steps/std": 2.3124880251035043 + }, + { + "/length/completion": 9643.739583333334, + "/length/completion/max": 25244, + "/length/completion/min": 2278, + "/length/completion/std": 581.9111320393498, + "/length/context": 45923.697916666664, + "/length/context/max": 115489, + "/length/context/min": 10414, + "/length/context/std": 2683.867219109207, + "/length/forward": 45928.166666666664, + "/length/forward/max": 115496, + "/length/forward/min": 10416, + "/length/forward/std": 2683.864334052588, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6012861736334405, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.48963364981452884, + "/record/score/max": 1.0, + "/record/score/mean": 0.6317237584261934, + "/record/score/min": 0.0, + "/record/score/std": 0.482336865132739, + "advantages": -0.010672928117828497, + "advantages/max": 2.0, + "advantages/min": -1.7142857142857142, + "advantages/std": 0.01988726664862735, + "entropy": 0.688232421875, + "entropy/max": 0.8671875, + "entropy/min": 0.458984375, + "entropy/std": 0.008185300181255748, + "epoch": 0.049, + "grad_norm": 16512.0, + "learning_rate": 1.9927382677660083e-06, + "loss": 149.77001875, + "out_of_date_ratio": 0.001472773932164273, + "out_of_date_ratio/max": 0.003977461252361536, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 9.025339034445814e-05, + "rewards": 0.5104166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05101996066242478, + "sampled_at_step": 48.107348243395485, + "sampled_at_step/max": 49.0, + "sampled_at_step/min": 47.46670913696289, + "sampled_at_step/std": 0.03662542993500882, + "scores": 0.5666791184161375, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009577435331342692, + "step": 49, + "steps": 26.885416666666668, + "steps/max": 90, + "steps/min": 4, + "steps/std": 2.237565456816576 + }, + { + "/length/completion": 9139.177083333334, + "/length/completion/max": 17365, + "/length/completion/min": 2460, + "/length/completion/std": 388.9433452061105, + "/length/context": 42846.822916666664, + "/length/context/max": 128316, + "/length/context/min": 9876, + "/length/context/std": 2074.138494759108, + "/length/forward": 42851.333333333336, + "/length/forward/max": 128320, + "/length/forward/min": 9880, + "/length/forward/std": 2074.1248980214746, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6177437020810514, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.48593870047594334, + "/record/score/max": 1.0, + "/record/score/mean": 0.6335445597950654, + "/record/score/min": 0.0, + "/record/score/std": 0.48183591662426184, + "advantages": -0.1057565883496267, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.021120893835498165, + "entropy": 0.67254638671875, + "entropy/max": 0.8671875, + "entropy/min": 0.482421875, + "entropy/std": 0.00796678719028089, + "epoch": 0.05, + "grad_norm": 15488.0, + "learning_rate": 1.992351536782881e-06, + "loss": -225.88562083333332, + "out_of_date_ratio": 0.0011801550923943676, + "out_of_date_ratio/max": 0.0036585365887731314, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 7.948650594591365e-05, + "rewards": 0.65625, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.048475287679651785, + "sampled_at_step": 49.28301433722178, + "sampled_at_step/max": 50.000003814697266, + "sampled_at_step/min": 47.999996185302734, + "sampled_at_step/std": 0.07208558500765914, + "scores": 0.5746298519407763, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009889958472670886, + "step": 50, + "steps": 25.03125, + "steps/max": 96, + "steps/min": 4, + "steps/std": 1.8162947061031707 + }, + { + "/length/completion": 9401.333333333334, + "/length/completion/max": 21765, + "/length/completion/min": 3213, + "/length/completion/std": 379.3273187781392, + "/length/context": 52463.145833333336, + "/length/context/max": 125646, + "/length/context/min": 7832, + "/length/context/std": 2379.3423480716424, + "/length/forward": 52467.5, + "/length/forward/max": 125648, + "/length/forward/min": 7840, + "/length/forward/std": 2379.32203174122, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.5954022988505747, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4908140191294725, + "/record/score/max": 1.0, + "/record/score/mean": 0.6323256430654999, + "/record/score/min": 0.0, + "/record/score/std": 0.4821720898053945, + "advantages": 0.021790943139257633, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.017091281548428414, + "entropy": 0.6608072916666666, + "entropy/max": 0.84375, + "entropy/min": 0.515625, + "entropy/std": 0.007203795776861115, + "epoch": 0.051, + "grad_norm": 14400.0, + "learning_rate": 1.991954812830795e-06, + "loss": 254.4839875, + "out_of_date_ratio": 0.0011251259766898631, + "out_of_date_ratio/max": 0.003933531232178211, + "out_of_date_ratio/min": 0.0, + "out_of_date_ratio/std": 7.5655512414866e-05, + "rewards": 0.75, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.044194173824159216, + "sampled_at_step": 50.112096428871155, + "sampled_at_step/max": 51.000003814697266, + "sampled_at_step/min": 48.0, + "sampled_at_step/std": 0.054218936651792234, + "scores": 0.7402110997616616, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.00809163192240093, + "step": 51, + "steps": 29.59375, + "steps/max": 89, + "steps/min": 2, + "steps/std": 1.7793043787575034 + }, + { + "/length/completion": 12228.010416666666, + "/length/completion/max": 22997, + "/length/completion/min": 3270, + "/length/completion/std": 500.05629641725665, + "/length/context": 51091.479166666664, + "/length/context/max": 115132, + "/length/context/min": 6674, + "/length/context/std": 2259.578293306198, + "/length/forward": 51096.0, + "/length/forward/max": 115136, + "/length/forward/min": 6680, + "/length/forward/std": 2259.560576749382, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6038374717832957, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4890989464859371, + "/record/score/max": 1.0, + "/record/score/mean": 0.6317769130998703, + "/record/score/min": 0.0, + "/record/score/std": 0.4823223457127706, + "advantages": -0.08217029951387875, + "advantages/max": 1.7142857142857144, + "advantages/min": -2.0, + "advantages/std": 0.020094432251289217, + "entropy": 0.7004801432291666, + "entropy/max": 0.89453125, + "entropy/min": 0.5546875, + "entropy/std": 0.007632295410241296, + "epoch": 0.052, + "grad_norm": 21760.0, + "learning_rate": 1.991548099904757e-06, + "loss": 52.64125625, + "out_of_date_ratio": 0.001651031008426192, + "out_of_date_ratio/max": 0.03822629898786545, + "out_of_date_ratio/min": 0.00011681560863507912, + "out_of_date_ratio/std": 0.0003895121769529702, + "rewards": 0.4583333333333333, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05085353651346116, + "sampled_at_step": 51.44491422176361, + "sampled_at_step/max": 52.000003814697266, + "sampled_at_step/min": 50.419376373291016, + "sampled_at_step/std": 0.04813901003363954, + "scores": 0.4376143432125869, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009489494447882852, + "step": 52, + "steps": 27.46875, + "steps/max": 77, + "steps/min": 2, + "steps/std": 1.643389081957216 + }, + { + "/length/completion": 8772.604166666666, + "/length/completion/max": 21070, + "/length/completion/min": 1752, + "/length/completion/std": 421.9746077057603, + "/length/context": 43895.510416666664, + "/length/context/max": 119424, + "/length/context/min": 3176, + "/length/context/std": 2951.941704876966, + "/length/forward": 43899.833333333336, + "/length/forward/max": 119432, + "/length/forward/min": 3184, + "/length/forward/std": 2951.9432746807433, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6261491317671093, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4838247581040109, + "/record/score/max": 1.0, + "/record/score/mean": 0.6313137674770122, + "/record/score/min": 0.0, + "/record/score/std": 0.48244864438714447, + "advantages": 0.014319014319014235, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.02023509036850466, + "entropy": 0.6823933919270834, + "entropy/max": 0.90234375, + "entropy/min": 0.455078125, + "entropy/std": 0.009506664852205228, + "epoch": 0.053, + "grad_norm": 15296.0, + "learning_rate": 1.991131402100361e-06, + "loss": 316.06665, + "out_of_date_ratio": 0.0015899745600715203, + "out_of_date_ratio/max": 0.005576208233833313, + "out_of_date_ratio/min": 0.0003133813734166324, + "out_of_date_ratio/std": 0.00010189227208870029, + "rewards": 0.5104166666666666, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05101996066242478, + "sampled_at_step": 52.58958820501963, + "sampled_at_step/max": 53.000003814697266, + "sampled_at_step/min": 51.999996185302734, + "sampled_at_step/std": 0.0446586193431428, + "scores": 0.4829059829059829, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009849445426304237, + "step": 53, + "steps": 25.8125, + "steps/max": 92, + "steps/min": 1, + "steps/std": 2.317763927235868 + }, + { + "/length/completion": 10806.510416666666, + "/length/completion/max": 26634, + "/length/completion/min": 900, + "/length/completion/std": 557.6911313642544, + "/length/context": 48856.885416666664, + "/length/context/max": 118755, + "/length/context/min": 3450, + "/length/context/std": 3002.7807003203775, + "/length/forward": 48861.666666666664, + "/length/forward/max": 118760, + "/length/forward/min": 3456, + "/length/forward/std": 3002.77547056392, + "/record/score/last_5_max": 1.0, + "/record/score/last_5_mean": 0.6335952848722987, + "/record/score/last_5_min": 0.0, + "/record/score/last_5_std": 0.4818218549006356, + "/record/score/max": 1.0, + "/record/score/mean": 0.6321173280749323, + "/record/score/min": 0.0, + "/record/score/std": 0.48222921066889, + "advantages": -0.001383604289172752, + "advantages/max": 2.0, + "advantages/min": -2.0, + "advantages/std": 0.0180340178972355, + "entropy": 0.6672566731770834, + "entropy/max": 0.91015625, + "entropy/min": 0.45703125, + "entropy/std": 0.00915473630294931, + "epoch": 0.054, + "grad_norm": 15808.0, + "learning_rate": 1.9907047236137496e-06, + "loss": -10.589589583333334, + "out_of_date_ratio": 0.0016308887469070517, + "out_of_date_ratio/max": 0.027313625440001488, + "out_of_date_ratio/min": 0.0001468213158659637, + "out_of_date_ratio/std": 0.00028130080240651613, + "rewards": 0.53125, + "rewards/max": 1.0, + "rewards/min": 0.0, + "rewards/std": 0.05093126879064569, + "sampled_at_step": 53.38630406061808, + "sampled_at_step/max": 54.000003814697266, + "sampled_at_step/min": 52.30179977416992, + "sampled_at_step/std": 0.05020286577148773, + "scores": 0.45693531649948116, + "scores/max": 1.0, + "scores/min": 0.0, + "scores/std": 0.009264651713448353, + "step": 54, + "steps": 29.114583333333332, + "steps/max": 95, + "steps/min": 0, + "steps/std": 2.370375147567457 + } + ], + "logging_steps": 1.0, + "max_steps": 1000, + "num_input_tokens_seen": 2062430400, + "num_train_epochs": 9223372036854775807, + "save_steps": 3, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.977636301701382e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}