{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 218.2890625, "completions/mean_terminated_length": 218.2890625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.055932467337697744, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 1.4016180957891362, "learning_rate": 0.0, "loss": 0.0025, "num_tokens": 458314.0, "reward": 0.5624843835830688, "reward_std": 0.10818969458341599, "rewards/qatch_small_update_with_fm/mean": 0.5624843835830688, "rewards/qatch_small_update_with_fm/std": 0.3901517689228058, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9868041276931763, "sampling/importance_sampling_ratio/min": 0.00016218787641264498, "sampling/sampling_logp_difference/max": 8.726755142211914, "sampling/sampling_logp_difference/mean": 0.07559221237897873, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 191.8125, "completions/mean_terminated_length": 191.8125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.0502051068469882, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 0.6875, "grad_norm": 0.8868732918382652, "learning_rate": 1.7543859649122805e-08, "loss": 0.0027, "num_tokens": 970794.0, "reward": 0.7046874761581421, "reward_std": 0.04274485632777214, "rewards/qatch_small_update_with_fm/mean": 0.7046874761581421, "rewards/qatch_small_update_with_fm/std": 0.3915046751499176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925431609153748, "sampling/importance_sampling_ratio/min": 0.0015312157338485122, "sampling/sampling_logp_difference/max": 6.481693267822266, "sampling/sampling_logp_difference/mean": 0.06322439014911652, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 217.28125, "completions/mean_terminated_length": 217.28125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.053691909182816744, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.5625, "grad_norm": 1.6731495484912082, "learning_rate": 3.508771929824561e-08, "loss": -0.0221, "num_tokens": 1395426.0, "reward": 0.6182304620742798, "reward_std": 0.09917416423559189, "rewards/qatch_small_update_with_fm/mean": 0.6182304620742798, "rewards/qatch_small_update_with_fm/std": 0.4379144310951233, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990143895149231, "sampling/importance_sampling_ratio/min": 0.00016410346142947674, "sampling/sampling_logp_difference/max": 8.71501350402832, "sampling/sampling_logp_difference/mean": 0.06847122311592102, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 220.12109375, "completions/mean_terminated_length": 220.12109375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.060406394302845, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.625, "grad_norm": 1.4683462720727918, "learning_rate": 5.2631578947368416e-08, "loss": 0.0207, "num_tokens": 2003425.0, "reward": 0.5585194826126099, "reward_std": 0.053804244846105576, "rewards/qatch_small_update_with_fm/mean": 0.5585194826126099, "rewards/qatch_small_update_with_fm/std": 0.39273256063461304, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9875953197479248, "sampling/importance_sampling_ratio/min": 0.004114419687539339, "sampling/sampling_logp_difference/max": 5.493257522583008, "sampling/sampling_logp_difference/mean": 0.0788843035697937, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 207.2421875, "completions/mean_terminated_length": 207.2421875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.06038828194141388, "epoch": 0.008849557522123894, "frac_reward_zero_std": 0.625, "grad_norm": 1.815000760491036, "learning_rate": 7.017543859649122e-08, "loss": -0.0129, "num_tokens": 2555599.0, "reward": 0.7544023394584656, "reward_std": 0.10639214515686035, "rewards/qatch_small_update_with_fm/mean": 0.7544023394584656, "rewards/qatch_small_update_with_fm/std": 0.3464336395263672, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911973476409912, "sampling/importance_sampling_ratio/min": 7.211915090010734e-06, "sampling/sampling_logp_difference/max": 11.839776039123535, "sampling/sampling_logp_difference/mean": 0.07425904273986816, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 198.359375, "completions/mean_terminated_length": 198.359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.05508757056668401, "epoch": 0.010619469026548672, "frac_reward_zero_std": 0.6875, "grad_norm": 1.3618814755274247, "learning_rate": 8.771929824561403e-08, "loss": 0.0008, "num_tokens": 3007131.0, "reward": 0.6679218411445618, "reward_std": 0.0737442895770073, "rewards/qatch_small_update_with_fm/mean": 0.6679218411445618, "rewards/qatch_small_update_with_fm/std": 0.3787311017513275, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9878405332565308, "sampling/importance_sampling_ratio/min": 0.006647423375397921, "sampling/sampling_logp_difference/max": 5.01352596282959, "sampling/sampling_logp_difference/mean": 0.07456570863723755, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0570991188287735, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.5, "grad_norm": 1.5094600148969153, "learning_rate": 1.0526315789473683e-07, "loss": 0.0034, "num_tokens": 3563819.0, "reward": 0.7576835751533508, "reward_std": 0.12967899441719055, "rewards/qatch_small_update_with_fm/mean": 0.7576836347579956, "rewards/qatch_small_update_with_fm/std": 0.35834866762161255, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900656938552856, "sampling/importance_sampling_ratio/min": 0.004104791674762964, "sampling/sampling_logp_difference/max": 5.49560022354126, "sampling/sampling_logp_difference/mean": 0.07453933358192444, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 193.29296875, "completions/mean_terminated_length": 193.29296875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.048482123762369156, "epoch": 0.01415929203539823, "frac_reward_zero_std": 0.5, "grad_norm": 2.5818561179335906, "learning_rate": 1.2280701754385964e-07, "loss": -0.009, "num_tokens": 3890038.0, "reward": 0.6313203573226929, "reward_std": 0.14092831313610077, "rewards/qatch_small_update_with_fm/mean": 0.6313203573226929, "rewards/qatch_small_update_with_fm/std": 0.39382705092430115, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898822903633118, "sampling/importance_sampling_ratio/min": 0.008772030472755432, "sampling/sampling_logp_difference/max": 4.736186981201172, "sampling/sampling_logp_difference/mean": 0.061118610203266144, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 197.39453125, "completions/mean_terminated_length": 197.39453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.04963156068697572, "epoch": 0.01592920353982301, "frac_reward_zero_std": 0.5625, "grad_norm": 1.5521690922872657, "learning_rate": 1.4035087719298244e-07, "loss": 0.0169, "num_tokens": 4266731.0, "reward": 0.770031213760376, "reward_std": 0.10407255589962006, "rewards/qatch_small_update_with_fm/mean": 0.770031213760376, "rewards/qatch_small_update_with_fm/std": 0.3665200173854828, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9872828722000122, "sampling/importance_sampling_ratio/min": 0.004101686645299196, "sampling/sampling_logp_difference/max": 5.496356964111328, "sampling/sampling_logp_difference/mean": 0.07008753716945648, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 201.2109375, "completions/mean_terminated_length": 201.2109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.051307815592736006, "epoch": 0.017699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 1.6061306069741945, "learning_rate": 1.5789473684210525e-07, "loss": -0.0103, "num_tokens": 4638561.0, "reward": 0.682628870010376, "reward_std": 0.09211964905261993, "rewards/qatch_small_update_with_fm/mean": 0.6826289296150208, "rewards/qatch_small_update_with_fm/std": 0.36384159326553345, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911330342292786, "sampling/importance_sampling_ratio/min": 0.0031929106917232275, "sampling/sampling_logp_difference/max": 5.746822357177734, "sampling/sampling_logp_difference/mean": 0.06979094445705414, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 208.46484375, "completions/mean_terminated_length": 208.46484375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05418810294941068, "epoch": 0.019469026548672566, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2675987379108877, "learning_rate": 1.7543859649122805e-07, "loss": 0.0089, "num_tokens": 5074904.0, "reward": 0.6214921474456787, "reward_std": 0.09628939628601074, "rewards/qatch_small_update_with_fm/mean": 0.6214921474456787, "rewards/qatch_small_update_with_fm/std": 0.36368274688720703, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906713962554932, "sampling/importance_sampling_ratio/min": 0.004233733285218477, "sampling/sampling_logp_difference/max": 5.4646711349487305, "sampling/sampling_logp_difference/mean": 0.06758482754230499, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 205.19921875, "completions/mean_terminated_length": 205.19921875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.05194803886115551, "epoch": 0.021238938053097345, "frac_reward_zero_std": 0.5, "grad_norm": 1.1957367434837758, "learning_rate": 1.9298245614035086e-07, "loss": 0.0113, "num_tokens": 5549051.0, "reward": 0.6895078420639038, "reward_std": 0.08644171059131622, "rewards/qatch_small_update_with_fm/mean": 0.6895078420639038, "rewards/qatch_small_update_with_fm/std": 0.36041754484176636, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990691065788269, "sampling/importance_sampling_ratio/min": 0.008775557391345501, "sampling/sampling_logp_difference/max": 4.735785007476807, "sampling/sampling_logp_difference/mean": 0.06334304809570312, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 228.0078125, "completions/mean_terminated_length": 228.0078125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06115304958075285, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.5625, "grad_norm": 1.8474801364773754, "learning_rate": 2.1052631578947366e-07, "loss": 0.0014, "num_tokens": 5991261.0, "reward": 0.5757031440734863, "reward_std": 0.09528271108865738, "rewards/qatch_small_update_with_fm/mean": 0.5757031440734863, "rewards/qatch_small_update_with_fm/std": 0.35785242915153503, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899276494979858, "sampling/importance_sampling_ratio/min": 0.0010690669296309352, "sampling/sampling_logp_difference/max": 6.840969085693359, "sampling/sampling_logp_difference/mean": 0.07720615714788437, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 231.671875, "completions/mean_terminated_length": 231.671875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.059437486343085766, "epoch": 0.024778761061946902, "frac_reward_zero_std": 0.4375, "grad_norm": 1.153686768788144, "learning_rate": 2.2807017543859647e-07, "loss": 0.0145, "num_tokens": 6339033.0, "reward": 0.5981484651565552, "reward_std": 0.11867053806781769, "rewards/qatch_small_update_with_fm/mean": 0.5981484055519104, "rewards/qatch_small_update_with_fm/std": 0.40236470103263855, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9874120950698853, "sampling/importance_sampling_ratio/min": 0.0067546553909778595, "sampling/sampling_logp_difference/max": 4.997523307800293, "sampling/sampling_logp_difference/mean": 0.07262717187404633, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 184.72265625, "completions/mean_terminated_length": 184.72265625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05293445521965623, "epoch": 0.02654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 1.7947302002840908, "learning_rate": 2.456140350877193e-07, "loss": 0.0262, "num_tokens": 6822914.0, "reward": 0.7774726152420044, "reward_std": 0.1253722608089447, "rewards/qatch_small_update_with_fm/mean": 0.7774726748466492, "rewards/qatch_small_update_with_fm/std": 0.3277592360973358, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9928814172744751, "sampling/importance_sampling_ratio/min": 0.0005699428147636354, "sampling/sampling_logp_difference/max": 7.469974517822266, "sampling/sampling_logp_difference/mean": 0.06256245821714401, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 180.90625, "completions/mean_terminated_length": 180.90625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.047853834461420774, "epoch": 0.02831858407079646, "frac_reward_zero_std": 0.625, "grad_norm": 1.2589380786159132, "learning_rate": 2.631578947368421e-07, "loss": -0.0078, "num_tokens": 7144698.0, "reward": 0.6949101686477661, "reward_std": 0.0378408208489418, "rewards/qatch_small_update_with_fm/mean": 0.6949101686477661, "rewards/qatch_small_update_with_fm/std": 0.3930208086967468, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918819665908813, "sampling/importance_sampling_ratio/min": 0.0031865073833614588, "sampling/sampling_logp_difference/max": 5.7488298416137695, "sampling/sampling_logp_difference/mean": 0.05979344993829727, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 204.32421875, "completions/mean_terminated_length": 204.32421875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.04873772710561752, "epoch": 0.03008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 1.1936460246963316, "learning_rate": 2.807017543859649e-07, "loss": 0.0036, "num_tokens": 7572637.0, "reward": 0.8436406254768372, "reward_std": 0.05956996604800224, "rewards/qatch_small_update_with_fm/mean": 0.8436405658721924, "rewards/qatch_small_update_with_fm/std": 0.26569095253944397, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906154274940491, "sampling/importance_sampling_ratio/min": 0.007214278448373079, "sampling/sampling_logp_difference/max": 4.931693077087402, "sampling/sampling_logp_difference/mean": 0.06251002848148346, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 232.6796875, "completions/mean_terminated_length": 232.6796875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.05695153260603547, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2761295767158813, "learning_rate": 2.982456140350877e-07, "loss": -0.0071, "num_tokens": 8194859.0, "reward": 0.5700898170471191, "reward_std": 0.10193304717540741, "rewards/qatch_small_update_with_fm/mean": 0.5700898766517639, "rewards/qatch_small_update_with_fm/std": 0.4049544334411621, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990411639213562, "sampling/importance_sampling_ratio/min": 0.004173677880316973, "sampling/sampling_logp_difference/max": 5.478957653045654, "sampling/sampling_logp_difference/mean": 0.07318311184644699, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 199.42578125, "completions/mean_terminated_length": 199.42578125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.04951566876843572, "epoch": 0.033628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 1.6725073395426633, "learning_rate": 3.157894736842105e-07, "loss": -0.0055, "num_tokens": 8922456.0, "reward": 0.7591953277587891, "reward_std": 0.0754041075706482, "rewards/qatch_small_update_with_fm/mean": 0.7591953277587891, "rewards/qatch_small_update_with_fm/std": 0.36782652139663696, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9886845350265503, "sampling/importance_sampling_ratio/min": 0.008016136474907398, "sampling/sampling_logp_difference/max": 4.826298713684082, "sampling/sampling_logp_difference/mean": 0.06480049341917038, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 222.52734375, "completions/mean_terminated_length": 222.52734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.053389983251690865, "epoch": 0.035398230088495575, "frac_reward_zero_std": 0.4375, "grad_norm": 1.5080008629099697, "learning_rate": 3.333333333333333e-07, "loss": -0.0087, "num_tokens": 9381903.0, "reward": 0.6970039010047913, "reward_std": 0.11117272078990936, "rewards/qatch_small_update_with_fm/mean": 0.6970039010047913, "rewards/qatch_small_update_with_fm/std": 0.33124446868896484, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989010751247406, "sampling/importance_sampling_ratio/min": 3.631289473560173e-06, "sampling/sampling_logp_difference/max": 12.525922775268555, "sampling/sampling_logp_difference/mean": 0.06789924949407578, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 225.8046875, "completions/mean_terminated_length": 225.8046875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06629037857055664, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 1.073903450177758, "learning_rate": 3.508771929824561e-07, "loss": -0.0026, "num_tokens": 9875389.0, "reward": 0.5116796493530273, "reward_std": 0.0645594596862793, "rewards/qatch_small_update_with_fm/mean": 0.5116796493530273, "rewards/qatch_small_update_with_fm/std": 0.40706780552864075, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882779121398926, "sampling/importance_sampling_ratio/min": 0.0041917734779417515, "sampling/sampling_logp_difference/max": 5.474631309509277, "sampling/sampling_logp_difference/mean": 0.08192314207553864, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 215.5390625, "completions/mean_terminated_length": 215.5390625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.05642617912963033, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.5, "grad_norm": 1.7536814431934094, "learning_rate": 3.684210526315789e-07, "loss": -0.0187, "num_tokens": 10343815.0, "reward": 0.6426757574081421, "reward_std": 0.16063649952411652, "rewards/qatch_small_update_with_fm/mean": 0.6426757574081421, "rewards/qatch_small_update_with_fm/std": 0.3499335050582886, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988250732421875, "sampling/importance_sampling_ratio/min": 0.0041191368363797665, "sampling/sampling_logp_difference/max": 5.492111682891846, "sampling/sampling_logp_difference/mean": 0.07272202521562576, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 205.4765625, "completions/mean_terminated_length": 205.4765625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.050218963995575905, "epoch": 0.04070796460176991, "frac_reward_zero_std": 0.625, "grad_norm": 1.1734990449788383, "learning_rate": 3.859649122807017e-07, "loss": 0.0112, "num_tokens": 10905281.0, "reward": 0.5514531135559082, "reward_std": 0.07454925775527954, "rewards/qatch_small_update_with_fm/mean": 0.5514531135559082, "rewards/qatch_small_update_with_fm/std": 0.40859004855155945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992061972618103, "sampling/importance_sampling_ratio/min": 0.0002468197199050337, "sampling/sampling_logp_difference/max": 8.306852340698242, "sampling/sampling_logp_difference/mean": 0.06509463489055634, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 186.2734375, "completions/mean_terminated_length": 186.2734375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.051339670550078154, "epoch": 0.04247787610619469, "frac_reward_zero_std": 0.5625, "grad_norm": 1.472579374696243, "learning_rate": 4.035087719298245e-07, "loss": 0.0049, "num_tokens": 11301367.0, "reward": 0.5480742454528809, "reward_std": 0.09668620675802231, "rewards/qatch_small_update_with_fm/mean": 0.5480742454528809, "rewards/qatch_small_update_with_fm/std": 0.4017762839794159, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989937424659729, "sampling/importance_sampling_ratio/min": 0.0032162086572498083, "sampling/sampling_logp_difference/max": 5.739552021026611, "sampling/sampling_logp_difference/mean": 0.06827802211046219, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.05917324684560299, "epoch": 0.04424778761061947, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9749562804449026, "learning_rate": 4.2105263157894733e-07, "loss": 0.0068, "num_tokens": 11719255.0, "reward": 0.5420546531677246, "reward_std": 0.08317986130714417, "rewards/qatch_small_update_with_fm/mean": 0.5420546531677246, "rewards/qatch_small_update_with_fm/std": 0.40023666620254517, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898385405540466, "sampling/importance_sampling_ratio/min": 0.004096901509910822, "sampling/sampling_logp_difference/max": 5.497524261474609, "sampling/sampling_logp_difference/mean": 0.07788807153701782, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 239.76171875, "completions/mean_terminated_length": 239.76171875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.058990718331187963, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.5625, "grad_norm": 1.3396032197988952, "learning_rate": 4.3859649122807013e-07, "loss": 0.0042, "num_tokens": 12233770.0, "reward": 0.7668046951293945, "reward_std": 0.0936145931482315, "rewards/qatch_small_update_with_fm/mean": 0.7668046951293945, "rewards/qatch_small_update_with_fm/std": 0.3399920165538788, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896319508552551, "sampling/importance_sampling_ratio/min": 0.0025338244158774614, "sampling/sampling_logp_difference/max": 5.978025436401367, "sampling/sampling_logp_difference/mean": 0.07602351158857346, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 212.66015625, "completions/mean_terminated_length": 212.66015625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.05781093053519726, "epoch": 0.047787610619469026, "frac_reward_zero_std": 0.5625, "grad_norm": 1.1795909655364405, "learning_rate": 4.5614035087719294e-07, "loss": -0.0148, "num_tokens": 12676883.0, "reward": 0.7312656044960022, "reward_std": 0.06833778321743011, "rewards/qatch_small_update_with_fm/mean": 0.731265664100647, "rewards/qatch_small_update_with_fm/std": 0.3687576353549957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896270036697388, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.07338601350784302, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 196.3671875, "completions/mean_terminated_length": 196.3671875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.0522055234760046, "epoch": 0.049557522123893805, "frac_reward_zero_std": 0.6875, "grad_norm": 2.061190562198567, "learning_rate": 4.7368421052631574e-07, "loss": -0.0102, "num_tokens": 13023505.0, "reward": 0.6650546789169312, "reward_std": 0.035377923399209976, "rewards/qatch_small_update_with_fm/mean": 0.6650546789169312, "rewards/qatch_small_update_with_fm/std": 0.375143438577652, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9883463382720947, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.07111047208309174, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 227.6875, "completions/mean_terminated_length": 227.6875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.05623852554708719, "epoch": 0.05132743362831858, "frac_reward_zero_std": 0.625, "grad_norm": 1.6294023746334783, "learning_rate": 4.912280701754385e-07, "loss": -0.0018, "num_tokens": 13443697.0, "reward": 0.6568242311477661, "reward_std": 0.11119887232780457, "rewards/qatch_small_update_with_fm/mean": 0.6568242311477661, "rewards/qatch_small_update_with_fm/std": 0.40569937229156494, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989427924156189, "sampling/importance_sampling_ratio/min": 0.0035574494395405054, "sampling/sampling_logp_difference/max": 5.638711452484131, "sampling/sampling_logp_difference/mean": 0.07333913445472717, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 219.7109375, "completions/mean_terminated_length": 219.7109375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.05887019541114569, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.625, "grad_norm": 1.4462954987854064, "learning_rate": 5.087719298245614e-07, "loss": 0.0056, "num_tokens": 13799319.0, "reward": 0.5096250176429749, "reward_std": 0.08224877715110779, "rewards/qatch_small_update_with_fm/mean": 0.5096250176429749, "rewards/qatch_small_update_with_fm/std": 0.4135892689228058, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896410703659058, "sampling/importance_sampling_ratio/min": 0.00046472682151943445, "sampling/sampling_logp_difference/max": 7.674060821533203, "sampling/sampling_logp_difference/mean": 0.07146239280700684, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 225.9296875, "completions/mean_terminated_length": 225.9296875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.0572858308441937, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.375, "grad_norm": 1.4963962240758701, "learning_rate": 5.263157894736842e-07, "loss": 0.0013, "num_tokens": 14207637.0, "reward": 0.47812891006469727, "reward_std": 0.10722576826810837, "rewards/qatch_small_update_with_fm/mean": 0.47812891006469727, "rewards/qatch_small_update_with_fm/std": 0.41087815165519714, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9885352253913879, "sampling/importance_sampling_ratio/min": 0.002572331577539444, "sampling/sampling_logp_difference/max": 5.962942600250244, "sampling/sampling_logp_difference/mean": 0.07356339693069458, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 215.8203125, "completions/mean_terminated_length": 215.8203125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.05347989918664098, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 1.460487515109523, "learning_rate": 5.43859649122807e-07, "loss": 0.0011, "num_tokens": 14733943.0, "reward": 0.6709609031677246, "reward_std": 0.0812995433807373, "rewards/qatch_small_update_with_fm/mean": 0.6709609031677246, "rewards/qatch_small_update_with_fm/std": 0.35092541575431824, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877209663391113, "sampling/importance_sampling_ratio/min": 0.006773304659873247, "sampling/sampling_logp_difference/max": 4.9947662353515625, "sampling/sampling_logp_difference/mean": 0.07288801670074463, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 216.15234375, "completions/mean_terminated_length": 216.15234375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.06382587924599648, "epoch": 0.0584070796460177, "frac_reward_zero_std": 0.375, "grad_norm": 1.6806617925432, "learning_rate": 5.614035087719298e-07, "loss": -0.0059, "num_tokens": 15201870.0, "reward": 0.6259765625, "reward_std": 0.1354515552520752, "rewards/qatch_small_update_with_fm/mean": 0.6259765625, "rewards/qatch_small_update_with_fm/std": 0.4338729977607727, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890233278274536, "sampling/importance_sampling_ratio/min": 0.004090497270226479, "sampling/sampling_logp_difference/max": 5.499088764190674, "sampling/sampling_logp_difference/mean": 0.07785800844430923, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 216.140625, "completions/mean_terminated_length": 216.140625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.06205499917268753, "epoch": 0.06017699115044248, "frac_reward_zero_std": 0.625, "grad_norm": 1.1188125302112715, "learning_rate": 5.789473684210526e-07, "loss": -0.0006, "num_tokens": 15670674.0, "reward": 0.7029336094856262, "reward_std": 0.09102556109428406, "rewards/qatch_small_update_with_fm/mean": 0.7029336094856262, "rewards/qatch_small_update_with_fm/std": 0.36313775181770325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894707202911377, "sampling/importance_sampling_ratio/min": 0.00528290867805481, "sampling/sampling_logp_difference/max": 5.243278503417969, "sampling/sampling_logp_difference/mean": 0.07377147674560547, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 215.26953125, "completions/mean_terminated_length": 215.26953125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.055985859129577875, "epoch": 0.061946902654867256, "frac_reward_zero_std": 0.75, "grad_norm": 0.9421523296988241, "learning_rate": 5.964912280701754e-07, "loss": -0.0083, "num_tokens": 16030199.0, "reward": 0.662277340888977, "reward_std": 0.05062306672334671, "rewards/qatch_small_update_with_fm/mean": 0.662277340888977, "rewards/qatch_small_update_with_fm/std": 0.4001128077507019, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9888499975204468, "sampling/importance_sampling_ratio/min": 0.011136544868350029, "sampling/sampling_logp_difference/max": 4.497523307800293, "sampling/sampling_logp_difference/mean": 0.07431173324584961, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 219.87109375, "completions/mean_terminated_length": 219.87109375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.055227352771908045, "epoch": 0.06371681415929203, "frac_reward_zero_std": 0.625, "grad_norm": 1.135079001459155, "learning_rate": 6.140350877192982e-07, "loss": -0.0032, "num_tokens": 16377286.0, "reward": 0.6459922194480896, "reward_std": 0.08503326773643494, "rewards/qatch_small_update_with_fm/mean": 0.6459922194480896, "rewards/qatch_small_update_with_fm/std": 0.353148490190506, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892193675041199, "sampling/importance_sampling_ratio/min": 9.36559445108287e-05, "sampling/sampling_logp_difference/max": 9.275882720947266, "sampling/sampling_logp_difference/mean": 0.0699758380651474, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 204.6953125, "completions/mean_terminated_length": 204.6953125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.05541439028456807, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9683166323193955, "learning_rate": 6.31578947368421e-07, "loss": 0.0055, "num_tokens": 16893240.0, "reward": 0.6555585861206055, "reward_std": 0.034975819289684296, "rewards/qatch_small_update_with_fm/mean": 0.6555585861206055, "rewards/qatch_small_update_with_fm/std": 0.3547699749469757, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.98728346824646, "sampling/importance_sampling_ratio/min": 0.001210095826536417, "sampling/sampling_logp_difference/max": 6.717055797576904, "sampling/sampling_logp_difference/mean": 0.07098366320133209, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 225.51171875, "completions/mean_terminated_length": 225.51171875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.06327511603012681, "epoch": 0.06725663716814159, "frac_reward_zero_std": 0.3125, "grad_norm": 2.1783519764095556, "learning_rate": 6.491228070175438e-07, "loss": -0.0082, "num_tokens": 17361579.0, "reward": 0.5263632535934448, "reward_std": 0.12323468178510666, "rewards/qatch_small_update_with_fm/mean": 0.5263632535934448, "rewards/qatch_small_update_with_fm/std": 0.38143190741539, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990209698677063, "sampling/importance_sampling_ratio/min": 0.004103474784642458, "sampling/sampling_logp_difference/max": 5.4959211349487305, "sampling/sampling_logp_difference/mean": 0.07122161984443665, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 214.296875, "completions/mean_terminated_length": 214.296875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.05618205713108182, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.5, "grad_norm": 1.3419509213878904, "learning_rate": 6.666666666666666e-07, "loss": -0.006, "num_tokens": 17898951.0, "reward": 0.6506953239440918, "reward_std": 0.08842222392559052, "rewards/qatch_small_update_with_fm/mean": 0.6506953239440918, "rewards/qatch_small_update_with_fm/std": 0.396701455116272, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990181028842926, "sampling/importance_sampling_ratio/min": 0.005266575142741203, "sampling/sampling_logp_difference/max": 5.24637508392334, "sampling/sampling_logp_difference/mean": 0.06922811269760132, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 226.23046875, "completions/mean_terminated_length": 226.23046875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.05959279276430607, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.9617272834762555, "learning_rate": 6.842105263157895e-07, "loss": 0.0026, "num_tokens": 18242850.0, "reward": 0.7116601467132568, "reward_std": 0.06126481294631958, "rewards/qatch_small_update_with_fm/mean": 0.7116601467132568, "rewards/qatch_small_update_with_fm/std": 0.39202579855918884, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9881985187530518, "sampling/importance_sampling_ratio/min": 0.004140384495258331, "sampling/sampling_logp_difference/max": 5.486966609954834, "sampling/sampling_logp_difference/mean": 0.07950674742460251, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 211.18359375, "completions/mean_terminated_length": 211.18359375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06127061927691102, "epoch": 0.07256637168141593, "frac_reward_zero_std": 0.375, "grad_norm": 2.3274345780809282, "learning_rate": 7.017543859649122e-07, "loss": 0.0226, "num_tokens": 18923697.0, "reward": 0.6969023942947388, "reward_std": 0.1744055151939392, "rewards/qatch_small_update_with_fm/mean": 0.6969023942947388, "rewards/qatch_small_update_with_fm/std": 0.3728826940059662, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9888622164726257, "sampling/importance_sampling_ratio/min": 0.00045951042557135224, "sampling/sampling_logp_difference/max": 7.685348987579346, "sampling/sampling_logp_difference/mean": 0.0754728838801384, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 210.484375, "completions/mean_terminated_length": 210.484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.05614142771810293, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.625, "grad_norm": 1.2632017942317626, "learning_rate": 7.192982456140351e-07, "loss": -0.0006, "num_tokens": 19296861.0, "reward": 0.6874765157699585, "reward_std": 0.06331270933151245, "rewards/qatch_small_update_with_fm/mean": 0.6874765157699585, "rewards/qatch_small_update_with_fm/std": 0.3579365611076355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9885727167129517, "sampling/importance_sampling_ratio/min": 0.005275054834783077, "sampling/sampling_logp_difference/max": 5.2447662353515625, "sampling/sampling_logp_difference/mean": 0.07586228102445602, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 227.88671875, "completions/mean_terminated_length": 227.88671875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.057134306989610195, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.5625, "grad_norm": 1.3882623306559656, "learning_rate": 7.368421052631578e-07, "loss": -0.0083, "num_tokens": 19771536.0, "reward": 0.6067304611206055, "reward_std": 0.13148462772369385, "rewards/qatch_small_update_with_fm/mean": 0.6067304611206055, "rewards/qatch_small_update_with_fm/std": 0.35363802313804626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.987711489200592, "sampling/importance_sampling_ratio/min": 0.0002641607716213912, "sampling/sampling_logp_difference/max": 8.23895263671875, "sampling/sampling_logp_difference/mean": 0.07562234252691269, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 200.41015625, "completions/mean_terminated_length": 200.41015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.05608399631455541, "epoch": 0.07787610619469026, "frac_reward_zero_std": 0.6875, "grad_norm": 1.2534604392810913, "learning_rate": 7.543859649122807e-07, "loss": 0.0167, "num_tokens": 20281977.0, "reward": 0.7539414167404175, "reward_std": 0.09053537249565125, "rewards/qatch_small_update_with_fm/mean": 0.7539414167404175, "rewards/qatch_small_update_with_fm/std": 0.37756431102752686, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906572103500366, "sampling/importance_sampling_ratio/min": 0.005265289451926947, "sampling/sampling_logp_difference/max": 5.24661922454834, "sampling/sampling_logp_difference/mean": 0.07270737737417221, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 267.09375, "completions/mean_terminated_length": 267.09375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07773672044277191, "epoch": 0.07964601769911504, "frac_reward_zero_std": 0.4375, "grad_norm": 1.3404809735788332, "learning_rate": 7.719298245614034e-07, "loss": 0.007, "num_tokens": 20790817.0, "reward": 0.5390859842300415, "reward_std": 0.09994053095579147, "rewards/qatch_small_update_with_fm/mean": 0.5390859246253967, "rewards/qatch_small_update_with_fm/std": 0.39160552620887756, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9873398542404175, "sampling/importance_sampling_ratio/min": 0.0007115455227904022, "sampling/sampling_logp_difference/max": 7.248071193695068, "sampling/sampling_logp_difference/mean": 0.09083493053913116, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 221.546875, "completions/mean_terminated_length": 221.546875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.0625874325633049, "epoch": 0.08141592920353982, "frac_reward_zero_std": 0.375, "grad_norm": 1.5966431213050403, "learning_rate": 7.894736842105263e-07, "loss": -0.0201, "num_tokens": 21361965.0, "reward": 0.6787070035934448, "reward_std": 0.13535821437835693, "rewards/qatch_small_update_with_fm/mean": 0.6787070035934448, "rewards/qatch_small_update_with_fm/std": 0.3745642304420471, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909084439277649, "sampling/importance_sampling_ratio/min": 0.004959781188517809, "sampling/sampling_logp_difference/max": 5.306393623352051, "sampling/sampling_logp_difference/mean": 0.07631758600473404, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.055308199021965265, "epoch": 0.0831858407079646, "frac_reward_zero_std": 0.625, "grad_norm": 1.4518930219563386, "learning_rate": 8.07017543859649e-07, "loss": -0.0101, "num_tokens": 21874157.0, "reward": 0.6671679615974426, "reward_std": 0.10809326171875, "rewards/qatch_small_update_with_fm/mean": 0.6671679615974426, "rewards/qatch_small_update_with_fm/std": 0.39586684107780457, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907793998718262, "sampling/importance_sampling_ratio/min": 0.00869713630527258, "sampling/sampling_logp_difference/max": 4.7447614669799805, "sampling/sampling_logp_difference/mean": 0.06782685220241547, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 233.37109375, "completions/mean_terminated_length": 233.37109375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.06507633905857801, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.5, "grad_norm": 1.6383109347734452, "learning_rate": 8.245614035087719e-07, "loss": -0.0068, "num_tokens": 22366332.0, "reward": 0.6874726414680481, "reward_std": 0.14611409604549408, "rewards/qatch_small_update_with_fm/mean": 0.6874727010726929, "rewards/qatch_small_update_with_fm/std": 0.3691793978214264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.987760603427887, "sampling/importance_sampling_ratio/min": 0.0025241519324481487, "sampling/sampling_logp_difference/max": 5.9818501472473145, "sampling/sampling_logp_difference/mean": 0.07947517931461334, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 220.91015625, "completions/mean_terminated_length": 220.91015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.05196545785292983, "epoch": 0.08672566371681416, "frac_reward_zero_std": 0.625, "grad_norm": 1.3498543785058719, "learning_rate": 8.421052631578947e-07, "loss": -0.0054, "num_tokens": 22712309.0, "reward": 0.6685937643051147, "reward_std": 0.10685192048549652, "rewards/qatch_small_update_with_fm/mean": 0.6685937643051147, "rewards/qatch_small_update_with_fm/std": 0.3972224295139313, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896861910820007, "sampling/importance_sampling_ratio/min": 0.0031929106917232275, "sampling/sampling_logp_difference/max": 5.746822357177734, "sampling/sampling_logp_difference/mean": 0.06827843189239502, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 234.8046875, "completions/mean_terminated_length": 234.8046875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.06737958360463381, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.4375, "grad_norm": 1.7923238515597124, "learning_rate": 8.596491228070175e-07, "loss": -0.0134, "num_tokens": 23186147.0, "reward": 0.5033125281333923, "reward_std": 0.12631133198738098, "rewards/qatch_small_update_with_fm/mean": 0.5033124685287476, "rewards/qatch_small_update_with_fm/std": 0.4238218069076538, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9885417222976685, "sampling/importance_sampling_ratio/min": 0.00029693765100091696, "sampling/sampling_logp_difference/max": 8.121988296508789, "sampling/sampling_logp_difference/mean": 0.07968786358833313, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 197.2734375, "completions/mean_terminated_length": 197.2734375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.04840736137703061, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.6875, "grad_norm": 1.0586044702380508, "learning_rate": 8.771929824561403e-07, "loss": -0.0009, "num_tokens": 23535817.0, "reward": 0.8437227010726929, "reward_std": 0.07556959986686707, "rewards/qatch_small_update_with_fm/mean": 0.8437227010726929, "rewards/qatch_small_update_with_fm/std": 0.296781986951828, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988394021987915, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.06788826733827591, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 231.86328125, "completions/mean_terminated_length": 231.86328125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.06255824631080031, "epoch": 0.0920353982300885, "frac_reward_zero_std": 0.4375, "grad_norm": 1.6045971303696716, "learning_rate": 8.947368421052631e-07, "loss": 0.0134, "num_tokens": 24069702.0, "reward": 0.8138632774353027, "reward_std": 0.1399797797203064, "rewards/qatch_small_update_with_fm/mean": 0.8138632774353027, "rewards/qatch_small_update_with_fm/std": 0.32255634665489197, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9859397411346436, "sampling/importance_sampling_ratio/min": 0.0026753947604447603, "sampling/sampling_logp_difference/max": 5.92365837097168, "sampling/sampling_logp_difference/mean": 0.08180845528841019, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 220.4609375, "completions/mean_terminated_length": 220.4609375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.062050150241702795, "epoch": 0.09380530973451327, "frac_reward_zero_std": 0.4375, "grad_norm": 1.4117919010554631, "learning_rate": 9.122807017543859e-07, "loss": -0.0052, "num_tokens": 24479100.0, "reward": 0.7222539186477661, "reward_std": 0.12479500472545624, "rewards/qatch_small_update_with_fm/mean": 0.7222539186477661, "rewards/qatch_small_update_with_fm/std": 0.35847076773643494, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9871389865875244, "sampling/importance_sampling_ratio/min": 0.004096901509910822, "sampling/sampling_logp_difference/max": 5.497524261474609, "sampling/sampling_logp_difference/mean": 0.07965491712093353, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 213.265625, "completions/mean_terminated_length": 213.265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.05208406038582325, "epoch": 0.09557522123893805, "frac_reward_zero_std": 0.625, "grad_norm": 1.2749853584316917, "learning_rate": 9.298245614035087e-07, "loss": 0.0086, "num_tokens": 24906176.0, "reward": 0.7578281164169312, "reward_std": 0.09011317789554596, "rewards/qatch_small_update_with_fm/mean": 0.7578281164169312, "rewards/qatch_small_update_with_fm/std": 0.36998218297958374, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876996278762817, "sampling/importance_sampling_ratio/min": 0.002484902273863554, "sampling/sampling_logp_difference/max": 5.997521877288818, "sampling/sampling_logp_difference/mean": 0.06985066086053848, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 222.65234375, "completions/mean_terminated_length": 222.65234375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.05924999574199319, "epoch": 0.09734513274336283, "frac_reward_zero_std": 0.625, "grad_norm": 1.1662708219703708, "learning_rate": 9.473684210526315e-07, "loss": -0.0068, "num_tokens": 25522151.0, "reward": 0.6106094121932983, "reward_std": 0.09099166095256805, "rewards/qatch_small_update_with_fm/mean": 0.6106094121932983, "rewards/qatch_small_update_with_fm/std": 0.42094501852989197, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9885672330856323, "sampling/importance_sampling_ratio/min": 0.0005121615249663591, "sampling/sampling_logp_difference/max": 7.576870441436768, "sampling/sampling_logp_difference/mean": 0.07843571156263351, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 230.015625, "completions/mean_terminated_length": 230.015625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.06128214206546545, "epoch": 0.09911504424778761, "frac_reward_zero_std": 0.6875, "grad_norm": 1.1814665152848656, "learning_rate": 9.649122807017545e-07, "loss": 0.0099, "num_tokens": 25926219.0, "reward": 0.5389687418937683, "reward_std": 0.0760495662689209, "rewards/qatch_small_update_with_fm/mean": 0.5389687418937683, "rewards/qatch_small_update_with_fm/std": 0.44913288950920105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9883639216423035, "sampling/importance_sampling_ratio/min": 0.005285531282424927, "sampling/sampling_logp_difference/max": 5.242782115936279, "sampling/sampling_logp_difference/mean": 0.076011061668396, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 231.01953125, "completions/mean_terminated_length": 231.01953125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06122827250510454, "epoch": 0.10088495575221239, "frac_reward_zero_std": 0.4375, "grad_norm": 1.701525809843503, "learning_rate": 9.82456140350877e-07, "loss": -0.0012, "num_tokens": 26508288.0, "reward": 0.5856757760047913, "reward_std": 0.10319986939430237, "rewards/qatch_small_update_with_fm/mean": 0.5856757760047913, "rewards/qatch_small_update_with_fm/std": 0.3761707544326782, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9905126094818115, "sampling/importance_sampling_ratio/min": 0.0008454708731733263, "sampling/sampling_logp_difference/max": 7.075616836547852, "sampling/sampling_logp_difference/mean": 0.07275321334600449, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 228.78125, "completions/mean_terminated_length": 228.78125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.05939402291551232, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.3125, "grad_norm": 1.6087452343972617, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 27004360.0, "reward": 0.7414413690567017, "reward_std": 0.16082945466041565, "rewards/qatch_small_update_with_fm/mean": 0.7414413690567017, "rewards/qatch_small_update_with_fm/std": 0.36743295192718506, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876971244812012, "sampling/importance_sampling_ratio/min": 0.0087041687220335, "sampling/sampling_logp_difference/max": 4.743953227996826, "sampling/sampling_logp_difference/mean": 0.07908263057470322, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 232.23046875, "completions/mean_terminated_length": 232.23046875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06470445403829217, "epoch": 0.10442477876106195, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2989246279889253, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 27654163.0, "reward": 0.6483319997787476, "reward_std": 0.07241523265838623, "rewards/qatch_small_update_with_fm/mean": 0.6483319997787476, "rewards/qatch_small_update_with_fm/std": 0.3903117775917053, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9866316318511963, "sampling/importance_sampling_ratio/min": 0.0015347807202488184, "sampling/sampling_logp_difference/max": 6.479367733001709, "sampling/sampling_logp_difference/mean": 0.08551231771707535, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 203.109375, "completions/mean_terminated_length": 203.109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.05414479412138462, "epoch": 0.10619469026548672, "frac_reward_zero_std": 0.625, "grad_norm": 1.3547680159242048, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 28066927.0, "reward": 0.8256796598434448, "reward_std": 0.12319210171699524, "rewards/qatch_small_update_with_fm/mean": 0.8256796598434448, "rewards/qatch_small_update_with_fm/std": 0.3297073543071747, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990422248840332, "sampling/importance_sampling_ratio/min": 0.008775134570896626, "sampling/sampling_logp_difference/max": 4.735833168029785, "sampling/sampling_logp_difference/mean": 0.06953281909227371, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 210.90625, "completions/mean_terminated_length": 210.90625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.05672915978357196, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.6875, "grad_norm": 1.1260978796261878, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 28441015.0, "reward": 0.5399414300918579, "reward_std": 0.08482731878757477, "rewards/qatch_small_update_with_fm/mean": 0.5399413704872131, "rewards/qatch_small_update_with_fm/std": 0.3924369812011719, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898838400840759, "sampling/importance_sampling_ratio/min": 0.00044970287126488984, "sampling/sampling_logp_difference/max": 7.706923484802246, "sampling/sampling_logp_difference/mean": 0.07207824289798737, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 221.19140625, "completions/mean_terminated_length": 221.19140625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.060915837064385414, "epoch": 0.10973451327433628, "frac_reward_zero_std": 0.5625, "grad_norm": 1.1258665241146926, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 28913320.0, "reward": 0.6639999747276306, "reward_std": 0.10557354986667633, "rewards/qatch_small_update_with_fm/mean": 0.6640000343322754, "rewards/qatch_small_update_with_fm/std": 0.37992992997169495, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889802932739258, "sampling/importance_sampling_ratio/min": 0.004290364682674408, "sampling/sampling_logp_difference/max": 5.451383590698242, "sampling/sampling_logp_difference/mean": 0.07521848380565643, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 211.0703125, "completions/mean_terminated_length": 211.0703125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.057312820106744766, "epoch": 0.11150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 1.3806557267663577, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 29287546.0, "reward": 0.8031054735183716, "reward_std": 0.13054558634757996, "rewards/qatch_small_update_with_fm/mean": 0.8031054735183716, "rewards/qatch_small_update_with_fm/std": 0.3409731090068817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891703128814697, "sampling/importance_sampling_ratio/min": 0.0009145242511294782, "sampling/sampling_logp_difference/max": 6.997106552124023, "sampling/sampling_logp_difference/mean": 0.07143960893154144, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 209.265625, "completions/mean_terminated_length": 209.265625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.05227735638618469, "epoch": 0.11327433628318584, "frac_reward_zero_std": 0.5625, "grad_norm": 1.1583707369088834, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 29605390.0, "reward": 0.7770195007324219, "reward_std": 0.0829365998506546, "rewards/qatch_small_update_with_fm/mean": 0.7770195007324219, "rewards/qatch_small_update_with_fm/std": 0.3401285409927368, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9910151958465576, "sampling/importance_sampling_ratio/min": 9.103809134103358e-05, "sampling/sampling_logp_difference/max": 9.304232597351074, "sampling/sampling_logp_difference/mean": 0.06772209703922272, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 211.19140625, "completions/mean_terminated_length": 211.19140625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.0573471263051033, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.625, "grad_norm": 1.6612352617651236, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 30237231.0, "reward": 0.6868828535079956, "reward_std": 0.12947605550289154, "rewards/qatch_small_update_with_fm/mean": 0.6868828535079956, "rewards/qatch_small_update_with_fm/std": 0.38128235936164856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901007413864136, "sampling/importance_sampling_ratio/min": 0.0007929722778499126, "sampling/sampling_logp_difference/max": 7.1397223472595215, "sampling/sampling_logp_difference/mean": 0.0746212974190712, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 196.890625, "completions/mean_terminated_length": 196.890625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.06002651434391737, "epoch": 0.1168141592920354, "frac_reward_zero_std": 0.625, "grad_norm": 1.2459924657963664, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 30586531.0, "reward": 0.8590390682220459, "reward_std": 0.10891761630773544, "rewards/qatch_small_update_with_fm/mean": 0.8590390682220459, "rewards/qatch_small_update_with_fm/std": 0.29756253957748413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9895268678665161, "sampling/importance_sampling_ratio/min": 0.008679230697453022, "sampling/sampling_logp_difference/max": 4.746822357177734, "sampling/sampling_logp_difference/mean": 0.0752851814031601, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 213.6484375, "completions/mean_terminated_length": 213.6484375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.05108651565387845, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.5625, "grad_norm": 1.6786719907991305, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 30921465.0, "reward": 0.7343359589576721, "reward_std": 0.0922800824046135, "rewards/qatch_small_update_with_fm/mean": 0.7343359589576721, "rewards/qatch_small_update_with_fm/std": 0.3966872990131378, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894901514053345, "sampling/importance_sampling_ratio/min": 0.0053325011394917965, "sampling/sampling_logp_difference/max": 5.2339348793029785, "sampling/sampling_logp_difference/mean": 0.06567675620317459, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 191.671875, "completions/mean_terminated_length": 191.671875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.049398843199014664, "epoch": 0.12035398230088495, "frac_reward_zero_std": 0.625, "grad_norm": 1.3480569196341743, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 31321605.0, "reward": 0.7132851481437683, "reward_std": 0.10275396704673767, "rewards/qatch_small_update_with_fm/mean": 0.7132851481437683, "rewards/qatch_small_update_with_fm/std": 0.36657455563545227, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901378750801086, "sampling/importance_sampling_ratio/min": 0.00142034946475178, "sampling/sampling_logp_difference/max": 6.556852340698242, "sampling/sampling_logp_difference/mean": 0.06882262229919434, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 226.60546875, "completions/mean_terminated_length": 226.60546875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06086216261610389, "epoch": 0.12212389380530973, "frac_reward_zero_std": 0.4375, "grad_norm": 1.3171490055496433, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 31961792.0, "reward": 0.5173789262771606, "reward_std": 0.10575304925441742, "rewards/qatch_small_update_with_fm/mean": 0.5173789262771606, "rewards/qatch_small_update_with_fm/std": 0.3906656801700592, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989037811756134, "sampling/importance_sampling_ratio/min": 0.006783433724194765, "sampling/sampling_logp_difference/max": 4.993271827697754, "sampling/sampling_logp_difference/mean": 0.07524209469556808, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 212.49609375, "completions/mean_terminated_length": 212.49609375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.06138837058097124, "epoch": 0.12389380530973451, "frac_reward_zero_std": 0.5, "grad_norm": 1.5948358740892794, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 32319615.0, "reward": 0.7251406311988831, "reward_std": 0.14819201827049255, "rewards/qatch_small_update_with_fm/mean": 0.7251405715942383, "rewards/qatch_small_update_with_fm/std": 0.4064193665981293, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.987440824508667, "sampling/importance_sampling_ratio/min": 1.3609868787511914e-08, "sampling/sampling_logp_difference/max": 18.112470626831055, "sampling/sampling_logp_difference/mean": 0.07786168903112411, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 204.21875, "completions/mean_terminated_length": 204.21875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05273952102288604, "epoch": 0.1256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.1148325278657547, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 32774903.0, "reward": 0.6098905801773071, "reward_std": 0.07109032571315765, "rewards/qatch_small_update_with_fm/mean": 0.6098905801773071, "rewards/qatch_small_update_with_fm/std": 0.3966805040836334, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992613673210144, "sampling/importance_sampling_ratio/min": 0.006861389148980379, "sampling/sampling_logp_difference/max": 4.981845378875732, "sampling/sampling_logp_difference/mean": 0.06364965438842773, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 228.99609375, "completions/mean_terminated_length": 228.99609375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06161496136337519, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.625, "grad_norm": 1.2833428215317622, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 33585750.0, "reward": 0.6821328401565552, "reward_std": 0.11582638323307037, "rewards/qatch_small_update_with_fm/mean": 0.6821328401565552, "rewards/qatch_small_update_with_fm/std": 0.4021257758140564, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9876558780670166, "sampling/importance_sampling_ratio/min": 0.003892954671755433, "sampling/sampling_logp_difference/max": 5.548586845397949, "sampling/sampling_logp_difference/mean": 0.08207723498344421, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 211.3359375, "completions/mean_terminated_length": 211.3359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05988115398213267, "epoch": 0.12920353982300886, "frac_reward_zero_std": 0.75, "grad_norm": 1.398151631377601, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 34136700.0, "reward": 0.5716406106948853, "reward_std": 0.04969751462340355, "rewards/qatch_small_update_with_fm/mean": 0.5716406106948853, "rewards/qatch_small_update_with_fm/std": 0.40344637632369995, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906647205352783, "sampling/importance_sampling_ratio/min": 0.005279851146042347, "sampling/sampling_logp_difference/max": 5.243857383728027, "sampling/sampling_logp_difference/mean": 0.07489632815122604, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 235.5859375, "completions/mean_terminated_length": 235.5859375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.06329893041402102, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.6875, "grad_norm": 1.0823255201111395, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 34670914.0, "reward": 0.6543593406677246, "reward_std": 0.0780901163816452, "rewards/qatch_small_update_with_fm/mean": 0.6543593406677246, "rewards/qatch_small_update_with_fm/std": 0.4128835201263428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9857140779495239, "sampling/importance_sampling_ratio/min": 0.0030068783089518547, "sampling/sampling_logp_difference/max": 5.8068528175354, "sampling/sampling_logp_difference/mean": 0.0846657007932663, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 244.5546875, "completions/mean_terminated_length": 244.5546875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07185739278793335, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.4375, "grad_norm": 1.378086589282812, "learning_rate": 1e-06, "loss": -0.0363, "num_tokens": 35206464.0, "reward": 0.6239843964576721, "reward_std": 0.14069399237632751, "rewards/qatch_small_update_with_fm/mean": 0.6239843368530273, "rewards/qatch_small_update_with_fm/std": 0.42358607053756714, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9863853454589844, "sampling/importance_sampling_ratio/min": 0.006657592952251434, "sampling/sampling_logp_difference/max": 5.011997222900391, "sampling/sampling_logp_difference/mean": 0.08819466829299927, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 220.5390625, "completions/mean_terminated_length": 220.5390625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06628159014508128, "epoch": 0.13451327433628318, "frac_reward_zero_std": 0.625, "grad_norm": 1.1875335710855202, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 35660042.0, "reward": 0.7418711185455322, "reward_std": 0.08845853805541992, "rewards/qatch_small_update_with_fm/mean": 0.7418710589408875, "rewards/qatch_small_update_with_fm/std": 0.32926684617996216, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9878979921340942, "sampling/importance_sampling_ratio/min": 0.0010934383608400822, "sampling/sampling_logp_difference/max": 6.818428039550781, "sampling/sampling_logp_difference/mean": 0.0823526680469513, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 210.45703125, "completions/mean_terminated_length": 210.45703125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0547793940640986, "epoch": 0.13628318584070798, "frac_reward_zero_std": 0.5, "grad_norm": 1.2820133241614284, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 36098735.0, "reward": 0.7723632454872131, "reward_std": 0.10817845165729523, "rewards/qatch_small_update_with_fm/mean": 0.7723632454872131, "rewards/qatch_small_update_with_fm/std": 0.32957103848457336, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900933504104614, "sampling/importance_sampling_ratio/min": 0.0019473416032269597, "sampling/sampling_logp_difference/max": 6.241290092468262, "sampling/sampling_logp_difference/mean": 0.06941035389900208, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 199.92578125, "completions/mean_terminated_length": 199.92578125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.059035807847976685, "epoch": 0.13805309734513274, "frac_reward_zero_std": 0.3125, "grad_norm": 1.810166027187809, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 36648908.0, "reward": 0.666699230670929, "reward_std": 0.1434335559606552, "rewards/qatch_small_update_with_fm/mean": 0.666699230670929, "rewards/qatch_small_update_with_fm/std": 0.36849868297576904, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9866025447845459, "sampling/importance_sampling_ratio/min": 0.006941504310816526, "sampling/sampling_logp_difference/max": 4.970236778259277, "sampling/sampling_logp_difference/mean": 0.07928824424743652, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 209.16796875, "completions/mean_terminated_length": 209.16796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06461150664836168, "epoch": 0.13982300884955753, "frac_reward_zero_std": 0.4375, "grad_norm": 1.5742182581772457, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 37106087.0, "reward": 0.6654453277587891, "reward_std": 0.13180303573608398, "rewards/qatch_small_update_with_fm/mean": 0.6654453277587891, "rewards/qatch_small_update_with_fm/std": 0.39904341101646423, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9861378073692322, "sampling/importance_sampling_ratio/min": 0.006765482947230339, "sampling/sampling_logp_difference/max": 4.995921611785889, "sampling/sampling_logp_difference/mean": 0.08057506382465363, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 224.44921875, "completions/mean_terminated_length": 224.44921875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06497358391061425, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0691186894798947, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 37761642.0, "reward": 0.7223789691925049, "reward_std": 0.03682992607355118, "rewards/qatch_small_update_with_fm/mean": 0.7223789691925049, "rewards/qatch_small_update_with_fm/std": 0.4028269946575165, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9870048761367798, "sampling/importance_sampling_ratio/min": 0.00475488742813468, "sampling/sampling_logp_difference/max": 5.3485822677612305, "sampling/sampling_logp_difference/mean": 0.08557099103927612, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 209.359375, "completions/mean_terminated_length": 209.359375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07113357819616795, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.5, "grad_norm": 1.1430097586030235, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 38304758.0, "reward": 0.664121150970459, "reward_std": 0.07573066651821136, "rewards/qatch_small_update_with_fm/mean": 0.6641210913658142, "rewards/qatch_small_update_with_fm/std": 0.3896544873714447, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9852210879325867, "sampling/importance_sampling_ratio/min": 0.004114307928830385, "sampling/sampling_logp_difference/max": 5.493284702301025, "sampling/sampling_logp_difference/mean": 0.09156718850135803, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 198.05859375, "completions/mean_terminated_length": 198.05859375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.060767903458327055, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 1.064463118851441, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 38814533.0, "reward": 0.5577499866485596, "reward_std": 0.03734627366065979, "rewards/qatch_small_update_with_fm/mean": 0.5577499866485596, "rewards/qatch_small_update_with_fm/std": 0.4213722348213196, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9857168197631836, "sampling/importance_sampling_ratio/min": 1.6835397076420122e-08, "sampling/sampling_logp_difference/max": 17.899782180786133, "sampling/sampling_logp_difference/mean": 0.0847933366894722, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 195.48046875, "completions/mean_terminated_length": 195.48046875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.05940087651833892, "epoch": 0.14690265486725665, "frac_reward_zero_std": 0.625, "grad_norm": 1.5926033700931235, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 39296656.0, "reward": 0.6463242173194885, "reward_std": 0.1061515137553215, "rewards/qatch_small_update_with_fm/mean": 0.6463241577148438, "rewards/qatch_small_update_with_fm/std": 0.3937905728816986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877705574035645, "sampling/importance_sampling_ratio/min": 0.004423520993441343, "sampling/sampling_logp_difference/max": 5.420819282531738, "sampling/sampling_logp_difference/mean": 0.07931964099407196, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 207.98046875, "completions/mean_terminated_length": 207.98046875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.05249483743682504, "epoch": 0.1486725663716814, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8908889978726945, "learning_rate": 1e-06, "loss": -0.0215, "num_tokens": 40012699.0, "reward": 0.8066601157188416, "reward_std": 0.04902618005871773, "rewards/qatch_small_update_with_fm/mean": 0.8066601753234863, "rewards/qatch_small_update_with_fm/std": 0.3067781925201416, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9874978065490723, "sampling/importance_sampling_ratio/min": 0.0017582516884431243, "sampling/sampling_logp_difference/max": 6.343435287475586, "sampling/sampling_logp_difference/mean": 0.07647126913070679, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 211.04296875, "completions/mean_terminated_length": 211.04296875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.06565155554562807, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.3694550387992506, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 40324774.0, "reward": 0.7276562452316284, "reward_std": 0.12753424048423767, "rewards/qatch_small_update_with_fm/mean": 0.7276562452316284, "rewards/qatch_small_update_with_fm/std": 0.3714563846588135, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896517395973206, "sampling/importance_sampling_ratio/min": 0.0007163186674006283, "sampling/sampling_logp_difference/max": 7.241385459899902, "sampling/sampling_logp_difference/mean": 0.07456763833761215, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 201.8203125, "completions/mean_terminated_length": 201.8203125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.051434017252177, "epoch": 0.15221238938053097, "frac_reward_zero_std": 0.6875, "grad_norm": 1.075833278684105, "learning_rate": 1e-06, "loss": -0.0122, "num_tokens": 40738408.0, "reward": 0.7893867492675781, "reward_std": 0.07746925950050354, "rewards/qatch_small_update_with_fm/mean": 0.7893867492675781, "rewards/qatch_small_update_with_fm/std": 0.305320680141449, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909459948539734, "sampling/importance_sampling_ratio/min": 0.00526427524164319, "sampling/sampling_logp_difference/max": 5.246811866760254, "sampling/sampling_logp_difference/mean": 0.06327448785305023, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 188.52734375, "completions/mean_terminated_length": 188.52734375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.05542505206540227, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.75, "grad_norm": 1.4599249891402826, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 41215151.0, "reward": 0.7529062032699585, "reward_std": 0.04203649237751961, "rewards/qatch_small_update_with_fm/mean": 0.7529062032699585, "rewards/qatch_small_update_with_fm/std": 0.3346794545650482, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9897748231887817, "sampling/importance_sampling_ratio/min": 9.50306002778234e-07, "sampling/sampling_logp_difference/max": 13.86648178100586, "sampling/sampling_logp_difference/mean": 0.06705738604068756, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 224.00390625, "completions/mean_terminated_length": 224.00390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06550072925165296, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.375, "grad_norm": 1.5855353981192564, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 41742432.0, "reward": 0.6274609565734863, "reward_std": 0.15894006192684174, "rewards/qatch_small_update_with_fm/mean": 0.6274609565734863, "rewards/qatch_small_update_with_fm/std": 0.36873817443847656, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9874089360237122, "sampling/importance_sampling_ratio/min": 0.0012522448087111115, "sampling/sampling_logp_difference/max": 6.682817459106445, "sampling/sampling_logp_difference/mean": 0.08358839154243469, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 219.96484375, "completions/mean_terminated_length": 219.96484375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.06327277980744839, "epoch": 0.15752212389380532, "frac_reward_zero_std": 0.5625, "grad_norm": 2.3868624400474925, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 42285063.0, "reward": 0.6643476486206055, "reward_std": 0.09053190052509308, "rewards/qatch_small_update_with_fm/mean": 0.6643476486206055, "rewards/qatch_small_update_with_fm/std": 0.36776453256607056, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9868210554122925, "sampling/importance_sampling_ratio/min": 0.0032281808089464903, "sampling/sampling_logp_difference/max": 5.735836505889893, "sampling/sampling_logp_difference/mean": 0.08308855444192886, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 221.921875, "completions/mean_terminated_length": 221.921875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06209539761766791, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9604749244588173, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 42771827.0, "reward": 0.624679684638977, "reward_std": 0.05232584848999977, "rewards/qatch_small_update_with_fm/mean": 0.624679684638977, "rewards/qatch_small_update_with_fm/std": 0.4224699139595032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900439977645874, "sampling/importance_sampling_ratio/min": 0.006829948164522648, "sampling/sampling_logp_difference/max": 4.986438274383545, "sampling/sampling_logp_difference/mean": 0.0733073502779007, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 215.45703125, "completions/mean_terminated_length": 215.45703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.059518329333513975, "epoch": 0.16106194690265488, "frac_reward_zero_std": 0.5625, "grad_norm": 1.363715543188874, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 43140760.0, "reward": 0.6687187552452087, "reward_std": 0.0947665125131607, "rewards/qatch_small_update_with_fm/mean": 0.6687187552452087, "rewards/qatch_small_update_with_fm/std": 0.39010441303253174, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891262650489807, "sampling/importance_sampling_ratio/min": 0.000560973712708801, "sampling/sampling_logp_difference/max": 7.485836505889893, "sampling/sampling_logp_difference/mean": 0.07687044888734818, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 238.33203125, "completions/mean_terminated_length": 238.33203125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06875412631779909, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2344610418561328, "learning_rate": 1e-06, "loss": -0.0178, "num_tokens": 43687517.0, "reward": 0.621164083480835, "reward_std": 0.07217143476009369, "rewards/qatch_small_update_with_fm/mean": 0.621164083480835, "rewards/qatch_small_update_with_fm/std": 0.40286827087402344, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9864582419395447, "sampling/importance_sampling_ratio/min": 0.0018237647600471973, "sampling/sampling_logp_difference/max": 6.306852340698242, "sampling/sampling_logp_difference/mean": 0.08633337914943695, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 182.59765625, "completions/mean_terminated_length": 182.59765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.048110376577824354, "epoch": 0.16460176991150444, "frac_reward_zero_std": 0.75, "grad_norm": 1.3812354569394656, "learning_rate": 1e-06, "loss": -0.0125, "num_tokens": 44077782.0, "reward": 0.7503007650375366, "reward_std": 0.05579577013850212, "rewards/qatch_small_update_with_fm/mean": 0.7503007650375366, "rewards/qatch_small_update_with_fm/std": 0.3701190650463104, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9926257133483887, "sampling/importance_sampling_ratio/min": 0.008679235354065895, "sampling/sampling_logp_difference/max": 4.746821880340576, "sampling/sampling_logp_difference/mean": 0.06102976202964783, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 226.34765625, "completions/mean_terminated_length": 226.34765625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06354533648118377, "epoch": 0.1663716814159292, "frac_reward_zero_std": 0.4375, "grad_norm": 2.0330279818529817, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 44597039.0, "reward": 0.6143515110015869, "reward_std": 0.09292621165513992, "rewards/qatch_small_update_with_fm/mean": 0.6143515706062317, "rewards/qatch_small_update_with_fm/std": 0.4134713113307953, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902607798576355, "sampling/importance_sampling_ratio/min": 0.0037183836102485657, "sampling/sampling_logp_difference/max": 5.594466209411621, "sampling/sampling_logp_difference/mean": 0.07333210110664368, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 216.32421875, "completions/mean_terminated_length": 216.32421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.0570914801210165, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.9805195877495109, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 45176658.0, "reward": 0.7462930083274841, "reward_std": 0.05266830697655678, "rewards/qatch_small_update_with_fm/mean": 0.7462930083274841, "rewards/qatch_small_update_with_fm/std": 0.3875497579574585, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896615743637085, "sampling/importance_sampling_ratio/min": 0.0019235039362683892, "sampling/sampling_logp_difference/max": 6.253606796264648, "sampling/sampling_logp_difference/mean": 0.07471055537462234, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 222.78125, "completions/mean_terminated_length": 222.78125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.0618698401376605, "epoch": 0.16991150442477876, "frac_reward_zero_std": 0.3125, "grad_norm": 1.2423456493533636, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 45682122.0, "reward": 0.5794335603713989, "reward_std": 0.16376078128814697, "rewards/qatch_small_update_with_fm/mean": 0.5794335603713989, "rewards/qatch_small_update_with_fm/std": 0.37759649753570557, "sampling/importance_sampling_ratio/max": 1.9816300868988037, "sampling/importance_sampling_ratio/mean": 0.9911603927612305, "sampling/importance_sampling_ratio/min": 0.005275054834783077, "sampling/sampling_logp_difference/max": 5.2447662353515625, "sampling/sampling_logp_difference/mean": 0.06948962807655334, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 199.75390625, "completions/mean_terminated_length": 199.75390625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05948884226381779, "epoch": 0.17168141592920355, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0621083166267602, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 46077291.0, "reward": 0.8319296836853027, "reward_std": 0.10714203119277954, "rewards/qatch_small_update_with_fm/mean": 0.8319296836853027, "rewards/qatch_small_update_with_fm/std": 0.32574471831321716, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9903090000152588, "sampling/importance_sampling_ratio/min": 0.0067833466455340385, "sampling/sampling_logp_difference/max": 4.993284702301025, "sampling/sampling_logp_difference/mean": 0.07050412893295288, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 231.16796875, "completions/mean_terminated_length": 231.16796875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.055903279688209295, "epoch": 0.17345132743362832, "frac_reward_zero_std": 0.375, "grad_norm": 1.498916659342343, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 46600134.0, "reward": 0.5468984246253967, "reward_std": 0.11932238936424255, "rewards/qatch_small_update_with_fm/mean": 0.5468984246253967, "rewards/qatch_small_update_with_fm/std": 0.3713155686855316, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9923816919326782, "sampling/importance_sampling_ratio/min": 0.0067833466455340385, "sampling/sampling_logp_difference/max": 4.993284702301025, "sampling/sampling_logp_difference/mean": 0.06638407707214355, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 217.890625, "completions/mean_terminated_length": 217.890625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06640621554106474, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.4375, "grad_norm": 1.5718249727687301, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 47070490.0, "reward": 0.7290234565734863, "reward_std": 0.15270094573497772, "rewards/qatch_small_update_with_fm/mean": 0.7290234565734863, "rewards/qatch_small_update_with_fm/std": 0.3411417007446289, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9883034229278564, "sampling/importance_sampling_ratio/min": 0.005264220293611288, "sampling/sampling_logp_difference/max": 5.246822357177734, "sampling/sampling_logp_difference/mean": 0.07885263860225677, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 201.0859375, "completions/mean_terminated_length": 201.0859375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.05885794572532177, "epoch": 0.17699115044247787, "frac_reward_zero_std": 0.625, "grad_norm": 1.3345730020684947, "learning_rate": 1e-06, "loss": -0.0168, "num_tokens": 47546912.0, "reward": 0.6619570255279541, "reward_std": 0.09077616035938263, "rewards/qatch_small_update_with_fm/mean": 0.6619570255279541, "rewards/qatch_small_update_with_fm/std": 0.424406498670578, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890620112419128, "sampling/importance_sampling_ratio/min": 0.0067546493373811245, "sampling/sampling_logp_difference/max": 4.997524261474609, "sampling/sampling_logp_difference/mean": 0.07268444448709488, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 239.95703125, "completions/mean_terminated_length": 239.95703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.0687459222972393, "epoch": 0.17876106194690267, "frac_reward_zero_std": 0.625, "grad_norm": 0.8181741928189153, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 48250181.0, "reward": 0.6181562542915344, "reward_std": 0.06346184015274048, "rewards/qatch_small_update_with_fm/mean": 0.6181562542915344, "rewards/qatch_small_update_with_fm/std": 0.393032044172287, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9872247576713562, "sampling/importance_sampling_ratio/min": 0.0052973320707678795, "sampling/sampling_logp_difference/max": 5.240551948547363, "sampling/sampling_logp_difference/mean": 0.07996334880590439, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 229.55078125, "completions/mean_terminated_length": 229.55078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.076118397526443, "epoch": 0.18053097345132743, "frac_reward_zero_std": 0.625, "grad_norm": 0.8489205627398348, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 48699602.0, "reward": 0.7388476133346558, "reward_std": 0.08579277992248535, "rewards/qatch_small_update_with_fm/mean": 0.7388476133346558, "rewards/qatch_small_update_with_fm/std": 0.3763107657432556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902989864349365, "sampling/importance_sampling_ratio/min": 0.011183848604559898, "sampling/sampling_logp_difference/max": 4.493284702301025, "sampling/sampling_logp_difference/mean": 0.07630578428506851, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 234.83203125, "completions/mean_terminated_length": 234.83203125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06544858030974865, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.6875, "grad_norm": 1.0495184915762261, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 49279447.0, "reward": 0.7172539234161377, "reward_std": 0.08092990517616272, "rewards/qatch_small_update_with_fm/mean": 0.7172539234161377, "rewards/qatch_small_update_with_fm/std": 0.36648428440093994, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909118413925171, "sampling/importance_sampling_ratio/min": 0.0002085514715872705, "sampling/sampling_logp_difference/max": 8.475324630737305, "sampling/sampling_logp_difference/mean": 0.07229933142662048, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 201.5234375, "completions/mean_terminated_length": 201.5234375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06334317708387971, "epoch": 0.184070796460177, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9710610492311486, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 49833373.0, "reward": 0.7955077886581421, "reward_std": 0.08076589554548264, "rewards/qatch_small_update_with_fm/mean": 0.7955077886581421, "rewards/qatch_small_update_with_fm/std": 0.2992088496685028, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912827014923096, "sampling/importance_sampling_ratio/min": 0.002524232491850853, "sampling/sampling_logp_difference/max": 5.981818199157715, "sampling/sampling_logp_difference/mean": 0.07269036024808884, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 251.88671875, "completions/mean_terminated_length": 251.88671875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.07081401254981756, "epoch": 0.18584070796460178, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2405682870240118, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 50403440.0, "reward": 0.7778749465942383, "reward_std": 0.07802677154541016, "rewards/qatch_small_update_with_fm/mean": 0.7778750061988831, "rewards/qatch_small_update_with_fm/std": 0.32360637187957764, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902708530426025, "sampling/importance_sampling_ratio/min": 0.003205658169463277, "sampling/sampling_logp_difference/max": 5.742837905883789, "sampling/sampling_logp_difference/mean": 0.07808385789394379, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 206.82421875, "completions/mean_terminated_length": 206.82421875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.0713232234120369, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2524565483157322, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 50824931.0, "reward": 0.7011171579360962, "reward_std": 0.0960366427898407, "rewards/qatch_small_update_with_fm/mean": 0.7011171579360962, "rewards/qatch_small_update_with_fm/std": 0.37654486298561096, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9864037036895752, "sampling/importance_sampling_ratio/min": 0.004132171627134085, "sampling/sampling_logp_difference/max": 5.488952159881592, "sampling/sampling_logp_difference/mean": 0.08674513548612595, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 204.5234375, "completions/mean_terminated_length": 204.5234375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06823413353413343, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.5625, "grad_norm": 1.2203515906399123, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 51307657.0, "reward": 0.7907344102859497, "reward_std": 0.11454281955957413, "rewards/qatch_small_update_with_fm/mean": 0.7907344102859497, "rewards/qatch_small_update_with_fm/std": 0.3722200095653534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9859263896942139, "sampling/importance_sampling_ratio/min": 0.00414169579744339, "sampling/sampling_logp_difference/max": 5.486649990081787, "sampling/sampling_logp_difference/mean": 0.0840449258685112, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 216.546875, "completions/mean_terminated_length": 216.546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07362686283886433, "epoch": 0.1911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.1727916966743837, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 51739557.0, "reward": 0.7079882621765137, "reward_std": 0.1126692146062851, "rewards/qatch_small_update_with_fm/mean": 0.7079882621765137, "rewards/qatch_small_update_with_fm/std": 0.3950938880443573, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893667697906494, "sampling/importance_sampling_ratio/min": 0.0005116977845318615, "sampling/sampling_logp_difference/max": 7.5777764320373535, "sampling/sampling_logp_difference/mean": 0.0835602730512619, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 235.3046875, "completions/mean_terminated_length": 235.3046875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.069594438187778, "epoch": 0.1929203539823009, "frac_reward_zero_std": 0.5625, "grad_norm": 1.1717602880661633, "learning_rate": 1e-06, "loss": -0.0122, "num_tokens": 52304035.0, "reward": 0.7542539238929749, "reward_std": 0.10196835547685623, "rewards/qatch_small_update_with_fm/mean": 0.7542538642883301, "rewards/qatch_small_update_with_fm/std": 0.3392114043235779, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988386332988739, "sampling/importance_sampling_ratio/min": 0.003735895035788417, "sampling/sampling_logp_difference/max": 5.589767932891846, "sampling/sampling_logp_difference/mean": 0.07965923845767975, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 237.87109375, "completions/mean_terminated_length": 237.87109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.08200910221785307, "epoch": 0.19469026548672566, "frac_reward_zero_std": 0.5, "grad_norm": 1.1239702239825442, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 52823218.0, "reward": 0.7591797113418579, "reward_std": 0.13066166639328003, "rewards/qatch_small_update_with_fm/mean": 0.7591797113418579, "rewards/qatch_small_update_with_fm/std": 0.3666918873786926, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904977679252625, "sampling/importance_sampling_ratio/min": 0.008987288922071457, "sampling/sampling_logp_difference/max": 4.711944103240967, "sampling/sampling_logp_difference/mean": 0.08705293387174606, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 248.390625, "completions/mean_terminated_length": 248.390625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.07323302794247866, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.5, "grad_norm": 0.9989215025851239, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 53404518.0, "reward": 0.7107539176940918, "reward_std": 0.14668454229831696, "rewards/qatch_small_update_with_fm/mean": 0.7107539176940918, "rewards/qatch_small_update_with_fm/std": 0.3824143707752228, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9871342182159424, "sampling/importance_sampling_ratio/min": 0.004103472921997309, "sampling/sampling_logp_difference/max": 5.495921611785889, "sampling/sampling_logp_difference/mean": 0.08473586291074753, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 226.14453125, "completions/mean_terminated_length": 226.14453125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07057367824018002, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0350790346928784, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 53895739.0, "reward": 0.6831991672515869, "reward_std": 0.0855279192328453, "rewards/qatch_small_update_with_fm/mean": 0.6831991672515869, "rewards/qatch_small_update_with_fm/std": 0.402239054441452, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9895448684692383, "sampling/importance_sampling_ratio/min": 0.005957757122814655, "sampling/sampling_logp_difference/max": 5.123061180114746, "sampling/sampling_logp_difference/mean": 0.08003868162631989, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 228.67578125, "completions/mean_terminated_length": 228.67578125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.08122638612985611, "epoch": 0.2, "frac_reward_zero_std": 0.6875, "grad_norm": 0.849733979561118, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 54381624.0, "reward": 0.7690976858139038, "reward_std": 0.08409841358661652, "rewards/qatch_small_update_with_fm/mean": 0.7690976858139038, "rewards/qatch_small_update_with_fm/std": 0.3112572729587555, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9858420491218567, "sampling/importance_sampling_ratio/min": 0.0054420898668468, "sampling/sampling_logp_difference/max": 5.213592052459717, "sampling/sampling_logp_difference/mean": 0.09284666180610657, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 237.3203125, "completions/mean_terminated_length": 237.3203125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07597249001264572, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.5, "grad_norm": 0.7586615609792853, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 54832682.0, "reward": 0.6945820450782776, "reward_std": 0.09893607348203659, "rewards/qatch_small_update_with_fm/mean": 0.6945819854736328, "rewards/qatch_small_update_with_fm/std": 0.36656326055526733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901285767555237, "sampling/importance_sampling_ratio/min": 0.014312805607914925, "sampling/sampling_logp_difference/max": 4.24660062789917, "sampling/sampling_logp_difference/mean": 0.0770685225725174, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 257.66796875, "completions/mean_terminated_length": 257.66796875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.0925935534760356, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0435651562227988, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 55348565.0, "reward": 0.7382968664169312, "reward_std": 0.10771472752094269, "rewards/qatch_small_update_with_fm/mean": 0.7382968664169312, "rewards/qatch_small_update_with_fm/std": 0.358004629611969, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990604043006897, "sampling/importance_sampling_ratio/min": 0.014600342139601707, "sampling/sampling_logp_difference/max": 4.226710319519043, "sampling/sampling_logp_difference/mean": 0.09014023840427399, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 251.6875, "completions/mean_terminated_length": 251.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.08259568270295858, "epoch": 0.20530973451327433, "frac_reward_zero_std": 0.625, "grad_norm": 0.8880503095846881, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 55758325.0, "reward": 0.7133241891860962, "reward_std": 0.09667318314313889, "rewards/qatch_small_update_with_fm/mean": 0.7133241891860962, "rewards/qatch_small_update_with_fm/std": 0.37226539850234985, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913509488105774, "sampling/importance_sampling_ratio/min": 0.004103474784642458, "sampling/sampling_logp_difference/max": 5.4959211349487305, "sampling/sampling_logp_difference/mean": 0.08229538798332214, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 254.046875, "completions/mean_terminated_length": 254.046875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.09230261761695147, "epoch": 0.20707964601769913, "frac_reward_zero_std": 0.625, "grad_norm": 1.1159689278172942, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 56245665.0, "reward": 0.782953143119812, "reward_std": 0.08451966941356659, "rewards/qatch_small_update_with_fm/mean": 0.782953143119812, "rewards/qatch_small_update_with_fm/std": 0.33307263255119324, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882062673568726, "sampling/importance_sampling_ratio/min": 0.011158311739563942, "sampling/sampling_logp_difference/max": 4.495570659637451, "sampling/sampling_logp_difference/mean": 0.09717138111591339, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 267.03515625, "completions/mean_terminated_length": 267.03515625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1017747251316905, "epoch": 0.2088495575221239, "frac_reward_zero_std": 0.625, "grad_norm": 0.8300344236250922, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 56741018.0, "reward": 0.6730078458786011, "reward_std": 0.0929047018289566, "rewards/qatch_small_update_with_fm/mean": 0.6730078458786011, "rewards/qatch_small_update_with_fm/std": 0.37552404403686523, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898161292076111, "sampling/importance_sampling_ratio/min": 0.0010080569190904498, "sampling/sampling_logp_difference/max": 6.899730682373047, "sampling/sampling_logp_difference/mean": 0.09620655328035355, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 255.59765625, "completions/mean_terminated_length": 255.59765625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.08545802906155586, "epoch": 0.21061946902654868, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0319217136508039, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 57145571.0, "reward": 0.6751953363418579, "reward_std": 0.0626310482621193, "rewards/qatch_small_update_with_fm/mean": 0.6751953363418579, "rewards/qatch_small_update_with_fm/std": 0.38396772742271423, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9910754561424255, "sampling/importance_sampling_ratio/min": 0.006784905679523945, "sampling/sampling_logp_difference/max": 4.9930548667907715, "sampling/sampling_logp_difference/mean": 0.08353398740291595, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 283.390625, "completions/mean_terminated_length": 283.390625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.09278723411262035, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.875, "grad_norm": 0.40765157194615936, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 57686151.0, "reward": 0.7513476610183716, "reward_std": 0.042032234370708466, "rewards/qatch_small_update_with_fm/mean": 0.7513476610183716, "rewards/qatch_small_update_with_fm/std": 0.33298224210739136, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9905245304107666, "sampling/importance_sampling_ratio/min": 7.85573092798586e-07, "sampling/sampling_logp_difference/max": 14.056852340698242, "sampling/sampling_logp_difference/mean": 0.09131969511508942, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 253.28125, "completions/mean_terminated_length": 253.28125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.08711250498890877, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7822434137440697, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 58111615.0, "reward": 0.7699062824249268, "reward_std": 0.14036858081817627, "rewards/qatch_small_update_with_fm/mean": 0.7699062824249268, "rewards/qatch_small_update_with_fm/std": 0.3514271378517151, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898660182952881, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.09157545864582062, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 259.3515625, "completions/mean_terminated_length": 259.3515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.091193076223135, "epoch": 0.215929203539823, "frac_reward_zero_std": 0.625, "grad_norm": 0.7754221166721602, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 58597945.0, "reward": 0.6951679587364197, "reward_std": 0.11067484319210052, "rewards/qatch_small_update_with_fm/mean": 0.6951679587364197, "rewards/qatch_small_update_with_fm/std": 0.38288044929504395, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879680871963501, "sampling/importance_sampling_ratio/min": 0.005322370678186417, "sampling/sampling_logp_difference/max": 5.235836505889893, "sampling/sampling_logp_difference/mean": 0.09452269226312637, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 279.49609375, "completions/mean_terminated_length": 279.49609375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.09114830754697323, "epoch": 0.2176991150442478, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6514437850174619, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 58993496.0, "reward": 0.6201406121253967, "reward_std": 0.09873516857624054, "rewards/qatch_small_update_with_fm/mean": 0.6201406121253967, "rewards/qatch_small_update_with_fm/std": 0.4052762985229492, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893493056297302, "sampling/importance_sampling_ratio/min": 0.0015082587487995625, "sampling/sampling_logp_difference/max": 6.496799468994141, "sampling/sampling_logp_difference/mean": 0.08865205943584442, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 268.82421875, "completions/mean_terminated_length": 268.82421875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.09484461508691311, "epoch": 0.21946902654867256, "frac_reward_zero_std": 0.5625, "grad_norm": 0.934234614851166, "learning_rate": 1e-06, "loss": -0.021, "num_tokens": 59620395.0, "reward": 0.6767265200614929, "reward_std": 0.10116851329803467, "rewards/qatch_small_update_with_fm/mean": 0.6767265796661377, "rewards/qatch_small_update_with_fm/std": 0.3683614432811737, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9874666333198547, "sampling/importance_sampling_ratio/min": 0.011243586428463459, "sampling/sampling_logp_difference/max": 4.48795747756958, "sampling/sampling_logp_difference/mean": 0.09557405114173889, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 262.40234375, "completions/mean_terminated_length": 262.40234375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.09813018050044775, "epoch": 0.22123893805309736, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5368316481587041, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 59913058.0, "reward": 0.7477929592132568, "reward_std": 0.032144203782081604, "rewards/qatch_small_update_with_fm/mean": 0.7477929592132568, "rewards/qatch_small_update_with_fm/std": 0.3281362056732178, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9916006326675415, "sampling/importance_sampling_ratio/min": 0.018390489742159843, "sampling/sampling_logp_difference/max": 3.9959216117858887, "sampling/sampling_logp_difference/mean": 0.08756014704704285, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 276.3359375, "completions/mean_terminated_length": 276.3359375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.10066359303891659, "epoch": 0.22300884955752212, "frac_reward_zero_std": 0.4375, "grad_norm": 0.838232937849372, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 60417352.0, "reward": 0.6837852001190186, "reward_std": 0.15973472595214844, "rewards/qatch_small_update_with_fm/mean": 0.6837852001190186, "rewards/qatch_small_update_with_fm/std": 0.37046951055526733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911990165710449, "sampling/importance_sampling_ratio/min": 0.011186771094799042, "sampling/sampling_logp_difference/max": 4.49302339553833, "sampling/sampling_logp_difference/mean": 0.09084731340408325, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 259.4140625, "completions/mean_terminated_length": 259.4140625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.08994897082448006, "epoch": 0.2247787610619469, "frac_reward_zero_std": 0.5, "grad_norm": 0.7653161293513729, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 60800850.0, "reward": 0.6803281307220459, "reward_std": 0.14784881472587585, "rewards/qatch_small_update_with_fm/mean": 0.6803281307220459, "rewards/qatch_small_update_with_fm/std": 0.40771958231925964, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9874090552330017, "sampling/importance_sampling_ratio/min": 0.002029245253652334, "sampling/sampling_logp_difference/max": 6.200091361999512, "sampling/sampling_logp_difference/mean": 0.09263262152671814, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 280.67578125, "completions/mean_terminated_length": 280.67578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.1118616834282875, "epoch": 0.22654867256637168, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8858334262845288, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 61328671.0, "reward": 0.6529296636581421, "reward_std": 0.09167185425758362, "rewards/qatch_small_update_with_fm/mean": 0.6529296636581421, "rewards/qatch_small_update_with_fm/std": 0.41027647256851196, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9881945848464966, "sampling/importance_sampling_ratio/min": 0.0045516896061599255, "sampling/sampling_logp_difference/max": 5.392256736755371, "sampling/sampling_logp_difference/mean": 0.10528907924890518, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 302.046875, "completions/mean_terminated_length": 302.046875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.10696916934102774, "epoch": 0.22831858407079647, "frac_reward_zero_std": 0.4375, "grad_norm": 0.9604443746123174, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 61709419.0, "reward": 0.6226562261581421, "reward_std": 0.1739925742149353, "rewards/qatch_small_update_with_fm/mean": 0.6226562261581421, "rewards/qatch_small_update_with_fm/std": 0.39967477321624756, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892038106918335, "sampling/importance_sampling_ratio/min": 0.0011153517989441752, "sampling/sampling_logp_difference/max": 6.798585414886475, "sampling/sampling_logp_difference/mean": 0.09998883306980133, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 305.88671875, "completions/mean_terminated_length": 305.88671875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.09376539383083582, "epoch": 0.23008849557522124, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7862261535767899, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 62132238.0, "reward": 0.6461484432220459, "reward_std": 0.15436768531799316, "rewards/qatch_small_update_with_fm/mean": 0.6461484432220459, "rewards/qatch_small_update_with_fm/std": 0.40689408779144287, "sampling/importance_sampling_ratio/max": 1.9475082159042358, "sampling/importance_sampling_ratio/mean": 0.9900339841842651, "sampling/importance_sampling_ratio/min": 0.012612488120794296, "sampling/sampling_logp_difference/max": 4.373067855834961, "sampling/sampling_logp_difference/mean": 0.08664417266845703, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 298.4453125, "completions/mean_terminated_length": 298.4453125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.10303333774209023, "epoch": 0.23185840707964603, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0219844010417432, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 62922736.0, "reward": 0.7904140949249268, "reward_std": 0.09890885651111603, "rewards/qatch_small_update_with_fm/mean": 0.7904140949249268, "rewards/qatch_small_update_with_fm/std": 0.3384302258491516, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9887751340866089, "sampling/importance_sampling_ratio/min": 0.006941414438188076, "sampling/sampling_logp_difference/max": 4.970249652862549, "sampling/sampling_logp_difference/mean": 0.1004166379570961, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 272.5390625, "completions/mean_terminated_length": 272.5390625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.10145942401140928, "epoch": 0.2336283185840708, "frac_reward_zero_std": 0.625, "grad_norm": 1.0066570470039222, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 63381930.0, "reward": 0.7727656364440918, "reward_std": 0.04941270500421524, "rewards/qatch_small_update_with_fm/mean": 0.7727656364440918, "rewards/qatch_small_update_with_fm/std": 0.3472408652305603, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879018068313599, "sampling/importance_sampling_ratio/min": 0.005326686892658472, "sampling/sampling_logp_difference/max": 5.235025882720947, "sampling/sampling_logp_difference/mean": 0.10085846483707428, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 281.359375, "completions/mean_terminated_length": 281.359375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.0962858721613884, "epoch": 0.23539823008849559, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8718101101072838, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 63788758.0, "reward": 0.6256601810455322, "reward_std": 0.11285711824893951, "rewards/qatch_small_update_with_fm/mean": 0.6256601810455322, "rewards/qatch_small_update_with_fm/std": 0.39782294631004333, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9886201620101929, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.09229514002799988, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 313.6015625, "completions/mean_terminated_length": 313.6015625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.10426320135593414, "epoch": 0.23716814159292035, "frac_reward_zero_std": 0.625, "grad_norm": 0.620772979902059, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 64252768.0, "reward": 0.7330859899520874, "reward_std": 0.10503730177879333, "rewards/qatch_small_update_with_fm/mean": 0.7330859899520874, "rewards/qatch_small_update_with_fm/std": 0.35654518008232117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911209344863892, "sampling/importance_sampling_ratio/min": 0.009995303116738796, "sampling/sampling_logp_difference/max": 4.605639934539795, "sampling/sampling_logp_difference/mean": 0.094190314412117, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 317.60546875, "completions/mean_terminated_length": 317.60546875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.10367918200790882, "epoch": 0.23893805309734514, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6907269231345502, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 64625051.0, "reward": 0.5382304787635803, "reward_std": 0.14928914606571198, "rewards/qatch_small_update_with_fm/mean": 0.5382304191589355, "rewards/qatch_small_update_with_fm/std": 0.3985642194747925, "sampling/importance_sampling_ratio/max": 1.9336779117584229, "sampling/importance_sampling_ratio/mean": 0.9892072677612305, "sampling/importance_sampling_ratio/min": 0.01847412809729576, "sampling/sampling_logp_difference/max": 3.9913840293884277, "sampling/sampling_logp_difference/mean": 0.09558893740177155, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 372.515625, "completions/mean_terminated_length": 313.4127197265625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.10493994690477848, "epoch": 0.2407079646017699, "frac_reward_zero_std": 0.625, "grad_norm": 0.6150747718251923, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 65142431.0, "reward": 0.5792617201805115, "reward_std": 0.07002231478691101, "rewards/qatch_small_update_with_fm/mean": 0.5792617201805115, "rewards/qatch_small_update_with_fm/std": 0.4229722023010254, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899119734764099, "sampling/importance_sampling_ratio/min": 0.011183901689946651, "sampling/sampling_logp_difference/max": 4.493279933929443, "sampling/sampling_logp_difference/mean": 0.10026513040065765, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 328.55859375, "completions/mean_terminated_length": 328.55859375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.12148185446858406, "epoch": 0.2424778761061947, "frac_reward_zero_std": 0.375, "grad_norm": 0.8001348116132405, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 65636302.0, "reward": 0.5703476667404175, "reward_std": 0.13366132974624634, "rewards/qatch_small_update_with_fm/mean": 0.5703476667404175, "rewards/qatch_small_update_with_fm/std": 0.3910647928714752, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913745522499084, "sampling/importance_sampling_ratio/min": 0.014454490505158901, "sampling/sampling_logp_difference/max": 4.23675012588501, "sampling/sampling_logp_difference/mean": 0.10473871231079102, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 331.17578125, "completions/mean_terminated_length": 331.17578125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.12022528424859047, "epoch": 0.24424778761061947, "frac_reward_zero_std": 0.5, "grad_norm": 0.7779054985874481, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 66158427.0, "reward": 0.6807109713554382, "reward_std": 0.09024008363485336, "rewards/qatch_small_update_with_fm/mean": 0.6807109713554382, "rewards/qatch_small_update_with_fm/std": 0.37375155091285706, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9935262799263, "sampling/importance_sampling_ratio/min": 0.0002641607716213912, "sampling/sampling_logp_difference/max": 8.23895263671875, "sampling/sampling_logp_difference/mean": 0.1035434752702713, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 315.984375, "completions/mean_terminated_length": 315.984375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.1255191322416067, "epoch": 0.24601769911504426, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6235699081867757, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 66667463.0, "reward": 0.6528828144073486, "reward_std": 0.10219062864780426, "rewards/qatch_small_update_with_fm/mean": 0.6528828144073486, "rewards/qatch_small_update_with_fm/std": 0.3693510890007019, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911524653434753, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.10624203085899353, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 322.84375, "completions/mean_terminated_length": 322.84375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.11273289285600185, "epoch": 0.24778761061946902, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8694494197292071, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 67065663.0, "reward": 0.6909765005111694, "reward_std": 0.15022653341293335, "rewards/qatch_small_update_with_fm/mean": 0.6909765005111694, "rewards/qatch_small_update_with_fm/std": 0.4188796579837799, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911254644393921, "sampling/importance_sampling_ratio/min": 0.018390489742159843, "sampling/sampling_logp_difference/max": 3.9959216117858887, "sampling/sampling_logp_difference/mean": 0.09885202348232269, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 307.7734375, "completions/mean_terminated_length": 307.7734375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.10854166280478239, "epoch": 0.24955752212389382, "frac_reward_zero_std": 0.625, "grad_norm": 0.6582767132558051, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 67630293.0, "reward": 0.7654882669448853, "reward_std": 0.13209347426891327, "rewards/qatch_small_update_with_fm/mean": 0.7654882669448853, "rewards/qatch_small_update_with_fm/std": 0.3466072976589203, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894741773605347, "sampling/importance_sampling_ratio/min": 0.008697095327079296, "sampling/sampling_logp_difference/max": 4.7447662353515625, "sampling/sampling_logp_difference/mean": 0.10250886529684067, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 330.546875, "completions/mean_terminated_length": 330.546875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.11137775052338839, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.6875, "grad_norm": 0.39969463474896644, "learning_rate": 1e-06, "loss": -0.013, "num_tokens": 68206081.0, "reward": 0.6641288995742798, "reward_std": 0.08044249564409256, "rewards/qatch_small_update_with_fm/mean": 0.6641288995742798, "rewards/qatch_small_update_with_fm/std": 0.40250152349472046, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914456605911255, "sampling/importance_sampling_ratio/min": 0.01118385884910822, "sampling/sampling_logp_difference/max": 4.493283748626709, "sampling/sampling_logp_difference/mean": 0.09565605968236923, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 282.18359375, "completions/mean_terminated_length": 282.18359375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.11050462163984776, "epoch": 0.25309734513274335, "frac_reward_zero_std": 0.75, "grad_norm": 0.5152218599616128, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 68596064.0, "reward": 0.7529257535934448, "reward_std": 0.06564027070999146, "rewards/qatch_small_update_with_fm/mean": 0.7529257535934448, "rewards/qatch_small_update_with_fm/std": 0.34699419140815735, "sampling/importance_sampling_ratio/max": 1.8420151472091675, "sampling/importance_sampling_ratio/mean": 0.9903613328933716, "sampling/importance_sampling_ratio/min": 0.011400715447962284, "sampling/sampling_logp_difference/max": 4.474079132080078, "sampling/sampling_logp_difference/mean": 0.09959550946950912, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 281.70703125, "completions/mean_terminated_length": 281.70703125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.11399852950125933, "epoch": 0.25486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 0.5059618551556871, "learning_rate": 1e-06, "loss": -0.0185, "num_tokens": 68975989.0, "reward": 0.8365429639816284, "reward_std": 0.06033072993159294, "rewards/qatch_small_update_with_fm/mean": 0.8365429639816284, "rewards/qatch_small_update_with_fm/std": 0.3398571312427521, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893470406532288, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.10417570918798447, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 311.2265625, "completions/mean_terminated_length": 311.2265625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.11174062639474869, "epoch": 0.25663716814159293, "frac_reward_zero_std": 0.6875, "grad_norm": 0.50898681753391, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 69318207.0, "reward": 0.6523163914680481, "reward_std": 0.058695483952760696, "rewards/qatch_small_update_with_fm/mean": 0.6523163914680481, "rewards/qatch_small_update_with_fm/std": 0.4232395589351654, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892643690109253, "sampling/importance_sampling_ratio/min": 0.01455187052488327, "sampling/sampling_logp_difference/max": 4.230035781860352, "sampling/sampling_logp_difference/mean": 0.10058675706386566, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 283.91015625, "completions/mean_terminated_length": 283.91015625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.09814571868628263, "epoch": 0.2584070796460177, "frac_reward_zero_std": 0.9375, "grad_norm": 0.21386235041296422, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 69840936.0, "reward": 0.8148593902587891, "reward_std": 0.004824659787118435, "rewards/qatch_small_update_with_fm/mean": 0.8148593902587891, "rewards/qatch_small_update_with_fm/std": 0.30982932448387146, "sampling/importance_sampling_ratio/max": 1.8642206192016602, "sampling/importance_sampling_ratio/mean": 0.9873814582824707, "sampling/importance_sampling_ratio/min": 0.014291773550212383, "sampling/sampling_logp_difference/max": 4.248071193695068, "sampling/sampling_logp_difference/mean": 0.09649167209863663, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 318.7734375, "completions/mean_terminated_length": 318.7734375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.10984945390373468, "epoch": 0.26017699115044246, "frac_reward_zero_std": 0.5, "grad_norm": 0.7232177387743879, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 70217950.0, "reward": 0.7357734441757202, "reward_std": 0.10225854814052582, "rewards/qatch_small_update_with_fm/mean": 0.7357734441757202, "rewards/qatch_small_update_with_fm/std": 0.3676723837852478, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927917718887329, "sampling/importance_sampling_ratio/min": 0.00675980793312192, "sampling/sampling_logp_difference/max": 4.996760845184326, "sampling/sampling_logp_difference/mean": 0.09779518842697144, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 327.26171875, "completions/mean_terminated_length": 327.26171875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.12225450947880745, "epoch": 0.26194690265486725, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6635589210066514, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 70701761.0, "reward": 0.7450859546661377, "reward_std": 0.13423468172550201, "rewards/qatch_small_update_with_fm/mean": 0.7450859546661377, "rewards/qatch_small_update_with_fm/std": 0.35765355825424194, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927431344985962, "sampling/importance_sampling_ratio/min": 0.008697601035237312, "sampling/sampling_logp_difference/max": 4.744708061218262, "sampling/sampling_logp_difference/mean": 0.10236505419015884, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 304.6171875, "completions/mean_terminated_length": 304.6171875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.10852374881505966, "epoch": 0.26371681415929205, "frac_reward_zero_std": 0.75, "grad_norm": 0.5372910231979201, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 71125407.0, "reward": 0.7528905868530273, "reward_std": 0.046875, "rewards/qatch_small_update_with_fm/mean": 0.7528906464576721, "rewards/qatch_small_update_with_fm/std": 0.31242799758911133, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933427572250366, "sampling/importance_sampling_ratio/min": 0.003236274002119899, "sampling/sampling_logp_difference/max": 5.733332633972168, "sampling/sampling_logp_difference/mean": 0.0950489193201065, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 327.58984375, "completions/mean_terminated_length": 327.58984375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.106517244130373, "epoch": 0.26548672566371684, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3782993611564517, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 71482758.0, "reward": 0.746874988079071, "reward_std": 0.030320392921566963, "rewards/qatch_small_update_with_fm/mean": 0.746874988079071, "rewards/qatch_small_update_with_fm/std": 0.31360676884651184, "sampling/importance_sampling_ratio/max": 1.9069268703460693, "sampling/importance_sampling_ratio/mean": 0.9913297891616821, "sampling/importance_sampling_ratio/min": 0.014322609640657902, "sampling/sampling_logp_difference/max": 4.24591588973999, "sampling/sampling_logp_difference/mean": 0.09537242352962494, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 275.15625, "completions/mean_terminated_length": 275.15625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.10477493796497583, "epoch": 0.2672566371681416, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7161847098933055, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 72037950.0, "reward": 0.8379726409912109, "reward_std": 0.07797817140817642, "rewards/qatch_small_update_with_fm/mean": 0.8379726409912109, "rewards/qatch_small_update_with_fm/std": 0.29647859930992126, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899912476539612, "sampling/importance_sampling_ratio/min": 0.0015248883282765746, "sampling/sampling_logp_difference/max": 6.485834121704102, "sampling/sampling_logp_difference/mean": 0.10141263902187347, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 310.3984375, "completions/mean_terminated_length": 310.3984375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.11613454110920429, "epoch": 0.26902654867256637, "frac_reward_zero_std": 0.625, "grad_norm": 0.6891517872451888, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 72599940.0, "reward": 0.7354961037635803, "reward_std": 0.1144690066576004, "rewards/qatch_small_update_with_fm/mean": 0.7354961037635803, "rewards/qatch_small_update_with_fm/std": 0.33235809206962585, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911872148513794, "sampling/importance_sampling_ratio/min": 0.0007149595185182989, "sampling/sampling_logp_difference/max": 7.243284702301025, "sampling/sampling_logp_difference/mean": 0.10409478843212128, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 341.36328125, "completions/mean_terminated_length": 341.36328125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.11662444099783897, "epoch": 0.27079646017699116, "frac_reward_zero_std": 0.25, "grad_norm": 0.8534737634982954, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 73316417.0, "reward": 0.5821914672851562, "reward_std": 0.20425722002983093, "rewards/qatch_small_update_with_fm/mean": 0.5821914076805115, "rewards/qatch_small_update_with_fm/std": 0.4198920726776123, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909311532974243, "sampling/importance_sampling_ratio/min": 0.014549053274095058, "sampling/sampling_logp_difference/max": 4.230229377746582, "sampling/sampling_logp_difference/mean": 0.10373535752296448, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 340.15234375, "completions/mean_terminated_length": 340.15234375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.11665886919945478, "epoch": 0.27256637168141595, "frac_reward_zero_std": 0.375, "grad_norm": 0.9040651908852855, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 73947096.0, "reward": 0.7290664315223694, "reward_std": 0.12613099813461304, "rewards/qatch_small_update_with_fm/mean": 0.7290664315223694, "rewards/qatch_small_update_with_fm/std": 0.38859787583351135, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9903141856193542, "sampling/importance_sampling_ratio/min": 0.011161153204739094, "sampling/sampling_logp_difference/max": 4.495316028594971, "sampling/sampling_logp_difference/mean": 0.10388841480016708, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 311.44921875, "completions/mean_terminated_length": 311.44921875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.10327374562621117, "epoch": 0.2743362831858407, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6888872039396743, "learning_rate": 1e-06, "loss": -0.0406, "num_tokens": 74477547.0, "reward": 0.6128828525543213, "reward_std": 0.10392679274082184, "rewards/qatch_small_update_with_fm/mean": 0.6128827929496765, "rewards/qatch_small_update_with_fm/std": 0.3436572849750519, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906870126724243, "sampling/importance_sampling_ratio/min": 0.005150408949702978, "sampling/sampling_logp_difference/max": 5.268679141998291, "sampling/sampling_logp_difference/mean": 0.09823443740606308, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 322.6484375, "completions/mean_terminated_length": 322.6484375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.11082449927926064, "epoch": 0.2761061946902655, "frac_reward_zero_std": 0.625, "grad_norm": 0.7015851796183483, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 75135905.0, "reward": 0.7353945374488831, "reward_std": 0.09259958565235138, "rewards/qatch_small_update_with_fm/mean": 0.7353945374488831, "rewards/qatch_small_update_with_fm/std": 0.35178375244140625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904823899269104, "sampling/importance_sampling_ratio/min": 0.0068324264138937, "sampling/sampling_logp_difference/max": 4.986075401306152, "sampling/sampling_logp_difference/mean": 0.09970168769359589, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 340.32421875, "completions/mean_terminated_length": 340.32421875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.12680564727634192, "epoch": 0.2778761061946903, "frac_reward_zero_std": 0.5, "grad_norm": 0.719573916814778, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 75662164.0, "reward": 0.7470703125, "reward_std": 0.09985685348510742, "rewards/qatch_small_update_with_fm/mean": 0.7470703125, "rewards/qatch_small_update_with_fm/std": 0.3765318989753723, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909191131591797, "sampling/importance_sampling_ratio/min": 0.011168654076755047, "sampling/sampling_logp_difference/max": 4.4946441650390625, "sampling/sampling_logp_difference/mean": 0.1114622950553894, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 320.4140625, "completions/mean_terminated_length": 320.4140625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.10564819350838661, "epoch": 0.27964601769911507, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6426424897825909, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 76192926.0, "reward": 0.6906836032867432, "reward_std": 0.1533028781414032, "rewards/qatch_small_update_with_fm/mean": 0.6906836032867432, "rewards/qatch_small_update_with_fm/std": 0.39270642399787903, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899740815162659, "sampling/importance_sampling_ratio/min": 0.0009147842065431178, "sampling/sampling_logp_difference/max": 6.996822357177734, "sampling/sampling_logp_difference/mean": 0.10140345990657806, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 316.74609375, "completions/mean_terminated_length": 316.74609375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.11091221682727337, "epoch": 0.2814159292035398, "frac_reward_zero_std": 0.5, "grad_norm": 0.6797052787098364, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 76756973.0, "reward": 0.8056992292404175, "reward_std": 0.11062506586313248, "rewards/qatch_small_update_with_fm/mean": 0.8056992292404175, "rewards/qatch_small_update_with_fm/std": 0.32723745703697205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991296648979187, "sampling/importance_sampling_ratio/min": 0.006754668429493904, "sampling/sampling_logp_difference/max": 4.99752140045166, "sampling/sampling_logp_difference/mean": 0.10075265914201736, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 336.171875, "completions/mean_terminated_length": 336.171875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.11384789273142815, "epoch": 0.2831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.8797097409635665, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 77260409.0, "reward": 0.4690038859844208, "reward_std": 0.10574622452259064, "rewards/qatch_small_update_with_fm/mean": 0.46900391578674316, "rewards/qatch_small_update_with_fm/std": 0.41652530431747437, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927676916122437, "sampling/importance_sampling_ratio/min": 0.009872177615761757, "sampling/sampling_logp_difference/max": 4.618034839630127, "sampling/sampling_logp_difference/mean": 0.1024760752916336, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 346.1171875, "completions/mean_terminated_length": 346.1171875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.11178999207913876, "epoch": 0.2849557522123894, "frac_reward_zero_std": 0.625, "grad_norm": 0.6811957283219384, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 77883415.0, "reward": 0.7185937166213989, "reward_std": 0.13636133074760437, "rewards/qatch_small_update_with_fm/mean": 0.7185937762260437, "rewards/qatch_small_update_with_fm/std": 0.3618468940258026, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907670021057129, "sampling/importance_sampling_ratio/min": 0.0018389016622677445, "sampling/sampling_logp_difference/max": 6.298586845397949, "sampling/sampling_logp_difference/mean": 0.10185998678207397, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 338.296875, "completions/mean_terminated_length": 338.296875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.10594448260962963, "epoch": 0.2867256637168142, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7268038126045898, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 78273699.0, "reward": 0.6265742182731628, "reward_std": 0.13848204910755157, "rewards/qatch_small_update_with_fm/mean": 0.6265742182731628, "rewards/qatch_small_update_with_fm/std": 0.433919221162796, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990743100643158, "sampling/importance_sampling_ratio/min": 0.0035162202548235655, "sampling/sampling_logp_difference/max": 5.650368690490723, "sampling/sampling_logp_difference/mean": 0.09895285218954086, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 362.01953125, "completions/mean_terminated_length": 362.01953125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.11295005213469267, "epoch": 0.2884955752212389, "frac_reward_zero_std": 0.4375, "grad_norm": 0.591131789029338, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 78820680.0, "reward": 0.734011709690094, "reward_std": 0.11754102259874344, "rewards/qatch_small_update_with_fm/mean": 0.7340116500854492, "rewards/qatch_small_update_with_fm/std": 0.32903358340263367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918836355209351, "sampling/importance_sampling_ratio/min": 8.4981948020868e-05, "sampling/sampling_logp_difference/max": 9.373071670532227, "sampling/sampling_logp_difference/mean": 0.10325956344604492, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 345.74609375, "completions/mean_terminated_length": 345.74609375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1067540692165494, "epoch": 0.2902654867256637, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6919553844927818, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 79378007.0, "reward": 0.7622460722923279, "reward_std": 0.11750834435224533, "rewards/qatch_small_update_with_fm/mean": 0.7622460722923279, "rewards/qatch_small_update_with_fm/std": 0.38261905312538147, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902242422103882, "sampling/importance_sampling_ratio/min": 0.010629362426698208, "sampling/sampling_logp_difference/max": 4.544135093688965, "sampling/sampling_logp_difference/mean": 0.09993548691272736, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 289.91015625, "completions/mean_terminated_length": 289.91015625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.09920257981866598, "epoch": 0.2920353982300885, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6278776429791636, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 79901744.0, "reward": 0.9076640605926514, "reward_std": 0.07271681725978851, "rewards/qatch_small_update_with_fm/mean": 0.9076640605926514, "rewards/qatch_small_update_with_fm/std": 0.2249380499124527, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918192625045776, "sampling/importance_sampling_ratio/min": 0.011175889521837234, "sampling/sampling_logp_difference/max": 4.493996620178223, "sampling/sampling_logp_difference/mean": 0.09208320826292038, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 293.078125, "completions/mean_terminated_length": 293.078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.09951941296458244, "epoch": 0.2938053097345133, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7492559799835081, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 80436596.0, "reward": 0.7066914439201355, "reward_std": 0.11800296604633331, "rewards/qatch_small_update_with_fm/mean": 0.7066914439201355, "rewards/qatch_small_update_with_fm/std": 0.357426255941391, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898148775100708, "sampling/importance_sampling_ratio/min": 0.0004474441520869732, "sampling/sampling_logp_difference/max": 7.711958885192871, "sampling/sampling_logp_difference/mean": 0.09365606307983398, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 317.6875, "completions/mean_terminated_length": 317.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.11481165885925293, "epoch": 0.29557522123893804, "frac_reward_zero_std": 0.625, "grad_norm": 0.5643059743627443, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 80971908.0, "reward": 0.5545429587364197, "reward_std": 0.05453858524560928, "rewards/qatch_small_update_with_fm/mean": 0.5545429587364197, "rewards/qatch_small_update_with_fm/std": 0.4164023995399475, "sampling/importance_sampling_ratio/max": 1.9052268266677856, "sampling/importance_sampling_ratio/mean": 0.9890497922897339, "sampling/importance_sampling_ratio/min": 0.012715059332549572, "sampling/sampling_logp_difference/max": 4.364968299865723, "sampling/sampling_logp_difference/mean": 0.10033901780843735, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 324.82421875, "completions/mean_terminated_length": 324.82421875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.11254716478288174, "epoch": 0.2973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 0.5113955054646178, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 81429943.0, "reward": 0.7119725942611694, "reward_std": 0.05836210772395134, "rewards/qatch_small_update_with_fm/mean": 0.7119725942611694, "rewards/qatch_small_update_with_fm/std": 0.3997737765312195, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912292957305908, "sampling/importance_sampling_ratio/min": 0.00725527573376894, "sampling/sampling_logp_difference/max": 4.926026344299316, "sampling/sampling_logp_difference/mean": 0.10253892838954926, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 324.08203125, "completions/mean_terminated_length": 324.08203125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.09713317174464464, "epoch": 0.2991150442477876, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8651796151859013, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 81927804.0, "reward": 0.6655507683753967, "reward_std": 0.122608982026577, "rewards/qatch_small_update_with_fm/mean": 0.6655508279800415, "rewards/qatch_small_update_with_fm/std": 0.3819872736930847, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991803765296936, "sampling/importance_sampling_ratio/min": 2.927790774265304e-06, "sampling/sampling_logp_difference/max": 12.741262435913086, "sampling/sampling_logp_difference/mean": 0.090723916888237, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 307.00390625, "completions/mean_terminated_length": 307.00390625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.09481226559728384, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 0.7662300408531278, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 82321245.0, "reward": 0.7684023380279541, "reward_std": 0.1258222609758377, "rewards/qatch_small_update_with_fm/mean": 0.7684023380279541, "rewards/qatch_small_update_with_fm/std": 0.3740701675415039, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988567590713501, "sampling/importance_sampling_ratio/min": 0.01430974155664444, "sampling/sampling_logp_difference/max": 4.246814727783203, "sampling/sampling_logp_difference/mean": 0.09349850565195084, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 313.04296875, "completions/mean_terminated_length": 313.04296875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.10745582170784473, "epoch": 0.30265486725663715, "frac_reward_zero_std": 0.6875, "grad_norm": 1.121831443907111, "learning_rate": 1e-06, "loss": 0.032, "num_tokens": 82698712.0, "reward": 0.7548906207084656, "reward_std": 0.09169437736272812, "rewards/qatch_small_update_with_fm/mean": 0.7548906803131104, "rewards/qatch_small_update_with_fm/std": 0.3765425384044647, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882317185401917, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.10578665137290955, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 305.1640625, "completions/mean_terminated_length": 305.1640625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.09699310921132565, "epoch": 0.30442477876106194, "frac_reward_zero_std": 0.5625, "grad_norm": 0.623628416847649, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 83212354.0, "reward": 0.8881992101669312, "reward_std": 0.10021714866161346, "rewards/qatch_small_update_with_fm/mean": 0.8881992101669312, "rewards/qatch_small_update_with_fm/std": 0.2322145402431488, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9884635210037231, "sampling/importance_sampling_ratio/min": 0.011157982051372528, "sampling/sampling_logp_difference/max": 4.49560022354126, "sampling/sampling_logp_difference/mean": 0.09806492924690247, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 306.546875, "completions/mean_terminated_length": 306.546875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.09263870492577553, "epoch": 0.30619469026548674, "frac_reward_zero_std": 0.4375, "grad_norm": 0.9500327430114363, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 83867950.0, "reward": 0.7660039067268372, "reward_std": 0.08824647963047028, "rewards/qatch_small_update_with_fm/mean": 0.7660039067268372, "rewards/qatch_small_update_with_fm/std": 0.3496793508529663, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896693229675293, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.09181556105613708, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 308.359375, "completions/mean_terminated_length": 308.359375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.08988738339394331, "epoch": 0.30796460176991153, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6961920769439389, "learning_rate": 1e-06, "loss": -0.0211, "num_tokens": 84357978.0, "reward": 0.8017382621765137, "reward_std": 0.08179794251918793, "rewards/qatch_small_update_with_fm/mean": 0.8017382621765137, "rewards/qatch_small_update_with_fm/std": 0.32338473200798035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890596270561218, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.09492075443267822, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 354.05078125, "completions/mean_terminated_length": 354.05078125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.08568741753697395, "epoch": 0.30973451327433627, "frac_reward_zero_std": 0.625, "grad_norm": 0.6020283387826877, "learning_rate": 1e-06, "loss": -0.0232, "num_tokens": 84880967.0, "reward": 0.776605486869812, "reward_std": 0.0926983505487442, "rewards/qatch_small_update_with_fm/mean": 0.776605486869812, "rewards/qatch_small_update_with_fm/std": 0.33691495656967163, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912744760513306, "sampling/importance_sampling_ratio/min": 0.0015349813038483262, "sampling/sampling_logp_difference/max": 6.479237079620361, "sampling/sampling_logp_difference/mean": 0.08575327694416046, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 350.390625, "completions/mean_terminated_length": 350.390625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.11490303184837103, "epoch": 0.31150442477876106, "frac_reward_zero_std": 0.625, "grad_norm": 0.6079318288634915, "learning_rate": 1e-06, "loss": -0.0335, "num_tokens": 85434347.0, "reward": 0.6241093873977661, "reward_std": 0.10749362409114838, "rewards/qatch_small_update_with_fm/mean": 0.6241093873977661, "rewards/qatch_small_update_with_fm/std": 0.36939021944999695, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918155670166016, "sampling/importance_sampling_ratio/min": 0.0059577059000730515, "sampling/sampling_logp_difference/max": 5.123069763183594, "sampling/sampling_logp_difference/mean": 0.10410696268081665, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 355.9609375, "completions/mean_terminated_length": 355.9609375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.11659033037722111, "epoch": 0.31327433628318585, "frac_reward_zero_std": 0.375, "grad_norm": 0.8230828183609293, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 86127505.0, "reward": 0.6114218235015869, "reward_std": 0.11610874533653259, "rewards/qatch_small_update_with_fm/mean": 0.6114218235015869, "rewards/qatch_small_update_with_fm/std": 0.3745664954185486, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907248616218567, "sampling/importance_sampling_ratio/min": 0.008668403141200542, "sampling/sampling_logp_difference/max": 4.74807071685791, "sampling/sampling_logp_difference/mean": 0.10726846754550934, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 316.46875, "completions/mean_terminated_length": 316.46875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.10505182296037674, "epoch": 0.31504424778761064, "frac_reward_zero_std": 0.5, "grad_norm": 0.8719774383428669, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 86662585.0, "reward": 0.8082695603370667, "reward_std": 0.11230713874101639, "rewards/qatch_small_update_with_fm/mean": 0.8082695603370667, "rewards/qatch_small_update_with_fm/std": 0.33685362339019775, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990057647228241, "sampling/importance_sampling_ratio/min": 0.005370930302888155, "sampling/sampling_logp_difference/max": 5.226754188537598, "sampling/sampling_logp_difference/mean": 0.09755245596170425, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 379.71875, "completions/mean_terminated_length": 379.71875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.10993960406631231, "epoch": 0.3168141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 0.7456707224921653, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 87261601.0, "reward": 0.5580117106437683, "reward_std": 0.133323535323143, "rewards/qatch_small_update_with_fm/mean": 0.5580117106437683, "rewards/qatch_small_update_with_fm/std": 0.3953360915184021, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920111298561096, "sampling/importance_sampling_ratio/min": 0.008692318573594093, "sampling/sampling_logp_difference/max": 4.7453155517578125, "sampling/sampling_logp_difference/mean": 0.10081157088279724, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 290.55859375, "completions/mean_terminated_length": 290.55859375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.0883013503625989, "epoch": 0.3185840707964602, "frac_reward_zero_std": 0.5625, "grad_norm": 0.762903167341592, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 87937168.0, "reward": 0.7398515939712524, "reward_std": 0.11943519115447998, "rewards/qatch_small_update_with_fm/mean": 0.7398515939712524, "rewards/qatch_small_update_with_fm/std": 0.34509414434432983, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9878397583961487, "sampling/importance_sampling_ratio/min": 0.005517044570297003, "sampling/sampling_logp_difference/max": 5.199913024902344, "sampling/sampling_logp_difference/mean": 0.09428344666957855, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 254.7421875, "completions/mean_terminated_length": 254.7421875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07716769725084305, "epoch": 0.32035398230088497, "frac_reward_zero_std": 0.6875, "grad_norm": 0.7686144020436141, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 88278398.0, "reward": 0.7153867483139038, "reward_std": 0.07736422121524811, "rewards/qatch_small_update_with_fm/mean": 0.7153867483139038, "rewards/qatch_small_update_with_fm/std": 0.3409058749675751, "sampling/importance_sampling_ratio/max": 1.9639513492584229, "sampling/importance_sampling_ratio/mean": 0.9903814792633057, "sampling/importance_sampling_ratio/min": 0.012658960185945034, "sampling/sampling_logp_difference/max": 4.36939001083374, "sampling/sampling_logp_difference/mean": 0.08092033863067627, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 320.12109375, "completions/mean_terminated_length": 320.12109375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.08729387260973454, "epoch": 0.32212389380530976, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6571255809518773, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 88914173.0, "reward": 0.7904921770095825, "reward_std": 0.0689883828163147, "rewards/qatch_small_update_with_fm/mean": 0.7904921770095825, "rewards/qatch_small_update_with_fm/std": 0.3290969133377075, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904789328575134, "sampling/importance_sampling_ratio/min": 0.005257649812847376, "sampling/sampling_logp_difference/max": 5.248071193695068, "sampling/sampling_logp_difference/mean": 0.08582089096307755, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 287.02734375, "completions/mean_terminated_length": 287.02734375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.08130458928644657, "epoch": 0.3238938053097345, "frac_reward_zero_std": 0.5625, "grad_norm": 0.676653673402814, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 89271364.0, "reward": 0.7791132926940918, "reward_std": 0.09577371925115585, "rewards/qatch_small_update_with_fm/mean": 0.7791132926940918, "rewards/qatch_small_update_with_fm/std": 0.34913262724876404, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9928947687149048, "sampling/importance_sampling_ratio/min": 0.011154402047395706, "sampling/sampling_logp_difference/max": 4.4959211349487305, "sampling/sampling_logp_difference/mean": 0.08006507158279419, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1594.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 421.66015625, "completions/mean_terminated_length": 421.66015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.12242889311164618, "epoch": 0.3256637168141593, "frac_reward_zero_std": 0.25, "grad_norm": 0.6934224953981509, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 89887709.0, "reward": 0.7145429849624634, "reward_std": 0.18574464321136475, "rewards/qatch_small_update_with_fm/mean": 0.7145429849624634, "rewards/qatch_small_update_with_fm/std": 0.37374573945999146, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9917437434196472, "sampling/importance_sampling_ratio/min": 0.0025099029298871756, "sampling/sampling_logp_difference/max": 5.987511157989502, "sampling/sampling_logp_difference/mean": 0.10476350039243698, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 354.5703125, "completions/mean_terminated_length": 354.5703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.11489906162023544, "epoch": 0.3274336283185841, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5499845071184065, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 90365359.0, "reward": 0.665109395980835, "reward_std": 0.0919097363948822, "rewards/qatch_small_update_with_fm/mean": 0.665109395980835, "rewards/qatch_small_update_with_fm/std": 0.402885764837265, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922910928726196, "sampling/importance_sampling_ratio/min": 0.012095731683075428, "sampling/sampling_logp_difference/max": 4.414902687072754, "sampling/sampling_logp_difference/mean": 0.10020913183689117, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 355.58203125, "completions/mean_terminated_length": 355.58203125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.09928950481116772, "epoch": 0.3292035398230089, "frac_reward_zero_std": 0.5, "grad_norm": 0.6232041508744968, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 90929524.0, "reward": 0.6242695450782776, "reward_std": 0.1135488897562027, "rewards/qatch_small_update_with_fm/mean": 0.6242695450782776, "rewards/qatch_small_update_with_fm/std": 0.3536941707134247, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9942116737365723, "sampling/importance_sampling_ratio/min": 0.00818202830851078, "sampling/sampling_logp_difference/max": 4.80581521987915, "sampling/sampling_logp_difference/mean": 0.08945226669311523, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.09778167959302664, "epoch": 0.3309734513274336, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6736731693665866, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 91402596.0, "reward": 0.6589140892028809, "reward_std": 0.07543458044528961, "rewards/qatch_small_update_with_fm/mean": 0.6589140892028809, "rewards/qatch_small_update_with_fm/std": 0.37273266911506653, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901614189147949, "sampling/importance_sampling_ratio/min": 0.008699359372258186, "sampling/sampling_logp_difference/max": 4.744505882263184, "sampling/sampling_logp_difference/mean": 0.09529487788677216, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 315.40625, "completions/mean_terminated_length": 315.40625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.09347141813486814, "epoch": 0.3327433628318584, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6242793766687501, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 92142220.0, "reward": 0.8259179592132568, "reward_std": 0.10921245813369751, "rewards/qatch_small_update_with_fm/mean": 0.8259179592132568, "rewards/qatch_small_update_with_fm/std": 0.3004448115825653, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913510680198669, "sampling/importance_sampling_ratio/min": 0.00676548620685935, "sampling/sampling_logp_difference/max": 4.9959211349487305, "sampling/sampling_logp_difference/mean": 0.09079431742429733, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 361.1484375, "completions/mean_terminated_length": 361.1484375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.0986938551068306, "epoch": 0.3345132743362832, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7453656730193217, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 92721058.0, "reward": 0.6616289019584656, "reward_std": 0.12159581482410431, "rewards/qatch_small_update_with_fm/mean": 0.6616289615631104, "rewards/qatch_small_update_with_fm/std": 0.395707368850708, "sampling/importance_sampling_ratio/max": 1.9929343461990356, "sampling/importance_sampling_ratio/mean": 0.9904531240463257, "sampling/importance_sampling_ratio/min": 0.014309660531580448, "sampling/sampling_logp_difference/max": 4.246820449829102, "sampling/sampling_logp_difference/mean": 0.09572584927082062, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 345.2421875, "completions/mean_terminated_length": 345.2421875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.11141256336122751, "epoch": 0.336283185840708, "frac_reward_zero_std": 0.5, "grad_norm": 0.7423535257929594, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 93106320.0, "reward": 0.7088867425918579, "reward_std": 0.0872776210308075, "rewards/qatch_small_update_with_fm/mean": 0.7088867425918579, "rewards/qatch_small_update_with_fm/std": 0.33193743228912354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9905740022659302, "sampling/importance_sampling_ratio/min": 0.014339085668325424, "sampling/sampling_logp_difference/max": 4.2447662353515625, "sampling/sampling_logp_difference/mean": 0.10014653205871582, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 369.8515625, "completions/mean_terminated_length": 369.8515625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.10245176684111357, "epoch": 0.3380530973451327, "frac_reward_zero_std": 0.5, "grad_norm": 0.7634695119879356, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 93606074.0, "reward": 0.6076210737228394, "reward_std": 0.1056838408112526, "rewards/qatch_small_update_with_fm/mean": 0.6076210737228394, "rewards/qatch_small_update_with_fm/std": 0.369706392288208, "sampling/importance_sampling_ratio/max": 1.9917839765548706, "sampling/importance_sampling_ratio/mean": 0.9923558235168457, "sampling/importance_sampling_ratio/min": 0.010142802260816097, "sampling/sampling_logp_difference/max": 4.590991020202637, "sampling/sampling_logp_difference/mean": 0.09241395443677902, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 341.4921875, "completions/mean_terminated_length": 341.4921875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.10239611659199, "epoch": 0.3398230088495575, "frac_reward_zero_std": 0.8125, "grad_norm": 0.49375392600444185, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 94055960.0, "reward": 0.7815742492675781, "reward_std": 0.029848366975784302, "rewards/qatch_small_update_with_fm/mean": 0.7815742492675781, "rewards/qatch_small_update_with_fm/std": 0.3153725266456604, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914690256118774, "sampling/importance_sampling_ratio/min": 0.008748124353587627, "sampling/sampling_logp_difference/max": 4.738915920257568, "sampling/sampling_logp_difference/mean": 0.09125423431396484, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 337.89453125, "completions/mean_terminated_length": 337.89453125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.10480010323226452, "epoch": 0.3415929203539823, "frac_reward_zero_std": 0.625, "grad_norm": 0.6433223060296351, "learning_rate": 1e-06, "loss": -0.0301, "num_tokens": 94600109.0, "reward": 0.6715664267539978, "reward_std": 0.09592887759208679, "rewards/qatch_small_update_with_fm/mean": 0.6715664267539978, "rewards/qatch_small_update_with_fm/std": 0.3534429371356964, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9928864240646362, "sampling/importance_sampling_ratio/min": 0.0027512013912200928, "sampling/sampling_logp_difference/max": 5.895717620849609, "sampling/sampling_logp_difference/mean": 0.09365631639957428, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 312.56640625, "completions/mean_terminated_length": 312.56640625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.0991691155359149, "epoch": 0.3433628318584071, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3817737532536764, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 95152462.0, "reward": 0.9008671641349792, "reward_std": 0.04832971841096878, "rewards/qatch_small_update_with_fm/mean": 0.9008671641349792, "rewards/qatch_small_update_with_fm/std": 0.2433462291955948, "sampling/importance_sampling_ratio/max": 1.8678128719329834, "sampling/importance_sampling_ratio/mean": 0.9926869869232178, "sampling/importance_sampling_ratio/min": 0.018439065665006638, "sampling/sampling_logp_difference/max": 3.993283748626709, "sampling/sampling_logp_difference/mean": 0.088732048869133, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 347.703125, "completions/mean_terminated_length": 347.703125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.10633239150047302, "epoch": 0.34513274336283184, "frac_reward_zero_std": 0.625, "grad_norm": 0.9229843567916687, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 95595426.0, "reward": 0.7062265872955322, "reward_std": 0.11547653377056122, "rewards/qatch_small_update_with_fm/mean": 0.7062265872955322, "rewards/qatch_small_update_with_fm/std": 0.3614107668399811, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913710355758667, "sampling/importance_sampling_ratio/min": 0.004114307928830385, "sampling/sampling_logp_difference/max": 5.493284702301025, "sampling/sampling_logp_difference/mean": 0.0934092104434967, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 325.4296875, "completions/mean_terminated_length": 325.4296875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.11451444402337074, "epoch": 0.34690265486725663, "frac_reward_zero_std": 0.75, "grad_norm": 0.45549074468231604, "learning_rate": 1e-06, "loss": -0.0234, "num_tokens": 96067856.0, "reward": 0.6925156116485596, "reward_std": 0.051358893513679504, "rewards/qatch_small_update_with_fm/mean": 0.6925156116485596, "rewards/qatch_small_update_with_fm/std": 0.3611532151699066, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914189577102661, "sampling/importance_sampling_ratio/min": 0.014309939928352833, "sampling/sampling_logp_difference/max": 4.246800899505615, "sampling/sampling_logp_difference/mean": 0.10335803031921387, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 424.375, "completions/mean_terminated_length": 424.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.12794983014464378, "epoch": 0.3486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 0.5880200744423812, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 96755152.0, "reward": 0.58935546875, "reward_std": 0.12563852965831757, "rewards/qatch_small_update_with_fm/mean": 0.58935546875, "rewards/qatch_small_update_with_fm/std": 0.3818955421447754, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994789719581604, "sampling/importance_sampling_ratio/min": 0.0004323141765780747, "sampling/sampling_logp_difference/max": 7.7463579177856445, "sampling/sampling_logp_difference/mean": 0.1069219559431076, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 335.69921875, "completions/mean_terminated_length": 335.69921875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.09195560961961746, "epoch": 0.3504424778761062, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6362032916923682, "learning_rate": 1e-06, "loss": -0.0251, "num_tokens": 97429299.0, "reward": 0.7082499861717224, "reward_std": 0.05375829339027405, "rewards/qatch_small_update_with_fm/mean": 0.7082499861717224, "rewards/qatch_small_update_with_fm/std": 0.3975197374820709, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904249310493469, "sampling/importance_sampling_ratio/min": 0.0031889788806438446, "sampling/sampling_logp_difference/max": 5.748054504394531, "sampling/sampling_logp_difference/mean": 0.09103560447692871, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 365.4921875, "completions/mean_terminated_length": 365.4921875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.11913414020091295, "epoch": 0.35221238938053095, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6131941301175466, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 98008113.0, "reward": 0.5971132516860962, "reward_std": 0.10834619402885437, "rewards/qatch_small_update_with_fm/mean": 0.5971132516860962, "rewards/qatch_small_update_with_fm/std": 0.3694043457508087, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9916287660598755, "sampling/importance_sampling_ratio/min": 0.01233906950801611, "sampling/sampling_logp_difference/max": 4.394984722137451, "sampling/sampling_logp_difference/mean": 0.10478620231151581, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 357.05859375, "completions/mean_terminated_length": 357.05859375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.10899745859205723, "epoch": 0.35398230088495575, "frac_reward_zero_std": 0.375, "grad_norm": 0.7218593602759523, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 98531936.0, "reward": 0.6973046660423279, "reward_std": 0.11880404502153397, "rewards/qatch_small_update_with_fm/mean": 0.6973047256469727, "rewards/qatch_small_update_with_fm/std": 0.4068034291267395, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918450117111206, "sampling/importance_sampling_ratio/min": 0.0111839659512043, "sampling/sampling_logp_difference/max": 4.493274211883545, "sampling/sampling_logp_difference/mean": 0.09498186409473419, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 360.96484375, "completions/mean_terminated_length": 360.96484375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.10720335505902767, "epoch": 0.35575221238938054, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6080464868328257, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 99023719.0, "reward": 0.6945273876190186, "reward_std": 0.0998416393995285, "rewards/qatch_small_update_with_fm/mean": 0.6945273876190186, "rewards/qatch_small_update_with_fm/std": 0.36767905950546265, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906712770462036, "sampling/importance_sampling_ratio/min": 0.003982192371040583, "sampling/sampling_logp_difference/max": 5.525922775268555, "sampling/sampling_logp_difference/mean": 0.10087412595748901, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 340.1015625, "completions/mean_terminated_length": 340.1015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.12118315137922764, "epoch": 0.35752212389380533, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7410996586697325, "learning_rate": 1e-06, "loss": -0.0125, "num_tokens": 99510897.0, "reward": 0.698015570640564, "reward_std": 0.1116129457950592, "rewards/qatch_small_update_with_fm/mean": 0.698015570640564, "rewards/qatch_small_update_with_fm/std": 0.3681219816207886, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892449975013733, "sampling/importance_sampling_ratio/min": 0.011232408694922924, "sampling/sampling_logp_difference/max": 4.488952159881592, "sampling/sampling_logp_difference/mean": 0.10859468579292297, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 348.39453125, "completions/mean_terminated_length": 348.39453125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.1027478277683258, "epoch": 0.35929203539823007, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7150909605607257, "learning_rate": 1e-06, "loss": 0.0356, "num_tokens": 99994646.0, "reward": 0.7400000095367432, "reward_std": 0.10706169903278351, "rewards/qatch_small_update_with_fm/mean": 0.7400000095367432, "rewards/qatch_small_update_with_fm/std": 0.35928434133529663, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894899129867554, "sampling/importance_sampling_ratio/min": 0.018361039459705353, "sampling/sampling_logp_difference/max": 3.9975242614746094, "sampling/sampling_logp_difference/mean": 0.09557086229324341, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 372.43359375, "completions/mean_terminated_length": 372.43359375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.1205616993829608, "epoch": 0.36106194690265486, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8901426137212775, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 100514229.0, "reward": 0.7741680145263672, "reward_std": 0.13178735971450806, "rewards/qatch_small_update_with_fm/mean": 0.7741679549217224, "rewards/qatch_small_update_with_fm/std": 0.33109039068222046, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9924964904785156, "sampling/importance_sampling_ratio/min": 0.009109980426728725, "sampling/sampling_logp_difference/max": 4.698384761810303, "sampling/sampling_logp_difference/mean": 0.10391943156719208, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 355.00390625, "completions/mean_terminated_length": 355.00390625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.10279853083193302, "epoch": 0.36283185840707965, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5888127900166, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 100845366.0, "reward": 0.7348710894584656, "reward_std": 0.13863445818424225, "rewards/qatch_small_update_with_fm/mean": 0.7348710894584656, "rewards/qatch_small_update_with_fm/std": 0.3723523020744324, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934222102165222, "sampling/importance_sampling_ratio/min": 0.011168073862791061, "sampling/sampling_logp_difference/max": 4.494696140289307, "sampling/sampling_logp_difference/mean": 0.09341467916965485, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 390.84765625, "completions/mean_terminated_length": 390.84765625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.10497349500656128, "epoch": 0.36460176991150445, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5545867533478495, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 101277951.0, "reward": 0.7697421908378601, "reward_std": 0.09470079839229584, "rewards/qatch_small_update_with_fm/mean": 0.7697421908378601, "rewards/qatch_small_update_with_fm/std": 0.3705956041812897, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9931164383888245, "sampling/importance_sampling_ratio/min": 0.011154407635331154, "sampling/sampling_logp_difference/max": 4.495920658111572, "sampling/sampling_logp_difference/mean": 0.09605337679386139, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 360.8046875, "completions/mean_terminated_length": 360.8046875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.10272657312452793, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.8125, "grad_norm": 0.403005864270568, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 101892333.0, "reward": 0.7502422332763672, "reward_std": 0.02443435788154602, "rewards/qatch_small_update_with_fm/mean": 0.7502422332763672, "rewards/qatch_small_update_with_fm/std": 0.31904786825180054, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899274110794067, "sampling/importance_sampling_ratio/min": 0.005282875616103411, "sampling/sampling_logp_difference/max": 5.243284702301025, "sampling/sampling_logp_difference/mean": 0.09682901948690414, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 396.0078125, "completions/mean_terminated_length": 381.498046875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.11194436810910702, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.625, "grad_norm": 0.4911640066793156, "learning_rate": 1e-06, "loss": -0.0317, "num_tokens": 102291583.0, "reward": 0.6277070045471191, "reward_std": 0.09390310943126678, "rewards/qatch_small_update_with_fm/mean": 0.6277070045471191, "rewards/qatch_small_update_with_fm/std": 0.38415640592575073, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9944193363189697, "sampling/importance_sampling_ratio/min": 0.01850791648030281, "sampling/sampling_logp_difference/max": 3.9895567893981934, "sampling/sampling_logp_difference/mean": 0.09457258880138397, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 384.65625, "completions/mean_terminated_length": 384.65625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.11076440289616585, "epoch": 0.36991150442477877, "frac_reward_zero_std": 0.375, "grad_norm": 0.8724792452949058, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 102807735.0, "reward": 0.7210546731948853, "reward_std": 0.1516134887933731, "rewards/qatch_small_update_with_fm/mean": 0.7210546731948853, "rewards/qatch_small_update_with_fm/std": 0.3743986189365387, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922000765800476, "sampling/importance_sampling_ratio/min": 0.011169740930199623, "sampling/sampling_logp_difference/max": 4.494546890258789, "sampling/sampling_logp_difference/mean": 0.09591890871524811, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 371.18359375, "completions/mean_terminated_length": 371.18359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.10253117606043816, "epoch": 0.37168141592920356, "frac_reward_zero_std": 0.625, "grad_norm": 0.542561326203597, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 103127958.0, "reward": 0.727777361869812, "reward_std": 0.08403307944536209, "rewards/qatch_small_update_with_fm/mean": 0.727777361869812, "rewards/qatch_small_update_with_fm/std": 0.3928605914115906, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9942893981933594, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.09067496657371521, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 427.49609375, "completions/mean_terminated_length": 413.1098327636719, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.11693217884749174, "epoch": 0.3734513274336283, "frac_reward_zero_std": 0.5, "grad_norm": 0.48834549093132984, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 103627781.0, "reward": 0.7900313138961792, "reward_std": 0.08779511600732803, "rewards/qatch_small_update_with_fm/mean": 0.7900311946868896, "rewards/qatch_small_update_with_fm/std": 0.360713392496109, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922540187835693, "sampling/importance_sampling_ratio/min": 0.008556932210922241, "sampling/sampling_logp_difference/max": 4.761013507843018, "sampling/sampling_logp_difference/mean": 0.10390772670507431, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 412.34375, "completions/mean_terminated_length": 412.34375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.1140274703502655, "epoch": 0.3752212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 0.6180322534177476, "learning_rate": 1e-06, "loss": -0.0252, "num_tokens": 104254797.0, "reward": 0.6544452905654907, "reward_std": 0.12845413386821747, "rewards/qatch_small_update_with_fm/mean": 0.6544452905654907, "rewards/qatch_small_update_with_fm/std": 0.40956881642341614, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898783564567566, "sampling/importance_sampling_ratio/min": 4.610423047779477e-07, "sampling/sampling_logp_difference/max": 14.589776039123535, "sampling/sampling_logp_difference/mean": 0.10311967879533768, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 363.5234375, "completions/mean_terminated_length": 363.5234375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.09243700839579105, "epoch": 0.3769911504424779, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7249564433829758, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 104963187.0, "reward": 0.6249492168426514, "reward_std": 0.1054215282201767, "rewards/qatch_small_update_with_fm/mean": 0.6249492168426514, "rewards/qatch_small_update_with_fm/std": 0.32009294629096985, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896044731140137, "sampling/importance_sampling_ratio/min": 0.0032624199520796537, "sampling/sampling_logp_difference/max": 5.72528600692749, "sampling/sampling_logp_difference/mean": 0.09415632486343384, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.10987964924424887, "epoch": 0.3787610619469027, "frac_reward_zero_std": 0.625, "grad_norm": 0.5524975720746137, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 105315203.0, "reward": 0.625613272190094, "reward_std": 0.07790683209896088, "rewards/qatch_small_update_with_fm/mean": 0.625613272190094, "rewards/qatch_small_update_with_fm/std": 0.4087977409362793, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906055331230164, "sampling/importance_sampling_ratio/min": 0.006783362478017807, "sampling/sampling_logp_difference/max": 4.993282318115234, "sampling/sampling_logp_difference/mean": 0.0996527373790741, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 383.6875, "completions/mean_terminated_length": 383.6875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.10452158376574516, "epoch": 0.3805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.38716645681649037, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 105755875.0, "reward": 0.8351406455039978, "reward_std": 0.06556863337755203, "rewards/qatch_small_update_with_fm/mean": 0.8351406455039978, "rewards/qatch_small_update_with_fm/std": 0.2928430736064911, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922690391540527, "sampling/importance_sampling_ratio/min": 0.011539791710674763, "sampling/sampling_logp_difference/max": 4.461954116821289, "sampling/sampling_logp_difference/mean": 0.09711340069770813, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 415.6484375, "completions/mean_terminated_length": 415.6484375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.12065678741782904, "epoch": 0.3823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 0.6352222097447021, "learning_rate": 1e-06, "loss": -0.0247, "num_tokens": 106177929.0, "reward": 0.7687656283378601, "reward_std": 0.13633453845977783, "rewards/qatch_small_update_with_fm/mean": 0.7687656283378601, "rewards/qatch_small_update_with_fm/std": 0.33637431263923645, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927363395690918, "sampling/importance_sampling_ratio/min": 0.0111549012362957, "sampling/sampling_logp_difference/max": 4.495876312255859, "sampling/sampling_logp_difference/mean": 0.10593421012163162, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 403.5, "completions/mean_terminated_length": 403.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.10500549431890249, "epoch": 0.384070796460177, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6852083526445841, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 106730537.0, "reward": 0.8187460899353027, "reward_std": 0.1587405502796173, "rewards/qatch_small_update_with_fm/mean": 0.8187460899353027, "rewards/qatch_small_update_with_fm/std": 0.3261018395423889, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9908450841903687, "sampling/importance_sampling_ratio/min": 0.0032403613440692425, "sampling/sampling_logp_difference/max": 5.732070446014404, "sampling/sampling_logp_difference/mean": 0.09662536531686783, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 414.6875, "completions/mean_terminated_length": 414.6875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.10810738615691662, "epoch": 0.3858407079646018, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6813898746761639, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 107296617.0, "reward": 0.7636758089065552, "reward_std": 0.11631257832050323, "rewards/qatch_small_update_with_fm/mean": 0.7636758089065552, "rewards/qatch_small_update_with_fm/std": 0.3674674928188324, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918367862701416, "sampling/importance_sampling_ratio/min": 7.124587864382192e-05, "sampling/sampling_logp_difference/max": 9.549373626708984, "sampling/sampling_logp_difference/mean": 0.09496808052062988, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 340.33203125, "completions/mean_terminated_length": 340.33203125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.10157356224954128, "epoch": 0.38761061946902653, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5941120949615057, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 107702590.0, "reward": 0.8051445484161377, "reward_std": 0.09416786581277847, "rewards/qatch_small_update_with_fm/mean": 0.8051445484161377, "rewards/qatch_small_update_with_fm/std": 0.32420217990875244, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893560409545898, "sampling/importance_sampling_ratio/min": 0.007196484599262476, "sampling/sampling_logp_difference/max": 4.934162616729736, "sampling/sampling_logp_difference/mean": 0.09752357006072998, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 431.86328125, "completions/mean_terminated_length": 431.86328125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.1107902517542243, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.625, "grad_norm": 0.5457807080706949, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 108195003.0, "reward": 0.6913124918937683, "reward_std": 0.10084883123636246, "rewards/qatch_small_update_with_fm/mean": 0.6913124918937683, "rewards/qatch_small_update_with_fm/std": 0.3513505458831787, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9931973218917847, "sampling/importance_sampling_ratio/min": 0.011205301620066166, "sampling/sampling_logp_difference/max": 4.491368293762207, "sampling/sampling_logp_difference/mean": 0.09411786496639252, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 371.265625, "completions/mean_terminated_length": 371.265625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1083214282989502, "epoch": 0.3911504424778761, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7784986805139263, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 108672607.0, "reward": 0.6840741634368896, "reward_std": 0.14675581455230713, "rewards/qatch_small_update_with_fm/mean": 0.6840741634368896, "rewards/qatch_small_update_with_fm/std": 0.4099890887737274, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994238018989563, "sampling/importance_sampling_ratio/min": 0.008668428286910057, "sampling/sampling_logp_difference/max": 4.748067855834961, "sampling/sampling_logp_difference/mean": 0.0962483137845993, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.10230957623571157, "epoch": 0.3929203539823009, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5490231460475042, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 109189583.0, "reward": 0.7061171531677246, "reward_std": 0.10651902854442596, "rewards/qatch_small_update_with_fm/mean": 0.7061171531677246, "rewards/qatch_small_update_with_fm/std": 0.3541206419467926, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989014744758606, "sampling/importance_sampling_ratio/min": 0.008726480416953564, "sampling/sampling_logp_difference/max": 4.741393089294434, "sampling/sampling_logp_difference/mean": 0.09948454052209854, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 389.9453125, "completions/mean_terminated_length": 389.9453125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.10472445003688335, "epoch": 0.39469026548672564, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6190601116973689, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 109798977.0, "reward": 0.6156679391860962, "reward_std": 0.09265945851802826, "rewards/qatch_small_update_with_fm/mean": 0.6156679391860962, "rewards/qatch_small_update_with_fm/std": 0.41817304491996765, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946981072425842, "sampling/importance_sampling_ratio/min": 0.008702251128852367, "sampling/sampling_logp_difference/max": 4.744173526763916, "sampling/sampling_logp_difference/mean": 0.08968104422092438, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1940.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 448.02734375, "completions/mean_terminated_length": 448.02734375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.10507409740239382, "epoch": 0.39646017699115044, "frac_reward_zero_std": 0.6875, "grad_norm": 0.49039506090276497, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 110339976.0, "reward": 0.6631484031677246, "reward_std": 0.06115978956222534, "rewards/qatch_small_update_with_fm/mean": 0.6631484031677246, "rewards/qatch_small_update_with_fm/std": 0.3594338297843933, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901099801063538, "sampling/importance_sampling_ratio/min": 0.0068128579296171665, "sampling/sampling_logp_difference/max": 4.988943576812744, "sampling/sampling_logp_difference/mean": 0.09931102395057678, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 433.8046875, "completions/mean_terminated_length": 433.8046875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.1009874390438199, "epoch": 0.39823008849557523, "frac_reward_zero_std": 0.5625, "grad_norm": 0.44278668660350046, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 110778358.0, "reward": 0.6486484408378601, "reward_std": 0.09495560824871063, "rewards/qatch_small_update_with_fm/mean": 0.6486484408378601, "rewards/qatch_small_update_with_fm/std": 0.432005912065506, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933207035064697, "sampling/importance_sampling_ratio/min": 0.014339202083647251, "sampling/sampling_logp_difference/max": 4.244758129119873, "sampling/sampling_logp_difference/mean": 0.08835351467132568, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 341.43359375, "completions/mean_terminated_length": 341.43359375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.10271453391760588, "epoch": 0.4, "frac_reward_zero_std": 0.625, "grad_norm": 0.5733446606675858, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 111123429.0, "reward": 0.5843750238418579, "reward_std": 0.07411018013954163, "rewards/qatch_small_update_with_fm/mean": 0.5843749642372131, "rewards/qatch_small_update_with_fm/std": 0.31206896901130676, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9935798645019531, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.09060897678136826, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 447.81640625, "completions/mean_terminated_length": 447.81640625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.12403890583664179, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5250232034467316, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 111573878.0, "reward": 0.7477734088897705, "reward_std": 0.1547674536705017, "rewards/qatch_small_update_with_fm/mean": 0.7477734088897705, "rewards/qatch_small_update_with_fm/std": 0.3546452224254608, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9935329556465149, "sampling/importance_sampling_ratio/min": 0.007649845909327269, "sampling/sampling_logp_difference/max": 4.873069763183594, "sampling/sampling_logp_difference/mean": 0.10479405522346497, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 389.6484375, "completions/mean_terminated_length": 389.6484375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.10224001575261354, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.625, "grad_norm": 0.5131191563549943, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 112305164.0, "reward": 0.8435273766517639, "reward_std": 0.09165941178798676, "rewards/qatch_small_update_with_fm/mean": 0.8435273170471191, "rewards/qatch_small_update_with_fm/std": 0.2752784788608551, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918692708015442, "sampling/importance_sampling_ratio/min": 0.005490469746291637, "sampling/sampling_logp_difference/max": 5.204741477966309, "sampling/sampling_logp_difference/mean": 0.09232448041439056, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 415.18359375, "completions/mean_terminated_length": 415.18359375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.12820047978311777, "epoch": 0.40530973451327434, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5694927094518497, "learning_rate": 1e-06, "loss": -0.0349, "num_tokens": 112805947.0, "reward": 0.7082968354225159, "reward_std": 0.0801946148276329, "rewards/qatch_small_update_with_fm/mean": 0.7082968950271606, "rewards/qatch_small_update_with_fm/std": 0.40650320053100586, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9939711093902588, "sampling/importance_sampling_ratio/min": 3.631289473560173e-06, "sampling/sampling_logp_difference/max": 12.525922775268555, "sampling/sampling_logp_difference/mean": 0.10653382539749146, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 398.5859375, "completions/mean_terminated_length": 398.5859375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.1140748867765069, "epoch": 0.40707964601769914, "frac_reward_zero_std": 0.5, "grad_norm": 0.6880249325585247, "learning_rate": 1e-06, "loss": -0.0121, "num_tokens": 113518561.0, "reward": 0.6640703082084656, "reward_std": 0.15750177204608917, "rewards/qatch_small_update_with_fm/mean": 0.6640703082084656, "rewards/qatch_small_update_with_fm/std": 0.40480488538742065, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946224093437195, "sampling/importance_sampling_ratio/min": 0.002527002478018403, "sampling/sampling_logp_difference/max": 5.980721473693848, "sampling/sampling_logp_difference/mean": 0.09498130530118942, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 405.1328125, "completions/mean_terminated_length": 405.1328125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.12539631128311157, "epoch": 0.4088495575221239, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5651489326765903, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 114094515.0, "reward": 0.7878320217132568, "reward_std": 0.12009721994400024, "rewards/qatch_small_update_with_fm/mean": 0.7878320217132568, "rewards/qatch_small_update_with_fm/std": 0.3283204436302185, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9924635887145996, "sampling/importance_sampling_ratio/min": 0.01837773434817791, "sampling/sampling_logp_difference/max": 3.996615409851074, "sampling/sampling_logp_difference/mean": 0.10449755936861038, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 466.86328125, "completions/mean_terminated_length": 452.63140869140625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.12363697215914726, "epoch": 0.41061946902654867, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4751429731610685, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 114570384.0, "reward": 0.6649336218833923, "reward_std": 0.09007216989994049, "rewards/qatch_small_update_with_fm/mean": 0.6649336218833923, "rewards/qatch_small_update_with_fm/std": 0.3947446942329407, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9944989681243896, "sampling/importance_sampling_ratio/min": 0.016750473529100418, "sampling/sampling_logp_difference/max": 4.089328765869141, "sampling/sampling_logp_difference/mean": 0.10331812500953674, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 369.34765625, "completions/mean_terminated_length": 369.34765625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.10589554626494646, "epoch": 0.41238938053097346, "frac_reward_zero_std": 0.375, "grad_norm": 0.897996865066677, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 115134841.0, "reward": 0.6671562194824219, "reward_std": 0.11132475733757019, "rewards/qatch_small_update_with_fm/mean": 0.6671562194824219, "rewards/qatch_small_update_with_fm/std": 0.3616568148136139, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922637343406677, "sampling/importance_sampling_ratio/min": 0.004132173955440521, "sampling/sampling_logp_difference/max": 5.488951683044434, "sampling/sampling_logp_difference/mean": 0.09396759420633316, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 404.7734375, "completions/mean_terminated_length": 404.7734375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.12250016257166862, "epoch": 0.41415929203539825, "frac_reward_zero_std": 0.625, "grad_norm": 0.495571008102404, "learning_rate": 1e-06, "loss": -0.0284, "num_tokens": 115632399.0, "reward": 0.634265661239624, "reward_std": 0.06549917906522751, "rewards/qatch_small_update_with_fm/mean": 0.634265661239624, "rewards/qatch_small_update_with_fm/std": 0.35203248262405396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9956581592559814, "sampling/importance_sampling_ratio/min": 0.0105217844247818, "sampling/sampling_logp_difference/max": 4.554307460784912, "sampling/sampling_logp_difference/mean": 0.09811676293611526, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 399.984375, "completions/mean_terminated_length": 399.984375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.1299695922061801, "epoch": 0.415929203539823, "frac_reward_zero_std": 0.625, "grad_norm": 0.4991884830918963, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 116033147.0, "reward": 0.8004375100135803, "reward_std": 0.11684393882751465, "rewards/qatch_small_update_with_fm/mean": 0.8004375100135803, "rewards/qatch_small_update_with_fm/std": 0.35567614436149597, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9916921854019165, "sampling/importance_sampling_ratio/min": 0.0009987365920096636, "sampling/sampling_logp_difference/max": 6.909019470214844, "sampling/sampling_logp_difference/mean": 0.1101202666759491, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 416.85546875, "completions/mean_terminated_length": 416.85546875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.10699698887765408, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4578399404909556, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 116528534.0, "reward": 0.6235507726669312, "reward_std": 0.037941787391901016, "rewards/qatch_small_update_with_fm/mean": 0.6235507726669312, "rewards/qatch_small_update_with_fm/std": 0.3784492611885071, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.993585467338562, "sampling/importance_sampling_ratio/min": 0.008174315094947815, "sampling/sampling_logp_difference/max": 4.806758403778076, "sampling/sampling_logp_difference/mean": 0.09373586624860764, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 394.4140625, "completions/mean_terminated_length": 394.4140625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.12796230241656303, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.5, "grad_norm": 0.5872021743640824, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 117202528.0, "reward": 0.7495508193969727, "reward_std": 0.09856027364730835, "rewards/qatch_small_update_with_fm/mean": 0.7495508193969727, "rewards/qatch_small_update_with_fm/std": 0.34960198402404785, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9907758831977844, "sampling/importance_sampling_ratio/min": 0.014935529790818691, "sampling/sampling_logp_difference/max": 4.204012393951416, "sampling/sampling_logp_difference/mean": 0.10940699279308319, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 470.66015625, "completions/mean_terminated_length": 470.66015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.13090031780302525, "epoch": 0.42123893805309737, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5546880762353853, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 117707193.0, "reward": 0.7711952924728394, "reward_std": 0.11917905509471893, "rewards/qatch_small_update_with_fm/mean": 0.7711952924728394, "rewards/qatch_small_update_with_fm/std": 0.33702993392944336, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.996447741985321, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.10174047946929932, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 443.375, "completions/mean_terminated_length": 443.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.11677948199212551, "epoch": 0.4230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 0.5266583505158154, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 118125289.0, "reward": 0.6743515729904175, "reward_std": 0.06367573887109756, "rewards/qatch_small_update_with_fm/mean": 0.6743515729904175, "rewards/qatch_small_update_with_fm/std": 0.3398094177246094, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922307729721069, "sampling/importance_sampling_ratio/min": 0.0005226434441283345, "sampling/sampling_logp_difference/max": 7.556611061096191, "sampling/sampling_logp_difference/mean": 0.09814269840717316, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 326.51953125, "completions/mean_terminated_length": 326.51953125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.10765422880649567, "epoch": 0.4247787610619469, "frac_reward_zero_std": 0.8125, "grad_norm": 0.35797878094122443, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 118683294.0, "reward": 0.7800507545471191, "reward_std": 0.04426509886980057, "rewards/qatch_small_update_with_fm/mean": 0.7800507545471191, "rewards/qatch_small_update_with_fm/std": 0.29425278306007385, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9905130863189697, "sampling/importance_sampling_ratio/min": 0.007289279717952013, "sampling/sampling_logp_difference/max": 4.921350479125977, "sampling/sampling_logp_difference/mean": 0.09998884797096252, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 420.17578125, "completions/mean_terminated_length": 420.17578125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.11644061841070652, "epoch": 0.4265486725663717, "frac_reward_zero_std": 0.5, "grad_norm": 0.5613929910697356, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 119158715.0, "reward": 0.6946874856948853, "reward_std": 0.1308887004852295, "rewards/qatch_small_update_with_fm/mean": 0.6946874856948853, "rewards/qatch_small_update_with_fm/std": 0.35073021054267883, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946391582489014, "sampling/importance_sampling_ratio/min": 0.004145145416259766, "sampling/sampling_logp_difference/max": 5.4858174324035645, "sampling/sampling_logp_difference/mean": 0.09869328886270523, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 369.453125, "completions/mean_terminated_length": 369.453125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.12437109090387821, "epoch": 0.4283185840707965, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5206241947060207, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 119700559.0, "reward": 0.7600976228713989, "reward_std": 0.06152753531932831, "rewards/qatch_small_update_with_fm/mean": 0.7600976228713989, "rewards/qatch_small_update_with_fm/std": 0.33996260166168213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919652342796326, "sampling/importance_sampling_ratio/min": 0.0002818721404764801, "sampling/sampling_logp_difference/max": 8.174057006835938, "sampling/sampling_logp_difference/mean": 0.10920348763465881, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 459.359375, "completions/mean_terminated_length": 445.0980529785156, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.14293739199638367, "epoch": 0.4300884955752212, "frac_reward_zero_std": 0.5, "grad_norm": 0.6945266374462524, "learning_rate": 1e-06, "loss": -0.0358, "num_tokens": 120191611.0, "reward": 0.7203359603881836, "reward_std": 0.1005784347653389, "rewards/qatch_small_update_with_fm/mean": 0.7203359603881836, "rewards/qatch_small_update_with_fm/std": 0.37025392055511475, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946791529655457, "sampling/importance_sampling_ratio/min": 0.006813199259340763, "sampling/sampling_logp_difference/max": 4.988893508911133, "sampling/sampling_logp_difference/mean": 0.11522375792264938, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 451.54296875, "completions/mean_terminated_length": 451.54296875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.13328593783080578, "epoch": 0.431858407079646, "frac_reward_zero_std": 0.375, "grad_norm": 0.6196257477102209, "learning_rate": 1e-06, "loss": 0.0426, "num_tokens": 120759766.0, "reward": 0.7439101934432983, "reward_std": 0.1378568708896637, "rewards/qatch_small_update_with_fm/mean": 0.7439101934432983, "rewards/qatch_small_update_with_fm/std": 0.3241375982761383, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925317764282227, "sampling/importance_sampling_ratio/min": 0.02285437099635601, "sampling/sampling_logp_difference/max": 3.7786128520965576, "sampling/sampling_logp_difference/mean": 0.11157552152872086, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 468.84375, "completions/mean_terminated_length": 468.84375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.14039788208901882, "epoch": 0.4336283185840708, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5102000070988705, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 121329870.0, "reward": 0.771414041519165, "reward_std": 0.10335954278707504, "rewards/qatch_small_update_with_fm/mean": 0.771414041519165, "rewards/qatch_small_update_with_fm/std": 0.33871808648109436, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99262535572052, "sampling/importance_sampling_ratio/min": 0.014456496573984623, "sampling/sampling_logp_difference/max": 4.236611366271973, "sampling/sampling_logp_difference/mean": 0.11720272153615952, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 415.3359375, "completions/mean_terminated_length": 415.3359375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.13042385503649712, "epoch": 0.4353982300884956, "frac_reward_zero_std": 0.8125, "grad_norm": 0.31843325314582177, "learning_rate": 1e-06, "loss": -0.0266, "num_tokens": 121774628.0, "reward": 0.8900351524353027, "reward_std": 0.07186062633991241, "rewards/qatch_small_update_with_fm/mean": 0.8900351524353027, "rewards/qatch_small_update_with_fm/std": 0.2635286748409271, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919847249984741, "sampling/importance_sampling_ratio/min": 0.013568556867539883, "sampling/sampling_logp_difference/max": 4.300000190734863, "sampling/sampling_logp_difference/mean": 0.1108667254447937, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3521.0, "completions/max_terminated_length": 3521.0, "completions/mean_length": 509.6640625, "completions/mean_terminated_length": 509.6640625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.138479420915246, "epoch": 0.43716814159292033, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4156690061559561, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 122309406.0, "reward": 0.6351015567779541, "reward_std": 0.048537224531173706, "rewards/qatch_small_update_with_fm/mean": 0.6351015567779541, "rewards/qatch_small_update_with_fm/std": 0.33746272325515747, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9944608211517334, "sampling/importance_sampling_ratio/min": 0.006842448841780424, "sampling/sampling_logp_difference/max": 4.984609603881836, "sampling/sampling_logp_difference/mean": 0.10972237586975098, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 476.61328125, "completions/mean_terminated_length": 476.61328125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.12279136944562197, "epoch": 0.4389380530973451, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4630336672015149, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 122840091.0, "reward": 0.6818008422851562, "reward_std": 0.09493610262870789, "rewards/qatch_small_update_with_fm/mean": 0.6818008422851562, "rewards/qatch_small_update_with_fm/std": 0.39343327283859253, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934309720993042, "sampling/importance_sampling_ratio/min": 0.0012457328848540783, "sampling/sampling_logp_difference/max": 6.688031196594238, "sampling/sampling_logp_difference/mean": 0.10279334336519241, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 548.1796875, "completions/mean_terminated_length": 548.1796875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.12996913958340883, "epoch": 0.4407079646017699, "frac_reward_zero_std": 0.3125, "grad_norm": 0.5700199575774423, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 123465225.0, "reward": 0.488394558429718, "reward_std": 0.1281006932258606, "rewards/qatch_small_update_with_fm/mean": 0.488394558429718, "rewards/qatch_small_update_with_fm/std": 0.35512784123420715, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9943488836288452, "sampling/importance_sampling_ratio/min": 0.011242197826504707, "sampling/sampling_logp_difference/max": 4.488080978393555, "sampling/sampling_logp_difference/mean": 0.10481943935155869, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 384.2421875, "completions/mean_terminated_length": 384.2421875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.11992082837969065, "epoch": 0.4424778761061947, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5498360675010315, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 123982887.0, "reward": 0.8174804449081421, "reward_std": 0.06691622734069824, "rewards/qatch_small_update_with_fm/mean": 0.8174804449081421, "rewards/qatch_small_update_with_fm/std": 0.29479968547821045, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904511570930481, "sampling/importance_sampling_ratio/min": 0.011172862723469734, "sampling/sampling_logp_difference/max": 4.494267463684082, "sampling/sampling_logp_difference/mean": 0.10939574986696243, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 448.39453125, "completions/mean_terminated_length": 448.39453125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.13190590497106314, "epoch": 0.44424778761061945, "frac_reward_zero_std": 0.5, "grad_norm": 0.4586587220281425, "learning_rate": 1e-06, "loss": -0.0197, "num_tokens": 124570860.0, "reward": 0.7742382884025574, "reward_std": 0.08533629775047302, "rewards/qatch_small_update_with_fm/mean": 0.7742382884025574, "rewards/qatch_small_update_with_fm/std": 0.3210985064506531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922869205474854, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.11073958873748779, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 488.015625, "completions/mean_terminated_length": 488.015625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.12504940200597048, "epoch": 0.44601769911504424, "frac_reward_zero_std": 0.4375, "grad_norm": 0.582915685918618, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 125148784.0, "reward": 0.7658984661102295, "reward_std": 0.11392462998628616, "rewards/qatch_small_update_with_fm/mean": 0.7658984661102295, "rewards/qatch_small_update_with_fm/std": 0.3386352062225342, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992237389087677, "sampling/importance_sampling_ratio/min": 6.2567501117882784e-06, "sampling/sampling_logp_difference/max": 11.981849670410156, "sampling/sampling_logp_difference/mean": 0.10734772682189941, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3040.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 562.68359375, "completions/mean_terminated_length": 562.68359375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1231586579233408, "epoch": 0.44778761061946903, "frac_reward_zero_std": 0.625, "grad_norm": 0.4012765932095851, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 125544399.0, "reward": 0.6832812428474426, "reward_std": 0.07998022437095642, "rewards/qatch_small_update_with_fm/mean": 0.6832812428474426, "rewards/qatch_small_update_with_fm/std": 0.38533180952072144, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9948018789291382, "sampling/importance_sampling_ratio/min": 0.003951632417738438, "sampling/sampling_logp_difference/max": 5.533626556396484, "sampling/sampling_logp_difference/mean": 0.10125906020402908, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1840.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 462.59375, "completions/mean_terminated_length": 462.59375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.12078492902219296, "epoch": 0.4495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.4699545761158711, "learning_rate": 1e-06, "loss": -0.0169, "num_tokens": 126110535.0, "reward": 0.7769765853881836, "reward_std": 0.04606456309556961, "rewards/qatch_small_update_with_fm/mean": 0.7769765853881836, "rewards/qatch_small_update_with_fm/std": 0.30436578392982483, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918253421783447, "sampling/importance_sampling_ratio/min": 0.007992015220224857, "sampling/sampling_logp_difference/max": 4.829312324523926, "sampling/sampling_logp_difference/mean": 0.10492375493049622, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 403.21484375, "completions/mean_terminated_length": 403.21484375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.10851260460913181, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.75, "grad_norm": 0.38739447303368746, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 126501230.0, "reward": 0.6241718530654907, "reward_std": 0.0250605009496212, "rewards/qatch_small_update_with_fm/mean": 0.6241718530654907, "rewards/qatch_small_update_with_fm/std": 0.4063698649406433, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904232025146484, "sampling/importance_sampling_ratio/min": 0.009276127442717552, "sampling/sampling_logp_difference/max": 4.68031120300293, "sampling/sampling_logp_difference/mean": 0.1012159138917923, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 450.74609375, "completions/mean_terminated_length": 450.74609375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.12568799499422312, "epoch": 0.45309734513274336, "frac_reward_zero_std": 0.5, "grad_norm": 0.6091856792011116, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 126960237.0, "reward": 0.7116171717643738, "reward_std": 0.10217170417308807, "rewards/qatch_small_update_with_fm/mean": 0.7116171717643738, "rewards/qatch_small_update_with_fm/std": 0.35243433713912964, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9938164949417114, "sampling/importance_sampling_ratio/min": 0.016684038564562798, "sampling/sampling_logp_difference/max": 4.0933027267456055, "sampling/sampling_logp_difference/mean": 0.10752049088478088, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 411.72265625, "completions/mean_terminated_length": 411.72265625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.10305561777204275, "epoch": 0.45486725663716815, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5717465117807924, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 127592246.0, "reward": 0.7968710660934448, "reward_std": 0.09067166596651077, "rewards/qatch_small_update_with_fm/mean": 0.7968710660934448, "rewards/qatch_small_update_with_fm/std": 0.3336501121520996, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914082288742065, "sampling/importance_sampling_ratio/min": 0.0041133444756269455, "sampling/sampling_logp_difference/max": 5.493518829345703, "sampling/sampling_logp_difference/mean": 0.09590524435043335, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 505.96875, "completions/mean_terminated_length": 505.96875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.11262097675353289, "epoch": 0.45663716814159294, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5447749013706079, "learning_rate": 1e-06, "loss": -0.0507, "num_tokens": 128076878.0, "reward": 0.8263515830039978, "reward_std": 0.13293085992336273, "rewards/qatch_small_update_with_fm/mean": 0.8263515830039978, "rewards/qatch_small_update_with_fm/std": 0.29542577266693115, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913504123687744, "sampling/importance_sampling_ratio/min": 0.014473767951130867, "sampling/sampling_logp_difference/max": 4.235417366027832, "sampling/sampling_logp_difference/mean": 0.0993494763970375, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 473.1640625, "completions/mean_terminated_length": 473.1640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.12198252696543932, "epoch": 0.4584070796460177, "frac_reward_zero_std": 0.625, "grad_norm": 2.0757714283775988, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 128524184.0, "reward": 0.6998945474624634, "reward_std": 0.07172062247991562, "rewards/qatch_small_update_with_fm/mean": 0.6998945474624634, "rewards/qatch_small_update_with_fm/std": 0.3736403286457062, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9921177625656128, "sampling/importance_sampling_ratio/min": 0.0024047421757131815, "sampling/sampling_logp_difference/max": 6.030312538146973, "sampling/sampling_logp_difference/mean": 0.10670118033885956, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 546.09765625, "completions/mean_terminated_length": 546.09765625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.12327743601053953, "epoch": 0.46017699115044247, "frac_reward_zero_std": 0.5, "grad_norm": 0.6147719138897032, "learning_rate": 1e-06, "loss": 0.0429, "num_tokens": 129136881.0, "reward": 0.6720781326293945, "reward_std": 0.13617882132530212, "rewards/qatch_small_update_with_fm/mean": 0.6720781326293945, "rewards/qatch_small_update_with_fm/std": 0.4061374366283417, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919106960296631, "sampling/importance_sampling_ratio/min": 0.00174518465064466, "sampling/sampling_logp_difference/max": 6.350894927978516, "sampling/sampling_logp_difference/mean": 0.10320325940847397, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 455.74609375, "completions/mean_terminated_length": 455.74609375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.11595601961016655, "epoch": 0.46194690265486726, "frac_reward_zero_std": 0.5, "grad_norm": 0.518100506416996, "learning_rate": 1e-06, "loss": -0.0265, "num_tokens": 129576512.0, "reward": 0.7429882287979126, "reward_std": 0.14759953320026398, "rewards/qatch_small_update_with_fm/mean": 0.7429882287979126, "rewards/qatch_small_update_with_fm/std": 0.34472936391830444, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9928544759750366, "sampling/importance_sampling_ratio/min": 0.008397025987505913, "sampling/sampling_logp_difference/max": 4.779877662658691, "sampling/sampling_logp_difference/mean": 0.09997648000717163, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 424.14453125, "completions/mean_terminated_length": 424.14453125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.10993783641606569, "epoch": 0.46371681415929206, "frac_reward_zero_std": 0.75, "grad_norm": 0.5021034588733652, "learning_rate": 1e-06, "loss": -0.0265, "num_tokens": 129969141.0, "reward": 0.8230313062667847, "reward_std": 0.06805883347988129, "rewards/qatch_small_update_with_fm/mean": 0.8230313062667847, "rewards/qatch_small_update_with_fm/std": 0.3318270444869995, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901431202888489, "sampling/importance_sampling_ratio/min": 0.006941431201994419, "sampling/sampling_logp_difference/max": 4.970247268676758, "sampling/sampling_logp_difference/mean": 0.1020810455083847, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 537.18359375, "completions/mean_terminated_length": 537.18359375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.14144710265100002, "epoch": 0.4654867256637168, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5124333861678522, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 130444916.0, "reward": 0.6506445407867432, "reward_std": 0.09026811271905899, "rewards/qatch_small_update_with_fm/mean": 0.6506445407867432, "rewards/qatch_small_update_with_fm/std": 0.34012213349342346, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9988116025924683, "sampling/importance_sampling_ratio/min": 0.0017341957427561283, "sampling/sampling_logp_difference/max": 6.357211589813232, "sampling/sampling_logp_difference/mean": 0.11083206534385681, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 304.046875, "completions/mean_terminated_length": 304.046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.09478252567350864, "epoch": 0.4672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 0.619049798770197, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 130971360.0, "reward": 0.8115742206573486, "reward_std": 0.0725603997707367, "rewards/qatch_small_update_with_fm/mean": 0.8115742206573486, "rewards/qatch_small_update_with_fm/std": 0.31686973571777344, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867129325866699, "sampling/importance_sampling_ratio/min": 0.005328531377017498, "sampling/sampling_logp_difference/max": 5.234679698944092, "sampling/sampling_logp_difference/mean": 0.09353677183389664, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 545.1484375, "completions/mean_terminated_length": 545.1484375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.1286120619624853, "epoch": 0.4690265486725664, "frac_reward_zero_std": 0.625, "grad_norm": 0.45949231122851514, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 131422790.0, "reward": 0.7897851467132568, "reward_std": 0.08896966278553009, "rewards/qatch_small_update_with_fm/mean": 0.7897851467132568, "rewards/qatch_small_update_with_fm/std": 0.321509450674057, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9942489266395569, "sampling/importance_sampling_ratio/min": 0.00868771132081747, "sampling/sampling_logp_difference/max": 4.745845794677734, "sampling/sampling_logp_difference/mean": 0.10412135720252991, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 564.22265625, "completions/mean_terminated_length": 564.22265625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.14587272237986326, "epoch": 0.47079646017699117, "frac_reward_zero_std": 0.5, "grad_norm": 0.5004787008761891, "learning_rate": 1e-06, "loss": -0.0357, "num_tokens": 131942543.0, "reward": 0.7513867616653442, "reward_std": 0.1140788346529007, "rewards/qatch_small_update_with_fm/mean": 0.7513867616653442, "rewards/qatch_small_update_with_fm/std": 0.36217227578163147, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9981526136398315, "sampling/importance_sampling_ratio/min": 0.004260185174643993, "sampling/sampling_logp_difference/max": 5.458442687988281, "sampling/sampling_logp_difference/mean": 0.11560261994600296, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 555.22265625, "completions/mean_terminated_length": 555.22265625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.1373251285403967, "epoch": 0.4725663716814159, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6125441902125374, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 132468712.0, "reward": 0.5675820112228394, "reward_std": 0.14589762687683105, "rewards/qatch_small_update_with_fm/mean": 0.5675820112228394, "rewards/qatch_small_update_with_fm/std": 0.418068528175354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9948241710662842, "sampling/importance_sampling_ratio/min": 0.0018865078454837203, "sampling/sampling_logp_difference/max": 6.2730278968811035, "sampling/sampling_logp_difference/mean": 0.11048072576522827, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 462.8984375, "completions/mean_terminated_length": 462.8984375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.12027736380696297, "epoch": 0.4743362831858407, "frac_reward_zero_std": 0.625, "grad_norm": 0.5504667215266769, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 132848686.0, "reward": 0.7035663723945618, "reward_std": 0.05683054029941559, "rewards/qatch_small_update_with_fm/mean": 0.7035663723945618, "rewards/qatch_small_update_with_fm/std": 0.38923537731170654, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.993371844291687, "sampling/importance_sampling_ratio/min": 0.002918687416240573, "sampling/sampling_logp_difference/max": 5.836621284484863, "sampling/sampling_logp_difference/mean": 0.10290797799825668, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 500.42578125, "completions/mean_terminated_length": 500.42578125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.13534568157047033, "epoch": 0.4761061946902655, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6007433735308465, "learning_rate": 1e-06, "loss": -0.0258, "num_tokens": 133368459.0, "reward": 0.6969257593154907, "reward_std": 0.09634049981832504, "rewards/qatch_small_update_with_fm/mean": 0.6969257593154907, "rewards/qatch_small_update_with_fm/std": 0.3994755148887634, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934263229370117, "sampling/importance_sampling_ratio/min": 0.006295106839388609, "sampling/sampling_logp_difference/max": 5.0679826736450195, "sampling/sampling_logp_difference/mean": 0.11658207327127457, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 447.66796875, "completions/mean_terminated_length": 447.66796875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.12615801114588976, "epoch": 0.4778761061946903, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6000587456169815, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 133839734.0, "reward": 0.7735468745231628, "reward_std": 0.14530882239341736, "rewards/qatch_small_update_with_fm/mean": 0.7735468149185181, "rewards/qatch_small_update_with_fm/std": 0.3402929902076721, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9957665205001831, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.10050618648529053, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 470.07421875, "completions/mean_terminated_length": 470.07421875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.11290043592453003, "epoch": 0.479646017699115, "frac_reward_zero_std": 0.625, "grad_norm": 0.5939769909746901, "learning_rate": 1e-06, "loss": 0.0399, "num_tokens": 134480953.0, "reward": 0.6712266206741333, "reward_std": 0.08824647217988968, "rewards/qatch_small_update_with_fm/mean": 0.6712265610694885, "rewards/qatch_small_update_with_fm/std": 0.38758161664009094, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9950553774833679, "sampling/importance_sampling_ratio/min": 0.011183853261172771, "sampling/sampling_logp_difference/max": 4.493284225463867, "sampling/sampling_logp_difference/mean": 0.09266015887260437, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 468.84765625, "completions/mean_terminated_length": 468.84765625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.13117850199341774, "epoch": 0.4814159292035398, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6585905752020277, "learning_rate": 1e-06, "loss": -0.0277, "num_tokens": 135080066.0, "reward": 0.7433632612228394, "reward_std": 0.05850284546613693, "rewards/qatch_small_update_with_fm/mean": 0.7433632612228394, "rewards/qatch_small_update_with_fm/std": 0.3301638960838318, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9974746704101562, "sampling/importance_sampling_ratio/min": 0.009958724491298199, "sampling/sampling_logp_difference/max": 4.609306335449219, "sampling/sampling_logp_difference/mean": 0.10478514432907104, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3146.0, "completions/max_terminated_length": 3146.0, "completions/mean_length": 576.86328125, "completions/mean_terminated_length": 576.86328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.14908631797879934, "epoch": 0.4831858407079646, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4628001801800674, "learning_rate": 1e-06, "loss": 0.0384, "num_tokens": 135766959.0, "reward": 0.6367383003234863, "reward_std": 0.07259754836559296, "rewards/qatch_small_update_with_fm/mean": 0.6367383003234863, "rewards/qatch_small_update_with_fm/std": 0.3717433512210846, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9974573254585266, "sampling/importance_sampling_ratio/min": 0.003483326407149434, "sampling/sampling_logp_difference/max": 5.6597676277160645, "sampling/sampling_logp_difference/mean": 0.11377645283937454, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3232.0, "completions/max_terminated_length": 3232.0, "completions/mean_length": 455.0546875, "completions/mean_terminated_length": 455.0546875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1027538888156414, "epoch": 0.4849557522123894, "frac_reward_zero_std": 0.8125, "grad_norm": 0.38737609564787473, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 136341533.0, "reward": 0.7330039143562317, "reward_std": 0.05588027834892273, "rewards/qatch_small_update_with_fm/mean": 0.7330039143562317, "rewards/qatch_small_update_with_fm/std": 0.38519182801246643, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9948368668556213, "sampling/importance_sampling_ratio/min": 0.0019187896978110075, "sampling/sampling_logp_difference/max": 6.256060600280762, "sampling/sampling_logp_difference/mean": 0.08965164422988892, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 472.0390625, "completions/mean_terminated_length": 472.0390625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.13030268996953964, "epoch": 0.48672566371681414, "frac_reward_zero_std": 0.625, "grad_norm": 0.5194915442984417, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 136887495.0, "reward": 0.8584296703338623, "reward_std": 0.06279782950878143, "rewards/qatch_small_update_with_fm/mean": 0.8584296703338623, "rewards/qatch_small_update_with_fm/std": 0.2424517422914505, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9960412979125977, "sampling/importance_sampling_ratio/min": 0.0067655895836651325, "sampling/sampling_logp_difference/max": 4.995905876159668, "sampling/sampling_logp_difference/mean": 0.10526451468467712, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 359.8671875, "completions/mean_terminated_length": 359.8671875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.09691363852471113, "epoch": 0.48849557522123893, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7103623049986925, "learning_rate": 1e-06, "loss": 0.0297, "num_tokens": 137366693.0, "reward": 0.7432616949081421, "reward_std": 0.12966401875019073, "rewards/qatch_small_update_with_fm/mean": 0.7432616949081421, "rewards/qatch_small_update_with_fm/std": 0.35516712069511414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898815751075745, "sampling/importance_sampling_ratio/min": 0.008864426985383034, "sampling/sampling_logp_difference/max": 4.725708961486816, "sampling/sampling_logp_difference/mean": 0.09437304735183716, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 423.5078125, "completions/mean_terminated_length": 409.10589599609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.1079849824309349, "epoch": 0.4902654867256637, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4566326790804489, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 137883335.0, "reward": 0.8503007888793945, "reward_std": 0.0626184344291687, "rewards/qatch_small_update_with_fm/mean": 0.8503007888793945, "rewards/qatch_small_update_with_fm/std": 0.26566144824028015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902591109275818, "sampling/importance_sampling_ratio/min": 0.008726668544113636, "sampling/sampling_logp_difference/max": 4.7413716316223145, "sampling/sampling_logp_difference/mean": 0.09983956068754196, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 469.0859375, "completions/mean_terminated_length": 469.0859375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.1033999565988779, "epoch": 0.4920353982300885, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4812629100597929, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 138490221.0, "reward": 0.7183789014816284, "reward_std": 0.12236850708723068, "rewards/qatch_small_update_with_fm/mean": 0.7183789014816284, "rewards/qatch_small_update_with_fm/std": 0.37549513578414917, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927672147750854, "sampling/importance_sampling_ratio/min": 0.003263342659920454, "sampling/sampling_logp_difference/max": 5.725003242492676, "sampling/sampling_logp_difference/mean": 0.09193219989538193, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 452.9453125, "completions/mean_terminated_length": 452.9453125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.11535468604415655, "epoch": 0.49380530973451325, "frac_reward_zero_std": 0.625, "grad_norm": 0.5075709751174705, "learning_rate": 1e-06, "loss": -0.0522, "num_tokens": 139050143.0, "reward": 0.7936406135559082, "reward_std": 0.06707090139389038, "rewards/qatch_small_update_with_fm/mean": 0.7936406135559082, "rewards/qatch_small_update_with_fm/std": 0.3406023383140564, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915643930435181, "sampling/importance_sampling_ratio/min": 0.001045701210387051, "sampling/sampling_logp_difference/max": 6.863067626953125, "sampling/sampling_logp_difference/mean": 0.10395675897598267, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 460.359375, "completions/mean_terminated_length": 460.359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.11615493427962065, "epoch": 0.49557522123893805, "frac_reward_zero_std": 0.5, "grad_norm": 0.7188042550526477, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 139657435.0, "reward": 0.7461133003234863, "reward_std": 0.13950800895690918, "rewards/qatch_small_update_with_fm/mean": 0.7461133003234863, "rewards/qatch_small_update_with_fm/std": 0.38162434101104736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991545557975769, "sampling/importance_sampling_ratio/min": 0.00027837336529046297, "sampling/sampling_logp_difference/max": 8.18654727935791, "sampling/sampling_logp_difference/mean": 0.10376756638288498, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 416.4921875, "completions/mean_terminated_length": 402.0627746582031, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.10716897528618574, "epoch": 0.49734513274336284, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5756784364509373, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 140091705.0, "reward": 0.7761679291725159, "reward_std": 0.10864467173814774, "rewards/qatch_small_update_with_fm/mean": 0.7761679291725159, "rewards/qatch_small_update_with_fm/std": 0.3267165720462799, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911130666732788, "sampling/importance_sampling_ratio/min": 3.5366774682188407e-05, "sampling/sampling_logp_difference/max": 10.249737739562988, "sampling/sampling_logp_difference/mean": 0.10210423916578293, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3154.0, "completions/max_terminated_length": 3154.0, "completions/mean_length": 418.12109375, "completions/mean_terminated_length": 418.12109375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.09531951136887074, "epoch": 0.49911504424778763, "frac_reward_zero_std": 0.875, "grad_norm": 0.4381018145886672, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 140536440.0, "reward": 0.8234609365463257, "reward_std": 0.005906249396502972, "rewards/qatch_small_update_with_fm/mean": 0.8234609365463257, "rewards/qatch_small_update_with_fm/std": 0.2951415479183197, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9938584566116333, "sampling/importance_sampling_ratio/min": 0.008785802870988846, "sampling/sampling_logp_difference/max": 4.734618186950684, "sampling/sampling_logp_difference/mean": 0.08802133053541183, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 429.8984375, "completions/mean_terminated_length": 429.8984375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.10019801463931799, "epoch": 0.5008849557522124, "frac_reward_zero_std": 0.6875, "grad_norm": 0.499022198107272, "learning_rate": 1e-06, "loss": 0.0493, "num_tokens": 141033998.0, "reward": 0.6048046350479126, "reward_std": 0.050070300698280334, "rewards/qatch_small_update_with_fm/mean": 0.6048046350479126, "rewards/qatch_small_update_with_fm/std": 0.3651096522808075, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922409653663635, "sampling/importance_sampling_ratio/min": 0.01044392678886652, "sampling/sampling_logp_difference/max": 4.561734676361084, "sampling/sampling_logp_difference/mean": 0.0922735333442688, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 542.42578125, "completions/mean_terminated_length": 542.42578125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.12973017618060112, "epoch": 0.5026548672566372, "frac_reward_zero_std": 0.5625, "grad_norm": 0.48294227298363385, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 141533211.0, "reward": 0.621808648109436, "reward_std": 0.14083924889564514, "rewards/qatch_small_update_with_fm/mean": 0.621808648109436, "rewards/qatch_small_update_with_fm/std": 0.42469820380210876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9957568049430847, "sampling/importance_sampling_ratio/min": 0.005370942875742912, "sampling/sampling_logp_difference/max": 5.226751804351807, "sampling/sampling_logp_difference/mean": 0.10498703271150589, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0781171815469861, "epoch": 0.504424778761062, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4365562931344378, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 142053163.0, "reward": 0.7447031736373901, "reward_std": 0.03437906131148338, "rewards/qatch_small_update_with_fm/mean": 0.7447031736373901, "rewards/qatch_small_update_with_fm/std": 0.382269024848938, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909601211547852, "sampling/importance_sampling_ratio/min": 0.005499903112649918, "sampling/sampling_logp_difference/max": 5.203024864196777, "sampling/sampling_logp_difference/mean": 0.07915640622377396, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 367.796875, "completions/mean_terminated_length": 367.796875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.09002258535474539, "epoch": 0.5061946902654867, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6698101164123688, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 142605319.0, "reward": 0.753304660320282, "reward_std": 0.06268875300884247, "rewards/qatch_small_update_with_fm/mean": 0.753304660320282, "rewards/qatch_small_update_with_fm/std": 0.35868948698043823, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9894113540649414, "sampling/importance_sampling_ratio/min": 0.0004474441520869732, "sampling/sampling_logp_difference/max": 7.711958885192871, "sampling/sampling_logp_difference/mean": 0.09195245802402496, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 506.75, "completions/mean_terminated_length": 506.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.11118471156805754, "epoch": 0.5079646017699115, "frac_reward_zero_std": 0.625, "grad_norm": 0.4948140819153, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 143225463.0, "reward": 0.7221835851669312, "reward_std": 0.06406918913125992, "rewards/qatch_small_update_with_fm/mean": 0.7221835851669312, "rewards/qatch_small_update_with_fm/std": 0.3668988347053528, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9937881231307983, "sampling/importance_sampling_ratio/min": 0.003267764812335372, "sampling/sampling_logp_difference/max": 5.723649024963379, "sampling/sampling_logp_difference/mean": 0.09986858069896698, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 419.1796875, "completions/mean_terminated_length": 419.1796875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.10201077908277512, "epoch": 0.5097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 0.4042234506059746, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 143610229.0, "reward": 0.8555507659912109, "reward_std": 0.031004978343844414, "rewards/qatch_small_update_with_fm/mean": 0.8555507659912109, "rewards/qatch_small_update_with_fm/std": 0.25457900762557983, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900668859481812, "sampling/importance_sampling_ratio/min": 0.014422724954783916, "sampling/sampling_logp_difference/max": 4.238950252532959, "sampling/sampling_logp_difference/mean": 0.09820079803466797, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 458.41015625, "completions/mean_terminated_length": 458.41015625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.10596211347728968, "epoch": 0.511504424778761, "frac_reward_zero_std": 0.625, "grad_norm": 0.5988557634553278, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 144313518.0, "reward": 0.7774180173873901, "reward_std": 0.08853039145469666, "rewards/qatch_small_update_with_fm/mean": 0.7774179577827454, "rewards/qatch_small_update_with_fm/std": 0.35354262590408325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990810751914978, "sampling/importance_sampling_ratio/min": 0.006848180666565895, "sampling/sampling_logp_difference/max": 4.983772277832031, "sampling/sampling_logp_difference/mean": 0.09928888082504272, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 521.95703125, "completions/mean_terminated_length": 521.95703125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.11423090100288391, "epoch": 0.5132743362831859, "frac_reward_zero_std": 0.5, "grad_norm": 0.6741792027668885, "learning_rate": 1e-06, "loss": 0.0429, "num_tokens": 145116995.0, "reward": 0.6444180011749268, "reward_std": 0.12592115998268127, "rewards/qatch_small_update_with_fm/mean": 0.6444180011749268, "rewards/qatch_small_update_with_fm/std": 0.3701937198638916, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9935187101364136, "sampling/importance_sampling_ratio/min": 2.6497457383811707e-07, "sampling/sampling_logp_difference/max": 15.143631935119629, "sampling/sampling_logp_difference/mean": 0.10046621412038803, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3026.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 535.671875, "completions/mean_terminated_length": 535.671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.10938840173184872, "epoch": 0.5150442477876106, "frac_reward_zero_std": 0.5, "grad_norm": 0.5835952264770478, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 145952959.0, "reward": 0.5978906154632568, "reward_std": 0.13176509737968445, "rewards/qatch_small_update_with_fm/mean": 0.5978906154632568, "rewards/qatch_small_update_with_fm/std": 0.4031720459461212, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9926491975784302, "sampling/importance_sampling_ratio/min": 0.004141896963119507, "sampling/sampling_logp_difference/max": 5.48660135269165, "sampling/sampling_logp_difference/mean": 0.1000325083732605, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4052.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 551.72265625, "completions/mean_terminated_length": 551.72265625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.11647769901901484, "epoch": 0.5168141592920354, "frac_reward_zero_std": 0.625, "grad_norm": 0.6162799567231806, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 146521208.0, "reward": 0.7887499928474426, "reward_std": 0.051286663860082626, "rewards/qatch_small_update_with_fm/mean": 0.7887499928474426, "rewards/qatch_small_update_with_fm/std": 0.32047656178474426, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9951774477958679, "sampling/importance_sampling_ratio/min": 0.006804880686104298, "sampling/sampling_logp_difference/max": 4.990115165710449, "sampling/sampling_logp_difference/mean": 0.09928067028522491, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3249.0, "completions/max_terminated_length": 3249.0, "completions/mean_length": 419.51171875, "completions/mean_terminated_length": 419.51171875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.10076918825507164, "epoch": 0.5185840707964602, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6215460006820153, "learning_rate": 1e-06, "loss": 0.0632, "num_tokens": 147232107.0, "reward": 0.7407070398330688, "reward_std": 0.12173819541931152, "rewards/qatch_small_update_with_fm/mean": 0.7407070398330688, "rewards/qatch_small_update_with_fm/std": 0.3596765995025635, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914222359657288, "sampling/importance_sampling_ratio/min": 0.004096901509910822, "sampling/sampling_logp_difference/max": 5.497524261474609, "sampling/sampling_logp_difference/mean": 0.09699016809463501, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 557.6328125, "completions/mean_terminated_length": 515.6759033203125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.11698231659829617, "epoch": 0.5203539823008849, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5086059922780932, "learning_rate": 1e-06, "loss": -0.0372, "num_tokens": 147777197.0, "reward": 0.5674062371253967, "reward_std": 0.12426469475030899, "rewards/qatch_small_update_with_fm/mean": 0.5674062371253967, "rewards/qatch_small_update_with_fm/std": 0.408805787563324, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9938693642616272, "sampling/importance_sampling_ratio/min": 0.008697181940078735, "sampling/sampling_logp_difference/max": 4.74475622177124, "sampling/sampling_logp_difference/mean": 0.10387073457241058, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 368.49609375, "completions/mean_terminated_length": 368.49609375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.08287282753735781, "epoch": 0.5221238938053098, "frac_reward_zero_std": 0.75, "grad_norm": 0.3754465043261356, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 148209756.0, "reward": 0.7729140520095825, "reward_std": 0.07160256057977676, "rewards/qatch_small_update_with_fm/mean": 0.7729140520095825, "rewards/qatch_small_update_with_fm/std": 0.3413429260253906, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9904618859291077, "sampling/importance_sampling_ratio/min": 0.0031984634697437286, "sampling/sampling_logp_difference/max": 5.745084762573242, "sampling/sampling_logp_difference/mean": 0.08396244049072266, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 466.6328125, "completions/mean_terminated_length": 466.6328125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.11806033086031675, "epoch": 0.5238938053097345, "frac_reward_zero_std": 0.5, "grad_norm": 0.6080798073663735, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 148634110.0, "reward": 0.7674218416213989, "reward_std": 0.11307354271411896, "rewards/qatch_small_update_with_fm/mean": 0.7674218416213989, "rewards/qatch_small_update_with_fm/std": 0.3313625454902649, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990534782409668, "sampling/importance_sampling_ratio/min": 0.011155885644257069, "sampling/sampling_logp_difference/max": 4.495788097381592, "sampling/sampling_logp_difference/mean": 0.10630348324775696, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 376.91796875, "completions/mean_terminated_length": 376.91796875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.08900775015354156, "epoch": 0.5256637168141592, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4087395121892628, "learning_rate": 1e-06, "loss": -0.0103, "num_tokens": 149203097.0, "reward": 0.7409297227859497, "reward_std": 0.052058301866054535, "rewards/qatch_small_update_with_fm/mean": 0.7409297227859497, "rewards/qatch_small_update_with_fm/std": 0.3449692130088806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9903227686882019, "sampling/importance_sampling_ratio/min": 0.00527548510581255, "sampling/sampling_logp_difference/max": 5.24468469619751, "sampling/sampling_logp_difference/mean": 0.09000498056411743, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 364.77734375, "completions/mean_terminated_length": 364.77734375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.10976507421582937, "epoch": 0.5274336283185841, "frac_reward_zero_std": 0.5, "grad_norm": 0.6729616231094641, "learning_rate": 1e-06, "loss": -0.0497, "num_tokens": 149537952.0, "reward": 0.5636405944824219, "reward_std": 0.10702510178089142, "rewards/qatch_small_update_with_fm/mean": 0.5636405944824219, "rewards/qatch_small_update_with_fm/std": 0.3598759174346924, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9937114119529724, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.09804245829582214, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 500.36328125, "completions/mean_terminated_length": 472.0511779785156, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.12263609375804663, "epoch": 0.5292035398230088, "frac_reward_zero_std": 0.625, "grad_norm": 0.4428215756207369, "learning_rate": 1e-06, "loss": -0.0397, "num_tokens": 150078141.0, "reward": 0.6905312538146973, "reward_std": 0.05177067592740059, "rewards/qatch_small_update_with_fm/mean": 0.6905312538146973, "rewards/qatch_small_update_with_fm/std": 0.3768806457519531, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9940615892410278, "sampling/importance_sampling_ratio/min": 0.001947162440046668, "sampling/sampling_logp_difference/max": 6.241382122039795, "sampling/sampling_logp_difference/mean": 0.10560339689254761, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 312.921875, "completions/mean_terminated_length": 312.921875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.07294352725148201, "epoch": 0.5309734513274337, "frac_reward_zero_std": 0.875, "grad_norm": 0.49703986559817703, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 150464585.0, "reward": 0.8377617001533508, "reward_std": 0.028467310592532158, "rewards/qatch_small_update_with_fm/mean": 0.8377617001533508, "rewards/qatch_small_update_with_fm/std": 0.26436394453048706, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.98967045545578, "sampling/importance_sampling_ratio/min": 0.0009882025187835097, "sampling/sampling_logp_difference/max": 6.919622898101807, "sampling/sampling_logp_difference/mean": 0.07985977083444595, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 455.06640625, "completions/mean_terminated_length": 455.06640625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.11215604841709137, "epoch": 0.5327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 0.543172696038466, "learning_rate": 1e-06, "loss": -0.0341, "num_tokens": 151168426.0, "reward": 0.7523554563522339, "reward_std": 0.11810556054115295, "rewards/qatch_small_update_with_fm/mean": 0.7523554563522339, "rewards/qatch_small_update_with_fm/std": 0.32776567339897156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9945645332336426, "sampling/importance_sampling_ratio/min": 0.007378770504146814, "sampling/sampling_logp_difference/max": 4.909148216247559, "sampling/sampling_logp_difference/mean": 0.09771260619163513, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 350.12109375, "completions/mean_terminated_length": 350.12109375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.08310944028198719, "epoch": 0.5345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.5927217882498504, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 151669849.0, "reward": 0.7303671836853027, "reward_std": 0.05779767408967018, "rewards/qatch_small_update_with_fm/mean": 0.7303671836853027, "rewards/qatch_small_update_with_fm/std": 0.3890523314476013, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9917393922805786, "sampling/importance_sampling_ratio/min": 0.0041347043588757515, "sampling/sampling_logp_difference/max": 5.488339424133301, "sampling/sampling_logp_difference/mean": 0.08336856961250305, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 455.5, "completions/mean_terminated_length": 455.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1370784854516387, "epoch": 0.536283185840708, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5876568831658732, "learning_rate": 1e-06, "loss": 0.0535, "num_tokens": 152257385.0, "reward": 0.7178086042404175, "reward_std": 0.07202421128749847, "rewards/qatch_small_update_with_fm/mean": 0.7178086042404175, "rewards/qatch_small_update_with_fm/std": 0.33369919657707214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9918363094329834, "sampling/importance_sampling_ratio/min": 0.0042452337220311165, "sampling/sampling_logp_difference/max": 5.461958408355713, "sampling/sampling_logp_difference/mean": 0.11546655744314194, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 434.015625, "completions/mean_terminated_length": 434.015625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.09734268207103014, "epoch": 0.5380530973451327, "frac_reward_zero_std": 0.625, "grad_norm": 0.6656086614188875, "learning_rate": 1e-06, "loss": -0.0074, "num_tokens": 152871565.0, "reward": 0.8256992101669312, "reward_std": 0.07486239075660706, "rewards/qatch_small_update_with_fm/mean": 0.8256992101669312, "rewards/qatch_small_update_with_fm/std": 0.2639926075935364, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946987628936768, "sampling/importance_sampling_ratio/min": 0.0032207805197685957, "sampling/sampling_logp_difference/max": 5.738131523132324, "sampling/sampling_logp_difference/mean": 0.08409319072961807, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 337.96875, "completions/mean_terminated_length": 337.96875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.08514279685914516, "epoch": 0.5398230088495575, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3134650372865385, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 153296357.0, "reward": 0.7113710641860962, "reward_std": 0.05351608246564865, "rewards/qatch_small_update_with_fm/mean": 0.7113710641860962, "rewards/qatch_small_update_with_fm/std": 0.3926786184310913, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9917937517166138, "sampling/importance_sampling_ratio/min": 0.008679499849677086, "sampling/sampling_logp_difference/max": 4.746791362762451, "sampling/sampling_logp_difference/mean": 0.08514834940433502, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 434.58984375, "completions/mean_terminated_length": 434.58984375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.1213516229763627, "epoch": 0.5415929203539823, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6627001328299444, "learning_rate": 1e-06, "loss": -0.0688, "num_tokens": 153921036.0, "reward": 0.7097538709640503, "reward_std": 0.14472410082817078, "rewards/qatch_small_update_with_fm/mean": 0.7097538709640503, "rewards/qatch_small_update_with_fm/std": 0.37847304344177246, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988747239112854, "sampling/importance_sampling_ratio/min": 0.0025193877518177032, "sampling/sampling_logp_difference/max": 5.983739376068115, "sampling/sampling_logp_difference/mean": 0.11412816494703293, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 465.35546875, "completions/mean_terminated_length": 465.35546875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.12086922209709883, "epoch": 0.5433628318584071, "frac_reward_zero_std": 0.5625, "grad_norm": 0.64253124673734, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 154509207.0, "reward": 0.8136289119720459, "reward_std": 0.10231994092464447, "rewards/qatch_small_update_with_fm/mean": 0.8136289119720459, "rewards/qatch_small_update_with_fm/std": 0.34205469489097595, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925813674926758, "sampling/importance_sampling_ratio/min": 0.011445293202996254, "sampling/sampling_logp_difference/max": 4.470176696777344, "sampling/sampling_logp_difference/mean": 0.10290302336215973, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 498.5859375, "completions/mean_terminated_length": 498.5859375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.12368575483560562, "epoch": 0.5451327433628319, "frac_reward_zero_std": 0.5, "grad_norm": 0.5271043034237322, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 155121037.0, "reward": 0.7560000419616699, "reward_std": 0.12761875987052917, "rewards/qatch_small_update_with_fm/mean": 0.7560000419616699, "rewards/qatch_small_update_with_fm/std": 0.35406094789505005, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9949927926063538, "sampling/importance_sampling_ratio/min": 0.008679255843162537, "sampling/sampling_logp_difference/max": 4.746819496154785, "sampling/sampling_logp_difference/mean": 0.1031837910413742, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3090.0, "completions/max_terminated_length": 3090.0, "completions/mean_length": 502.54296875, "completions/mean_terminated_length": 502.54296875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.1271666856482625, "epoch": 0.5469026548672566, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6756247756423795, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 155601320.0, "reward": 0.6903945207595825, "reward_std": 0.057952310889959335, "rewards/qatch_small_update_with_fm/mean": 0.6903945207595825, "rewards/qatch_small_update_with_fm/std": 0.3617255687713623, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.997051477432251, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.10001347959041595, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 417.25390625, "completions/mean_terminated_length": 417.25390625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.11392358969897032, "epoch": 0.5486725663716814, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7202387252863824, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 156132649.0, "reward": 0.7270429730415344, "reward_std": 0.1510547250509262, "rewards/qatch_small_update_with_fm/mean": 0.7270429730415344, "rewards/qatch_small_update_with_fm/std": 0.38717004656791687, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9941946268081665, "sampling/importance_sampling_ratio/min": 0.007314593065530062, "sampling/sampling_logp_difference/max": 4.91788387298584, "sampling/sampling_logp_difference/mean": 0.09563559293746948, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 441.7734375, "completions/mean_terminated_length": 427.44317626953125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.11410268303006887, "epoch": 0.5504424778761062, "frac_reward_zero_std": 0.625, "grad_norm": 0.5607862617554898, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 156621343.0, "reward": 0.8368203043937683, "reward_std": 0.10923705250024796, "rewards/qatch_small_update_with_fm/mean": 0.8368203043937683, "rewards/qatch_small_update_with_fm/std": 0.28346678614616394, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933780431747437, "sampling/importance_sampling_ratio/min": 0.011173656210303307, "sampling/sampling_logp_difference/max": 4.49419641494751, "sampling/sampling_logp_difference/mean": 0.09727410227060318, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 433.5078125, "completions/mean_terminated_length": 433.5078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.11591598391532898, "epoch": 0.552212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 0.5845116761011263, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 157069281.0, "reward": 0.8090898394584656, "reward_std": 0.0635581985116005, "rewards/qatch_small_update_with_fm/mean": 0.8090898394584656, "rewards/qatch_small_update_with_fm/std": 0.3227374851703644, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9951672554016113, "sampling/importance_sampling_ratio/min": 0.006770285312086344, "sampling/sampling_logp_difference/max": 4.995212078094482, "sampling/sampling_logp_difference/mean": 0.09658536314964294, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 446.53125, "completions/mean_terminated_length": 446.53125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1346312901005149, "epoch": 0.5539823008849557, "frac_reward_zero_std": 0.5, "grad_norm": 0.5379089439997566, "learning_rate": 1e-06, "loss": 0.0546, "num_tokens": 157421465.0, "reward": 0.7145351767539978, "reward_std": 0.0909833312034607, "rewards/qatch_small_update_with_fm/mean": 0.714535117149353, "rewards/qatch_small_update_with_fm/std": 0.37304043769836426, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946333169937134, "sampling/importance_sampling_ratio/min": 0.012340717017650604, "sampling/sampling_logp_difference/max": 4.394851207733154, "sampling/sampling_logp_difference/mean": 0.10755407065153122, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 378.3203125, "completions/mean_terminated_length": 378.3203125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1214052801951766, "epoch": 0.5557522123893806, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6558845846189386, "learning_rate": 1e-06, "loss": -0.0369, "num_tokens": 157899307.0, "reward": 0.7183281183242798, "reward_std": 0.053716663271188736, "rewards/qatch_small_update_with_fm/mean": 0.7183281183242798, "rewards/qatch_small_update_with_fm/std": 0.340137779712677, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9941397309303284, "sampling/importance_sampling_ratio/min": 0.0022659660317003727, "sampling/sampling_logp_difference/max": 6.089754104614258, "sampling/sampling_logp_difference/mean": 0.10012153536081314, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 384.48828125, "completions/mean_terminated_length": 384.48828125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.11504952795803547, "epoch": 0.5575221238938053, "frac_reward_zero_std": 0.5, "grad_norm": 0.6495225945273754, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 158448664.0, "reward": 0.8267968893051147, "reward_std": 0.11277540028095245, "rewards/qatch_small_update_with_fm/mean": 0.8267968893051147, "rewards/qatch_small_update_with_fm/std": 0.2775137722492218, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9939453601837158, "sampling/importance_sampling_ratio/min": 0.006800443399697542, "sampling/sampling_logp_difference/max": 4.990767478942871, "sampling/sampling_logp_difference/mean": 0.0972718894481659, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 391.35546875, "completions/mean_terminated_length": 391.35546875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.12797002494335175, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.5, "grad_norm": 0.66959273180804, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 159075987.0, "reward": 0.7723633050918579, "reward_std": 0.09683510661125183, "rewards/qatch_small_update_with_fm/mean": 0.7723633050918579, "rewards/qatch_small_update_with_fm/std": 0.31531158089637756, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9960798025131226, "sampling/importance_sampling_ratio/min": 0.0006279381923377514, "sampling/sampling_logp_difference/max": 7.373068809509277, "sampling/sampling_logp_difference/mean": 0.10363449156284332, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 418.4375, "completions/mean_terminated_length": 418.4375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.12645841762423515, "epoch": 0.5610619469026549, "frac_reward_zero_std": 0.8125, "grad_norm": 0.43227915668177896, "learning_rate": 1e-06, "loss": -0.0158, "num_tokens": 159643251.0, "reward": 0.8227812647819519, "reward_std": 0.049158066511154175, "rewards/qatch_small_update_with_fm/mean": 0.8227812647819519, "rewards/qatch_small_update_with_fm/std": 0.31986841559410095, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9950623512268066, "sampling/importance_sampling_ratio/min": 0.008687077090144157, "sampling/sampling_logp_difference/max": 4.7459187507629395, "sampling/sampling_logp_difference/mean": 0.10484028607606888, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 489.4453125, "completions/mean_terminated_length": 475.302001953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.13590082712471485, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5538138640244875, "learning_rate": 1e-06, "loss": -0.0412, "num_tokens": 160179285.0, "reward": 0.7630391120910645, "reward_std": 0.09547914564609528, "rewards/qatch_small_update_with_fm/mean": 0.7630391120910645, "rewards/qatch_small_update_with_fm/std": 0.35770267248153687, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000463724136353, "sampling/importance_sampling_ratio/min": 0.005264458246529102, "sampling/sampling_logp_difference/max": 5.246777057647705, "sampling/sampling_logp_difference/mean": 0.10617281496524811, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 539.88671875, "completions/mean_terminated_length": 539.88671875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.13833912461996078, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6595395533621345, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 160704968.0, "reward": 0.6066679954528809, "reward_std": 0.08918474614620209, "rewards/qatch_small_update_with_fm/mean": 0.6066679954528809, "rewards/qatch_small_update_with_fm/std": 0.37395772337913513, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9956793785095215, "sampling/importance_sampling_ratio/min": 0.0030901748687028885, "sampling/sampling_logp_difference/max": 5.77952766418457, "sampling/sampling_logp_difference/mean": 0.10660675168037415, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 320.67578125, "completions/mean_terminated_length": 320.67578125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.09042697865515947, "epoch": 0.5663716814159292, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6116439453305005, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 161201829.0, "reward": 0.884960949420929, "reward_std": 0.04503811523318291, "rewards/qatch_small_update_with_fm/mean": 0.884960949420929, "rewards/qatch_small_update_with_fm/std": 0.288003534078598, "sampling/importance_sampling_ratio/max": 1.8726849555969238, "sampling/importance_sampling_ratio/mean": 0.9918411374092102, "sampling/importance_sampling_ratio/min": 0.00045137188863009214, "sampling/sampling_logp_difference/max": 7.703218936920166, "sampling/sampling_logp_difference/mean": 0.08345195651054382, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2957.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 560.984375, "completions/mean_terminated_length": 560.984375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.1479995846748352, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4996635931610896, "learning_rate": 1e-06, "loss": 0.0588, "num_tokens": 161929473.0, "reward": 0.6543202996253967, "reward_std": 0.1047203540802002, "rewards/qatch_small_update_with_fm/mean": 0.6543203592300415, "rewards/qatch_small_update_with_fm/std": 0.39747729897499084, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9977031946182251, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.11383165419101715, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 440.65234375, "completions/mean_terminated_length": 426.3176574707031, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.11984299309551716, "epoch": 0.5699115044247788, "frac_reward_zero_std": 0.5, "grad_norm": 0.5934492923234808, "learning_rate": 1e-06, "loss": -0.0495, "num_tokens": 162480056.0, "reward": 0.7722031474113464, "reward_std": 0.11908012628555298, "rewards/qatch_small_update_with_fm/mean": 0.7722031474113464, "rewards/qatch_small_update_with_fm/std": 0.3386456072330475, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946287870407104, "sampling/importance_sampling_ratio/min": 0.01118414755910635, "sampling/sampling_logp_difference/max": 4.493257999420166, "sampling/sampling_logp_difference/mean": 0.0980626791715622, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 461.82421875, "completions/mean_terminated_length": 461.82421875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.136813223361969, "epoch": 0.5716814159292035, "frac_reward_zero_std": 0.625, "grad_norm": 0.5608859874498612, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 163002811.0, "reward": 0.8026015758514404, "reward_std": 0.05060504376888275, "rewards/qatch_small_update_with_fm/mean": 0.8026015758514404, "rewards/qatch_small_update_with_fm/std": 0.33220797777175903, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9989163875579834, "sampling/importance_sampling_ratio/min": 2.4591246983618475e-05, "sampling/sampling_logp_difference/max": 10.613120079040527, "sampling/sampling_logp_difference/mean": 0.10465744137763977, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 368.53515625, "completions/mean_terminated_length": 368.53515625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.10137500800192356, "epoch": 0.5734513274336284, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5441647449845531, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 163449396.0, "reward": 0.821164071559906, "reward_std": 0.044571854174137115, "rewards/qatch_small_update_with_fm/mean": 0.821164071559906, "rewards/qatch_small_update_with_fm/std": 0.31781241297721863, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920123815536499, "sampling/importance_sampling_ratio/min": 4.6803124860161915e-05, "sampling/sampling_logp_difference/max": 9.969560623168945, "sampling/sampling_logp_difference/mean": 0.09228494018316269, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 453.5234375, "completions/mean_terminated_length": 439.2392272949219, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.14200774021446705, "epoch": 0.5752212389380531, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5266961980954568, "learning_rate": 1e-06, "loss": -0.0445, "num_tokens": 163966026.0, "reward": 0.7711563110351562, "reward_std": 0.09890992194414139, "rewards/qatch_small_update_with_fm/mean": 0.7711562514305115, "rewards/qatch_small_update_with_fm/std": 0.3402455151081085, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9953124523162842, "sampling/importance_sampling_ratio/min": 0.0011746156960725784, "sampling/sampling_logp_difference/max": 6.746814250946045, "sampling/sampling_logp_difference/mean": 0.11260481923818588, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 398.9296875, "completions/mean_terminated_length": 398.9296875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.12197424191981554, "epoch": 0.5769911504424778, "frac_reward_zero_std": 0.8125, "grad_norm": 0.35093369253212775, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 164455704.0, "reward": 0.6438866853713989, "reward_std": 0.04379892349243164, "rewards/qatch_small_update_with_fm/mean": 0.6438866853713989, "rewards/qatch_small_update_with_fm/std": 0.3767320513725281, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9938642382621765, "sampling/importance_sampling_ratio/min": 0.012557579204440117, "sampling/sampling_logp_difference/max": 4.3774309158325195, "sampling/sampling_logp_difference/mean": 0.10210156440734863, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 487.40625, "completions/mean_terminated_length": 487.40625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.12392527237534523, "epoch": 0.5787610619469027, "frac_reward_zero_std": 0.625, "grad_norm": 0.4301567888730242, "learning_rate": 1e-06, "loss": -0.017, "num_tokens": 165043888.0, "reward": 0.6832578182220459, "reward_std": 0.07006269693374634, "rewards/qatch_small_update_with_fm/mean": 0.6832578182220459, "rewards/qatch_small_update_with_fm/std": 0.3451367914676666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9974828958511353, "sampling/importance_sampling_ratio/min": 0.003241075435653329, "sampling/sampling_logp_difference/max": 5.7318501472473145, "sampling/sampling_logp_difference/mean": 0.09849071502685547, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3916.0, "completions/max_terminated_length": 3916.0, "completions/mean_length": 562.7265625, "completions/mean_terminated_length": 562.7265625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.12135390192270279, "epoch": 0.5805309734513274, "frac_reward_zero_std": 0.5, "grad_norm": 0.46934700215857733, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 165710922.0, "reward": 0.6851679682731628, "reward_std": 0.09362904727458954, "rewards/qatch_small_update_with_fm/mean": 0.6851679682731628, "rewards/qatch_small_update_with_fm/std": 0.3390476405620575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9955009818077087, "sampling/importance_sampling_ratio/min": 0.00046856250264681876, "sampling/sampling_logp_difference/max": 7.665841102600098, "sampling/sampling_logp_difference/mean": 0.0948438048362732, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 388.74609375, "completions/mean_terminated_length": 388.74609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.11189944110810757, "epoch": 0.5823008849557522, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6534054957023212, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 166287353.0, "reward": 0.8831367492675781, "reward_std": 0.04500850662589073, "rewards/qatch_small_update_with_fm/mean": 0.8831367492675781, "rewards/qatch_small_update_with_fm/std": 0.2723407745361328, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9937008619308472, "sampling/importance_sampling_ratio/min": 0.01839052513241768, "sampling/sampling_logp_difference/max": 3.995919704437256, "sampling/sampling_logp_difference/mean": 0.09178143739700317, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 481.04296875, "completions/mean_terminated_length": 481.04296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.13939405418932438, "epoch": 0.584070796460177, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6137833889894624, "learning_rate": 1e-06, "loss": 0.0451, "num_tokens": 166844948.0, "reward": 0.6526992321014404, "reward_std": 0.10534843057394028, "rewards/qatch_small_update_with_fm/mean": 0.6526992321014404, "rewards/qatch_small_update_with_fm/std": 0.3459371030330658, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9961899518966675, "sampling/importance_sampling_ratio/min": 0.0015783263370394707, "sampling/sampling_logp_difference/max": 6.451390266418457, "sampling/sampling_logp_difference/mean": 0.11320600658655167, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 351.69140625, "completions/mean_terminated_length": 351.69140625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.10876735299825668, "epoch": 0.5858407079646017, "frac_reward_zero_std": 0.625, "grad_norm": 0.5578310759986786, "learning_rate": 1e-06, "loss": 0.0591, "num_tokens": 167356421.0, "reward": 0.7452616691589355, "reward_std": 0.10783276706933975, "rewards/qatch_small_update_with_fm/mean": 0.7452616691589355, "rewards/qatch_small_update_with_fm/std": 0.32912948727607727, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9922351241111755, "sampling/importance_sampling_ratio/min": 0.009870722889900208, "sampling/sampling_logp_difference/max": 4.618182182312012, "sampling/sampling_logp_difference/mean": 0.09559471905231476, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 375.47265625, "completions/mean_terminated_length": 375.47265625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.12615725863724947, "epoch": 0.5876106194690266, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5005707404307401, "learning_rate": 1e-06, "loss": -0.0222, "num_tokens": 167926062.0, "reward": 0.6700546741485596, "reward_std": 0.07864837348461151, "rewards/qatch_small_update_with_fm/mean": 0.6700546741485596, "rewards/qatch_small_update_with_fm/std": 0.4005128741264343, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9930409789085388, "sampling/importance_sampling_ratio/min": 0.0007163186674006283, "sampling/sampling_logp_difference/max": 7.241385459899902, "sampling/sampling_logp_difference/mean": 0.10649150609970093, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 492.69921875, "completions/mean_terminated_length": 492.69921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.12994778994470835, "epoch": 0.5893805309734513, "frac_reward_zero_std": 0.5, "grad_norm": 0.462068641561627, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 168363249.0, "reward": 0.6787070035934448, "reward_std": 0.12104913592338562, "rewards/qatch_small_update_with_fm/mean": 0.6787070035934448, "rewards/qatch_small_update_with_fm/std": 0.3600393533706665, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9959803223609924, "sampling/importance_sampling_ratio/min": 0.014395722188055515, "sampling/sampling_logp_difference/max": 4.240824222564697, "sampling/sampling_logp_difference/mean": 0.10011019557714462, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3668.0, "completions/max_terminated_length": 3668.0, "completions/mean_length": 610.2109375, "completions/mean_terminated_length": 610.2109375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.14918566308915615, "epoch": 0.5911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 0.4833571863040555, "learning_rate": 1e-06, "loss": -0.0226, "num_tokens": 168936967.0, "reward": 0.6901054978370667, "reward_std": 0.12324898689985275, "rewards/qatch_small_update_with_fm/mean": 0.6901054978370667, "rewards/qatch_small_update_with_fm/std": 0.4031791090965271, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998165369033813, "sampling/importance_sampling_ratio/min": 0.018390534445643425, "sampling/sampling_logp_difference/max": 3.9959192276000977, "sampling/sampling_logp_difference/mean": 0.10862861573696136, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 476.22265625, "completions/mean_terminated_length": 476.22265625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.14073671028017998, "epoch": 0.5929203539823009, "frac_reward_zero_std": 0.625, "grad_norm": 0.4452770871121166, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 169395120.0, "reward": 0.8413281440734863, "reward_std": 0.054899152368307114, "rewards/qatch_small_update_with_fm/mean": 0.8413281440734863, "rewards/qatch_small_update_with_fm/std": 0.2942917048931122, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9945122599601746, "sampling/importance_sampling_ratio/min": 0.001602180185727775, "sampling/sampling_logp_difference/max": 6.436389923095703, "sampling/sampling_logp_difference/mean": 0.11215078830718994, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 470.0546875, "completions/mean_terminated_length": 470.0546875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.13123032171279192, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4413668770939599, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 169960782.0, "reward": 0.7925976514816284, "reward_std": 0.06153729930520058, "rewards/qatch_small_update_with_fm/mean": 0.7925976514816284, "rewards/qatch_small_update_with_fm/std": 0.32960665225982666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9963173866271973, "sampling/importance_sampling_ratio/min": 5.2583640353986993e-05, "sampling/sampling_logp_difference/max": 9.853105545043945, "sampling/sampling_logp_difference/mean": 0.1045021116733551, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 375.79296875, "completions/mean_terminated_length": 375.79296875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.12677610013633966, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.75, "grad_norm": 0.5472979749360979, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 170411641.0, "reward": 0.70751953125, "reward_std": 0.0590687096118927, "rewards/qatch_small_update_with_fm/mean": 0.70751953125, "rewards/qatch_small_update_with_fm/std": 0.3998953700065613, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946392774581909, "sampling/importance_sampling_ratio/min": 0.005264220293611288, "sampling/sampling_logp_difference/max": 5.246822357177734, "sampling/sampling_logp_difference/mean": 0.10403802245855331, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 440.74609375, "completions/mean_terminated_length": 426.41180419921875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.11007914505898952, "epoch": 0.5982300884955752, "frac_reward_zero_std": 0.625, "grad_norm": 0.4332959024599203, "learning_rate": 1e-06, "loss": -0.0356, "num_tokens": 170886616.0, "reward": 0.7666601538658142, "reward_std": 0.030690979212522507, "rewards/qatch_small_update_with_fm/mean": 0.7666601538658142, "rewards/qatch_small_update_with_fm/std": 0.3420563340187073, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9921419024467468, "sampling/importance_sampling_ratio/min": 0.004177570343017578, "sampling/sampling_logp_difference/max": 5.478025436401367, "sampling/sampling_logp_difference/mean": 0.09689275920391083, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 480.60546875, "completions/mean_terminated_length": 466.427490234375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.13167188875377178, "epoch": 0.6, "frac_reward_zero_std": 0.625, "grad_norm": 0.41572721343676217, "learning_rate": 1e-06, "loss": -0.0408, "num_tokens": 171562403.0, "reward": 0.7634413838386536, "reward_std": 0.06147729605436325, "rewards/qatch_small_update_with_fm/mean": 0.7634413838386536, "rewards/qatch_small_update_with_fm/std": 0.30900147557258606, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9971693754196167, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.10109194368124008, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1451.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 443.70703125, "completions/mean_terminated_length": 443.70703125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.12869115360081196, "epoch": 0.6017699115044248, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4337358298816994, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 172183448.0, "reward": 0.6481367349624634, "reward_std": 0.09045610576868057, "rewards/qatch_small_update_with_fm/mean": 0.6481367349624634, "rewards/qatch_small_update_with_fm/std": 0.3557817041873932, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.993698239326477, "sampling/importance_sampling_ratio/min": 0.011156954802572727, "sampling/sampling_logp_difference/max": 4.495692253112793, "sampling/sampling_logp_difference/mean": 0.10702698677778244, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 449.98046875, "completions/mean_terminated_length": 449.98046875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.13718800246715546, "epoch": 0.6035398230088496, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3991429375228678, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 172828595.0, "reward": 0.8131992220878601, "reward_std": 0.03861326724290848, "rewards/qatch_small_update_with_fm/mean": 0.8131991624832153, "rewards/qatch_small_update_with_fm/std": 0.3113223612308502, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9982845783233643, "sampling/importance_sampling_ratio/min": 0.011312464252114296, "sampling/sampling_logp_difference/max": 4.4818501472473145, "sampling/sampling_logp_difference/mean": 0.10661160200834274, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3680.0, "completions/max_terminated_length": 3680.0, "completions/mean_length": 553.42578125, "completions/mean_terminated_length": 553.42578125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.13830072805285454, "epoch": 0.6053097345132743, "frac_reward_zero_std": 0.5, "grad_norm": 0.5307300561999599, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 173347328.0, "reward": 0.7857421636581421, "reward_std": 0.12376996129751205, "rewards/qatch_small_update_with_fm/mean": 0.7857421636581421, "rewards/qatch_small_update_with_fm/std": 0.3143363893032074, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.998528003692627, "sampling/importance_sampling_ratio/min": 0.008697294630110264, "sampling/sampling_logp_difference/max": 4.744743347167969, "sampling/sampling_logp_difference/mean": 0.10505607724189758, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 583.9375, "completions/mean_terminated_length": 570.1647338867188, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.13550461642444134, "epoch": 0.6070796460176991, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6604555736712372, "learning_rate": 1e-06, "loss": -0.0578, "num_tokens": 173950400.0, "reward": 0.710085928440094, "reward_std": 0.14106522500514984, "rewards/qatch_small_update_with_fm/mean": 0.710085928440094, "rewards/qatch_small_update_with_fm/std": 0.34384334087371826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9940401315689087, "sampling/importance_sampling_ratio/min": 0.0029973003547638655, "sampling/sampling_logp_difference/max": 5.8100433349609375, "sampling/sampling_logp_difference/mean": 0.10811758041381836, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3297.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 553.1171875, "completions/mean_terminated_length": 553.1171875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.12964780256152153, "epoch": 0.6088495575221239, "frac_reward_zero_std": 0.625, "grad_norm": 0.4294082866965215, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 174738366.0, "reward": 0.669113278388977, "reward_std": 0.07132252305746078, "rewards/qatch_small_update_with_fm/mean": 0.669113278388977, "rewards/qatch_small_update_with_fm/std": 0.3960440456867218, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9974240064620972, "sampling/importance_sampling_ratio/min": 0.011154402047395706, "sampling/sampling_logp_difference/max": 4.4959211349487305, "sampling/sampling_logp_difference/mean": 0.0989176407456398, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 496.5703125, "completions/mean_terminated_length": 496.5703125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.1358927357941866, "epoch": 0.6106194690265486, "frac_reward_zero_std": 0.5, "grad_norm": 0.5797395111181823, "learning_rate": 1e-06, "loss": -0.0251, "num_tokens": 175206480.0, "reward": 0.7895351648330688, "reward_std": 0.13384278118610382, "rewards/qatch_small_update_with_fm/mean": 0.7895351648330688, "rewards/qatch_small_update_with_fm/std": 0.3367822766304016, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9962368011474609, "sampling/importance_sampling_ratio/min": 0.003333539701998234, "sampling/sampling_logp_difference/max": 5.703720569610596, "sampling/sampling_logp_difference/mean": 0.10676106810569763, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3889.0, "completions/mean_length": 659.578125, "completions/mean_terminated_length": 605.0317993164062, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.16065522283315659, "epoch": 0.6123893805309735, "frac_reward_zero_std": 0.6875, "grad_norm": 0.36081780370740074, "learning_rate": 1e-06, "loss": -0.0681, "num_tokens": 175694948.0, "reward": 0.8195507526397705, "reward_std": 0.07761920243501663, "rewards/qatch_small_update_with_fm/mean": 0.8195507526397705, "rewards/qatch_small_update_with_fm/std": 0.339427649974823, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0015735626220703, "sampling/importance_sampling_ratio/min": 0.005800637882202864, "sampling/sampling_logp_difference/max": 5.149787425994873, "sampling/sampling_logp_difference/mean": 0.1185375228524208, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 381.00390625, "completions/mean_terminated_length": 381.00390625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.11013178713619709, "epoch": 0.6141592920353982, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5357087422630683, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 176272869.0, "reward": 0.8548632860183716, "reward_std": 0.0601215660572052, "rewards/qatch_small_update_with_fm/mean": 0.8548632860183716, "rewards/qatch_small_update_with_fm/std": 0.3249365985393524, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992617666721344, "sampling/importance_sampling_ratio/min": 0.011489858850836754, "sampling/sampling_logp_difference/max": 4.466290473937988, "sampling/sampling_logp_difference/mean": 0.09646125137805939, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 812.734375, "completions/mean_terminated_length": 799.85888671875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.20528638921678066, "epoch": 0.6159292035398231, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4169959713473217, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 176961297.0, "reward": 0.5849062204360962, "reward_std": 0.1079559475183487, "rewards/qatch_small_update_with_fm/mean": 0.5849062204360962, "rewards/qatch_small_update_with_fm/std": 0.3221876919269562, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.005646824836731, "sampling/importance_sampling_ratio/min": 4.014266960439272e-05, "sampling/sampling_logp_difference/max": 10.12307071685791, "sampling/sampling_logp_difference/mean": 0.13561221957206726, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 563.3828125, "completions/mean_terminated_length": 549.5294189453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.137452632188797, "epoch": 0.6176991150442478, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5080168937055006, "learning_rate": 1e-06, "loss": -0.0059, "num_tokens": 177646515.0, "reward": 0.7648085951805115, "reward_std": 0.14244134724140167, "rewards/qatch_small_update_with_fm/mean": 0.7648086547851562, "rewards/qatch_small_update_with_fm/std": 0.31815099716186523, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9965197443962097, "sampling/importance_sampling_ratio/min": 0.010515945963561535, "sampling/sampling_logp_difference/max": 4.5548624992370605, "sampling/sampling_logp_difference/mean": 0.10763923823833466, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 504.24609375, "completions/mean_terminated_length": 504.24609375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.14669188857078552, "epoch": 0.6194690265486725, "frac_reward_zero_std": 0.6875, "grad_norm": 0.46991361682403926, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 178274386.0, "reward": 0.7633749842643738, "reward_std": 0.0714079737663269, "rewards/qatch_small_update_with_fm/mean": 0.7633749842643738, "rewards/qatch_small_update_with_fm/std": 0.3283040523529053, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9949438571929932, "sampling/importance_sampling_ratio/min": 0.005583168473094702, "sampling/sampling_logp_difference/max": 5.1879987716674805, "sampling/sampling_logp_difference/mean": 0.11546455323696136, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 597.47265625, "completions/mean_terminated_length": 597.47265625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.15416298992931843, "epoch": 0.6212389380530974, "frac_reward_zero_std": 0.625, "grad_norm": 0.4550888098901701, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 178902139.0, "reward": 0.7233320474624634, "reward_std": 0.09315317869186401, "rewards/qatch_small_update_with_fm/mean": 0.7233320474624634, "rewards/qatch_small_update_with_fm/std": 0.38610121607780457, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9959432482719421, "sampling/importance_sampling_ratio/min": 0.0018164917128160596, "sampling/sampling_logp_difference/max": 6.310848236083984, "sampling/sampling_logp_difference/mean": 0.11840873956680298, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3230.0, "completions/max_terminated_length": 3230.0, "completions/mean_length": 491.67578125, "completions/mean_terminated_length": 491.67578125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.17086774297058582, "epoch": 0.6230088495575221, "frac_reward_zero_std": 0.625, "grad_norm": 0.41309780113542105, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 179407928.0, "reward": 0.8434453010559082, "reward_std": 0.09375521540641785, "rewards/qatch_small_update_with_fm/mean": 0.8434453010559082, "rewards/qatch_small_update_with_fm/std": 0.25720974802970886, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986539483070374, "sampling/importance_sampling_ratio/min": 0.0020053053740411997, "sampling/sampling_logp_difference/max": 6.211958885192871, "sampling/sampling_logp_difference/mean": 0.12576082348823547, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 548.05078125, "completions/mean_terminated_length": 548.05078125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.15240298677235842, "epoch": 0.6247787610619469, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4873828861345403, "learning_rate": 1e-06, "loss": 0.026, "num_tokens": 179888837.0, "reward": 0.6651484370231628, "reward_std": 0.07561331987380981, "rewards/qatch_small_update_with_fm/mean": 0.6651483774185181, "rewards/qatch_small_update_with_fm/std": 0.39603519439697266, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9961607456207275, "sampling/importance_sampling_ratio/min": 0.002665693871676922, "sampling/sampling_logp_difference/max": 5.927290916442871, "sampling/sampling_logp_difference/mean": 0.11823346465826035, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 412.3203125, "completions/mean_terminated_length": 412.3203125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.12977661192417145, "epoch": 0.6265486725663717, "frac_reward_zero_std": 0.875, "grad_norm": 0.320145515397183, "learning_rate": 1e-06, "loss": -0.0142, "num_tokens": 180399383.0, "reward": 0.9061718583106995, "reward_std": 0.0416874997317791, "rewards/qatch_small_update_with_fm/mean": 0.9061718583106995, "rewards/qatch_small_update_with_fm/std": 0.20859557390213013, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925155639648438, "sampling/importance_sampling_ratio/min": 0.01432258915156126, "sampling/sampling_logp_difference/max": 4.245917320251465, "sampling/sampling_logp_difference/mean": 0.11004288494586945, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 414.60546875, "completions/mean_terminated_length": 414.60546875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1346904393285513, "epoch": 0.6283185840707964, "frac_reward_zero_std": 0.6875, "grad_norm": 0.47310823738569224, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 180899346.0, "reward": 0.8244647979736328, "reward_std": 0.08044957369565964, "rewards/qatch_small_update_with_fm/mean": 0.8244647979736328, "rewards/qatch_small_update_with_fm/std": 0.3035234212875366, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934406876564026, "sampling/importance_sampling_ratio/min": 0.012023287825286388, "sampling/sampling_logp_difference/max": 4.420909881591797, "sampling/sampling_logp_difference/mean": 0.10992369055747986, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1923.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 498.35546875, "completions/mean_terminated_length": 498.35546875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.13894802145659924, "epoch": 0.6300884955752213, "frac_reward_zero_std": 0.625, "grad_norm": 0.5291274570939486, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 181416381.0, "reward": 0.8441094160079956, "reward_std": 0.10105187445878983, "rewards/qatch_small_update_with_fm/mean": 0.8441094160079956, "rewards/qatch_small_update_with_fm/std": 0.28807127475738525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9959750175476074, "sampling/importance_sampling_ratio/min": 0.00495093734934926, "sampling/sampling_logp_difference/max": 5.308178424835205, "sampling/sampling_logp_difference/mean": 0.10719551891088486, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 503.2890625, "completions/mean_terminated_length": 489.2000427246094, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.13633199036121368, "epoch": 0.631858407079646, "frac_reward_zero_std": 0.625, "grad_norm": 0.42358638576867164, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 181996903.0, "reward": 0.8239570260047913, "reward_std": 0.07967139035463333, "rewards/qatch_small_update_with_fm/mean": 0.8239570260047913, "rewards/qatch_small_update_with_fm/std": 0.296528160572052, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9955064058303833, "sampling/importance_sampling_ratio/min": 0.004409489221870899, "sampling/sampling_logp_difference/max": 5.423996448516846, "sampling/sampling_logp_difference/mean": 0.1080026775598526, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 482.41015625, "completions/mean_terminated_length": 482.41015625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.1289900867268443, "epoch": 0.6336283185840708, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5076216355009244, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 182562896.0, "reward": 0.7308008074760437, "reward_std": 0.08840906620025635, "rewards/qatch_small_update_with_fm/mean": 0.7308008074760437, "rewards/qatch_small_update_with_fm/std": 0.3640395700931549, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9929482936859131, "sampling/importance_sampling_ratio/min": 0.006888894364237785, "sampling/sampling_logp_difference/max": 4.977844715118408, "sampling/sampling_logp_difference/mean": 0.1100352555513382, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 491.66796875, "completions/mean_terminated_length": 491.66796875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1540286112576723, "epoch": 0.6353982300884956, "frac_reward_zero_std": 0.625, "grad_norm": 0.5294897566023161, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 183366587.0, "reward": 0.8129062652587891, "reward_std": 0.08059161901473999, "rewards/qatch_small_update_with_fm/mean": 0.8129062652587891, "rewards/qatch_small_update_with_fm/std": 0.2822462320327759, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9948251247406006, "sampling/importance_sampling_ratio/min": 0.0018911922816187143, "sampling/sampling_logp_difference/max": 6.270547866821289, "sampling/sampling_logp_difference/mean": 0.12115992605686188, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 364.6171875, "completions/mean_terminated_length": 364.6171875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.10692374687641859, "epoch": 0.6371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 0.47798261049620744, "learning_rate": 1e-06, "loss": -0.0321, "num_tokens": 183851897.0, "reward": 0.8864530920982361, "reward_std": 0.05340360850095749, "rewards/qatch_small_update_with_fm/mean": 0.8864530920982361, "rewards/qatch_small_update_with_fm/std": 0.2327709197998047, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992672324180603, "sampling/importance_sampling_ratio/min": 0.011662163771688938, "sampling/sampling_logp_difference/max": 4.4514055252075195, "sampling/sampling_logp_difference/mean": 0.09532105177640915, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 574.52734375, "completions/mean_terminated_length": 574.52734375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1734640970826149, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.5, "grad_norm": 0.5221913862946522, "learning_rate": 1e-06, "loss": -0.0231, "num_tokens": 184408144.0, "reward": 0.7384335994720459, "reward_std": 0.11242429167032242, "rewards/qatch_small_update_with_fm/mean": 0.7384335398674011, "rewards/qatch_small_update_with_fm/std": 0.3646511435508728, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003238916397095, "sampling/importance_sampling_ratio/min": 0.00335796014405787, "sampling/sampling_logp_difference/max": 5.6964216232299805, "sampling/sampling_logp_difference/mean": 0.12429113686084747, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 401.9921875, "completions/mean_terminated_length": 401.9921875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.1182346660643816, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4427870385280626, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 184909566.0, "reward": 0.8384257555007935, "reward_std": 0.05273023992776871, "rewards/qatch_small_update_with_fm/mean": 0.8384257555007935, "rewards/qatch_small_update_with_fm/std": 0.2808656394481659, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9951916933059692, "sampling/importance_sampling_ratio/min": 0.004122166894376278, "sampling/sampling_logp_difference/max": 5.4913763999938965, "sampling/sampling_logp_difference/mean": 0.0995698869228363, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3443.0, "completions/mean_length": 689.7421875, "completions/mean_terminated_length": 662.9212646484375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.18941612169146538, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.625, "grad_norm": 0.39168740182060785, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 185508332.0, "reward": 0.7280429601669312, "reward_std": 0.10673657059669495, "rewards/qatch_small_update_with_fm/mean": 0.7280429601669312, "rewards/qatch_small_update_with_fm/std": 0.37747400999069214, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0072745084762573, "sampling/importance_sampling_ratio/min": 0.00017696816939860582, "sampling/sampling_logp_difference/max": 8.639540672302246, "sampling/sampling_logp_difference/mean": 0.12284526973962784, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 395.875, "completions/mean_terminated_length": 395.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.13422376941889524, "epoch": 0.6442477876106195, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5339106021835213, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 186070620.0, "reward": 0.8596680164337158, "reward_std": 0.05402027443051338, "rewards/qatch_small_update_with_fm/mean": 0.8596680164337158, "rewards/qatch_small_update_with_fm/std": 0.27827054262161255, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9923632144927979, "sampling/importance_sampling_ratio/min": 0.005285664927214384, "sampling/sampling_logp_difference/max": 5.2427568435668945, "sampling/sampling_logp_difference/mean": 0.11187142133712769, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 563.1640625, "completions/mean_terminated_length": 563.1640625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.1712171956896782, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4925582892442974, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 186725974.0, "reward": 0.7571640014648438, "reward_std": 0.11262641847133636, "rewards/qatch_small_update_with_fm/mean": 0.7571640014648438, "rewards/qatch_small_update_with_fm/std": 0.3311219811439514, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9971981048583984, "sampling/importance_sampling_ratio/min": 0.0032281808089464903, "sampling/sampling_logp_difference/max": 5.735836505889893, "sampling/sampling_logp_difference/mean": 0.1241505891084671, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3207.0, "completions/max_terminated_length": 3207.0, "completions/mean_length": 530.6640625, "completions/mean_terminated_length": 530.6640625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.13621144648641348, "epoch": 0.647787610619469, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5985888632848992, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 187224176.0, "reward": 0.7640078067779541, "reward_std": 0.10024788975715637, "rewards/qatch_small_update_with_fm/mean": 0.7640078067779541, "rewards/qatch_small_update_with_fm/std": 0.3289325535297394, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985411167144775, "sampling/importance_sampling_ratio/min": 0.011255462653934956, "sampling/sampling_logp_difference/max": 4.486901760101318, "sampling/sampling_logp_difference/mean": 0.10325411707162857, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3778.0, "completions/mean_length": 547.9375, "completions/mean_terminated_length": 491.61907958984375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.12220047693699598, "epoch": 0.6495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.32769796643092136, "learning_rate": 1e-06, "loss": -0.0868, "num_tokens": 187659680.0, "reward": 0.7751250267028809, "reward_std": 0.06956201791763306, "rewards/qatch_small_update_with_fm/mean": 0.7751250267028809, "rewards/qatch_small_update_with_fm/std": 0.36153918504714966, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9963778257369995, "sampling/importance_sampling_ratio/min": 0.018411830067634583, "sampling/sampling_logp_difference/max": 3.9947619438171387, "sampling/sampling_logp_difference/mean": 0.10011971741914749, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 734.4375, "completions/mean_terminated_length": 667.47412109375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.16884157061576843, "epoch": 0.6513274336283186, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4418356024268668, "learning_rate": 1e-06, "loss": -0.0543, "num_tokens": 188319328.0, "reward": 0.7149375081062317, "reward_std": 0.09334467351436615, "rewards/qatch_small_update_with_fm/mean": 0.7149375081062317, "rewards/qatch_small_update_with_fm/std": 0.3463698923587799, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000972747802734, "sampling/importance_sampling_ratio/min": 0.01852019503712654, "sampling/sampling_logp_difference/max": 3.988893508911133, "sampling/sampling_logp_difference/mean": 0.11699914187192917, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 502.859375, "completions/mean_terminated_length": 502.859375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.12519878055900335, "epoch": 0.6530973451327433, "frac_reward_zero_std": 0.375, "grad_norm": 0.6438590645763488, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 188984524.0, "reward": 0.8615351319313049, "reward_std": 0.11218338459730148, "rewards/qatch_small_update_with_fm/mean": 0.8615351319313049, "rewards/qatch_small_update_with_fm/std": 0.24655072391033173, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9942902326583862, "sampling/importance_sampling_ratio/min": 0.01430963259190321, "sampling/sampling_logp_difference/max": 4.246822357177734, "sampling/sampling_logp_difference/mean": 0.09988274425268173, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3375.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 547.31640625, "completions/mean_terminated_length": 547.31640625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.14965487271547318, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4595165561544715, "learning_rate": 1e-06, "loss": 0.0728, "num_tokens": 189394541.0, "reward": 0.715039074420929, "reward_std": 0.044145166873931885, "rewards/qatch_small_update_with_fm/mean": 0.715039074420929, "rewards/qatch_small_update_with_fm/std": 0.35467761754989624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9968997240066528, "sampling/importance_sampling_ratio/min": 0.001756817102432251, "sampling/sampling_logp_difference/max": 6.34425163269043, "sampling/sampling_logp_difference/mean": 0.11401140689849854, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 495.32421875, "completions/mean_terminated_length": 495.32421875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.14836543146520853, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.625, "grad_norm": 0.4925656414759849, "learning_rate": 1e-06, "loss": 0.0385, "num_tokens": 189856128.0, "reward": 0.6013593673706055, "reward_std": 0.09321607649326324, "rewards/qatch_small_update_with_fm/mean": 0.6013593673706055, "rewards/qatch_small_update_with_fm/std": 0.39479532837867737, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9953794479370117, "sampling/importance_sampling_ratio/min": 0.001509586232714355, "sampling/sampling_logp_difference/max": 6.495919704437256, "sampling/sampling_logp_difference/mean": 0.11514382064342499, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 617.25, "completions/mean_terminated_length": 617.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.16911724209785461, "epoch": 0.6584070796460177, "frac_reward_zero_std": 0.625, "grad_norm": 0.420032439150815, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 190641648.0, "reward": 0.7167304754257202, "reward_std": 0.07554842531681061, "rewards/qatch_small_update_with_fm/mean": 0.7167304754257202, "rewards/qatch_small_update_with_fm/std": 0.32102170586586, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9969842433929443, "sampling/importance_sampling_ratio/min": 0.0024064192548394203, "sampling/sampling_logp_difference/max": 6.02961540222168, "sampling/sampling_logp_difference/mean": 0.12394626438617706, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 746.92578125, "completions/mean_terminated_length": 746.92578125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.18326103128492832, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.5625, "grad_norm": 0.34790702562535814, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 191207677.0, "reward": 0.6764101386070251, "reward_std": 0.1258085072040558, "rewards/qatch_small_update_with_fm/mean": 0.6764101386070251, "rewards/qatch_small_update_with_fm/std": 0.3671666085720062, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0023778676986694, "sampling/importance_sampling_ratio/min": 0.008668403141200542, "sampling/sampling_logp_difference/max": 4.74807071685791, "sampling/sampling_logp_difference/mean": 0.12471143901348114, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 511.9453125, "completions/mean_terminated_length": 511.9453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.13981572352349758, "epoch": 0.6619469026548672, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4099991695151121, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 191821791.0, "reward": 0.8004882335662842, "reward_std": 0.050791338086128235, "rewards/qatch_small_update_with_fm/mean": 0.8004882335662842, "rewards/qatch_small_update_with_fm/std": 0.3645688593387604, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9944133758544922, "sampling/importance_sampling_ratio/min": 0.008987173438072205, "sampling/sampling_logp_difference/max": 4.711956977844238, "sampling/sampling_logp_difference/mean": 0.11034908890724182, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 566.328125, "completions/mean_terminated_length": 566.328125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.15874271467328072, "epoch": 0.6637168141592921, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4275311076393409, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 192456643.0, "reward": 0.7895312309265137, "reward_std": 0.08289177715778351, "rewards/qatch_small_update_with_fm/mean": 0.7895312309265137, "rewards/qatch_small_update_with_fm/std": 0.3061412274837494, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9964075684547424, "sampling/importance_sampling_ratio/min": 0.018361065536737442, "sampling/sampling_logp_difference/max": 3.9975228309631348, "sampling/sampling_logp_difference/mean": 0.11977240443229675, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4086.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 692.05859375, "completions/mean_terminated_length": 692.05859375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.17270477302372456, "epoch": 0.6654867256637168, "frac_reward_zero_std": 0.625, "grad_norm": 0.38419500050306277, "learning_rate": 1e-06, "loss": 0.0907, "num_tokens": 193035442.0, "reward": 0.6908007860183716, "reward_std": 0.07150629162788391, "rewards/qatch_small_update_with_fm/mean": 0.6908007860183716, "rewards/qatch_small_update_with_fm/std": 0.34826797246932983, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000948905944824, "sampling/importance_sampling_ratio/min": 4.4799104870207884e-08, "sampling/sampling_logp_difference/max": 16.921077728271484, "sampling/sampling_logp_difference/mean": 0.12002436816692352, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 507.24609375, "completions/mean_terminated_length": 507.24609375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.12251328956335783, "epoch": 0.6672566371681415, "frac_reward_zero_std": 0.625, "grad_norm": 0.34979100036660665, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 193707233.0, "reward": 0.8660585880279541, "reward_std": 0.09433520585298538, "rewards/qatch_small_update_with_fm/mean": 0.8660585880279541, "rewards/qatch_small_update_with_fm/std": 0.25607553124427795, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9955108761787415, "sampling/importance_sampling_ratio/min": 0.008815412409603596, "sampling/sampling_logp_difference/max": 4.731253623962402, "sampling/sampling_logp_difference/mean": 0.09934522956609726, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 620.8125, "completions/mean_terminated_length": 620.8125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.1559175457805395, "epoch": 0.6690265486725664, "frac_reward_zero_std": 0.6875, "grad_norm": 0.33073547774063494, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 194216241.0, "reward": 0.7509804964065552, "reward_std": 0.0672738254070282, "rewards/qatch_small_update_with_fm/mean": 0.7509804368019104, "rewards/qatch_small_update_with_fm/std": 0.3491855263710022, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985901117324829, "sampling/importance_sampling_ratio/min": 0.00598114961758256, "sampling/sampling_logp_difference/max": 5.119142532348633, "sampling/sampling_logp_difference/mean": 0.11180153489112854, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 540.3046875, "completions/mean_terminated_length": 540.3046875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1661166436970234, "epoch": 0.6707964601769911, "frac_reward_zero_std": 0.8125, "grad_norm": 0.33195648875070566, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 194831391.0, "reward": 0.7810038924217224, "reward_std": 0.04923711344599724, "rewards/qatch_small_update_with_fm/mean": 0.7810038924217224, "rewards/qatch_small_update_with_fm/std": 0.33204519748687744, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9990555047988892, "sampling/importance_sampling_ratio/min": 0.009403667412698269, "sampling/sampling_logp_difference/max": 4.666655540466309, "sampling/sampling_logp_difference/mean": 0.11909811943769455, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 474.10546875, "completions/mean_terminated_length": 474.10546875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.14898390136659145, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.6875, "grad_norm": 0.40024858962846094, "learning_rate": 1e-06, "loss": -0.0093, "num_tokens": 195477002.0, "reward": 0.8072617053985596, "reward_std": 0.06418918073177338, "rewards/qatch_small_update_with_fm/mean": 0.8072617053985596, "rewards/qatch_small_update_with_fm/std": 0.3275093734264374, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9951503276824951, "sampling/importance_sampling_ratio/min": 0.018497345969080925, "sampling/sampling_logp_difference/max": 3.9901280403137207, "sampling/sampling_logp_difference/mean": 0.11574701964855194, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 454.40234375, "completions/mean_terminated_length": 454.40234375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.1413791086524725, "epoch": 0.6743362831858407, "frac_reward_zero_std": 0.6875, "grad_norm": 0.38007691311421254, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 195954881.0, "reward": 0.744070291519165, "reward_std": 0.05227509140968323, "rewards/qatch_small_update_with_fm/mean": 0.744070291519165, "rewards/qatch_small_update_with_fm/std": 0.3730511963367462, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9953719973564148, "sampling/importance_sampling_ratio/min": 0.0025141651276499033, "sampling/sampling_logp_difference/max": 5.985814571380615, "sampling/sampling_logp_difference/mean": 0.11235985904932022, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 510.7578125, "completions/mean_terminated_length": 510.7578125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.14344297163188457, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4625180428801349, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 196464291.0, "reward": 0.816574215888977, "reward_std": 0.15176716446876526, "rewards/qatch_small_update_with_fm/mean": 0.816574215888977, "rewards/qatch_small_update_with_fm/std": 0.3127204179763794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9949458837509155, "sampling/importance_sampling_ratio/min": 0.006565525196492672, "sampling/sampling_logp_difference/max": 5.025922775268555, "sampling/sampling_logp_difference/mean": 0.11291942000389099, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2107.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 583.234375, "completions/mean_terminated_length": 583.234375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.15638727508485317, "epoch": 0.6778761061946903, "frac_reward_zero_std": 0.5, "grad_norm": 0.45746226742929946, "learning_rate": 1e-06, "loss": -0.0288, "num_tokens": 197120607.0, "reward": 0.6001679301261902, "reward_std": 0.12643268704414368, "rewards/qatch_small_update_with_fm/mean": 0.600167989730835, "rewards/qatch_small_update_with_fm/std": 0.3939620852470398, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9973058700561523, "sampling/importance_sampling_ratio/min": 0.011168883182108402, "sampling/sampling_logp_difference/max": 4.49462366104126, "sampling/sampling_logp_difference/mean": 0.1140265166759491, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 478.71875, "completions/mean_terminated_length": 478.71875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.14543859846889973, "epoch": 0.679646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.3538203767911081, "learning_rate": 1e-06, "loss": -0.012, "num_tokens": 197602663.0, "reward": 0.7854375243186951, "reward_std": 0.028378454968333244, "rewards/qatch_small_update_with_fm/mean": 0.7854375243186951, "rewards/qatch_small_update_with_fm/std": 0.32659393548965454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9973599314689636, "sampling/importance_sampling_ratio/min": 1.096164487535134e-05, "sampling/sampling_logp_difference/max": 11.42110824584961, "sampling/sampling_logp_difference/mean": 0.11048808693885803, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 486.6171875, "completions/mean_terminated_length": 486.6171875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.14200040139257908, "epoch": 0.6814159292035398, "frac_reward_zero_std": 0.8125, "grad_norm": 0.44083436912740653, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 198074565.0, "reward": 0.808886706829071, "reward_std": 0.026976868510246277, "rewards/qatch_small_update_with_fm/mean": 0.808886706829071, "rewards/qatch_small_update_with_fm/std": 0.3425396680831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9965760707855225, "sampling/importance_sampling_ratio/min": 0.008726546540856361, "sampling/sampling_logp_difference/max": 4.741385459899902, "sampling/sampling_logp_difference/mean": 0.10850890725851059, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 455.9921875, "completions/mean_terminated_length": 455.9921875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.1441742181777954, "epoch": 0.6831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 0.5614946533696555, "learning_rate": 1e-06, "loss": 0.0259, "num_tokens": 198597955.0, "reward": 0.8295273780822754, "reward_std": 0.09427762776613235, "rewards/qatch_small_update_with_fm/mean": 0.8295273780822754, "rewards/qatch_small_update_with_fm/std": 0.2936692535877228, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9962972402572632, "sampling/importance_sampling_ratio/min": 0.013299347832798958, "sampling/sampling_logp_difference/max": 4.320040225982666, "sampling/sampling_logp_difference/mean": 0.1088903546333313, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3817.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 589.7578125, "completions/mean_terminated_length": 589.7578125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.15512034017592669, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.625, "grad_norm": 0.439384354195262, "learning_rate": 1e-06, "loss": -0.0228, "num_tokens": 198996725.0, "reward": 0.7426366806030273, "reward_std": 0.030658302828669548, "rewards/qatch_small_update_with_fm/mean": 0.7426366806030273, "rewards/qatch_small_update_with_fm/std": 0.35217705368995667, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001318097114563, "sampling/importance_sampling_ratio/min": 0.004440457094460726, "sampling/sampling_logp_difference/max": 5.416997909545898, "sampling/sampling_logp_difference/mean": 0.11081845313310623, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 464.1015625, "completions/mean_terminated_length": 464.1015625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.1471645962446928, "epoch": 0.6867256637168142, "frac_reward_zero_std": 0.625, "grad_norm": 0.530064088610342, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 199847519.0, "reward": 0.7159297466278076, "reward_std": 0.1021965891122818, "rewards/qatch_small_update_with_fm/mean": 0.7159296870231628, "rewards/qatch_small_update_with_fm/std": 0.36042875051498413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9948940277099609, "sampling/importance_sampling_ratio/min": 0.006773307919502258, "sampling/sampling_logp_difference/max": 4.994765758514404, "sampling/sampling_logp_difference/mean": 0.11302639544010162, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 392.51953125, "completions/mean_terminated_length": 392.51953125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.12604860868304968, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 0.5616242030778377, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 200272052.0, "reward": 0.8972305059432983, "reward_std": 0.11663605272769928, "rewards/qatch_small_update_with_fm/mean": 0.8972305059432983, "rewards/qatch_small_update_with_fm/std": 0.24549192190170288, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9938252568244934, "sampling/importance_sampling_ratio/min": 0.023648742586374283, "sampling/sampling_logp_difference/max": 3.744445323944092, "sampling/sampling_logp_difference/mean": 0.10341814160346985, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 519.9296875, "completions/mean_terminated_length": 519.9296875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.17424634657800198, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5942977436585406, "learning_rate": 1e-06, "loss": -0.0269, "num_tokens": 200742274.0, "reward": 0.8007890582084656, "reward_std": 0.07194599509239197, "rewards/qatch_small_update_with_fm/mean": 0.8007890582084656, "rewards/qatch_small_update_with_fm/std": 0.29960644245147705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002765655517578, "sampling/importance_sampling_ratio/min": 0.008807619102299213, "sampling/sampling_logp_difference/max": 4.732138156890869, "sampling/sampling_logp_difference/mean": 0.12190736830234528, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 522.78515625, "completions/mean_terminated_length": 522.78515625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.16362570971250534, "epoch": 0.6920353982300885, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5138882733479488, "learning_rate": 1e-06, "loss": 0.0653, "num_tokens": 201239675.0, "reward": 0.76220703125, "reward_std": 0.05660145357251167, "rewards/qatch_small_update_with_fm/mean": 0.76220703125, "rewards/qatch_small_update_with_fm/std": 0.3293953239917755, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004456043243408, "sampling/importance_sampling_ratio/min": 0.014311004430055618, "sampling/sampling_logp_difference/max": 4.2467265129089355, "sampling/sampling_logp_difference/mean": 0.11852040886878967, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 642.67578125, "completions/mean_terminated_length": 642.67578125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.17715238966047764, "epoch": 0.6938053097345133, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3121151290906744, "learning_rate": 1e-06, "loss": -0.0272, "num_tokens": 201780376.0, "reward": 0.684402346611023, "reward_std": 0.05445664003491402, "rewards/qatch_small_update_with_fm/mean": 0.684402346611023, "rewards/qatch_small_update_with_fm/std": 0.3712097406387329, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992333650588989, "sampling/importance_sampling_ratio/min": 0.0003269147127866745, "sampling/sampling_logp_difference/max": 8.025811195373535, "sampling/sampling_logp_difference/mean": 0.1236954778432846, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 452.3359375, "completions/mean_terminated_length": 452.3359375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1450560800731182, "epoch": 0.695575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.34599328815634517, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 202230654.0, "reward": 0.8244608640670776, "reward_std": 0.06429094076156616, "rewards/qatch_small_update_with_fm/mean": 0.8244609236717224, "rewards/qatch_small_update_with_fm/std": 0.2881889343261719, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9949817061424255, "sampling/importance_sampling_ratio/min": 0.0028936576563864946, "sampling/sampling_logp_difference/max": 5.845233917236328, "sampling/sampling_logp_difference/mean": 0.11141933500766754, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 703.48828125, "completions/mean_terminated_length": 608.116455078125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1723710596561432, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5065433606281433, "learning_rate": 1e-06, "loss": -0.051, "num_tokens": 202864043.0, "reward": 0.6551952958106995, "reward_std": 0.12935110926628113, "rewards/qatch_small_update_with_fm/mean": 0.6551952958106995, "rewards/qatch_small_update_with_fm/std": 0.41460785269737244, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006225109100342, "sampling/importance_sampling_ratio/min": 0.008855162188410759, "sampling/sampling_logp_difference/max": 4.726754665374756, "sampling/sampling_logp_difference/mean": 0.12057724595069885, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 631.3828125, "completions/mean_terminated_length": 631.3828125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2066683303564787, "epoch": 0.6991150442477876, "frac_reward_zero_std": 0.5625, "grad_norm": 0.43477121030532406, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 203379229.0, "reward": 0.6926288604736328, "reward_std": 0.0851493626832962, "rewards/qatch_small_update_with_fm/mean": 0.6926288604736328, "rewards/qatch_small_update_with_fm/std": 0.39662235975265503, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0038108825683594, "sampling/importance_sampling_ratio/min": 0.005343807861208916, "sampling/sampling_logp_difference/max": 5.23181676864624, "sampling/sampling_logp_difference/mean": 0.13774800300598145, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 419.71484375, "completions/mean_terminated_length": 419.71484375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.13268938660621643, "epoch": 0.7008849557522124, "frac_reward_zero_std": 0.625, "grad_norm": 0.4484007539037055, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 203883556.0, "reward": 0.8229648470878601, "reward_std": 0.05420804023742676, "rewards/qatch_small_update_with_fm/mean": 0.8229647874832153, "rewards/qatch_small_update_with_fm/std": 0.3085273504257202, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9944854378700256, "sampling/importance_sampling_ratio/min": 0.004351816605776548, "sampling/sampling_logp_difference/max": 5.437161922454834, "sampling/sampling_logp_difference/mean": 0.10334019362926483, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 561.34375, "completions/mean_terminated_length": 561.34375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.1825061459094286, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 0.3502005182299795, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 204624508.0, "reward": 0.6012461185455322, "reward_std": 0.07029339671134949, "rewards/qatch_small_update_with_fm/mean": 0.6012461185455322, "rewards/qatch_small_update_with_fm/std": 0.3837004601955414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9966017603874207, "sampling/importance_sampling_ratio/min": 0.00334866507910192, "sampling/sampling_logp_difference/max": 5.699193477630615, "sampling/sampling_logp_difference/mean": 0.12955331802368164, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 466.14453125, "completions/mean_terminated_length": 466.14453125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.15828338265419006, "epoch": 0.7044247787610619, "frac_reward_zero_std": 0.625, "grad_norm": 0.4978733228091175, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 205080401.0, "reward": 0.8767499923706055, "reward_std": 0.0694652646780014, "rewards/qatch_small_update_with_fm/mean": 0.8767499923706055, "rewards/qatch_small_update_with_fm/std": 0.2577313780784607, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.998377799987793, "sampling/importance_sampling_ratio/min": 0.01460467278957367, "sampling/sampling_logp_difference/max": 4.226413726806641, "sampling/sampling_logp_difference/mean": 0.11512663960456848, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 522.0546875, "completions/mean_terminated_length": 522.0546875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.16699490696191788, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.625, "grad_norm": 0.34895092944716055, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 205534671.0, "reward": 0.7786523103713989, "reward_std": 0.07300121337175369, "rewards/qatch_small_update_with_fm/mean": 0.7786523103713989, "rewards/qatch_small_update_with_fm/std": 0.33300358057022095, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985975027084351, "sampling/importance_sampling_ratio/min": 0.004132211208343506, "sampling/sampling_logp_difference/max": 5.488942623138428, "sampling/sampling_logp_difference/mean": 0.1197887659072876, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 557.84765625, "completions/mean_terminated_length": 557.84765625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.1648202035576105, "epoch": 0.7079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.2764157663335144, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 205939896.0, "reward": 0.6907851696014404, "reward_std": 0.03865916654467583, "rewards/qatch_small_update_with_fm/mean": 0.6907851696014404, "rewards/qatch_small_update_with_fm/std": 0.4175146222114563, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005991458892822, "sampling/importance_sampling_ratio/min": 4.522010840446455e-06, "sampling/sampling_logp_difference/max": 12.306553840637207, "sampling/sampling_logp_difference/mean": 0.11411407589912415, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 544.4453125, "completions/mean_terminated_length": 544.4453125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.20310404524207115, "epoch": 0.7097345132743362, "frac_reward_zero_std": 0.5, "grad_norm": 0.5061570416127074, "learning_rate": 1e-06, "loss": -0.0461, "num_tokens": 206369930.0, "reward": 0.675488293170929, "reward_std": 0.10496611893177032, "rewards/qatch_small_update_with_fm/mean": 0.675488293170929, "rewards/qatch_small_update_with_fm/std": 0.40421876311302185, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0022008419036865, "sampling/importance_sampling_ratio/min": 0.01850472204387188, "sampling/sampling_logp_difference/max": 3.989729404449463, "sampling/sampling_logp_difference/mean": 0.13742220401763916, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 614.6796875, "completions/mean_terminated_length": 614.6796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.19880390912294388, "epoch": 0.7115044247787611, "frac_reward_zero_std": 0.5, "grad_norm": 0.4435035882354106, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 206997288.0, "reward": 0.6815507411956787, "reward_std": 0.11047512292861938, "rewards/qatch_small_update_with_fm/mean": 0.6815507411956787, "rewards/qatch_small_update_with_fm/std": 0.3761551082134247, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006957054138184, "sampling/importance_sampling_ratio/min": 0.010709195397794247, "sampling/sampling_logp_difference/max": 4.536652565002441, "sampling/sampling_logp_difference/mean": 0.1332876980304718, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 486.46875, "completions/mean_terminated_length": 486.46875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.18742970563471317, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.625, "grad_norm": 0.42034996504613936, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 207693248.0, "reward": 0.7741523385047913, "reward_std": 0.07138805091381073, "rewards/qatch_small_update_with_fm/mean": 0.7741523385047913, "rewards/qatch_small_update_with_fm/std": 0.31449058651924133, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010027885437012, "sampling/importance_sampling_ratio/min": 0.008775109425187111, "sampling/sampling_logp_difference/max": 4.735836029052734, "sampling/sampling_logp_difference/mean": 0.12943994998931885, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 451.75390625, "completions/mean_terminated_length": 451.75390625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1542245950549841, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2923557605448827, "learning_rate": 1e-06, "loss": -0.0452, "num_tokens": 208325889.0, "reward": 0.8891288638114929, "reward_std": 0.01680476777255535, "rewards/qatch_small_update_with_fm/mean": 0.8891288638114929, "rewards/qatch_small_update_with_fm/std": 0.26092061400413513, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9982110857963562, "sampling/importance_sampling_ratio/min": 0.003301323391497135, "sampling/sampling_logp_difference/max": 5.7134318351745605, "sampling/sampling_logp_difference/mean": 0.113829106092453, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 400.30078125, "completions/mean_terminated_length": 400.30078125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.1461954489350319, "epoch": 0.7168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.3594464319887048, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 208893774.0, "reward": 0.7378515601158142, "reward_std": 0.06936902552843094, "rewards/qatch_small_update_with_fm/mean": 0.7378515601158142, "rewards/qatch_small_update_with_fm/std": 0.3432586193084717, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9973443746566772, "sampling/importance_sampling_ratio/min": 0.0028301975689828396, "sampling/sampling_logp_difference/max": 5.867408752441406, "sampling/sampling_logp_difference/mean": 0.11105867475271225, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3721.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 761.70703125, "completions/mean_terminated_length": 761.70703125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.21504069492220879, "epoch": 0.7185840707964601, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4598716404374558, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 209545459.0, "reward": 0.6908632516860962, "reward_std": 0.11959480494260788, "rewards/qatch_small_update_with_fm/mean": 0.690863311290741, "rewards/qatch_small_update_with_fm/std": 0.36190682649612427, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0028066635131836, "sampling/importance_sampling_ratio/min": 0.00274001806974411, "sampling/sampling_logp_difference/max": 5.8997907638549805, "sampling/sampling_logp_difference/mean": 0.13885968923568726, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 592.12109375, "completions/mean_terminated_length": 592.12109375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.19392023608088493, "epoch": 0.720353982300885, "frac_reward_zero_std": 0.75, "grad_norm": 0.2783516577547617, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 210185858.0, "reward": 0.6728867292404175, "reward_std": 0.036109741777181625, "rewards/qatch_small_update_with_fm/mean": 0.6728867292404175, "rewards/qatch_small_update_with_fm/std": 0.354897141456604, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001744031906128, "sampling/importance_sampling_ratio/min": 0.0009166671079583466, "sampling/sampling_logp_difference/max": 6.9947662353515625, "sampling/sampling_logp_difference/mean": 0.13026288151741028, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3979.0, "completions/max_terminated_length": 3979.0, "completions/mean_length": 727.93359375, "completions/mean_terminated_length": 727.93359375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.2095312885940075, "epoch": 0.7221238938053097, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5232028757397078, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 210972561.0, "reward": 0.6885937452316284, "reward_std": 0.09573820233345032, "rewards/qatch_small_update_with_fm/mean": 0.6885937452316284, "rewards/qatch_small_update_with_fm/std": 0.3826073408126831, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0056464672088623, "sampling/importance_sampling_ratio/min": 0.006897236220538616, "sampling/sampling_logp_difference/max": 4.976634502410889, "sampling/sampling_logp_difference/mean": 0.13450539112091064, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 508.99609375, "completions/mean_terminated_length": 508.99609375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.16254041530191898, "epoch": 0.7238938053097345, "frac_reward_zero_std": 0.625, "grad_norm": 0.545358972866689, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 211426640.0, "reward": 0.8075742125511169, "reward_std": 0.07480335235595703, "rewards/qatch_small_update_with_fm/mean": 0.8075742125511169, "rewards/qatch_small_update_with_fm/std": 0.3394266963005066, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000327229499817, "sampling/importance_sampling_ratio/min": 0.006418498232960701, "sampling/sampling_logp_difference/max": 5.0485711097717285, "sampling/sampling_logp_difference/mean": 0.11342006921768188, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 615.09375, "completions/mean_terminated_length": 615.09375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.19291920214891434, "epoch": 0.7256637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 0.47680289996018316, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 212253944.0, "reward": 0.6174297332763672, "reward_std": 0.11888127028942108, "rewards/qatch_small_update_with_fm/mean": 0.6174297332763672, "rewards/qatch_small_update_with_fm/std": 0.37394073605537415, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998745918273926, "sampling/importance_sampling_ratio/min": 0.005661542061716318, "sampling/sampling_logp_difference/max": 5.17405891418457, "sampling/sampling_logp_difference/mean": 0.13196709752082825, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3931.0, "completions/max_terminated_length": 3931.0, "completions/mean_length": 716.828125, "completions/mean_terminated_length": 716.828125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.17363397032022476, "epoch": 0.727433628318584, "frac_reward_zero_std": 0.625, "grad_norm": 0.32502024170474697, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 212929100.0, "reward": 0.7465429306030273, "reward_std": 0.0992249995470047, "rewards/qatch_small_update_with_fm/mean": 0.7465429306030273, "rewards/qatch_small_update_with_fm/std": 0.381736159324646, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001173496246338, "sampling/importance_sampling_ratio/min": 0.0036354686599224806, "sampling/sampling_logp_difference/max": 5.6170172691345215, "sampling/sampling_logp_difference/mean": 0.11454272270202637, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4093.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 686.29296875, "completions/mean_terminated_length": 686.29296875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.1912785768508911, "epoch": 0.7292035398230089, "frac_reward_zero_std": 0.625, "grad_norm": 0.36200051116897225, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 213543863.0, "reward": 0.7139023542404175, "reward_std": 0.07025236636400223, "rewards/qatch_small_update_with_fm/mean": 0.7139023542404175, "rewards/qatch_small_update_with_fm/std": 0.33821868896484375, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0010260343551636, "sampling/importance_sampling_ratio/min": 0.01126747578382492, "sampling/sampling_logp_difference/max": 4.485835075378418, "sampling/sampling_logp_difference/mean": 0.12891894578933716, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 482.08984375, "completions/mean_terminated_length": 482.08984375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.15765495225787163, "epoch": 0.7309734513274336, "frac_reward_zero_std": 0.625, "grad_norm": 0.4196500993343117, "learning_rate": 1e-06, "loss": -0.0244, "num_tokens": 214201166.0, "reward": 0.8524882793426514, "reward_std": 0.09491947293281555, "rewards/qatch_small_update_with_fm/mean": 0.8524882793426514, "rewards/qatch_small_update_with_fm/std": 0.2842613160610199, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9969896674156189, "sampling/importance_sampling_ratio/min": 2.4591103283455595e-05, "sampling/sampling_logp_difference/max": 10.613125801086426, "sampling/sampling_logp_difference/mean": 0.11501606553792953, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 425.8828125, "completions/mean_terminated_length": 411.490234375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.16947471164166927, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.875, "grad_norm": 0.35258047954675187, "learning_rate": 1e-06, "loss": -0.022, "num_tokens": 214648096.0, "reward": 0.7841289043426514, "reward_std": 0.04797707498073578, "rewards/qatch_small_update_with_fm/mean": 0.7841289043426514, "rewards/qatch_small_update_with_fm/std": 0.3436765670776367, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.996971845626831, "sampling/importance_sampling_ratio/min": 0.008898786269128323, "sampling/sampling_logp_difference/max": 4.7218403816223145, "sampling/sampling_logp_difference/mean": 0.12509727478027344, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 473.265625, "completions/mean_terminated_length": 473.265625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.16774693876504898, "epoch": 0.7345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.39832537332698686, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 215187780.0, "reward": 0.8978593349456787, "reward_std": 0.03775494545698166, "rewards/qatch_small_update_with_fm/mean": 0.8978593349456787, "rewards/qatch_small_update_with_fm/std": 0.21952234208583832, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9979634881019592, "sampling/importance_sampling_ratio/min": 0.0004939264617860317, "sampling/sampling_logp_difference/max": 7.613123893737793, "sampling/sampling_logp_difference/mean": 0.12119480222463608, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 493.36328125, "completions/mean_terminated_length": 493.36328125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.15931341610848904, "epoch": 0.736283185840708, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5004576326100599, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 215705697.0, "reward": 0.6353867053985596, "reward_std": 0.11076454818248749, "rewards/qatch_small_update_with_fm/mean": 0.6353867053985596, "rewards/qatch_small_update_with_fm/std": 0.3819371461868286, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9973691701889038, "sampling/importance_sampling_ratio/min": 0.023676568642258644, "sampling/sampling_logp_difference/max": 3.743269443511963, "sampling/sampling_logp_difference/mean": 0.11513706296682358, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 451.29296875, "completions/mean_terminated_length": 451.29296875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1622842587530613, "epoch": 0.7380530973451327, "frac_reward_zero_std": 0.625, "grad_norm": 0.3899241092283991, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 216250860.0, "reward": 0.785335898399353, "reward_std": 0.08137451857328415, "rewards/qatch_small_update_with_fm/mean": 0.785335898399353, "rewards/qatch_small_update_with_fm/std": 0.35125818848609924, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9958220720291138, "sampling/importance_sampling_ratio/min": 0.006848800927400589, "sampling/sampling_logp_difference/max": 4.983681678771973, "sampling/sampling_logp_difference/mean": 0.12142086774110794, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 623.05078125, "completions/mean_terminated_length": 623.05078125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.18198287673294544, "epoch": 0.7398230088495575, "frac_reward_zero_std": 0.4375, "grad_norm": 0.409413930464498, "learning_rate": 1e-06, "loss": -0.0106, "num_tokens": 216796345.0, "reward": 0.7311093807220459, "reward_std": 0.1406518816947937, "rewards/qatch_small_update_with_fm/mean": 0.7311093807220459, "rewards/qatch_small_update_with_fm/std": 0.3903703987598419, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007123947143555, "sampling/importance_sampling_ratio/min": 0.00420353701338172, "sampling/sampling_logp_difference/max": 5.471828937530518, "sampling/sampling_logp_difference/mean": 0.12437430024147034, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 474.4375, "completions/mean_terminated_length": 474.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.13196693547070026, "epoch": 0.7415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.41445779960836726, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 217478441.0, "reward": 0.8417499661445618, "reward_std": 0.04443129152059555, "rewards/qatch_small_update_with_fm/mean": 0.8417499661445618, "rewards/qatch_small_update_with_fm/std": 0.28878355026245117, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9945211410522461, "sampling/importance_sampling_ratio/min": 0.00021754797489847988, "sampling/sampling_logp_difference/max": 8.433091163635254, "sampling/sampling_logp_difference/mean": 0.10465686023235321, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 734.046875, "completions/mean_terminated_length": 734.046875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.19972126185894012, "epoch": 0.7433628318584071, "frac_reward_zero_std": 0.625, "grad_norm": 0.3870550794541803, "learning_rate": 1e-06, "loss": 0.0529, "num_tokens": 218280533.0, "reward": 0.6786133050918579, "reward_std": 0.09401129186153412, "rewards/qatch_small_update_with_fm/mean": 0.6786133050918579, "rewards/qatch_small_update_with_fm/std": 0.3757462203502655, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000883936882019, "sampling/importance_sampling_ratio/min": 0.007178118452429771, "sampling/sampling_logp_difference/max": 4.936717987060547, "sampling/sampling_logp_difference/mean": 0.1316712498664856, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2604.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 564.26171875, "completions/mean_terminated_length": 564.26171875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.16319572925567627, "epoch": 0.7451327433628319, "frac_reward_zero_std": 0.625, "grad_norm": 0.46633898980222643, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 218783080.0, "reward": 0.6869062781333923, "reward_std": 0.0881059467792511, "rewards/qatch_small_update_with_fm/mean": 0.6869062781333923, "rewards/qatch_small_update_with_fm/std": 0.40433627367019653, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000537395477295, "sampling/importance_sampling_ratio/min": 0.014322554692626, "sampling/sampling_logp_difference/max": 4.245919704437256, "sampling/sampling_logp_difference/mean": 0.11453516036272049, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 604.390625, "completions/mean_terminated_length": 604.390625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1689841579645872, "epoch": 0.7469026548672566, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3544339756613165, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 219359580.0, "reward": 0.7998437881469727, "reward_std": 0.08399610966444016, "rewards/qatch_small_update_with_fm/mean": 0.7998437881469727, "rewards/qatch_small_update_with_fm/std": 0.30130448937416077, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9989594221115112, "sampling/importance_sampling_ratio/min": 0.0011994243832305074, "sampling/sampling_logp_difference/max": 6.7259135246276855, "sampling/sampling_logp_difference/mean": 0.12029863148927689, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 568.7265625, "completions/mean_terminated_length": 554.8941650390625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.16228130646049976, "epoch": 0.7486725663716814, "frac_reward_zero_std": 0.875, "grad_norm": 0.22356471074436193, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 219916454.0, "reward": 0.7088984251022339, "reward_std": 0.026240184903144836, "rewards/qatch_small_update_with_fm/mean": 0.7088984251022339, "rewards/qatch_small_update_with_fm/std": 0.3714582026004791, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999340772628784, "sampling/importance_sampling_ratio/min": 0.023729214444756508, "sampling/sampling_logp_difference/max": 3.7410483360290527, "sampling/sampling_logp_difference/mean": 0.11432121694087982, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 664.03515625, "completions/mean_terminated_length": 664.03515625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.1923084706068039, "epoch": 0.7504424778761062, "frac_reward_zero_std": 0.3125, "grad_norm": 0.5461092747329064, "learning_rate": 1e-06, "loss": 0.0583, "num_tokens": 220427167.0, "reward": 0.8140429258346558, "reward_std": 0.18169225752353668, "rewards/qatch_small_update_with_fm/mean": 0.8140429258346558, "rewards/qatch_small_update_with_fm/std": 0.3105939030647278, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0016264915466309, "sampling/importance_sampling_ratio/min": 0.00532237533479929, "sampling/sampling_logp_difference/max": 5.235835552215576, "sampling/sampling_logp_difference/mean": 0.12810061872005463, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3639.0, "completions/mean_length": 772.30859375, "completions/mean_terminated_length": 719.5516357421875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.18962455727159977, "epoch": 0.7522123893805309, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3160575103154446, "learning_rate": 1e-06, "loss": -0.0781, "num_tokens": 221069566.0, "reward": 0.7325546741485596, "reward_std": 0.053682636469602585, "rewards/qatch_small_update_with_fm/mean": 0.7325546741485596, "rewards/qatch_small_update_with_fm/std": 0.37810027599334717, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.003647804260254, "sampling/importance_sampling_ratio/min": 0.004103474784642458, "sampling/sampling_logp_difference/max": 5.4959211349487305, "sampling/sampling_logp_difference/mean": 0.12572765350341797, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 660.265625, "completions/mean_terminated_length": 646.7921752929688, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.18477561697363853, "epoch": 0.7539823008849558, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4090449823566527, "learning_rate": 1e-06, "loss": -0.0218, "num_tokens": 221587810.0, "reward": 0.7701483964920044, "reward_std": 0.10543603450059891, "rewards/qatch_small_update_with_fm/mean": 0.7701483964920044, "rewards/qatch_small_update_with_fm/std": 0.3279249668121338, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0023423433303833, "sampling/importance_sampling_ratio/min": 0.001957992557436228, "sampling/sampling_logp_difference/max": 6.235835552215576, "sampling/sampling_logp_difference/mean": 0.12211276590824127, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 622.453125, "completions/mean_terminated_length": 608.8314208984375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.15044170431792736, "epoch": 0.7557522123893805, "frac_reward_zero_std": 0.6875, "grad_norm": 0.42226060366299967, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 222141878.0, "reward": 0.7257421612739563, "reward_std": 0.06865538656711578, "rewards/qatch_small_update_with_fm/mean": 0.7257421612739563, "rewards/qatch_small_update_with_fm/std": 0.35420534014701843, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9960782527923584, "sampling/importance_sampling_ratio/min": 0.01118391752243042, "sampling/sampling_logp_difference/max": 4.493278503417969, "sampling/sampling_logp_difference/mean": 0.11394952237606049, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 801.06640625, "completions/mean_terminated_length": 801.06640625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.17895109206438065, "epoch": 0.7575221238938054, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3815108439775098, "learning_rate": 1e-06, "loss": -0.0198, "num_tokens": 222826775.0, "reward": 0.6168085336685181, "reward_std": 0.08496352285146713, "rewards/qatch_small_update_with_fm/mean": 0.6168085336685181, "rewards/qatch_small_update_with_fm/std": 0.40082257986068726, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0023446083068848, "sampling/importance_sampling_ratio/min": 0.011175249703228474, "sampling/sampling_logp_difference/max": 4.494053840637207, "sampling/sampling_logp_difference/mean": 0.12193140387535095, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2660.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 631.9453125, "completions/mean_terminated_length": 631.9453125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1826272513717413, "epoch": 0.7592920353982301, "frac_reward_zero_std": 0.5625, "grad_norm": 0.41904123544273625, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 223359449.0, "reward": 0.775890588760376, "reward_std": 0.11549194902181625, "rewards/qatch_small_update_with_fm/mean": 0.7758906483650208, "rewards/qatch_small_update_with_fm/std": 0.3709191679954529, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9977415204048157, "sampling/importance_sampling_ratio/min": 0.014440981671214104, "sampling/sampling_logp_difference/max": 4.237685203552246, "sampling/sampling_logp_difference/mean": 0.13103239238262177, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 635.03125, "completions/mean_terminated_length": 635.03125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1812898088246584, "epoch": 0.7610619469026548, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5466435269344176, "learning_rate": 1e-06, "loss": 0.1014, "num_tokens": 224151553.0, "reward": 0.7688593864440918, "reward_std": 0.10703786462545395, "rewards/qatch_small_update_with_fm/mean": 0.7688593864440918, "rewards/qatch_small_update_with_fm/std": 0.31848737597465515, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9988529086112976, "sampling/importance_sampling_ratio/min": 0.006796242203563452, "sampling/sampling_logp_difference/max": 4.991385459899902, "sampling/sampling_logp_difference/mean": 0.13133785128593445, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3099.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 635.1484375, "completions/mean_terminated_length": 635.1484375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.17052887007594109, "epoch": 0.7628318584070797, "frac_reward_zero_std": 0.75, "grad_norm": 0.42027461892419476, "learning_rate": 1e-06, "loss": -0.0535, "num_tokens": 224847815.0, "reward": 0.8387616872787476, "reward_std": 0.03548329323530197, "rewards/qatch_small_update_with_fm/mean": 0.8387616872787476, "rewards/qatch_small_update_with_fm/std": 0.2991366684436798, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9974981546401978, "sampling/importance_sampling_ratio/min": 0.008679375983774662, "sampling/sampling_logp_difference/max": 4.746805667877197, "sampling/sampling_logp_difference/mean": 0.12578247487545013, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 506.56640625, "completions/mean_terminated_length": 506.56640625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.12694681715220213, "epoch": 0.7646017699115044, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5130403131834611, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 225484312.0, "reward": 0.8006914258003235, "reward_std": 0.08573571592569351, "rewards/qatch_small_update_with_fm/mean": 0.8006914258003235, "rewards/qatch_small_update_with_fm/std": 0.29757291078567505, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994242787361145, "sampling/importance_sampling_ratio/min": 0.008697095327079296, "sampling/sampling_logp_difference/max": 4.7447662353515625, "sampling/sampling_logp_difference/mean": 0.10106033831834793, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 692.78125, "completions/mean_terminated_length": 692.78125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.13791646622121334, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.8125, "grad_norm": 1.4941877942190622, "learning_rate": 1e-06, "loss": 0.0259, "num_tokens": 225999760.0, "reward": 0.8496835827827454, "reward_std": 0.046133317053318024, "rewards/qatch_small_update_with_fm/mean": 0.8496835827827454, "rewards/qatch_small_update_with_fm/std": 0.3013724684715271, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9952928423881531, "sampling/importance_sampling_ratio/min": 0.006896561942994595, "sampling/sampling_logp_difference/max": 4.97673225402832, "sampling/sampling_logp_difference/mean": 0.10778751224279404, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 602.625, "completions/mean_terminated_length": 602.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.15351567789912224, "epoch": 0.768141592920354, "frac_reward_zero_std": 0.625, "grad_norm": 0.42487288101888443, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 226594368.0, "reward": 0.7664843797683716, "reward_std": 0.07009606808423996, "rewards/qatch_small_update_with_fm/mean": 0.7664843797683716, "rewards/qatch_small_update_with_fm/std": 0.3558588922023773, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9956225752830505, "sampling/importance_sampling_ratio/min": 0.0011039423989132047, "sampling/sampling_logp_difference/max": 6.808867454528809, "sampling/sampling_logp_difference/mean": 0.11667559295892715, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 748.1171875, "completions/mean_terminated_length": 748.1171875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.168431606143713, "epoch": 0.7699115044247787, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3854349475784843, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 227160990.0, "reward": 0.7121405601501465, "reward_std": 0.10320340842008591, "rewards/qatch_small_update_with_fm/mean": 0.7121405601501465, "rewards/qatch_small_update_with_fm/std": 0.3460690677165985, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9979861974716187, "sampling/importance_sampling_ratio/min": 0.0012341516558080912, "sampling/sampling_logp_difference/max": 6.697371482849121, "sampling/sampling_logp_difference/mean": 0.12207724899053574, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2455.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 581.72265625, "completions/mean_terminated_length": 581.72265625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.15434116125106812, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3569690705345467, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 227766583.0, "reward": 0.7684921622276306, "reward_std": 0.07930755615234375, "rewards/qatch_small_update_with_fm/mean": 0.7684921622276306, "rewards/qatch_small_update_with_fm/std": 0.3264910876750946, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9944840669631958, "sampling/importance_sampling_ratio/min": 0.00047306326450780034, "sampling/sampling_logp_difference/max": 7.656281471252441, "sampling/sampling_logp_difference/mean": 0.11814532428979874, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 723.5625, "completions/mean_terminated_length": 723.5625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.15825031697750092, "epoch": 0.7734513274336283, "frac_reward_zero_std": 0.625, "grad_norm": 0.34000322678139233, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 228371063.0, "reward": 0.7408750057220459, "reward_std": 0.09346934407949448, "rewards/qatch_small_update_with_fm/mean": 0.7408750057220459, "rewards/qatch_small_update_with_fm/std": 0.3187641203403473, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9984309077262878, "sampling/importance_sampling_ratio/min": 0.008697095327079296, "sampling/sampling_logp_difference/max": 4.7447662353515625, "sampling/sampling_logp_difference/mean": 0.11309655755758286, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2843.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 711.328125, "completions/mean_terminated_length": 711.328125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.18015681765973568, "epoch": 0.7752212389380531, "frac_reward_zero_std": 0.6875, "grad_norm": 0.45693622408807605, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 229134907.0, "reward": 0.6116093397140503, "reward_std": 0.062218718230724335, "rewards/qatch_small_update_with_fm/mean": 0.6116093397140503, "rewards/qatch_small_update_with_fm/std": 0.357837438583374, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985137581825256, "sampling/importance_sampling_ratio/min": 0.0051135290414094925, "sampling/sampling_logp_difference/max": 5.27586555480957, "sampling/sampling_logp_difference/mean": 0.1284247636795044, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 600.83203125, "completions/mean_terminated_length": 600.83203125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1643862072378397, "epoch": 0.7769911504424779, "frac_reward_zero_std": 0.875, "grad_norm": 0.4528087649562646, "learning_rate": 1e-06, "loss": -0.0204, "num_tokens": 229853200.0, "reward": 0.7506523132324219, "reward_std": 0.02189062535762787, "rewards/qatch_small_update_with_fm/mean": 0.7506523132324219, "rewards/qatch_small_update_with_fm/std": 0.3145546019077301, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.997706949710846, "sampling/importance_sampling_ratio/min": 0.001599552109837532, "sampling/sampling_logp_difference/max": 6.4380316734313965, "sampling/sampling_logp_difference/mean": 0.12278648465871811, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 586.2578125, "completions/mean_terminated_length": 586.2578125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.1549396775662899, "epoch": 0.7787610619469026, "frac_reward_zero_std": 0.8125, "grad_norm": 0.31433711549478927, "learning_rate": 1e-06, "loss": -0.0224, "num_tokens": 230362770.0, "reward": 0.7373007535934448, "reward_std": 0.05242425575852394, "rewards/qatch_small_update_with_fm/mean": 0.7373007535934448, "rewards/qatch_small_update_with_fm/std": 0.3278573751449585, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9971117377281189, "sampling/importance_sampling_ratio/min": 0.011141431517899036, "sampling/sampling_logp_difference/max": 4.497084617614746, "sampling/sampling_logp_difference/mean": 0.12081331014633179, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3951.0, "completions/max_terminated_length": 3951.0, "completions/mean_length": 858.46484375, "completions/mean_terminated_length": 858.46484375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.16271107271313667, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.5625, "grad_norm": 0.39440167191771214, "learning_rate": 1e-06, "loss": -0.0644, "num_tokens": 231013257.0, "reward": 0.7417617440223694, "reward_std": 0.07942794263362885, "rewards/qatch_small_update_with_fm/mean": 0.7417617440223694, "rewards/qatch_small_update_with_fm/std": 0.37872886657714844, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0037524700164795, "sampling/importance_sampling_ratio/min": 0.0020053647458553314, "sampling/sampling_logp_difference/max": 6.2119293212890625, "sampling/sampling_logp_difference/mean": 0.11337009072303772, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3282.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 593.32421875, "completions/mean_terminated_length": 593.32421875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1659252755343914, "epoch": 0.7823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 0.36212202748645383, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 231473228.0, "reward": 0.7991992235183716, "reward_std": 0.033742353320121765, "rewards/qatch_small_update_with_fm/mean": 0.7991992235183716, "rewards/qatch_small_update_with_fm/std": 0.3366776406764984, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9989630579948425, "sampling/importance_sampling_ratio/min": 0.0038688918575644493, "sampling/sampling_logp_difference/max": 5.5547871589660645, "sampling/sampling_logp_difference/mean": 0.12536245584487915, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3265.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 694.28515625, "completions/mean_terminated_length": 694.28515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1803117860108614, "epoch": 0.784070796460177, "frac_reward_zero_std": 0.5625, "grad_norm": 0.419844471085865, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 232015333.0, "reward": 0.8482460975646973, "reward_std": 0.09589515626430511, "rewards/qatch_small_update_with_fm/mean": 0.8482460975646973, "rewards/qatch_small_update_with_fm/std": 0.30093497037887573, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9970134496688843, "sampling/importance_sampling_ratio/min": 0.011528928764164448, "sampling/sampling_logp_difference/max": 4.46289587020874, "sampling/sampling_logp_difference/mean": 0.1318548023700714, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 829.99609375, "completions/mean_terminated_length": 829.99609375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.1749946866184473, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.4375, "grad_norm": 0.39484374074074674, "learning_rate": 1e-06, "loss": -0.0219, "num_tokens": 232805252.0, "reward": 0.686453104019165, "reward_std": 0.11683434247970581, "rewards/qatch_small_update_with_fm/mean": 0.686453104019165, "rewards/qatch_small_update_with_fm/std": 0.3472793996334076, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9994871616363525, "sampling/importance_sampling_ratio/min": 0.0013426319928839803, "sampling/sampling_logp_difference/max": 6.613123416900635, "sampling/sampling_logp_difference/mean": 0.12524878978729248, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 704.53515625, "completions/mean_terminated_length": 664.3201904296875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.1625970732420683, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.625, "grad_norm": 0.4301157411953366, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 233457437.0, "reward": 0.6297616958618164, "reward_std": 0.09737332165241241, "rewards/qatch_small_update_with_fm/mean": 0.6297616958618164, "rewards/qatch_small_update_with_fm/std": 0.41093724966049194, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9979434609413147, "sampling/importance_sampling_ratio/min": 0.0008534918888472021, "sampling/sampling_logp_difference/max": 7.066174507141113, "sampling/sampling_logp_difference/mean": 0.12096916139125824, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2229.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 596.2890625, "completions/mean_terminated_length": 596.2890625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.14597140066325665, "epoch": 0.7893805309734513, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2835604822547674, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 234065047.0, "reward": 0.8153945207595825, "reward_std": 0.03509760648012161, "rewards/qatch_small_update_with_fm/mean": 0.8153945207595825, "rewards/qatch_small_update_with_fm/std": 0.3443441689014435, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9938744306564331, "sampling/importance_sampling_ratio/min": 0.0015488486969843507, "sampling/sampling_logp_difference/max": 6.470243453979492, "sampling/sampling_logp_difference/mean": 0.11343678086996078, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3454.0, "completions/mean_length": 780.5234375, "completions/mean_terminated_length": 767.5216064453125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.15503480285406113, "epoch": 0.7911504424778761, "frac_reward_zero_std": 0.625, "grad_norm": 0.3740383952215832, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 234562269.0, "reward": 0.7261210680007935, "reward_std": 0.0933132916688919, "rewards/qatch_small_update_with_fm/mean": 0.7261210680007935, "rewards/qatch_small_update_with_fm/std": 0.3724234700202942, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.996291995048523, "sampling/importance_sampling_ratio/min": 0.005370934959501028, "sampling/sampling_logp_difference/max": 5.226753234863281, "sampling/sampling_logp_difference/mean": 0.11553680151700974, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 659.67578125, "completions/mean_terminated_length": 659.67578125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.15476826392114162, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.625, "grad_norm": 0.42675635170846243, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 235043658.0, "reward": 0.6756054759025574, "reward_std": 0.09046542644500732, "rewards/qatch_small_update_with_fm/mean": 0.6756054759025574, "rewards/qatch_small_update_with_fm/std": 0.3835858702659607, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9972091913223267, "sampling/importance_sampling_ratio/min": 0.008706767112016678, "sampling/sampling_logp_difference/max": 4.743654727935791, "sampling/sampling_logp_difference/mean": 0.12006603181362152, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 550.0390625, "completions/mean_terminated_length": 550.0390625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.12986405473202467, "epoch": 0.7946902654867256, "frac_reward_zero_std": 0.5, "grad_norm": 0.5427483756820398, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 235464196.0, "reward": 0.7087421417236328, "reward_std": 0.09299471974372864, "rewards/qatch_small_update_with_fm/mean": 0.7087421417236328, "rewards/qatch_small_update_with_fm/std": 0.3667365312576294, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9917939901351929, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.11037851870059967, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 698.49609375, "completions/mean_terminated_length": 698.49609375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.1488830018788576, "epoch": 0.7964601769911505, "frac_reward_zero_std": 0.5, "grad_norm": 0.4951429798091371, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 236066467.0, "reward": 0.7882499694824219, "reward_std": 0.13462623953819275, "rewards/qatch_small_update_with_fm/mean": 0.7882499694824219, "rewards/qatch_small_update_with_fm/std": 0.3175378739833832, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9960341453552246, "sampling/importance_sampling_ratio/min": 1.961273119377438e-05, "sampling/sampling_logp_difference/max": 10.83933162689209, "sampling/sampling_logp_difference/mean": 0.11680779606103897, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 509.01171875, "completions/mean_terminated_length": 509.01171875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.11211834196001291, "epoch": 0.7982300884955752, "frac_reward_zero_std": 0.75, "grad_norm": 0.3849702455218366, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 236867798.0, "reward": 0.9428945183753967, "reward_std": 0.028621939942240715, "rewards/qatch_small_update_with_fm/mean": 0.9428945183753967, "rewards/qatch_small_update_with_fm/std": 0.14791765809059143, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909011721611023, "sampling/importance_sampling_ratio/min": 0.005266137886792421, "sampling/sampling_logp_difference/max": 5.246458053588867, "sampling/sampling_logp_difference/mean": 0.10161980986595154, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3160.0, "completions/max_terminated_length": 3160.0, "completions/mean_length": 702.078125, "completions/mean_terminated_length": 702.078125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.15019656997174025, "epoch": 0.8, "frac_reward_zero_std": 0.8125, "grad_norm": 0.325804060936362, "learning_rate": 1e-06, "loss": 0.0638, "num_tokens": 237550042.0, "reward": 0.816601574420929, "reward_std": 0.027011848986148834, "rewards/qatch_small_update_with_fm/mean": 0.816601574420929, "rewards/qatch_small_update_with_fm/std": 0.2908731698989868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9971194267272949, "sampling/importance_sampling_ratio/min": 0.008687051944434643, "sampling/sampling_logp_difference/max": 4.745921611785889, "sampling/sampling_logp_difference/mean": 0.11530216038227081, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 515.734375, "completions/mean_terminated_length": 515.734375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.12139484006911516, "epoch": 0.8017699115044248, "frac_reward_zero_std": 0.875, "grad_norm": 0.2689081659869354, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 238204646.0, "reward": 0.7734726667404175, "reward_std": 0.032330580055713654, "rewards/qatch_small_update_with_fm/mean": 0.7734726667404175, "rewards/qatch_small_update_with_fm/std": 0.3623669147491455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991873562335968, "sampling/importance_sampling_ratio/min": 0.0018496569246053696, "sampling/sampling_logp_difference/max": 6.292755126953125, "sampling/sampling_logp_difference/mean": 0.1044083833694458, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3267.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 661.6796875, "completions/mean_terminated_length": 661.6796875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.13479878939688206, "epoch": 0.8035398230088495, "frac_reward_zero_std": 0.4375, "grad_norm": 0.49470886264254876, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 238715492.0, "reward": 0.6919882297515869, "reward_std": 0.10859642922878265, "rewards/qatch_small_update_with_fm/mean": 0.6919882297515869, "rewards/qatch_small_update_with_fm/std": 0.4031483829021454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9940096139907837, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.11178075522184372, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3160.0, "completions/max_terminated_length": 3160.0, "completions/mean_length": 636.25390625, "completions/mean_terminated_length": 636.25390625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.12827913463115692, "epoch": 0.8053097345132744, "frac_reward_zero_std": 0.75, "grad_norm": 0.8735075316746723, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 239250309.0, "reward": 0.7329335808753967, "reward_std": 0.06027635186910629, "rewards/qatch_small_update_with_fm/mean": 0.7329335808753967, "rewards/qatch_small_update_with_fm/std": 0.3722755014896393, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9950513243675232, "sampling/importance_sampling_ratio/min": 0.004170369356870651, "sampling/sampling_logp_difference/max": 5.479750633239746, "sampling/sampling_logp_difference/mean": 0.10352346301078796, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2681.0, "completions/max_terminated_length": 2681.0, "completions/mean_length": 547.953125, "completions/mean_terminated_length": 547.953125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.1058797063305974, "epoch": 0.8070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 0.4273494684096733, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 239704009.0, "reward": 0.8358047008514404, "reward_std": 0.04802708327770233, "rewards/qatch_small_update_with_fm/mean": 0.8358047008514404, "rewards/qatch_small_update_with_fm/std": 0.2909993529319763, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992448091506958, "sampling/importance_sampling_ratio/min": 0.0026839813217520714, "sampling/sampling_logp_difference/max": 5.920454025268555, "sampling/sampling_logp_difference/mean": 0.09336571395397186, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 617.75390625, "completions/mean_terminated_length": 617.75390625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.13106741197407246, "epoch": 0.8088495575221238, "frac_reward_zero_std": 0.75, "grad_norm": 0.34818347973080715, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 240374890.0, "reward": 0.693066418170929, "reward_std": 0.05205351859331131, "rewards/qatch_small_update_with_fm/mean": 0.693066418170929, "rewards/qatch_small_update_with_fm/std": 0.37017884850502014, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9947883486747742, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.10878457874059677, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3826.0, "completions/max_terminated_length": 3826.0, "completions/mean_length": 671.5, "completions/mean_terminated_length": 671.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.14847872778773308, "epoch": 0.8106194690265487, "frac_reward_zero_std": 0.875, "grad_norm": 0.2268928508210983, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 240994906.0, "reward": 0.8564844131469727, "reward_std": 0.034672707319259644, "rewards/qatch_small_update_with_fm/mean": 0.8564844131469727, "rewards/qatch_small_update_with_fm/std": 0.28987762331962585, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986143112182617, "sampling/importance_sampling_ratio/min": 0.008972340263426304, "sampling/sampling_logp_difference/max": 4.713608741760254, "sampling/sampling_logp_difference/mean": 0.11376264691352844, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3260.0, "completions/max_terminated_length": 3260.0, "completions/mean_length": 511.375, "completions/mean_terminated_length": 511.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.14136963710188866, "epoch": 0.8123893805309734, "frac_reward_zero_std": 0.75, "grad_norm": 0.4322927842698917, "learning_rate": 1e-06, "loss": -0.0082, "num_tokens": 241443450.0, "reward": 0.8052030801773071, "reward_std": 0.04568236321210861, "rewards/qatch_small_update_with_fm/mean": 0.8052030801773071, "rewards/qatch_small_update_with_fm/std": 0.29775679111480713, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992789626121521, "sampling/importance_sampling_ratio/min": 0.011154704727232456, "sampling/sampling_logp_difference/max": 4.495893955230713, "sampling/sampling_logp_difference/mean": 0.11692950874567032, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 471.6953125, "completions/mean_terminated_length": 471.6953125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.10994228906929493, "epoch": 0.8141592920353983, "frac_reward_zero_std": 0.8125, "grad_norm": 0.272719066624188, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 242050972.0, "reward": 0.8606210947036743, "reward_std": 0.03434634208679199, "rewards/qatch_small_update_with_fm/mean": 0.8606210947036743, "rewards/qatch_small_update_with_fm/std": 0.30872756242752075, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9928982257843018, "sampling/importance_sampling_ratio/min": 0.002553700003772974, "sampling/sampling_logp_difference/max": 5.970211982727051, "sampling/sampling_logp_difference/mean": 0.09547945111989975, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3529.0, "completions/max_terminated_length": 3529.0, "completions/mean_length": 808.0859375, "completions/mean_terminated_length": 808.0859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.16546104848384857, "epoch": 0.815929203539823, "frac_reward_zero_std": 0.6875, "grad_norm": 0.32918008336972193, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 242787538.0, "reward": 0.7358086109161377, "reward_std": 0.05101485177874565, "rewards/qatch_small_update_with_fm/mean": 0.7358086109161377, "rewards/qatch_small_update_with_fm/std": 0.34538453817367554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9978460669517517, "sampling/importance_sampling_ratio/min": 0.0003619363415054977, "sampling/sampling_logp_difference/max": 7.924042224884033, "sampling/sampling_logp_difference/mean": 0.12252495437860489, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 503.17578125, "completions/mean_terminated_length": 503.17578125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.1253802226856351, "epoch": 0.8176991150442477, "frac_reward_zero_std": 0.75, "grad_norm": 0.35180005875154, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 243200367.0, "reward": 0.8208436965942383, "reward_std": 0.053170062601566315, "rewards/qatch_small_update_with_fm/mean": 0.8208437561988831, "rewards/qatch_small_update_with_fm/std": 0.33666518330574036, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9941401481628418, "sampling/importance_sampling_ratio/min": 0.005322411190718412, "sampling/sampling_logp_difference/max": 5.235828876495361, "sampling/sampling_logp_difference/mean": 0.10240937769412994, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3923.0, "completions/mean_length": 744.49609375, "completions/mean_terminated_length": 718.1063232421875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.16998208314180374, "epoch": 0.8194690265486726, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2983180012105374, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 243699182.0, "reward": 0.728640615940094, "reward_std": 0.05621249973773956, "rewards/qatch_small_update_with_fm/mean": 0.728640615940094, "rewards/qatch_small_update_with_fm/std": 0.3335835039615631, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001490116119385, "sampling/importance_sampling_ratio/min": 0.011265814304351807, "sampling/sampling_logp_difference/max": 4.485982418060303, "sampling/sampling_logp_difference/mean": 0.12198029458522797, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 556.328125, "completions/mean_terminated_length": 556.328125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1414416767656803, "epoch": 0.8212389380530973, "frac_reward_zero_std": 0.75, "grad_norm": 0.4023449772591073, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 244336994.0, "reward": 0.6802421808242798, "reward_std": 0.042659495025873184, "rewards/qatch_small_update_with_fm/mean": 0.6802421808242798, "rewards/qatch_small_update_with_fm/std": 0.3406652808189392, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933852553367615, "sampling/importance_sampling_ratio/min": 0.00175619893707335, "sampling/sampling_logp_difference/max": 6.344603538513184, "sampling/sampling_logp_difference/mean": 0.11338303238153458, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 496.59765625, "completions/mean_terminated_length": 496.59765625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.15152422711253166, "epoch": 0.8230088495575221, "frac_reward_zero_std": 0.875, "grad_norm": 0.3084270165573425, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 244833451.0, "reward": 0.717523455619812, "reward_std": 0.01150607131421566, "rewards/qatch_small_update_with_fm/mean": 0.717523455619812, "rewards/qatch_small_update_with_fm/std": 0.3694573640823364, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9948652982711792, "sampling/importance_sampling_ratio/min": 0.011154529638588428, "sampling/sampling_logp_difference/max": 4.495909690856934, "sampling/sampling_logp_difference/mean": 0.1199587881565094, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 964.40625, "completions/mean_terminated_length": 876.3694458007812, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.1926432903856039, "epoch": 0.8247787610619469, "frac_reward_zero_std": 0.5625, "grad_norm": 0.32375511279508756, "learning_rate": 1e-06, "loss": -0.1064, "num_tokens": 245471555.0, "reward": 0.6633554697036743, "reward_std": 0.0848885253071785, "rewards/qatch_small_update_with_fm/mean": 0.6633554697036743, "rewards/qatch_small_update_with_fm/std": 0.3811967074871063, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0064408779144287, "sampling/importance_sampling_ratio/min": 0.0007988452562130988, "sampling/sampling_logp_difference/max": 7.132343292236328, "sampling/sampling_logp_difference/mean": 0.13031846284866333, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3352.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 754.55078125, "completions/mean_terminated_length": 754.55078125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.17175315879285336, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3004368041198786, "learning_rate": 1e-06, "loss": -0.0468, "num_tokens": 246164080.0, "reward": 0.8258593678474426, "reward_std": 0.04333872348070145, "rewards/qatch_small_update_with_fm/mean": 0.8258593678474426, "rewards/qatch_small_update_with_fm/std": 0.28225964307785034, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985412955284119, "sampling/importance_sampling_ratio/min": 0.0001251505163963884, "sampling/sampling_logp_difference/max": 8.985993385314941, "sampling/sampling_logp_difference/mean": 0.12510454654693604, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3902.0, "completions/max_terminated_length": 3902.0, "completions/mean_length": 689.5234375, "completions/mean_terminated_length": 689.5234375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.15941564738750458, "epoch": 0.8283185840707965, "frac_reward_zero_std": 0.625, "grad_norm": 0.46909349336641437, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 246863030.0, "reward": 0.8296366930007935, "reward_std": 0.0837571993470192, "rewards/qatch_small_update_with_fm/mean": 0.8296366930007935, "rewards/qatch_small_update_with_fm/std": 0.2871522605419159, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9992177486419678, "sampling/importance_sampling_ratio/min": 0.0001529164583189413, "sampling/sampling_logp_difference/max": 8.785618782043457, "sampling/sampling_logp_difference/mean": 0.11667613685131073, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 747.42578125, "completions/mean_terminated_length": 734.294189453125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.16841256991028786, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.6875, "grad_norm": 0.34838615373771503, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 247468851.0, "reward": 0.6867148876190186, "reward_std": 0.056690070778131485, "rewards/qatch_small_update_with_fm/mean": 0.6867148876190186, "rewards/qatch_small_update_with_fm/std": 0.3688320219516754, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9969513416290283, "sampling/importance_sampling_ratio/min": 0.006885254755616188, "sampling/sampling_logp_difference/max": 4.978373050689697, "sampling/sampling_logp_difference/mean": 0.1274685263633728, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 638.0859375, "completions/mean_terminated_length": 638.0859375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.1450775759294629, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.625, "grad_norm": 0.4692889831796996, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 248138985.0, "reward": 0.7625468969345093, "reward_std": 0.08190645277500153, "rewards/qatch_small_update_with_fm/mean": 0.7625468969345093, "rewards/qatch_small_update_with_fm/std": 0.3101102411746979, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9959828853607178, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.11293405294418335, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3411.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 471.22265625, "completions/mean_terminated_length": 471.22265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1280396282672882, "epoch": 0.8336283185840708, "frac_reward_zero_std": 0.6875, "grad_norm": 0.46987885082001113, "learning_rate": 1e-06, "loss": -0.0515, "num_tokens": 248581938.0, "reward": 0.8769687414169312, "reward_std": 0.09168442338705063, "rewards/qatch_small_update_with_fm/mean": 0.8769687414169312, "rewards/qatch_small_update_with_fm/std": 0.25800442695617676, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934540390968323, "sampling/importance_sampling_ratio/min": 0.009367694146931171, "sampling/sampling_logp_difference/max": 4.670488357543945, "sampling/sampling_logp_difference/mean": 0.10664229094982147, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 584.125, "completions/mean_terminated_length": 528.3809814453125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.10968455206602812, "epoch": 0.8353982300884956, "frac_reward_zero_std": 0.8125, "grad_norm": 0.28465633487894076, "learning_rate": 1e-06, "loss": -0.144, "num_tokens": 249136658.0, "reward": 0.7801679372787476, "reward_std": 0.06691081821918488, "rewards/qatch_small_update_with_fm/mean": 0.7801679372787476, "rewards/qatch_small_update_with_fm/std": 0.3578769862651825, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9958081245422363, "sampling/importance_sampling_ratio/min": 0.0017069082241505384, "sampling/sampling_logp_difference/max": 6.373071670532227, "sampling/sampling_logp_difference/mean": 0.09798172861337662, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 773.7734375, "completions/mean_terminated_length": 747.6141967773438, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.1615205705165863, "epoch": 0.8371681415929203, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3232061583745438, "learning_rate": 1e-06, "loss": -0.0545, "num_tokens": 249666472.0, "reward": 0.6544336080551147, "reward_std": 0.07753826677799225, "rewards/qatch_small_update_with_fm/mean": 0.65443354845047, "rewards/qatch_small_update_with_fm/std": 0.33462199568748474, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986249208450317, "sampling/importance_sampling_ratio/min": 0.01115453988313675, "sampling/sampling_logp_difference/max": 4.495908737182617, "sampling/sampling_logp_difference/mean": 0.11811242997646332, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3852.0, "completions/mean_length": 936.25390625, "completions/mean_terminated_length": 873.310791015625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.17724050767719746, "epoch": 0.8389380530973451, "frac_reward_zero_std": 0.375, "grad_norm": 0.38820766688732716, "learning_rate": 1e-06, "loss": -0.0682, "num_tokens": 250401577.0, "reward": 0.6836913824081421, "reward_std": 0.1162552535533905, "rewards/qatch_small_update_with_fm/mean": 0.6836913824081421, "rewards/qatch_small_update_with_fm/std": 0.3393937647342682, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9975631237030029, "sampling/importance_sampling_ratio/min": 0.008690698072314262, "sampling/sampling_logp_difference/max": 4.74550199508667, "sampling/sampling_logp_difference/mean": 0.1296575367450714, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 527.55859375, "completions/mean_terminated_length": 527.55859375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.12861198373138905, "epoch": 0.8407079646017699, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5101103556172623, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 251068408.0, "reward": 0.929367184638977, "reward_std": 0.0932401642203331, "rewards/qatch_small_update_with_fm/mean": 0.929367184638977, "rewards/qatch_small_update_with_fm/std": 0.1822948455810547, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920750856399536, "sampling/importance_sampling_ratio/min": 0.008755389600992203, "sampling/sampling_logp_difference/max": 4.738085746765137, "sampling/sampling_logp_difference/mean": 0.11195670813322067, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 659.09765625, "completions/mean_terminated_length": 645.61962890625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.1493313405662775, "epoch": 0.8424778761061947, "frac_reward_zero_std": 0.625, "grad_norm": 0.3765353734239925, "learning_rate": 1e-06, "loss": -0.0242, "num_tokens": 251657729.0, "reward": 0.8344101905822754, "reward_std": 0.09548807889223099, "rewards/qatch_small_update_with_fm/mean": 0.8344101905822754, "rewards/qatch_small_update_with_fm/std": 0.2909124791622162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9970696568489075, "sampling/importance_sampling_ratio/min": 0.011136996559798717, "sampling/sampling_logp_difference/max": 4.497482776641846, "sampling/sampling_logp_difference/mean": 0.1136137917637825, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3756.0, "completions/max_terminated_length": 3756.0, "completions/mean_length": 669.21875, "completions/mean_terminated_length": 669.21875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.13360167015343904, "epoch": 0.8442477876106195, "frac_reward_zero_std": 0.625, "grad_norm": 0.3526060990407319, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 252104105.0, "reward": 0.7368632555007935, "reward_std": 0.05987680330872536, "rewards/qatch_small_update_with_fm/mean": 0.7368632555007935, "rewards/qatch_small_update_with_fm/std": 0.3386927843093872, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9987338781356812, "sampling/importance_sampling_ratio/min": 0.008173546753823757, "sampling/sampling_logp_difference/max": 4.806852340698242, "sampling/sampling_logp_difference/mean": 0.10058818012475967, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3016.0, "completions/mean_length": 849.1875, "completions/mean_terminated_length": 823.6220703125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.15353283658623695, "epoch": 0.8460176991150442, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4516144346284262, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 252697289.0, "reward": 0.6796093583106995, "reward_std": 0.1720023900270462, "rewards/qatch_small_update_with_fm/mean": 0.6796093583106995, "rewards/qatch_small_update_with_fm/std": 0.3603024184703827, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9981167912483215, "sampling/importance_sampling_ratio/min": 0.004114686511456966, "sampling/sampling_logp_difference/max": 5.493192672729492, "sampling/sampling_logp_difference/mean": 0.11910088360309601, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3387.0, "completions/max_terminated_length": 3387.0, "completions/mean_length": 628.56640625, "completions/mean_terminated_length": 628.56640625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.13713755458593369, "epoch": 0.8477876106194691, "frac_reward_zero_std": 0.625, "grad_norm": 0.5442060091729809, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 253342426.0, "reward": 0.7485117316246033, "reward_std": 0.09448330849409103, "rewards/qatch_small_update_with_fm/mean": 0.7485117316246033, "rewards/qatch_small_update_with_fm/std": 0.3188386857509613, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9971814751625061, "sampling/importance_sampling_ratio/min": 0.004977615084499121, "sampling/sampling_logp_difference/max": 5.302804470062256, "sampling/sampling_logp_difference/mean": 0.11223170161247253, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 737.57421875, "completions/mean_terminated_length": 724.4039916992188, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.16004723869264126, "epoch": 0.8495575221238938, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3865325051659031, "learning_rate": 1e-06, "loss": -0.037, "num_tokens": 253793261.0, "reward": 0.6448359489440918, "reward_std": 0.15111446380615234, "rewards/qatch_small_update_with_fm/mean": 0.6448359489440918, "rewards/qatch_small_update_with_fm/std": 0.3562937080860138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002942085266113, "sampling/importance_sampling_ratio/min": 0.008727042004466057, "sampling/sampling_logp_difference/max": 4.741328716278076, "sampling/sampling_logp_difference/mean": 0.12233758717775345, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 547.390625, "completions/mean_terminated_length": 533.4745483398438, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.111550472676754, "epoch": 0.8513274336283185, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2075705475708058, "learning_rate": 1e-06, "loss": -0.0327, "num_tokens": 254333697.0, "reward": 0.6903281211853027, "reward_std": 0.03593749552965164, "rewards/qatch_small_update_with_fm/mean": 0.6903281211853027, "rewards/qatch_small_update_with_fm/std": 0.31454235315322876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991868793964386, "sampling/importance_sampling_ratio/min": 1.0412995834485628e-05, "sampling/sampling_logp_difference/max": 11.472455978393555, "sampling/sampling_logp_difference/mean": 0.10045738518238068, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 843.91015625, "completions/mean_terminated_length": 779.1275024414062, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.17988201044499874, "epoch": 0.8530973451327434, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3950648815701397, "learning_rate": 1e-06, "loss": -0.0401, "num_tokens": 254895770.0, "reward": 0.7115039229393005, "reward_std": 0.05014953017234802, "rewards/qatch_small_update_with_fm/mean": 0.7115039229393005, "rewards/qatch_small_update_with_fm/std": 0.3483102321624756, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0065269470214844, "sampling/importance_sampling_ratio/min": 0.00866839848458767, "sampling/sampling_logp_difference/max": 4.748071193695068, "sampling/sampling_logp_difference/mean": 0.12747487425804138, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3370.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 718.48828125, "completions/mean_terminated_length": 718.48828125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.13500451855361462, "epoch": 0.8548672566371681, "frac_reward_zero_std": 0.625, "grad_norm": 0.43031337259879043, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 255571047.0, "reward": 0.6986562609672546, "reward_std": 0.07628892362117767, "rewards/qatch_small_update_with_fm/mean": 0.6986562609672546, "rewards/qatch_small_update_with_fm/std": 0.35163816809654236, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9929754137992859, "sampling/importance_sampling_ratio/min": 0.002005315152928233, "sampling/sampling_logp_difference/max": 6.211954116821289, "sampling/sampling_logp_difference/mean": 0.11147291958332062, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3851.0, "completions/mean_length": 643.24609375, "completions/mean_terminated_length": 588.4404907226562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.11379838734865189, "epoch": 0.856637168141593, "frac_reward_zero_std": 0.8125, "grad_norm": 0.33428081990047404, "learning_rate": 1e-06, "loss": -0.1222, "num_tokens": 256008854.0, "reward": 0.7900702953338623, "reward_std": 0.04124121367931366, "rewards/qatch_small_update_with_fm/mean": 0.7900702953338623, "rewards/qatch_small_update_with_fm/std": 0.31325802206993103, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9924777746200562, "sampling/importance_sampling_ratio/min": 5.922396303503774e-05, "sampling/sampling_logp_difference/max": 9.734184265136719, "sampling/sampling_logp_difference/mean": 0.10147938132286072, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3578.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 783.12109375, "completions/mean_terminated_length": 783.12109375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.11473686061799526, "epoch": 0.8584070796460177, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3716733908405323, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 256643029.0, "reward": 0.7510117292404175, "reward_std": 0.05420318618416786, "rewards/qatch_small_update_with_fm/mean": 0.7510117292404175, "rewards/qatch_small_update_with_fm/std": 0.3284763693809509, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.993644118309021, "sampling/importance_sampling_ratio/min": 0.0015391077613458037, "sampling/sampling_logp_difference/max": 6.476552486419678, "sampling/sampling_logp_difference/mean": 0.09778095036745071, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 506.46484375, "completions/mean_terminated_length": 506.46484375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.1107774619013071, "epoch": 0.8601769911504424, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4364102557318908, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 257068300.0, "reward": 0.7705351710319519, "reward_std": 0.07204098999500275, "rewards/qatch_small_update_with_fm/mean": 0.7705351710319519, "rewards/qatch_small_update_with_fm/std": 0.3419608771800995, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9939799904823303, "sampling/importance_sampling_ratio/min": 0.0054782782681286335, "sampling/sampling_logp_difference/max": 5.206964492797852, "sampling/sampling_logp_difference/mean": 0.09605875611305237, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3576.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 663.84375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.14957253821194172, "epoch": 0.8619469026548673, "frac_reward_zero_std": 0.625, "grad_norm": 0.48289123247994914, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 257611332.0, "reward": 0.8433437347412109, "reward_std": 0.11846810579299927, "rewards/qatch_small_update_with_fm/mean": 0.8433437347412109, "rewards/qatch_small_update_with_fm/std": 0.31830912828445435, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99498450756073, "sampling/importance_sampling_ratio/min": 0.005322446580976248, "sampling/sampling_logp_difference/max": 5.2358222007751465, "sampling/sampling_logp_difference/mean": 0.11823485791683197, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3639.0, "completions/max_terminated_length": 3639.0, "completions/mean_length": 532.5, "completions/mean_terminated_length": 532.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.1004238873720169, "epoch": 0.863716814159292, "frac_reward_zero_std": 0.875, "grad_norm": 0.34170294454283023, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 258201076.0, "reward": 0.8699023127555847, "reward_std": 0.008277655579149723, "rewards/qatch_small_update_with_fm/mean": 0.8699023127555847, "rewards/qatch_small_update_with_fm/std": 0.24368809163570404, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896460771560669, "sampling/importance_sampling_ratio/min": 0.005287018604576588, "sampling/sampling_logp_difference/max": 5.2425007820129395, "sampling/sampling_logp_difference/mean": 0.09384314715862274, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3694.0, "completions/max_terminated_length": 3694.0, "completions/mean_length": 638.52734375, "completions/mean_terminated_length": 638.52734375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.10675976052880287, "epoch": 0.8654867256637168, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4477106407992075, "learning_rate": 1e-06, "loss": -0.031, "num_tokens": 258868619.0, "reward": 0.827468752861023, "reward_std": 0.06248141825199127, "rewards/qatch_small_update_with_fm/mean": 0.827468752861023, "rewards/qatch_small_update_with_fm/std": 0.31693241000175476, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.993117094039917, "sampling/importance_sampling_ratio/min": 0.002081583719700575, "sampling/sampling_logp_difference/max": 6.174626350402832, "sampling/sampling_logp_difference/mean": 0.09679339826107025, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3630.0, "completions/max_terminated_length": 3630.0, "completions/mean_length": 555.64453125, "completions/mean_terminated_length": 555.64453125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.11295890063047409, "epoch": 0.8672566371681416, "frac_reward_zero_std": 0.6875, "grad_norm": 0.47637855017594344, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 259404736.0, "reward": 0.8470624685287476, "reward_std": 0.08143269270658493, "rewards/qatch_small_update_with_fm/mean": 0.8470624685287476, "rewards/qatch_small_update_with_fm/std": 0.28385287523269653, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990898847579956, "sampling/importance_sampling_ratio/min": 0.005345887504518032, "sampling/sampling_logp_difference/max": 5.2314276695251465, "sampling/sampling_logp_difference/mean": 0.10513126850128174, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 525.6875, "completions/mean_terminated_length": 525.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.11057206615805626, "epoch": 0.8690265486725663, "frac_reward_zero_std": 0.75, "grad_norm": 0.49316013842711826, "learning_rate": 1e-06, "loss": -0.0172, "num_tokens": 259886576.0, "reward": 0.6682304739952087, "reward_std": 0.06963515281677246, "rewards/qatch_small_update_with_fm/mean": 0.6682304739952087, "rewards/qatch_small_update_with_fm/std": 0.41227987408638, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994171142578125, "sampling/importance_sampling_ratio/min": 5.919911814089573e-07, "sampling/sampling_logp_difference/max": 14.339774131774902, "sampling/sampling_logp_difference/mean": 0.09741106629371643, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3209.0, "completions/max_terminated_length": 3209.0, "completions/mean_length": 544.3125, "completions/mean_terminated_length": 544.3125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.10203938465565443, "epoch": 0.8707964601769912, "frac_reward_zero_std": 0.8125, "grad_norm": 0.40395896479505305, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 260393056.0, "reward": 0.825976550579071, "reward_std": 0.07064475864171982, "rewards/qatch_small_update_with_fm/mean": 0.825976550579071, "rewards/qatch_small_update_with_fm/std": 0.3124232292175293, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9923082590103149, "sampling/importance_sampling_ratio/min": 0.011136534623801708, "sampling/sampling_logp_difference/max": 4.497524261474609, "sampling/sampling_logp_difference/mean": 0.09712864458560944, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3687.0, "completions/mean_length": 740.96875, "completions/mean_terminated_length": 727.8118286132812, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.13502811640501022, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.75, "grad_norm": 0.7541222795830347, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 261195416.0, "reward": 0.6253867149353027, "reward_std": 0.036423392593860626, "rewards/qatch_small_update_with_fm/mean": 0.6253867149353027, "rewards/qatch_small_update_with_fm/std": 0.39507678151130676, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9943881034851074, "sampling/importance_sampling_ratio/min": 0.0019406526116654277, "sampling/sampling_logp_difference/max": 6.2447309494018555, "sampling/sampling_logp_difference/mean": 0.11299841105937958, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3588.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 626.94140625, "completions/mean_terminated_length": 626.94140625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1206303657963872, "epoch": 0.8743362831858407, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4655702140113037, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 261846201.0, "reward": 0.6357265710830688, "reward_std": 0.05497089400887489, "rewards/qatch_small_update_with_fm/mean": 0.6357265710830688, "rewards/qatch_small_update_with_fm/std": 0.380975604057312, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99620121717453, "sampling/importance_sampling_ratio/min": 0.006773304659873247, "sampling/sampling_logp_difference/max": 4.9947662353515625, "sampling/sampling_logp_difference/mean": 0.10108514130115509, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3938.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 853.140625, "completions/mean_terminated_length": 853.140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.15600799396634102, "epoch": 0.8761061946902655, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2990253421074993, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 262474925.0, "reward": 0.7554570436477661, "reward_std": 0.0371590256690979, "rewards/qatch_small_update_with_fm/mean": 0.7554570436477661, "rewards/qatch_small_update_with_fm/std": 0.3164856433868408, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999498128890991, "sampling/importance_sampling_ratio/min": 0.006773614324629307, "sampling/sampling_logp_difference/max": 4.994720458984375, "sampling/sampling_logp_difference/mean": 0.11973873525857925, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3271.0, "completions/max_terminated_length": 3271.0, "completions/mean_length": 620.87890625, "completions/mean_terminated_length": 620.87890625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1256363745778799, "epoch": 0.8778761061946903, "frac_reward_zero_std": 0.375, "grad_norm": 0.695171336012073, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 263142830.0, "reward": 0.7039062976837158, "reward_std": 0.12556229531764984, "rewards/qatch_small_update_with_fm/mean": 0.7039062976837158, "rewards/qatch_small_update_with_fm/std": 0.37397584319114685, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914224147796631, "sampling/importance_sampling_ratio/min": 0.0022153332829475403, "sampling/sampling_logp_difference/max": 6.11235237121582, "sampling/sampling_logp_difference/mean": 0.11125008016824722, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3627.0, "completions/max_terminated_length": 3627.0, "completions/mean_length": 599.80078125, "completions/mean_terminated_length": 599.80078125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.11637549288570881, "epoch": 0.879646017699115, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1743804157983891, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 263689803.0, "reward": 0.7390859127044678, "reward_std": 0.009406249970197678, "rewards/qatch_small_update_with_fm/mean": 0.7390859127044678, "rewards/qatch_small_update_with_fm/std": 0.3409428298473358, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9943405389785767, "sampling/importance_sampling_ratio/min": 0.008762093260884285, "sampling/sampling_logp_difference/max": 4.737320423126221, "sampling/sampling_logp_difference/mean": 0.09983810782432556, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 592.25, "completions/mean_terminated_length": 592.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.12224038224667311, "epoch": 0.8814159292035398, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4241928294714878, "learning_rate": 1e-06, "loss": -0.0232, "num_tokens": 264256907.0, "reward": 0.7852305173873901, "reward_std": 0.04839783534407616, "rewards/qatch_small_update_with_fm/mean": 0.7852305173873901, "rewards/qatch_small_update_with_fm/std": 0.32617098093032837, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915899634361267, "sampling/importance_sampling_ratio/min": 0.00750912819057703, "sampling/sampling_logp_difference/max": 4.891635894775391, "sampling/sampling_logp_difference/mean": 0.10896101593971252, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3446.0, "completions/mean_length": 680.08203125, "completions/mean_terminated_length": 653.18505859375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.1548197865486145, "epoch": 0.8831858407079646, "frac_reward_zero_std": 0.6875, "grad_norm": 0.42449434564075406, "learning_rate": 1e-06, "loss": -0.0168, "num_tokens": 264778464.0, "reward": 0.7460390329360962, "reward_std": 0.04268177971243858, "rewards/qatch_small_update_with_fm/mean": 0.7460390329360962, "rewards/qatch_small_update_with_fm/std": 0.3574458062648773, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997272491455078, "sampling/importance_sampling_ratio/min": 0.00681761559098959, "sampling/sampling_logp_difference/max": 4.988245487213135, "sampling/sampling_logp_difference/mean": 0.12331503629684448, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3910.0, "completions/mean_length": 621.04296875, "completions/mean_terminated_length": 607.4157104492188, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.12380017153918743, "epoch": 0.8849557522123894, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5938474878995735, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 265333899.0, "reward": 0.6871640682220459, "reward_std": 0.11829064786434174, "rewards/qatch_small_update_with_fm/mean": 0.6871640682220459, "rewards/qatch_small_update_with_fm/std": 0.4201640784740448, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9962127208709717, "sampling/importance_sampling_ratio/min": 0.00790239404886961, "sampling/sampling_logp_difference/max": 4.84058952331543, "sampling/sampling_logp_difference/mean": 0.10077165812253952, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 649.7109375, "completions/mean_terminated_length": 636.1961059570312, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.13394583947956562, "epoch": 0.8867256637168142, "frac_reward_zero_std": 0.5, "grad_norm": 0.5901939553261971, "learning_rate": 1e-06, "loss": -0.0195, "num_tokens": 265955729.0, "reward": 0.7199413776397705, "reward_std": 0.13469067215919495, "rewards/qatch_small_update_with_fm/mean": 0.7199413776397705, "rewards/qatch_small_update_with_fm/std": 0.358967661857605, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9926297068595886, "sampling/importance_sampling_ratio/min": 0.0005967216566205025, "sampling/sampling_logp_difference/max": 7.424059867858887, "sampling/sampling_logp_difference/mean": 0.1124434769153595, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3276.0, "completions/max_terminated_length": 3276.0, "completions/mean_length": 795.90625, "completions/mean_terminated_length": 795.90625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.13532480970025063, "epoch": 0.8884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 0.47390615928516006, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 266872601.0, "reward": 0.6929843425750732, "reward_std": 0.128580704331398, "rewards/qatch_small_update_with_fm/mean": 0.6929843425750732, "rewards/qatch_small_update_with_fm/std": 0.3396168649196625, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927940964698792, "sampling/importance_sampling_ratio/min": 0.0025748631451278925, "sampling/sampling_logp_difference/max": 5.961958885192871, "sampling/sampling_logp_difference/mean": 0.11729906499385834, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3900.0, "completions/mean_length": 747.88671875, "completions/mean_terminated_length": 625.8906860351562, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.13497674651443958, "epoch": 0.8902654867256637, "frac_reward_zero_std": 0.625, "grad_norm": 0.397347167414158, "learning_rate": 1e-06, "loss": -0.1661, "num_tokens": 267346108.0, "reward": 0.7580468654632568, "reward_std": 0.09373210370540619, "rewards/qatch_small_update_with_fm/mean": 0.7580468654632568, "rewards/qatch_small_update_with_fm/std": 0.3393411636352539, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9984617829322815, "sampling/importance_sampling_ratio/min": 0.0005745299276895821, "sampling/sampling_logp_difference/max": 7.461958408355713, "sampling/sampling_logp_difference/mean": 0.10664564371109009, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3773.0, "completions/max_terminated_length": 3773.0, "completions/mean_length": 794.94140625, "completions/mean_terminated_length": 794.94140625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.14703031815588474, "epoch": 0.8920353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 0.47300725939173244, "learning_rate": 1e-06, "loss": -0.038, "num_tokens": 268114605.0, "reward": 0.746777355670929, "reward_std": 0.1325729787349701, "rewards/qatch_small_update_with_fm/mean": 0.746777355670929, "rewards/qatch_small_update_with_fm/std": 0.3324522376060486, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9946229457855225, "sampling/importance_sampling_ratio/min": 0.004103472921997309, "sampling/sampling_logp_difference/max": 5.495921611785889, "sampling/sampling_logp_difference/mean": 0.11896717548370361, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 607.3125, "completions/mean_terminated_length": 607.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1080400925129652, "epoch": 0.8938053097345132, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5307241600276866, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 268747581.0, "reward": 0.7987890243530273, "reward_std": 0.045952875167131424, "rewards/qatch_small_update_with_fm/mean": 0.7987890243530273, "rewards/qatch_small_update_with_fm/std": 0.3247639536857605, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9940429925918579, "sampling/importance_sampling_ratio/min": 0.003386986441910267, "sampling/sampling_logp_difference/max": 5.687814712524414, "sampling/sampling_logp_difference/mean": 0.09250658750534058, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 516.34375, "completions/mean_terminated_length": 516.34375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.10784084629267454, "epoch": 0.8955752212389381, "frac_reward_zero_std": 0.6875, "grad_norm": 0.44363434212922453, "learning_rate": 1e-06, "loss": -0.0223, "num_tokens": 269317493.0, "reward": 0.7939062714576721, "reward_std": 0.05867087468504906, "rewards/qatch_small_update_with_fm/mean": 0.7939062714576721, "rewards/qatch_small_update_with_fm/std": 0.31158387660980225, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9887971878051758, "sampling/importance_sampling_ratio/min": 0.006596569903194904, "sampling/sampling_logp_difference/max": 5.021205425262451, "sampling/sampling_logp_difference/mean": 0.10122236609458923, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 547.87890625, "completions/mean_terminated_length": 533.9647216796875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.10982772801071405, "epoch": 0.8973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 0.5602113323442856, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 270047254.0, "reward": 0.8025702834129333, "reward_std": 0.07300128787755966, "rewards/qatch_small_update_with_fm/mean": 0.8025702834129333, "rewards/qatch_small_update_with_fm/std": 0.36065641045570374, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890502691268921, "sampling/importance_sampling_ratio/min": 0.008726593106985092, "sampling/sampling_logp_difference/max": 4.741380214691162, "sampling/sampling_logp_difference/mean": 0.10416848212480545, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 555.31640625, "completions/mean_terminated_length": 555.31640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.10554197616875172, "epoch": 0.8991150442477877, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6791503270002681, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 270632903.0, "reward": 0.7361679673194885, "reward_std": 0.06678478419780731, "rewards/qatch_small_update_with_fm/mean": 0.7361679673194885, "rewards/qatch_small_update_with_fm/std": 0.3236234784126282, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9916872382164001, "sampling/importance_sampling_ratio/min": 0.006751068867743015, "sampling/sampling_logp_difference/max": 4.998054504394531, "sampling/sampling_logp_difference/mean": 0.0991954579949379, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 520.3515625, "completions/mean_terminated_length": 492.19683837890625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.10747648775577545, "epoch": 0.9008849557522124, "frac_reward_zero_std": 0.8125, "grad_norm": 0.31376394668547153, "learning_rate": 1e-06, "loss": -0.0331, "num_tokens": 271390833.0, "reward": 0.8341054916381836, "reward_std": 0.05029167979955673, "rewards/qatch_small_update_with_fm/mean": 0.8341054916381836, "rewards/qatch_small_update_with_fm/std": 0.33065176010131836, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912950396537781, "sampling/importance_sampling_ratio/min": 0.000440972886281088, "sampling/sampling_logp_difference/max": 7.726527214050293, "sampling/sampling_logp_difference/mean": 0.10213141143321991, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 475.76171875, "completions/mean_terminated_length": 475.76171875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.101709870621562, "epoch": 0.9026548672566371, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5136908947485258, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 271914180.0, "reward": 0.7939648628234863, "reward_std": 0.10082687437534332, "rewards/qatch_small_update_with_fm/mean": 0.7939648032188416, "rewards/qatch_small_update_with_fm/std": 0.3312971591949463, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915147423744202, "sampling/importance_sampling_ratio/min": 0.00526475952938199, "sampling/sampling_logp_difference/max": 5.246719837188721, "sampling/sampling_logp_difference/mean": 0.09459386765956879, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3733.0, "completions/mean_length": 709.73828125, "completions/mean_terminated_length": 683.0748291015625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.11102820746600628, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.8125, "grad_norm": 1.0245789709175026, "learning_rate": 1e-06, "loss": -0.0864, "num_tokens": 272625921.0, "reward": 0.8580273389816284, "reward_std": 0.0242561474442482, "rewards/qatch_small_update_with_fm/mean": 0.8580273389816284, "rewards/qatch_small_update_with_fm/std": 0.2759164571762085, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9949482679367065, "sampling/importance_sampling_ratio/min": 0.005282989237457514, "sampling/sampling_logp_difference/max": 5.243263244628906, "sampling/sampling_logp_difference/mean": 0.09105080366134644, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 379.58984375, "completions/mean_terminated_length": 379.58984375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.09121392387896776, "epoch": 0.9061946902654867, "frac_reward_zero_std": 0.875, "grad_norm": 0.2937055756246945, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 273184376.0, "reward": 0.960460901260376, "reward_std": 0.02830149233341217, "rewards/qatch_small_update_with_fm/mean": 0.960460901260376, "rewards/qatch_small_update_with_fm/std": 0.14264100790023804, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898667931556702, "sampling/importance_sampling_ratio/min": 0.0067546553909778595, "sampling/sampling_logp_difference/max": 4.997523307800293, "sampling/sampling_logp_difference/mean": 0.0861605778336525, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3332.0, "completions/max_terminated_length": 3332.0, "completions/mean_length": 550.4609375, "completions/mean_terminated_length": 550.4609375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.12689280044287443, "epoch": 0.9079646017699115, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5932380148617874, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 273674846.0, "reward": 0.7879921793937683, "reward_std": 0.14545322954654694, "rewards/qatch_small_update_with_fm/mean": 0.7879921793937683, "rewards/qatch_small_update_with_fm/std": 0.3656655251979828, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9960757493972778, "sampling/importance_sampling_ratio/min": 0.011154396459460258, "sampling/sampling_logp_difference/max": 4.495921611785889, "sampling/sampling_logp_difference/mean": 0.10277114808559418, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 491.1015625, "completions/mean_terminated_length": 476.9647216796875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.10514062270522118, "epoch": 0.9097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 0.4467300183498017, "learning_rate": 1e-06, "loss": -0.0923, "num_tokens": 274116648.0, "reward": 0.872878909111023, "reward_std": 0.04924742877483368, "rewards/qatch_small_update_with_fm/mean": 0.872878909111023, "rewards/qatch_small_update_with_fm/std": 0.22971169650554657, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994683027267456, "sampling/importance_sampling_ratio/min": 0.0010823322227224708, "sampling/sampling_logp_difference/max": 6.82863712310791, "sampling/sampling_logp_difference/mean": 0.09253266453742981, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 489.421875, "completions/mean_terminated_length": 489.421875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.10272311139851809, "epoch": 0.911504424778761, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6825855255982989, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 274671252.0, "reward": 0.7494022846221924, "reward_std": 0.08151744306087494, "rewards/qatch_small_update_with_fm/mean": 0.7494022846221924, "rewards/qatch_small_update_with_fm/std": 0.3152708411216736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9923495054244995, "sampling/importance_sampling_ratio/min": 0.000986733939498663, "sampling/sampling_logp_difference/max": 6.921110153198242, "sampling/sampling_logp_difference/mean": 0.09436982870101929, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 709.078125, "completions/mean_terminated_length": 695.796142578125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.1157941734418273, "epoch": 0.9132743362831859, "frac_reward_zero_std": 0.5625, "grad_norm": 0.44786462791495824, "learning_rate": 1e-06, "loss": -0.0207, "num_tokens": 275267848.0, "reward": 0.7720664143562317, "reward_std": 0.09372272342443466, "rewards/qatch_small_update_with_fm/mean": 0.7720664143562317, "rewards/qatch_small_update_with_fm/std": 0.31302616000175476, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927377700805664, "sampling/importance_sampling_ratio/min": 0.004789196886122227, "sampling/sampling_logp_difference/max": 5.341392517089844, "sampling/sampling_logp_difference/mean": 0.09881985187530518, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3991.0, "completions/mean_length": 756.34375, "completions/mean_terminated_length": 716.7431030273438, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.11825973261147738, "epoch": 0.9150442477876106, "frac_reward_zero_std": 0.625, "grad_norm": 0.6220394658984534, "learning_rate": 1e-06, "loss": -0.078, "num_tokens": 276139968.0, "reward": 0.6929961442947388, "reward_std": 0.07409294694662094, "rewards/qatch_small_update_with_fm/mean": 0.6929961442947388, "rewards/qatch_small_update_with_fm/std": 0.38647279143333435, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.995445966720581, "sampling/importance_sampling_ratio/min": 0.00030024454463273287, "sampling/sampling_logp_difference/max": 8.110913276672363, "sampling/sampling_logp_difference/mean": 0.09951721131801605, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 383.0859375, "completions/mean_terminated_length": 383.0859375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.0746139083057642, "epoch": 0.9168141592920354, "frac_reward_zero_std": 0.875, "grad_norm": 0.4217912624034053, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 276603734.0, "reward": 0.7472773194313049, "reward_std": 0.04199932515621185, "rewards/qatch_small_update_with_fm/mean": 0.7472773194313049, "rewards/qatch_small_update_with_fm/std": 0.3478599488735199, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900175333023071, "sampling/importance_sampling_ratio/min": 7.775412814226002e-05, "sampling/sampling_logp_difference/max": 9.461958885192871, "sampling/sampling_logp_difference/mean": 0.07847994565963745, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3543.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 668.703125, "completions/mean_terminated_length": 668.703125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.11498175282031298, "epoch": 0.9185840707964602, "frac_reward_zero_std": 0.6875, "grad_norm": 0.45490108367204685, "learning_rate": 1e-06, "loss": 0.0702, "num_tokens": 277088282.0, "reward": 0.7890429496765137, "reward_std": 0.05803219974040985, "rewards/qatch_small_update_with_fm/mean": 0.7890429496765137, "rewards/qatch_small_update_with_fm/std": 0.2938583195209503, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.992883563041687, "sampling/importance_sampling_ratio/min": 0.0026743346825242043, "sampling/sampling_logp_difference/max": 5.9240546226501465, "sampling/sampling_logp_difference/mean": 0.10016723722219467, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 570.6796875, "completions/mean_terminated_length": 570.6796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.12316263653337955, "epoch": 0.9203539823008849, "frac_reward_zero_std": 0.75, "grad_norm": 0.5428826382650482, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 277591640.0, "reward": 0.8450508117675781, "reward_std": 0.0795350894331932, "rewards/qatch_small_update_with_fm/mean": 0.8450508117675781, "rewards/qatch_small_update_with_fm/std": 0.2786359190940857, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9909742474555969, "sampling/importance_sampling_ratio/min": 0.0041517033241689205, "sampling/sampling_logp_difference/max": 5.484236717224121, "sampling/sampling_logp_difference/mean": 0.11202063411474228, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 851.640625, "completions/mean_terminated_length": 838.917724609375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.1328129693865776, "epoch": 0.9221238938053097, "frac_reward_zero_std": 0.625, "grad_norm": 0.6348699462205718, "learning_rate": 1e-06, "loss": -0.0749, "num_tokens": 278204700.0, "reward": 0.6865078210830688, "reward_std": 0.0656687542796135, "rewards/qatch_small_update_with_fm/mean": 0.6865078210830688, "rewards/qatch_small_update_with_fm/std": 0.35934215784072876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9983229041099548, "sampling/importance_sampling_ratio/min": 0.0016617655055597425, "sampling/sampling_logp_difference/max": 6.399874687194824, "sampling/sampling_logp_difference/mean": 0.10541870445013046, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 466.08984375, "completions/mean_terminated_length": 466.08984375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.09466620348393917, "epoch": 0.9238938053097345, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5389918379739942, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 278721795.0, "reward": 0.7020820379257202, "reward_std": 0.04175513610243797, "rewards/qatch_small_update_with_fm/mean": 0.7020820379257202, "rewards/qatch_small_update_with_fm/std": 0.2982487976551056, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9895206689834595, "sampling/importance_sampling_ratio/min": 0.0014664140762761235, "sampling/sampling_logp_difference/max": 6.524935245513916, "sampling/sampling_logp_difference/mean": 0.0922519713640213, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 697.234375, "completions/mean_terminated_length": 697.234375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.12420686054974794, "epoch": 0.9256637168141593, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4637277192811646, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 279378175.0, "reward": 0.907964825630188, "reward_std": 0.060234054923057556, "rewards/qatch_small_update_with_fm/mean": 0.907964825630188, "rewards/qatch_small_update_with_fm/std": 0.2710384428501129, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9939944744110107, "sampling/importance_sampling_ratio/min": 3.092254246439552e-06, "sampling/sampling_logp_difference/max": 12.686610221862793, "sampling/sampling_logp_difference/mean": 0.10830149054527283, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3774.0, "completions/max_terminated_length": 3774.0, "completions/mean_length": 592.01953125, "completions/mean_terminated_length": 592.01953125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.11691632680594921, "epoch": 0.9274336283185841, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4683143629081742, "learning_rate": 1e-06, "loss": 0.0532, "num_tokens": 279822276.0, "reward": 0.7248905897140503, "reward_std": 0.0623093843460083, "rewards/qatch_small_update_with_fm/mean": 0.7248905897140503, "rewards/qatch_small_update_with_fm/std": 0.3484153151512146, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925761222839355, "sampling/importance_sampling_ratio/min": 0.008074809797108173, "sampling/sampling_logp_difference/max": 4.819005966186523, "sampling/sampling_logp_difference/mean": 0.1028011366724968, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 483.20703125, "completions/mean_terminated_length": 483.20703125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.08595089986920357, "epoch": 0.9292035398230089, "frac_reward_zero_std": 0.75, "grad_norm": 0.5232236333613473, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 280248649.0, "reward": 0.800000011920929, "reward_std": 0.05487021803855896, "rewards/qatch_small_update_with_fm/mean": 0.800000011920929, "rewards/qatch_small_update_with_fm/std": 0.3535082936286926, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9883502721786499, "sampling/importance_sampling_ratio/min": 0.004996729549020529, "sampling/sampling_logp_difference/max": 5.298971652984619, "sampling/sampling_logp_difference/mean": 0.09205498546361923, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 597.28515625, "completions/mean_terminated_length": 597.28515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.11841507069766521, "epoch": 0.9309734513274336, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5666674244929343, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 280657026.0, "reward": 0.8406132459640503, "reward_std": 0.0987255647778511, "rewards/qatch_small_update_with_fm/mean": 0.8406132459640503, "rewards/qatch_small_update_with_fm/std": 0.2870708405971527, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896042943000793, "sampling/importance_sampling_ratio/min": 0.002490278333425522, "sampling/sampling_logp_difference/max": 5.995360851287842, "sampling/sampling_logp_difference/mean": 0.1075456291437149, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 487.5078125, "completions/mean_terminated_length": 487.5078125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.08723330218344927, "epoch": 0.9327433628318584, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5266721054678201, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 281301316.0, "reward": 0.7623125314712524, "reward_std": 0.029279010370373726, "rewards/qatch_small_update_with_fm/mean": 0.7623125314712524, "rewards/qatch_small_update_with_fm/std": 0.37368521094322205, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890673160552979, "sampling/importance_sampling_ratio/min": 2.2507217067868623e-07, "sampling/sampling_logp_difference/max": 15.306844711303711, "sampling/sampling_logp_difference/mean": 0.08900699019432068, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3234.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 670.80078125, "completions/mean_terminated_length": 670.80078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.10302252415567636, "epoch": 0.9345132743362832, "frac_reward_zero_std": 0.6875, "grad_norm": 0.48289001580005797, "learning_rate": 1e-06, "loss": -0.033, "num_tokens": 281832577.0, "reward": 0.714773416519165, "reward_std": 0.07669016718864441, "rewards/qatch_small_update_with_fm/mean": 0.714773416519165, "rewards/qatch_small_update_with_fm/std": 0.334821879863739, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9901472926139832, "sampling/importance_sampling_ratio/min": 0.004114319570362568, "sampling/sampling_logp_difference/max": 5.493281841278076, "sampling/sampling_logp_difference/mean": 0.10036074370145798, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 1048.1640625, "completions/mean_terminated_length": 962.4818725585938, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.12826403323560953, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.625, "grad_norm": 0.4273381754299511, "learning_rate": 1e-06, "loss": -0.0511, "num_tokens": 282545243.0, "reward": 0.6557812690734863, "reward_std": 0.11316819489002228, "rewards/qatch_small_update_with_fm/mean": 0.6557812690734863, "rewards/qatch_small_update_with_fm/std": 0.37949860095977783, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934544563293457, "sampling/importance_sampling_ratio/min": 0.00034160760696977377, "sampling/sampling_logp_difference/max": 7.981847763061523, "sampling/sampling_logp_difference/mean": 0.10794238746166229, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 870.19921875, "completions/mean_terminated_length": 857.549072265625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.14097079075872898, "epoch": 0.9380530973451328, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4239509379502484, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 283138702.0, "reward": 0.6905586123466492, "reward_std": 0.06804914027452469, "rewards/qatch_small_update_with_fm/mean": 0.6905585527420044, "rewards/qatch_small_update_with_fm/std": 0.3696425259113312, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920858144760132, "sampling/importance_sampling_ratio/min": 0.003613534849137068, "sampling/sampling_logp_difference/max": 5.623068809509277, "sampling/sampling_logp_difference/mean": 0.11941371113061905, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 528.30078125, "completions/mean_terminated_length": 528.30078125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.09164797514677048, "epoch": 0.9398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 0.5307769550601885, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 283613819.0, "reward": 0.8188437819480896, "reward_std": 0.06766097992658615, "rewards/qatch_small_update_with_fm/mean": 0.8188437819480896, "rewards/qatch_small_update_with_fm/std": 0.3401625156402588, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893657565116882, "sampling/importance_sampling_ratio/min": 0.0008690520189702511, "sampling/sampling_logp_difference/max": 7.048107624053955, "sampling/sampling_logp_difference/mean": 0.09450694918632507, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3533.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 565.26171875, "completions/mean_terminated_length": 565.26171875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.08598773088306189, "epoch": 0.9415929203539823, "frac_reward_zero_std": 0.875, "grad_norm": 0.38821236574145923, "learning_rate": 1e-06, "loss": -0.011, "num_tokens": 284038894.0, "reward": 0.7724023461341858, "reward_std": 0.042911797761917114, "rewards/qatch_small_update_with_fm/mean": 0.7724023461341858, "rewards/qatch_small_update_with_fm/std": 0.337915301322937, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913812875747681, "sampling/importance_sampling_ratio/min": 0.004855873994529247, "sampling/sampling_logp_difference/max": 5.327566146850586, "sampling/sampling_logp_difference/mean": 0.0843484178185463, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3197.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 405.58984375, "completions/mean_terminated_length": 405.58984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0662353434599936, "epoch": 0.9433628318584071, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6172104659359773, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 284502357.0, "reward": 0.7266562581062317, "reward_std": 0.07175854593515396, "rewards/qatch_small_update_with_fm/mean": 0.7266561985015869, "rewards/qatch_small_update_with_fm/std": 0.40244054794311523, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914896488189697, "sampling/importance_sampling_ratio/min": 0.0015248889103531837, "sampling/sampling_logp_difference/max": 6.485833644866943, "sampling/sampling_logp_difference/mean": 0.07165195047855377, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 402.88671875, "completions/mean_terminated_length": 402.88671875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06576312193647027, "epoch": 0.9451327433628318, "frac_reward_zero_std": 0.9375, "grad_norm": 0.41788947304605045, "learning_rate": 1e-06, "loss": -0.0167, "num_tokens": 284956248.0, "reward": 0.8745741844177246, "reward_std": 0.0049965716898441315, "rewards/qatch_small_update_with_fm/mean": 0.8745741844177246, "rewards/qatch_small_update_with_fm/std": 0.2458096295595169, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9895414113998413, "sampling/importance_sampling_ratio/min": 0.006748078390955925, "sampling/sampling_logp_difference/max": 4.998497486114502, "sampling/sampling_logp_difference/mean": 0.07585783302783966, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3813.0, "completions/max_terminated_length": 3813.0, "completions/mean_length": 575.8125, "completions/mean_terminated_length": 575.8125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.07056690799072385, "epoch": 0.9469026548672567, "frac_reward_zero_std": 0.6875, "grad_norm": 0.7378731304713649, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 285410296.0, "reward": 0.7944101691246033, "reward_std": 0.06454658508300781, "rewards/qatch_small_update_with_fm/mean": 0.7944101691246033, "rewards/qatch_small_update_with_fm/std": 0.34657639265060425, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912918210029602, "sampling/importance_sampling_ratio/min": 0.004212359432131052, "sampling/sampling_logp_difference/max": 5.469732284545898, "sampling/sampling_logp_difference/mean": 0.07724401354789734, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 515.69921875, "completions/mean_terminated_length": 515.69921875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.08022209769114852, "epoch": 0.9486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 0.5696256068461455, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 286059707.0, "reward": 0.7645195126533508, "reward_std": 0.06664008647203445, "rewards/qatch_small_update_with_fm/mean": 0.7645195126533508, "rewards/qatch_small_update_with_fm/std": 0.33420178294181824, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892451763153076, "sampling/importance_sampling_ratio/min": 0.0020269453525543213, "sampling/sampling_logp_difference/max": 6.201225280761719, "sampling/sampling_logp_difference/mean": 0.08961813151836395, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 601.76171875, "completions/mean_terminated_length": 588.058837890625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.0777341304346919, "epoch": 0.9504424778761061, "frac_reward_zero_std": 0.75, "grad_norm": 0.5403558957446779, "learning_rate": 1e-06, "loss": 0.039, "num_tokens": 286713822.0, "reward": 0.8262421488761902, "reward_std": 0.10278799384832382, "rewards/qatch_small_update_with_fm/mean": 0.8262421488761902, "rewards/qatch_small_update_with_fm/std": 0.32409027218818665, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906113147735596, "sampling/importance_sampling_ratio/min": 0.000726877769920975, "sampling/sampling_logp_difference/max": 7.226752281188965, "sampling/sampling_logp_difference/mean": 0.08289214968681335, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 534.79296875, "completions/mean_terminated_length": 534.79296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07284298213198781, "epoch": 0.952212389380531, "frac_reward_zero_std": 0.8125, "grad_norm": 0.407306536990088, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 287406777.0, "reward": 0.7659531235694885, "reward_std": 0.03347664326429367, "rewards/qatch_small_update_with_fm/mean": 0.7659531235694885, "rewards/qatch_small_update_with_fm/std": 0.31723061203956604, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9882594347000122, "sampling/importance_sampling_ratio/min": 4.673008646705057e-08, "sampling/sampling_logp_difference/max": 16.878877639770508, "sampling/sampling_logp_difference/mean": 0.08553198724985123, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3659.0, "completions/max_terminated_length": 3659.0, "completions/mean_length": 560.9609375, "completions/mean_terminated_length": 560.9609375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.063908651471138, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.875, "grad_norm": 0.49161049554807584, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 287984415.0, "reward": 0.7458593845367432, "reward_std": 0.035589832812547684, "rewards/qatch_small_update_with_fm/mean": 0.7458593845367432, "rewards/qatch_small_update_with_fm/std": 0.35967662930488586, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9910004138946533, "sampling/importance_sampling_ratio/min": 2.653115188877564e-06, "sampling/sampling_logp_difference/max": 12.839776039123535, "sampling/sampling_logp_difference/mean": 0.07102929055690765, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3777.0, "completions/mean_length": 657.8671875, "completions/mean_terminated_length": 644.3843383789062, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.07635938376188278, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.75, "grad_norm": 0.5452335209582936, "learning_rate": 1e-06, "loss": -0.0607, "num_tokens": 288796429.0, "reward": 0.5647109746932983, "reward_std": 0.06827317178249359, "rewards/qatch_small_update_with_fm/mean": 0.5647109746932983, "rewards/qatch_small_update_with_fm/std": 0.38985252380371094, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9927794933319092, "sampling/importance_sampling_ratio/min": 0.00033576894202269614, "sampling/sampling_logp_difference/max": 7.999087333679199, "sampling/sampling_logp_difference/mean": 0.07802629470825195, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2996.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 544.43359375, "completions/mean_terminated_length": 544.43359375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06596293672919273, "epoch": 0.9575221238938053, "frac_reward_zero_std": 0.9375, "grad_norm": 0.27206970564743294, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 289291676.0, "reward": 0.7545312643051147, "reward_std": 0.015625, "rewards/qatch_small_update_with_fm/mean": 0.7545312643051147, "rewards/qatch_small_update_with_fm/std": 0.37506264448165894, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9873992204666138, "sampling/importance_sampling_ratio/min": 0.0026083062402904034, "sampling/sampling_logp_difference/max": 5.94905424118042, "sampling/sampling_logp_difference/mean": 0.08176256716251373, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3901.0, "completions/mean_length": 934.8359375, "completions/mean_terminated_length": 922.4392700195312, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.1001374926418066, "epoch": 0.95929203539823, "frac_reward_zero_std": 0.625, "grad_norm": 0.559960616971841, "learning_rate": 1e-06, "loss": -0.0294, "num_tokens": 289976722.0, "reward": 0.6559218764305115, "reward_std": 0.08523955941200256, "rewards/qatch_small_update_with_fm/mean": 0.6559218168258667, "rewards/qatch_small_update_with_fm/std": 0.3872259855270386, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9958049654960632, "sampling/importance_sampling_ratio/min": 0.0012593659339472651, "sampling/sampling_logp_difference/max": 6.677146911621094, "sampling/sampling_logp_difference/mean": 0.09016713500022888, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 498.71484375, "completions/mean_terminated_length": 498.71484375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.0649999501183629, "epoch": 0.9610619469026549, "frac_reward_zero_std": 0.8125, "grad_norm": 0.47685171312494107, "learning_rate": 1e-06, "loss": -0.0217, "num_tokens": 290494377.0, "reward": 0.745410144329071, "reward_std": 0.04396291822195053, "rewards/qatch_small_update_with_fm/mean": 0.745410144329071, "rewards/qatch_small_update_with_fm/std": 0.3141641318798065, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892974495887756, "sampling/importance_sampling_ratio/min": 0.0004503573873080313, "sampling/sampling_logp_difference/max": 7.705469131469727, "sampling/sampling_logp_difference/mean": 0.07702207565307617, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1027.33984375, "completions/mean_terminated_length": 990.95263671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.09254028089344501, "epoch": 0.9628318584070796, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5142745856968639, "learning_rate": 1e-06, "loss": -0.0922, "num_tokens": 291223952.0, "reward": 0.657976508140564, "reward_std": 0.1279950886964798, "rewards/qatch_small_update_with_fm/mean": 0.657976508140564, "rewards/qatch_small_update_with_fm/std": 0.3692423105239868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9931938648223877, "sampling/importance_sampling_ratio/min": 0.0009468301432207227, "sampling/sampling_logp_difference/max": 6.962390899658203, "sampling/sampling_logp_difference/mean": 0.0876588374376297, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2998.0, "completions/max_terminated_length": 2998.0, "completions/mean_length": 573.875, "completions/mean_terminated_length": 573.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06328198779374361, "epoch": 0.9646017699115044, "frac_reward_zero_std": 0.875, "grad_norm": 0.41556654472228777, "learning_rate": 1e-06, "loss": -0.0308, "num_tokens": 291649248.0, "reward": 0.7210820317268372, "reward_std": 0.022427227348089218, "rewards/qatch_small_update_with_fm/mean": 0.7210820317268372, "rewards/qatch_small_update_with_fm/std": 0.3797529935836792, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914613962173462, "sampling/importance_sampling_ratio/min": 0.0003364813746884465, "sampling/sampling_logp_difference/max": 7.996967792510986, "sampling/sampling_logp_difference/mean": 0.0714886337518692, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3946.0, "completions/mean_length": 674.890625, "completions/mean_terminated_length": 647.9527587890625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07149964012205601, "epoch": 0.9663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 0.4074940853826009, "learning_rate": 1e-06, "loss": -0.0965, "num_tokens": 292226948.0, "reward": 0.7326640486717224, "reward_std": 0.05338604003190994, "rewards/qatch_small_update_with_fm/mean": 0.7326640486717224, "rewards/qatch_small_update_with_fm/std": 0.35138407349586487, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919158220291138, "sampling/importance_sampling_ratio/min": 0.0015048105269670486, "sampling/sampling_logp_difference/max": 6.499088287353516, "sampling/sampling_logp_difference/mean": 0.07774293422698975, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 783.1640625, "completions/mean_terminated_length": 770.172607421875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.07332196040078998, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.2833815598679686, "learning_rate": 1e-06, "loss": -0.028, "num_tokens": 292919086.0, "reward": 0.6988593339920044, "reward_std": 0.03941141813993454, "rewards/qatch_small_update_with_fm/mean": 0.6988593339920044, "rewards/qatch_small_update_with_fm/std": 0.346405953168869, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990686297416687, "sampling/importance_sampling_ratio/min": 9.723559196572751e-05, "sampling/sampling_logp_difference/max": 9.238373756408691, "sampling/sampling_logp_difference/mean": 0.08024582266807556, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 1048.33203125, "completions/mean_terminated_length": 1024.3345947265625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.10439629666507244, "epoch": 0.9699115044247788, "frac_reward_zero_std": 0.6875, "grad_norm": 0.38819391093077565, "learning_rate": 1e-06, "loss": -0.0542, "num_tokens": 293700723.0, "reward": 0.7764492034912109, "reward_std": 0.057538390159606934, "rewards/qatch_small_update_with_fm/mean": 0.7764492630958557, "rewards/qatch_small_update_with_fm/std": 0.35874632000923157, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9915471076965332, "sampling/importance_sampling_ratio/min": 0.0003894827968906611, "sampling/sampling_logp_difference/max": 7.850690841674805, "sampling/sampling_logp_difference/mean": 0.10121684521436691, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3337.0, "completions/max_terminated_length": 3337.0, "completions/mean_length": 514.9296875, "completions/mean_terminated_length": 514.9296875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06488791201263666, "epoch": 0.9716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 0.49346829936262865, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 294261521.0, "reward": 0.8128906488418579, "reward_std": 0.05527229979634285, "rewards/qatch_small_update_with_fm/mean": 0.8128906488418579, "rewards/qatch_small_update_with_fm/std": 0.3675207793712616, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900767207145691, "sampling/importance_sampling_ratio/min": 0.002415587892755866, "sampling/sampling_logp_difference/max": 6.02581262588501, "sampling/sampling_logp_difference/mean": 0.07517706602811813, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3724.0, "completions/max_terminated_length": 3724.0, "completions/mean_length": 569.7109375, "completions/mean_terminated_length": 569.7109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.0651154387742281, "epoch": 0.9734513274336283, "frac_reward_zero_std": 0.75, "grad_norm": 0.5618206950409255, "learning_rate": 1e-06, "loss": -0.0255, "num_tokens": 294822679.0, "reward": 0.7361406087875366, "reward_std": 0.06598907709121704, "rewards/qatch_small_update_with_fm/mean": 0.7361406087875366, "rewards/qatch_small_update_with_fm/std": 0.3569088876247406, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9903668165206909, "sampling/importance_sampling_ratio/min": 0.001626877929084003, "sampling/sampling_logp_difference/max": 6.421092510223389, "sampling/sampling_logp_difference/mean": 0.07287141680717468, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 693.64453125, "completions/mean_terminated_length": 693.64453125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.07973913569003344, "epoch": 0.9752212389380531, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4333880222785954, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 295314780.0, "reward": 0.6914101839065552, "reward_std": 0.0544731467962265, "rewards/qatch_small_update_with_fm/mean": 0.6914101839065552, "rewards/qatch_small_update_with_fm/std": 0.35723307728767395, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877251982688904, "sampling/importance_sampling_ratio/min": 0.005544170271605253, "sampling/sampling_logp_difference/max": 5.195008277893066, "sampling/sampling_logp_difference/mean": 0.08873097598552704, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 467.06640625, "completions/mean_terminated_length": 467.06640625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.05367149272933602, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9635481879647013, "learning_rate": 1e-06, "loss": 0.0675, "num_tokens": 295878157.0, "reward": 0.7614219188690186, "reward_std": 0.07371485978364944, "rewards/qatch_small_update_with_fm/mean": 0.7614219188690186, "rewards/qatch_small_update_with_fm/std": 0.3245559334754944, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889311790466309, "sampling/importance_sampling_ratio/min": 0.0019758539274334908, "sampling/sampling_logp_difference/max": 6.226754665374756, "sampling/sampling_logp_difference/mean": 0.06883668154478073, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3886.0, "completions/mean_length": 799.53515625, "completions/mean_terminated_length": 773.5787353515625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.09158620424568653, "epoch": 0.9787610619469026, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5187698393124252, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 296637702.0, "reward": 0.7982265949249268, "reward_std": 0.05513294041156769, "rewards/qatch_small_update_with_fm/mean": 0.7982265949249268, "rewards/qatch_small_update_with_fm/std": 0.33354005217552185, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.991692304611206, "sampling/importance_sampling_ratio/min": 0.0009394215303473175, "sampling/sampling_logp_difference/max": 6.970246315002441, "sampling/sampling_logp_difference/mean": 0.09343910217285156, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 656.17578125, "completions/mean_terminated_length": 656.17578125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.07529762573540211, "epoch": 0.9805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.46503478854107794, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 297062307.0, "reward": 0.8423788547515869, "reward_std": 0.04574102163314819, "rewards/qatch_small_update_with_fm/mean": 0.8423788547515869, "rewards/qatch_small_update_with_fm/std": 0.272524356842041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9878612160682678, "sampling/importance_sampling_ratio/min": 0.005275082774460316, "sampling/sampling_logp_difference/max": 5.244760990142822, "sampling/sampling_logp_difference/mean": 0.08667662739753723, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3507.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 720.953125, "completions/mean_terminated_length": 720.953125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.0724447788670659, "epoch": 0.9823008849557522, "frac_reward_zero_std": 0.6875, "grad_norm": 0.47734244313632035, "learning_rate": 1e-06, "loss": 0.0308, "num_tokens": 297617959.0, "reward": 0.611578106880188, "reward_std": 0.08688230812549591, "rewards/qatch_small_update_with_fm/mean": 0.611578106880188, "rewards/qatch_small_update_with_fm/std": 0.3563700318336487, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.988653302192688, "sampling/importance_sampling_ratio/min": 0.00012823805445805192, "sampling/sampling_logp_difference/max": 8.96162223815918, "sampling/sampling_logp_difference/mean": 0.08443759381771088, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 762.35546875, "completions/mean_terminated_length": 722.8261108398438, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07419738546013832, "epoch": 0.984070796460177, "frac_reward_zero_std": 0.625, "grad_norm": 0.5607787784340242, "learning_rate": 1e-06, "loss": -0.0576, "num_tokens": 298193794.0, "reward": 0.729687511920929, "reward_std": 0.14253780245780945, "rewards/qatch_small_update_with_fm/mean": 0.729687511920929, "rewards/qatch_small_update_with_fm/std": 0.3643515110015869, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896014928817749, "sampling/importance_sampling_ratio/min": 0.0019406072096899152, "sampling/sampling_logp_difference/max": 6.244754314422607, "sampling/sampling_logp_difference/mean": 0.08009640872478485, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 517.08984375, "completions/mean_terminated_length": 517.08984375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06529176607728004, "epoch": 0.9858407079646018, "frac_reward_zero_std": 0.875, "grad_norm": 0.5883079496784032, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 298870505.0, "reward": 0.7845039367675781, "reward_std": 0.037567272782325745, "rewards/qatch_small_update_with_fm/mean": 0.7845039367675781, "rewards/qatch_small_update_with_fm/std": 0.36427584290504456, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9908521771430969, "sampling/importance_sampling_ratio/min": 0.0007565406849607825, "sampling/sampling_logp_difference/max": 7.18675422668457, "sampling/sampling_logp_difference/mean": 0.07576609402894974, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3902.0, "completions/mean_length": 881.76171875, "completions/mean_terminated_length": 804.6200561523438, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.0811238419264555, "epoch": 0.9876106194690265, "frac_reward_zero_std": 0.5, "grad_norm": 0.5896053283682656, "learning_rate": 1e-06, "loss": -0.1348, "num_tokens": 299473212.0, "reward": 0.741335928440094, "reward_std": 0.16152629256248474, "rewards/qatch_small_update_with_fm/mean": 0.741335928440094, "rewards/qatch_small_update_with_fm/std": 0.37875017523765564, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9893544912338257, "sampling/importance_sampling_ratio/min": 0.0012162798084318638, "sampling/sampling_logp_difference/max": 6.711958408355713, "sampling/sampling_logp_difference/mean": 0.08723506331443787, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2388.0, "completions/max_terminated_length": 2388.0, "completions/mean_length": 556.34375, "completions/mean_terminated_length": 556.34375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07279634848237038, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6346360887575442, "learning_rate": 1e-06, "loss": 0.0392, "num_tokens": 299892036.0, "reward": 0.8744804859161377, "reward_std": 0.07507223635911942, "rewards/qatch_small_update_with_fm/mean": 0.8744804859161377, "rewards/qatch_small_update_with_fm/std": 0.2712794840335846, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9875009059906006, "sampling/importance_sampling_ratio/min": 0.0007305411854758859, "sampling/sampling_logp_difference/max": 7.221724987030029, "sampling/sampling_logp_difference/mean": 0.08305776119232178, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3367.0, "completions/max_terminated_length": 3367.0, "completions/mean_length": 1021.33984375, "completions/mean_terminated_length": 1021.33984375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.09527640696614981, "epoch": 0.9911504424778761, "frac_reward_zero_std": 0.375, "grad_norm": 0.768685757865519, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 300635739.0, "reward": 0.6343789100646973, "reward_std": 0.13254612684249878, "rewards/qatch_small_update_with_fm/mean": 0.6343789100646973, "rewards/qatch_small_update_with_fm/std": 0.3772452473640442, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9884824156761169, "sampling/importance_sampling_ratio/min": 1.0130537702934816e-05, "sampling/sampling_logp_difference/max": 11.499956130981445, "sampling/sampling_logp_difference/mean": 0.09710503369569778, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3505.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 566.98046875, "completions/mean_terminated_length": 566.98046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.09003073815256357, "epoch": 0.9929203539823008, "frac_reward_zero_std": 0.75, "grad_norm": 0.665274763420313, "learning_rate": 1e-06, "loss": -0.047, "num_tokens": 300981334.0, "reward": 0.8392460942268372, "reward_std": 0.05475308746099472, "rewards/qatch_small_update_with_fm/mean": 0.8392460942268372, "rewards/qatch_small_update_with_fm/std": 0.3062317967414856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896265268325806, "sampling/importance_sampling_ratio/min": 4.6086139349199584e-08, "sampling/sampling_logp_difference/max": 16.89275360107422, "sampling/sampling_logp_difference/mean": 0.09289674460887909, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3772.0, "completions/max_terminated_length": 3772.0, "completions/mean_length": 804.1796875, "completions/mean_terminated_length": 804.1796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.10023855697363615, "epoch": 0.9946902654867257, "frac_reward_zero_std": 0.5, "grad_norm": 0.5995308303865646, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 301536276.0, "reward": 0.7380585670471191, "reward_std": 0.0959700495004654, "rewards/qatch_small_update_with_fm/mean": 0.7380585670471191, "rewards/qatch_small_update_with_fm/std": 0.34974250197410583, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933926463127136, "sampling/importance_sampling_ratio/min": 0.00023100488760974258, "sampling/sampling_logp_difference/max": 8.373071670532227, "sampling/sampling_logp_difference/mean": 0.0933670848608017, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 606.3984375, "completions/mean_terminated_length": 606.3984375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07056841254234314, "epoch": 0.9964601769911504, "frac_reward_zero_std": 0.625, "grad_norm": 0.6327470025212618, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 301984378.0, "reward": 0.7754648327827454, "reward_std": 0.12288480997085571, "rewards/qatch_small_update_with_fm/mean": 0.7754648327827454, "rewards/qatch_small_update_with_fm/std": 0.3759419918060303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867947101593018, "sampling/importance_sampling_ratio/min": 8.739297072679619e-07, "sampling/sampling_logp_difference/max": 13.950265884399414, "sampling/sampling_logp_difference/mean": 0.08524052798748016, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3921.0, "completions/mean_length": 824.3203125, "completions/mean_terminated_length": 772.388916015625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.08141087833791971, "epoch": 0.9982300884955753, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4522442359272215, "learning_rate": 1e-06, "loss": -0.0234, "num_tokens": 302708316.0, "reward": 0.6576171517372131, "reward_std": 0.08948611468076706, "rewards/qatch_small_update_with_fm/mean": 0.6576172113418579, "rewards/qatch_small_update_with_fm/std": 0.413068950176239, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.990230917930603, "sampling/importance_sampling_ratio/min": 3.224205258334223e-08, "sampling/sampling_logp_difference/max": 17.2499942779541, "sampling/sampling_logp_difference/mean": 0.085847407579422, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3438.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 632.109375, "completions/mean_terminated_length": 632.109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.07970152795314789, "epoch": 1.0, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4195401770571529, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 303171576.0, "reward": 0.8823828101158142, "reward_std": 0.05790891870856285, "rewards/qatch_small_update_with_fm/mean": 0.8823828101158142, "rewards/qatch_small_update_with_fm/std": 0.2736479640007019, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889679551124573, "sampling/importance_sampling_ratio/min": 0.00027138934819959104, "sampling/sampling_logp_difference/max": 8.211956024169922, "sampling/sampling_logp_difference/mean": 0.08676078915596008, "step": 565 }, { "epoch": 1.0, "step": 565, "total_flos": 0.0, "train_loss": -0.0013111333199982754, "train_runtime": 50378.8497, "train_samples_per_second": 0.18, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 565, "num_input_tokens_seen": 303171576, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }