diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17558 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 565, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 218.2890625, + "completions/mean_terminated_length": 218.2890625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.055932467337697744, + "epoch": 0.0017699115044247787, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4016180957891362, + "learning_rate": 0.0, + "loss": 0.0025, + "num_tokens": 458314.0, + "reward": 0.5624843835830688, + "reward_std": 0.10818969458341599, + "rewards/qatch_small_update_with_fm/mean": 0.5624843835830688, + "rewards/qatch_small_update_with_fm/std": 0.3901517689228058, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9868041276931763, + "sampling/importance_sampling_ratio/min": 0.00016218787641264498, + "sampling/sampling_logp_difference/max": 8.726755142211914, + "sampling/sampling_logp_difference/mean": 0.07559221237897873, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 191.8125, + "completions/mean_terminated_length": 191.8125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.0502051068469882, + "epoch": 0.0035398230088495575, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8868732918382652, + "learning_rate": 1.7543859649122805e-08, + "loss": 0.0027, + "num_tokens": 970794.0, + "reward": 0.7046874761581421, + "reward_std": 0.04274485632777214, + "rewards/qatch_small_update_with_fm/mean": 0.7046874761581421, + "rewards/qatch_small_update_with_fm/std": 0.3915046751499176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925431609153748, + "sampling/importance_sampling_ratio/min": 0.0015312157338485122, + "sampling/sampling_logp_difference/max": 6.481693267822266, + "sampling/sampling_logp_difference/mean": 0.06322439014911652, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 217.28125, + "completions/mean_terminated_length": 217.28125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.053691909182816744, + "epoch": 0.005309734513274336, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.6731495484912082, + "learning_rate": 3.508771929824561e-08, + "loss": -0.0221, + "num_tokens": 1395426.0, + "reward": 0.6182304620742798, + "reward_std": 0.09917416423559189, + "rewards/qatch_small_update_with_fm/mean": 0.6182304620742798, + "rewards/qatch_small_update_with_fm/std": 0.4379144310951233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990143895149231, + "sampling/importance_sampling_ratio/min": 0.00016410346142947674, + "sampling/sampling_logp_difference/max": 8.71501350402832, + "sampling/sampling_logp_difference/mean": 0.06847122311592102, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 220.12109375, + "completions/mean_terminated_length": 220.12109375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.060406394302845, + "epoch": 0.007079646017699115, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4683462720727918, + "learning_rate": 5.2631578947368416e-08, + "loss": 0.0207, + "num_tokens": 2003425.0, + "reward": 0.5585194826126099, + "reward_std": 0.053804244846105576, + "rewards/qatch_small_update_with_fm/mean": 0.5585194826126099, + "rewards/qatch_small_update_with_fm/std": 0.39273256063461304, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875953197479248, + "sampling/importance_sampling_ratio/min": 0.004114419687539339, + "sampling/sampling_logp_difference/max": 5.493257522583008, + "sampling/sampling_logp_difference/mean": 0.0788843035697937, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 207.2421875, + "completions/mean_terminated_length": 207.2421875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.06038828194141388, + "epoch": 0.008849557522123894, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.815000760491036, + "learning_rate": 7.017543859649122e-08, + "loss": -0.0129, + "num_tokens": 2555599.0, + "reward": 0.7544023394584656, + "reward_std": 0.10639214515686035, + "rewards/qatch_small_update_with_fm/mean": 0.7544023394584656, + "rewards/qatch_small_update_with_fm/std": 0.3464336395263672, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911973476409912, + "sampling/importance_sampling_ratio/min": 7.211915090010734e-06, + "sampling/sampling_logp_difference/max": 11.839776039123535, + "sampling/sampling_logp_difference/mean": 0.07425904273986816, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 198.359375, + "completions/mean_terminated_length": 198.359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.05508757056668401, + "epoch": 0.010619469026548672, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3618814755274247, + "learning_rate": 8.771929824561403e-08, + "loss": 0.0008, + "num_tokens": 3007131.0, + "reward": 0.6679218411445618, + "reward_std": 0.0737442895770073, + "rewards/qatch_small_update_with_fm/mean": 0.6679218411445618, + "rewards/qatch_small_update_with_fm/std": 0.3787311017513275, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878405332565308, + "sampling/importance_sampling_ratio/min": 0.006647423375397921, + "sampling/sampling_logp_difference/max": 5.01352596282959, + "sampling/sampling_logp_difference/mean": 0.07456570863723755, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 215.375, + "completions/mean_terminated_length": 215.375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.0570991188287735, + "epoch": 0.012389380530973451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5094600148969153, + "learning_rate": 1.0526315789473683e-07, + "loss": 0.0034, + "num_tokens": 3563819.0, + "reward": 0.7576835751533508, + "reward_std": 0.12967899441719055, + "rewards/qatch_small_update_with_fm/mean": 0.7576836347579956, + "rewards/qatch_small_update_with_fm/std": 0.35834866762161255, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900656938552856, + "sampling/importance_sampling_ratio/min": 0.004104791674762964, + "sampling/sampling_logp_difference/max": 5.49560022354126, + "sampling/sampling_logp_difference/mean": 0.07453933358192444, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 193.29296875, + "completions/mean_terminated_length": 193.29296875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.048482123762369156, + "epoch": 0.01415929203539823, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.5818561179335906, + "learning_rate": 1.2280701754385964e-07, + "loss": -0.009, + "num_tokens": 3890038.0, + "reward": 0.6313203573226929, + "reward_std": 0.14092831313610077, + "rewards/qatch_small_update_with_fm/mean": 0.6313203573226929, + "rewards/qatch_small_update_with_fm/std": 0.39382705092430115, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898822903633118, + "sampling/importance_sampling_ratio/min": 0.008772030472755432, + "sampling/sampling_logp_difference/max": 4.736186981201172, + "sampling/sampling_logp_difference/mean": 0.061118610203266144, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 197.39453125, + "completions/mean_terminated_length": 197.39453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.04963156068697572, + "epoch": 0.01592920353982301, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.5521690922872657, + "learning_rate": 1.4035087719298244e-07, + "loss": 0.0169, + "num_tokens": 4266731.0, + "reward": 0.770031213760376, + "reward_std": 0.10407255589962006, + "rewards/qatch_small_update_with_fm/mean": 0.770031213760376, + "rewards/qatch_small_update_with_fm/std": 0.3665200173854828, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872828722000122, + "sampling/importance_sampling_ratio/min": 0.004101686645299196, + "sampling/sampling_logp_difference/max": 5.496356964111328, + "sampling/sampling_logp_difference/mean": 0.07008753716945648, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 201.2109375, + "completions/mean_terminated_length": 201.2109375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.051307815592736006, + "epoch": 0.017699115044247787, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6061306069741945, + "learning_rate": 1.5789473684210525e-07, + "loss": -0.0103, + "num_tokens": 4638561.0, + "reward": 0.682628870010376, + "reward_std": 0.09211964905261993, + "rewards/qatch_small_update_with_fm/mean": 0.6826289296150208, + "rewards/qatch_small_update_with_fm/std": 0.36384159326553345, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911330342292786, + "sampling/importance_sampling_ratio/min": 0.0031929106917232275, + "sampling/sampling_logp_difference/max": 5.746822357177734, + "sampling/sampling_logp_difference/mean": 0.06979094445705414, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 208.46484375, + "completions/mean_terminated_length": 208.46484375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05418810294941068, + "epoch": 0.019469026548672566, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2675987379108877, + "learning_rate": 1.7543859649122805e-07, + "loss": 0.0089, + "num_tokens": 5074904.0, + "reward": 0.6214921474456787, + "reward_std": 0.09628939628601074, + "rewards/qatch_small_update_with_fm/mean": 0.6214921474456787, + "rewards/qatch_small_update_with_fm/std": 0.36368274688720703, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906713962554932, + "sampling/importance_sampling_ratio/min": 0.004233733285218477, + "sampling/sampling_logp_difference/max": 5.4646711349487305, + "sampling/sampling_logp_difference/mean": 0.06758482754230499, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 205.19921875, + "completions/mean_terminated_length": 205.19921875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.05194803886115551, + "epoch": 0.021238938053097345, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1957367434837758, + "learning_rate": 1.9298245614035086e-07, + "loss": 0.0113, + "num_tokens": 5549051.0, + "reward": 0.6895078420639038, + "reward_std": 0.08644171059131622, + "rewards/qatch_small_update_with_fm/mean": 0.6895078420639038, + "rewards/qatch_small_update_with_fm/std": 0.36041754484176636, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990691065788269, + "sampling/importance_sampling_ratio/min": 0.008775557391345501, + "sampling/sampling_logp_difference/max": 4.735785007476807, + "sampling/sampling_logp_difference/mean": 0.06334304809570312, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 228.0078125, + "completions/mean_terminated_length": 228.0078125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.06115304958075285, + "epoch": 0.023008849557522124, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.8474801364773754, + "learning_rate": 2.1052631578947366e-07, + "loss": 0.0014, + "num_tokens": 5991261.0, + "reward": 0.5757031440734863, + "reward_std": 0.09528271108865738, + "rewards/qatch_small_update_with_fm/mean": 0.5757031440734863, + "rewards/qatch_small_update_with_fm/std": 0.35785242915153503, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899276494979858, + "sampling/importance_sampling_ratio/min": 0.0010690669296309352, + "sampling/sampling_logp_difference/max": 6.840969085693359, + "sampling/sampling_logp_difference/mean": 0.07720615714788437, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 231.671875, + "completions/mean_terminated_length": 231.671875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.059437486343085766, + "epoch": 0.024778761061946902, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.153686768788144, + "learning_rate": 2.2807017543859647e-07, + "loss": 0.0145, + "num_tokens": 6339033.0, + "reward": 0.5981484651565552, + "reward_std": 0.11867053806781769, + "rewards/qatch_small_update_with_fm/mean": 0.5981484055519104, + "rewards/qatch_small_update_with_fm/std": 0.40236470103263855, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874120950698853, + "sampling/importance_sampling_ratio/min": 0.0067546553909778595, + "sampling/sampling_logp_difference/max": 4.997523307800293, + "sampling/sampling_logp_difference/mean": 0.07262717187404633, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 184.72265625, + "completions/mean_terminated_length": 184.72265625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.05293445521965623, + "epoch": 0.02654867256637168, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7947302002840908, + "learning_rate": 2.456140350877193e-07, + "loss": 0.0262, + "num_tokens": 6822914.0, + "reward": 0.7774726152420044, + "reward_std": 0.1253722608089447, + "rewards/qatch_small_update_with_fm/mean": 0.7774726748466492, + "rewards/qatch_small_update_with_fm/std": 0.3277592360973358, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9928814172744751, + "sampling/importance_sampling_ratio/min": 0.0005699428147636354, + "sampling/sampling_logp_difference/max": 7.469974517822266, + "sampling/sampling_logp_difference/mean": 0.06256245821714401, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 180.90625, + "completions/mean_terminated_length": 180.90625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.047853834461420774, + "epoch": 0.02831858407079646, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2589380786159132, + "learning_rate": 2.631578947368421e-07, + "loss": -0.0078, + "num_tokens": 7144698.0, + "reward": 0.6949101686477661, + "reward_std": 0.0378408208489418, + "rewards/qatch_small_update_with_fm/mean": 0.6949101686477661, + "rewards/qatch_small_update_with_fm/std": 0.3930208086967468, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918819665908813, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.05979344993829727, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 204.32421875, + "completions/mean_terminated_length": 204.32421875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.04873772710561752, + "epoch": 0.03008849557522124, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1936460246963316, + "learning_rate": 2.807017543859649e-07, + "loss": 0.0036, + "num_tokens": 7572637.0, + "reward": 0.8436406254768372, + "reward_std": 0.05956996604800224, + "rewards/qatch_small_update_with_fm/mean": 0.8436405658721924, + "rewards/qatch_small_update_with_fm/std": 0.26569095253944397, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906154274940491, + "sampling/importance_sampling_ratio/min": 0.007214278448373079, + "sampling/sampling_logp_difference/max": 4.931693077087402, + "sampling/sampling_logp_difference/mean": 0.06251002848148346, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 232.6796875, + "completions/mean_terminated_length": 232.6796875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.05695153260603547, + "epoch": 0.03185840707964602, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2761295767158813, + "learning_rate": 2.982456140350877e-07, + "loss": -0.0071, + "num_tokens": 8194859.0, + "reward": 0.5700898170471191, + "reward_std": 0.10193304717540741, + "rewards/qatch_small_update_with_fm/mean": 0.5700898766517639, + "rewards/qatch_small_update_with_fm/std": 0.4049544334411621, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990411639213562, + "sampling/importance_sampling_ratio/min": 0.004173677880316973, + "sampling/sampling_logp_difference/max": 5.478957653045654, + "sampling/sampling_logp_difference/mean": 0.07318311184644699, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 199.42578125, + "completions/mean_terminated_length": 199.42578125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.04951566876843572, + "epoch": 0.033628318584070796, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6725073395426633, + "learning_rate": 3.157894736842105e-07, + "loss": -0.0055, + "num_tokens": 8922456.0, + "reward": 0.7591953277587891, + "reward_std": 0.0754041075706482, + "rewards/qatch_small_update_with_fm/mean": 0.7591953277587891, + "rewards/qatch_small_update_with_fm/std": 0.36782652139663696, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9886845350265503, + "sampling/importance_sampling_ratio/min": 0.008016136474907398, + "sampling/sampling_logp_difference/max": 4.826298713684082, + "sampling/sampling_logp_difference/mean": 0.06480049341917038, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 222.52734375, + "completions/mean_terminated_length": 222.52734375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.053389983251690865, + "epoch": 0.035398230088495575, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.5080008629099697, + "learning_rate": 3.333333333333333e-07, + "loss": -0.0087, + "num_tokens": 9381903.0, + "reward": 0.6970039010047913, + "reward_std": 0.11117272078990936, + "rewards/qatch_small_update_with_fm/mean": 0.6970039010047913, + "rewards/qatch_small_update_with_fm/std": 0.33124446868896484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989010751247406, + "sampling/importance_sampling_ratio/min": 3.631289473560173e-06, + "sampling/sampling_logp_difference/max": 12.525922775268555, + "sampling/sampling_logp_difference/mean": 0.06789924949407578, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 225.8046875, + "completions/mean_terminated_length": 225.8046875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.06629037857055664, + "epoch": 0.03716814159292035, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.073903450177758, + "learning_rate": 3.508771929824561e-07, + "loss": -0.0026, + "num_tokens": 9875389.0, + "reward": 0.5116796493530273, + "reward_std": 0.0645594596862793, + "rewards/qatch_small_update_with_fm/mean": 0.5116796493530273, + "rewards/qatch_small_update_with_fm/std": 0.40706780552864075, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882779121398926, + "sampling/importance_sampling_ratio/min": 0.0041917734779417515, + "sampling/sampling_logp_difference/max": 5.474631309509277, + "sampling/sampling_logp_difference/mean": 0.08192314207553864, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 215.5390625, + "completions/mean_terminated_length": 215.5390625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.05642617912963033, + "epoch": 0.03893805309734513, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7536814431934094, + "learning_rate": 3.684210526315789e-07, + "loss": -0.0187, + "num_tokens": 10343815.0, + "reward": 0.6426757574081421, + "reward_std": 0.16063649952411652, + "rewards/qatch_small_update_with_fm/mean": 0.6426757574081421, + "rewards/qatch_small_update_with_fm/std": 0.3499335050582886, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988250732421875, + "sampling/importance_sampling_ratio/min": 0.0041191368363797665, + "sampling/sampling_logp_difference/max": 5.492111682891846, + "sampling/sampling_logp_difference/mean": 0.07272202521562576, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 205.4765625, + "completions/mean_terminated_length": 205.4765625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.050218963995575905, + "epoch": 0.04070796460176991, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1734990449788383, + "learning_rate": 3.859649122807017e-07, + "loss": 0.0112, + "num_tokens": 10905281.0, + "reward": 0.5514531135559082, + "reward_std": 0.07454925775527954, + "rewards/qatch_small_update_with_fm/mean": 0.5514531135559082, + "rewards/qatch_small_update_with_fm/std": 0.40859004855155945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992061972618103, + "sampling/importance_sampling_ratio/min": 0.0002468197199050337, + "sampling/sampling_logp_difference/max": 8.306852340698242, + "sampling/sampling_logp_difference/mean": 0.06509463489055634, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 186.2734375, + "completions/mean_terminated_length": 186.2734375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.051339670550078154, + "epoch": 0.04247787610619469, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.472579374696243, + "learning_rate": 4.035087719298245e-07, + "loss": 0.0049, + "num_tokens": 11301367.0, + "reward": 0.5480742454528809, + "reward_std": 0.09668620675802231, + "rewards/qatch_small_update_with_fm/mean": 0.5480742454528809, + "rewards/qatch_small_update_with_fm/std": 0.4017762839794159, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989937424659729, + "sampling/importance_sampling_ratio/min": 0.0032162086572498083, + "sampling/sampling_logp_difference/max": 5.739552021026611, + "sampling/sampling_logp_difference/mean": 0.06827802211046219, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 221.875, + "completions/mean_terminated_length": 221.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.05917324684560299, + "epoch": 0.04424778761061947, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9749562804449026, + "learning_rate": 4.2105263157894733e-07, + "loss": 0.0068, + "num_tokens": 11719255.0, + "reward": 0.5420546531677246, + "reward_std": 0.08317986130714417, + "rewards/qatch_small_update_with_fm/mean": 0.5420546531677246, + "rewards/qatch_small_update_with_fm/std": 0.40023666620254517, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898385405540466, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.07788807153701782, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 239.76171875, + "completions/mean_terminated_length": 239.76171875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.058990718331187963, + "epoch": 0.04601769911504425, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.3396032197988952, + "learning_rate": 4.3859649122807013e-07, + "loss": 0.0042, + "num_tokens": 12233770.0, + "reward": 0.7668046951293945, + "reward_std": 0.0936145931482315, + "rewards/qatch_small_update_with_fm/mean": 0.7668046951293945, + "rewards/qatch_small_update_with_fm/std": 0.3399920165538788, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896319508552551, + "sampling/importance_sampling_ratio/min": 0.0025338244158774614, + "sampling/sampling_logp_difference/max": 5.978025436401367, + "sampling/sampling_logp_difference/mean": 0.07602351158857346, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 212.66015625, + "completions/mean_terminated_length": 212.66015625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.05781093053519726, + "epoch": 0.047787610619469026, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.1795909655364405, + "learning_rate": 4.5614035087719294e-07, + "loss": -0.0148, + "num_tokens": 12676883.0, + "reward": 0.7312656044960022, + "reward_std": 0.06833778321743011, + "rewards/qatch_small_update_with_fm/mean": 0.731265664100647, + "rewards/qatch_small_update_with_fm/std": 0.3687576353549957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896270036697388, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.07338601350784302, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 196.3671875, + "completions/mean_terminated_length": 196.3671875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.0522055234760046, + "epoch": 0.049557522123893805, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.061190562198567, + "learning_rate": 4.7368421052631574e-07, + "loss": -0.0102, + "num_tokens": 13023505.0, + "reward": 0.6650546789169312, + "reward_std": 0.035377923399209976, + "rewards/qatch_small_update_with_fm/mean": 0.6650546789169312, + "rewards/qatch_small_update_with_fm/std": 0.375143438577652, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883463382720947, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.07111047208309174, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 227.6875, + "completions/mean_terminated_length": 227.6875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.05623852554708719, + "epoch": 0.05132743362831858, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.6294023746334783, + "learning_rate": 4.912280701754385e-07, + "loss": -0.0018, + "num_tokens": 13443697.0, + "reward": 0.6568242311477661, + "reward_std": 0.11119887232780457, + "rewards/qatch_small_update_with_fm/mean": 0.6568242311477661, + "rewards/qatch_small_update_with_fm/std": 0.40569937229156494, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989427924156189, + "sampling/importance_sampling_ratio/min": 0.0035574494395405054, + "sampling/sampling_logp_difference/max": 5.638711452484131, + "sampling/sampling_logp_difference/mean": 0.07333913445472717, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 219.7109375, + "completions/mean_terminated_length": 219.7109375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.05887019541114569, + "epoch": 0.05309734513274336, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4462954987854064, + "learning_rate": 5.087719298245614e-07, + "loss": 0.0056, + "num_tokens": 13799319.0, + "reward": 0.5096250176429749, + "reward_std": 0.08224877715110779, + "rewards/qatch_small_update_with_fm/mean": 0.5096250176429749, + "rewards/qatch_small_update_with_fm/std": 0.4135892689228058, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896410703659058, + "sampling/importance_sampling_ratio/min": 0.00046472682151943445, + "sampling/sampling_logp_difference/max": 7.674060821533203, + "sampling/sampling_logp_difference/mean": 0.07146239280700684, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 225.9296875, + "completions/mean_terminated_length": 225.9296875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.0572858308441937, + "epoch": 0.05486725663716814, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.4963962240758701, + "learning_rate": 5.263157894736842e-07, + "loss": 0.0013, + "num_tokens": 14207637.0, + "reward": 0.47812891006469727, + "reward_std": 0.10722576826810837, + "rewards/qatch_small_update_with_fm/mean": 0.47812891006469727, + "rewards/qatch_small_update_with_fm/std": 0.41087815165519714, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885352253913879, + "sampling/importance_sampling_ratio/min": 0.002572331577539444, + "sampling/sampling_logp_difference/max": 5.962942600250244, + "sampling/sampling_logp_difference/mean": 0.07356339693069458, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 215.8203125, + "completions/mean_terminated_length": 215.8203125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.05347989918664098, + "epoch": 0.05663716814159292, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.460487515109523, + "learning_rate": 5.43859649122807e-07, + "loss": 0.0011, + "num_tokens": 14733943.0, + "reward": 0.6709609031677246, + "reward_std": 0.0812995433807373, + "rewards/qatch_small_update_with_fm/mean": 0.6709609031677246, + "rewards/qatch_small_update_with_fm/std": 0.35092541575431824, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877209663391113, + "sampling/importance_sampling_ratio/min": 0.006773304659873247, + "sampling/sampling_logp_difference/max": 4.9947662353515625, + "sampling/sampling_logp_difference/mean": 0.07288801670074463, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 216.15234375, + "completions/mean_terminated_length": 216.15234375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.06382587924599648, + "epoch": 0.0584070796460177, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.6806617925432, + "learning_rate": 5.614035087719298e-07, + "loss": -0.0059, + "num_tokens": 15201870.0, + "reward": 0.6259765625, + "reward_std": 0.1354515552520752, + "rewards/qatch_small_update_with_fm/mean": 0.6259765625, + "rewards/qatch_small_update_with_fm/std": 0.4338729977607727, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890233278274536, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.07785800844430923, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 216.140625, + "completions/mean_terminated_length": 216.140625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.06205499917268753, + "epoch": 0.06017699115044248, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1188125302112715, + "learning_rate": 5.789473684210526e-07, + "loss": -0.0006, + "num_tokens": 15670674.0, + "reward": 0.7029336094856262, + "reward_std": 0.09102556109428406, + "rewards/qatch_small_update_with_fm/mean": 0.7029336094856262, + "rewards/qatch_small_update_with_fm/std": 0.36313775181770325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894707202911377, + "sampling/importance_sampling_ratio/min": 0.00528290867805481, + "sampling/sampling_logp_difference/max": 5.243278503417969, + "sampling/sampling_logp_difference/mean": 0.07377147674560547, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 215.26953125, + "completions/mean_terminated_length": 215.26953125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.055985859129577875, + "epoch": 0.061946902654867256, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9421523296988241, + "learning_rate": 5.964912280701754e-07, + "loss": -0.0083, + "num_tokens": 16030199.0, + "reward": 0.662277340888977, + "reward_std": 0.05062306672334671, + "rewards/qatch_small_update_with_fm/mean": 0.662277340888977, + "rewards/qatch_small_update_with_fm/std": 0.4001128077507019, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888499975204468, + "sampling/importance_sampling_ratio/min": 0.011136544868350029, + "sampling/sampling_logp_difference/max": 4.497523307800293, + "sampling/sampling_logp_difference/mean": 0.07431173324584961, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 219.87109375, + "completions/mean_terminated_length": 219.87109375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.055227352771908045, + "epoch": 0.06371681415929203, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.135079001459155, + "learning_rate": 6.140350877192982e-07, + "loss": -0.0032, + "num_tokens": 16377286.0, + "reward": 0.6459922194480896, + "reward_std": 0.08503326773643494, + "rewards/qatch_small_update_with_fm/mean": 0.6459922194480896, + "rewards/qatch_small_update_with_fm/std": 0.353148490190506, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892193675041199, + "sampling/importance_sampling_ratio/min": 9.36559445108287e-05, + "sampling/sampling_logp_difference/max": 9.275882720947266, + "sampling/sampling_logp_difference/mean": 0.0699758380651474, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 204.6953125, + "completions/mean_terminated_length": 204.6953125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05541439028456807, + "epoch": 0.06548672566371681, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9683166323193955, + "learning_rate": 6.31578947368421e-07, + "loss": 0.0055, + "num_tokens": 16893240.0, + "reward": 0.6555585861206055, + "reward_std": 0.034975819289684296, + "rewards/qatch_small_update_with_fm/mean": 0.6555585861206055, + "rewards/qatch_small_update_with_fm/std": 0.3547699749469757, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98728346824646, + "sampling/importance_sampling_ratio/min": 0.001210095826536417, + "sampling/sampling_logp_difference/max": 6.717055797576904, + "sampling/sampling_logp_difference/mean": 0.07098366320133209, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 225.51171875, + "completions/mean_terminated_length": 225.51171875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.06327511603012681, + "epoch": 0.06725663716814159, + "frac_reward_zero_std": 0.3125, + "grad_norm": 2.1783519764095556, + "learning_rate": 6.491228070175438e-07, + "loss": -0.0082, + "num_tokens": 17361579.0, + "reward": 0.5263632535934448, + "reward_std": 0.12323468178510666, + "rewards/qatch_small_update_with_fm/mean": 0.5263632535934448, + "rewards/qatch_small_update_with_fm/std": 0.38143190741539, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990209698677063, + "sampling/importance_sampling_ratio/min": 0.004103474784642458, + "sampling/sampling_logp_difference/max": 5.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.07122161984443665, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 214.296875, + "completions/mean_terminated_length": 214.296875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.05618205713108182, + "epoch": 0.06902654867256637, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3419509213878904, + "learning_rate": 6.666666666666666e-07, + "loss": -0.006, + "num_tokens": 17898951.0, + "reward": 0.6506953239440918, + "reward_std": 0.08842222392559052, + "rewards/qatch_small_update_with_fm/mean": 0.6506953239440918, + "rewards/qatch_small_update_with_fm/std": 0.396701455116272, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990181028842926, + "sampling/importance_sampling_ratio/min": 0.005266575142741203, + "sampling/sampling_logp_difference/max": 5.24637508392334, + "sampling/sampling_logp_difference/mean": 0.06922811269760132, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 226.23046875, + "completions/mean_terminated_length": 226.23046875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.05959279276430607, + "epoch": 0.07079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9617272834762555, + "learning_rate": 6.842105263157895e-07, + "loss": 0.0026, + "num_tokens": 18242850.0, + "reward": 0.7116601467132568, + "reward_std": 0.06126481294631958, + "rewards/qatch_small_update_with_fm/mean": 0.7116601467132568, + "rewards/qatch_small_update_with_fm/std": 0.39202579855918884, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9881985187530518, + "sampling/importance_sampling_ratio/min": 0.004140384495258331, + "sampling/sampling_logp_difference/max": 5.486966609954834, + "sampling/sampling_logp_difference/mean": 0.07950674742460251, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 211.18359375, + "completions/mean_terminated_length": 211.18359375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.06127061927691102, + "epoch": 0.07256637168141593, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.3274345780809282, + "learning_rate": 7.017543859649122e-07, + "loss": 0.0226, + "num_tokens": 18923697.0, + "reward": 0.6969023942947388, + "reward_std": 0.1744055151939392, + "rewards/qatch_small_update_with_fm/mean": 0.6969023942947388, + "rewards/qatch_small_update_with_fm/std": 0.3728826940059662, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888622164726257, + "sampling/importance_sampling_ratio/min": 0.00045951042557135224, + "sampling/sampling_logp_difference/max": 7.685348987579346, + "sampling/sampling_logp_difference/mean": 0.0754728838801384, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 210.484375, + "completions/mean_terminated_length": 210.484375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.05614142771810293, + "epoch": 0.0743362831858407, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2632017942317626, + "learning_rate": 7.192982456140351e-07, + "loss": -0.0006, + "num_tokens": 19296861.0, + "reward": 0.6874765157699585, + "reward_std": 0.06331270933151245, + "rewards/qatch_small_update_with_fm/mean": 0.6874765157699585, + "rewards/qatch_small_update_with_fm/std": 0.3579365611076355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885727167129517, + "sampling/importance_sampling_ratio/min": 0.005275054834783077, + "sampling/sampling_logp_difference/max": 5.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.07586228102445602, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 227.88671875, + "completions/mean_terminated_length": 227.88671875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.057134306989610195, + "epoch": 0.07610619469026549, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.3882623306559656, + "learning_rate": 7.368421052631578e-07, + "loss": -0.0083, + "num_tokens": 19771536.0, + "reward": 0.6067304611206055, + "reward_std": 0.13148462772369385, + "rewards/qatch_small_update_with_fm/mean": 0.6067304611206055, + "rewards/qatch_small_update_with_fm/std": 0.35363802313804626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987711489200592, + "sampling/importance_sampling_ratio/min": 0.0002641607716213912, + "sampling/sampling_logp_difference/max": 8.23895263671875, + "sampling/sampling_logp_difference/mean": 0.07562234252691269, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 200.41015625, + "completions/mean_terminated_length": 200.41015625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05608399631455541, + "epoch": 0.07787610619469026, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.2534604392810913, + "learning_rate": 7.543859649122807e-07, + "loss": 0.0167, + "num_tokens": 20281977.0, + "reward": 0.7539414167404175, + "reward_std": 0.09053537249565125, + "rewards/qatch_small_update_with_fm/mean": 0.7539414167404175, + "rewards/qatch_small_update_with_fm/std": 0.37756431102752686, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906572103500366, + "sampling/importance_sampling_ratio/min": 0.005265289451926947, + "sampling/sampling_logp_difference/max": 5.24661922454834, + "sampling/sampling_logp_difference/mean": 0.07270737737417221, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 267.09375, + "completions/mean_terminated_length": 267.09375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.07773672044277191, + "epoch": 0.07964601769911504, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.3404809735788332, + "learning_rate": 7.719298245614034e-07, + "loss": 0.007, + "num_tokens": 20790817.0, + "reward": 0.5390859842300415, + "reward_std": 0.09994053095579147, + "rewards/qatch_small_update_with_fm/mean": 0.5390859246253967, + "rewards/qatch_small_update_with_fm/std": 0.39160552620887756, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873398542404175, + "sampling/importance_sampling_ratio/min": 0.0007115455227904022, + "sampling/sampling_logp_difference/max": 7.248071193695068, + "sampling/sampling_logp_difference/mean": 0.09083493053913116, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 221.546875, + "completions/mean_terminated_length": 221.546875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.0625874325633049, + "epoch": 0.08141592920353982, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.5966431213050403, + "learning_rate": 7.894736842105263e-07, + "loss": -0.0201, + "num_tokens": 21361965.0, + "reward": 0.6787070035934448, + "reward_std": 0.13535821437835693, + "rewards/qatch_small_update_with_fm/mean": 0.6787070035934448, + "rewards/qatch_small_update_with_fm/std": 0.3745642304420471, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909084439277649, + "sampling/importance_sampling_ratio/min": 0.004959781188517809, + "sampling/sampling_logp_difference/max": 5.306393623352051, + "sampling/sampling_logp_difference/mean": 0.07631758600473404, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 208.375, + "completions/mean_terminated_length": 208.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.055308199021965265, + "epoch": 0.0831858407079646, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4518930219563386, + "learning_rate": 8.07017543859649e-07, + "loss": -0.0101, + "num_tokens": 21874157.0, + "reward": 0.6671679615974426, + "reward_std": 0.10809326171875, + "rewards/qatch_small_update_with_fm/mean": 0.6671679615974426, + "rewards/qatch_small_update_with_fm/std": 0.39586684107780457, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907793998718262, + "sampling/importance_sampling_ratio/min": 0.00869713630527258, + "sampling/sampling_logp_difference/max": 4.7447614669799805, + "sampling/sampling_logp_difference/mean": 0.06782685220241547, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 233.37109375, + "completions/mean_terminated_length": 233.37109375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.06507633905857801, + "epoch": 0.08495575221238938, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6383109347734452, + "learning_rate": 8.245614035087719e-07, + "loss": -0.0068, + "num_tokens": 22366332.0, + "reward": 0.6874726414680481, + "reward_std": 0.14611409604549408, + "rewards/qatch_small_update_with_fm/mean": 0.6874727010726929, + "rewards/qatch_small_update_with_fm/std": 0.3691793978214264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987760603427887, + "sampling/importance_sampling_ratio/min": 0.0025241519324481487, + "sampling/sampling_logp_difference/max": 5.9818501472473145, + "sampling/sampling_logp_difference/mean": 0.07947517931461334, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 220.91015625, + "completions/mean_terminated_length": 220.91015625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.05196545785292983, + "epoch": 0.08672566371681416, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3498543785058719, + "learning_rate": 8.421052631578947e-07, + "loss": -0.0054, + "num_tokens": 22712309.0, + "reward": 0.6685937643051147, + "reward_std": 0.10685192048549652, + "rewards/qatch_small_update_with_fm/mean": 0.6685937643051147, + "rewards/qatch_small_update_with_fm/std": 0.3972224295139313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896861910820007, + "sampling/importance_sampling_ratio/min": 0.0031929106917232275, + "sampling/sampling_logp_difference/max": 5.746822357177734, + "sampling/sampling_logp_difference/mean": 0.06827843189239502, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 234.8046875, + "completions/mean_terminated_length": 234.8046875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.06737958360463381, + "epoch": 0.08849557522123894, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.7923238515597124, + "learning_rate": 8.596491228070175e-07, + "loss": -0.0134, + "num_tokens": 23186147.0, + "reward": 0.5033125281333923, + "reward_std": 0.12631133198738098, + "rewards/qatch_small_update_with_fm/mean": 0.5033124685287476, + "rewards/qatch_small_update_with_fm/std": 0.4238218069076538, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885417222976685, + "sampling/importance_sampling_ratio/min": 0.00029693765100091696, + "sampling/sampling_logp_difference/max": 8.121988296508789, + "sampling/sampling_logp_difference/mean": 0.07968786358833313, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 197.2734375, + "completions/mean_terminated_length": 197.2734375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.04840736137703061, + "epoch": 0.09026548672566372, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0586044702380508, + "learning_rate": 8.771929824561403e-07, + "loss": -0.0009, + "num_tokens": 23535817.0, + "reward": 0.8437227010726929, + "reward_std": 0.07556959986686707, + "rewards/qatch_small_update_with_fm/mean": 0.8437227010726929, + "rewards/qatch_small_update_with_fm/std": 0.296781986951828, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988394021987915, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.06788826733827591, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 231.86328125, + "completions/mean_terminated_length": 231.86328125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.06255824631080031, + "epoch": 0.0920353982300885, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.6045971303696716, + "learning_rate": 8.947368421052631e-07, + "loss": 0.0134, + "num_tokens": 24069702.0, + "reward": 0.8138632774353027, + "reward_std": 0.1399797797203064, + "rewards/qatch_small_update_with_fm/mean": 0.8138632774353027, + "rewards/qatch_small_update_with_fm/std": 0.32255634665489197, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859397411346436, + "sampling/importance_sampling_ratio/min": 0.0026753947604447603, + "sampling/sampling_logp_difference/max": 5.92365837097168, + "sampling/sampling_logp_difference/mean": 0.08180845528841019, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 220.4609375, + "completions/mean_terminated_length": 220.4609375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.062050150241702795, + "epoch": 0.09380530973451327, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.4117919010554631, + "learning_rate": 9.122807017543859e-07, + "loss": -0.0052, + "num_tokens": 24479100.0, + "reward": 0.7222539186477661, + "reward_std": 0.12479500472545624, + "rewards/qatch_small_update_with_fm/mean": 0.7222539186477661, + "rewards/qatch_small_update_with_fm/std": 0.35847076773643494, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871389865875244, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.07965491712093353, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 213.265625, + "completions/mean_terminated_length": 213.265625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.05208406038582325, + "epoch": 0.09557522123893805, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2749853584316917, + "learning_rate": 9.298245614035087e-07, + "loss": 0.0086, + "num_tokens": 24906176.0, + "reward": 0.7578281164169312, + "reward_std": 0.09011317789554596, + "rewards/qatch_small_update_with_fm/mean": 0.7578281164169312, + "rewards/qatch_small_update_with_fm/std": 0.36998218297958374, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876996278762817, + "sampling/importance_sampling_ratio/min": 0.002484902273863554, + "sampling/sampling_logp_difference/max": 5.997521877288818, + "sampling/sampling_logp_difference/mean": 0.06985066086053848, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 222.65234375, + "completions/mean_terminated_length": 222.65234375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05924999574199319, + "epoch": 0.09734513274336283, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1662708219703708, + "learning_rate": 9.473684210526315e-07, + "loss": -0.0068, + "num_tokens": 25522151.0, + "reward": 0.6106094121932983, + "reward_std": 0.09099166095256805, + "rewards/qatch_small_update_with_fm/mean": 0.6106094121932983, + "rewards/qatch_small_update_with_fm/std": 0.42094501852989197, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885672330856323, + "sampling/importance_sampling_ratio/min": 0.0005121615249663591, + "sampling/sampling_logp_difference/max": 7.576870441436768, + "sampling/sampling_logp_difference/mean": 0.07843571156263351, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 230.015625, + "completions/mean_terminated_length": 230.015625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.06128214206546545, + "epoch": 0.09911504424778761, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.1814665152848656, + "learning_rate": 9.649122807017545e-07, + "loss": 0.0099, + "num_tokens": 25926219.0, + "reward": 0.5389687418937683, + "reward_std": 0.0760495662689209, + "rewards/qatch_small_update_with_fm/mean": 0.5389687418937683, + "rewards/qatch_small_update_with_fm/std": 0.44913288950920105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883639216423035, + "sampling/importance_sampling_ratio/min": 0.005285531282424927, + "sampling/sampling_logp_difference/max": 5.242782115936279, + "sampling/sampling_logp_difference/mean": 0.076011061668396, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 231.01953125, + "completions/mean_terminated_length": 231.01953125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.06122827250510454, + "epoch": 0.10088495575221239, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.701525809843503, + "learning_rate": 9.82456140350877e-07, + "loss": -0.0012, + "num_tokens": 26508288.0, + "reward": 0.5856757760047913, + "reward_std": 0.10319986939430237, + "rewards/qatch_small_update_with_fm/mean": 0.5856757760047913, + "rewards/qatch_small_update_with_fm/std": 0.3761707544326782, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9905126094818115, + "sampling/importance_sampling_ratio/min": 0.0008454708731733263, + "sampling/sampling_logp_difference/max": 7.075616836547852, + "sampling/sampling_logp_difference/mean": 0.07275321334600449, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 228.78125, + "completions/mean_terminated_length": 228.78125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.05939402291551232, + "epoch": 0.10265486725663717, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.6087452343972617, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 27004360.0, + "reward": 0.7414413690567017, + "reward_std": 0.16082945466041565, + "rewards/qatch_small_update_with_fm/mean": 0.7414413690567017, + "rewards/qatch_small_update_with_fm/std": 0.36743295192718506, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876971244812012, + "sampling/importance_sampling_ratio/min": 0.0087041687220335, + "sampling/sampling_logp_difference/max": 4.743953227996826, + "sampling/sampling_logp_difference/mean": 0.07908263057470322, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 232.23046875, + "completions/mean_terminated_length": 232.23046875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.06470445403829217, + "epoch": 0.10442477876106195, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2989246279889253, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 27654163.0, + "reward": 0.6483319997787476, + "reward_std": 0.07241523265838623, + "rewards/qatch_small_update_with_fm/mean": 0.6483319997787476, + "rewards/qatch_small_update_with_fm/std": 0.3903117775917053, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866316318511963, + "sampling/importance_sampling_ratio/min": 0.0015347807202488184, + "sampling/sampling_logp_difference/max": 6.479367733001709, + "sampling/sampling_logp_difference/mean": 0.08551231771707535, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 203.109375, + "completions/mean_terminated_length": 203.109375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.05414479412138462, + "epoch": 0.10619469026548672, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3547680159242048, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 28066927.0, + "reward": 0.8256796598434448, + "reward_std": 0.12319210171699524, + "rewards/qatch_small_update_with_fm/mean": 0.8256796598434448, + "rewards/qatch_small_update_with_fm/std": 0.3297073543071747, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990422248840332, + "sampling/importance_sampling_ratio/min": 0.008775134570896626, + "sampling/sampling_logp_difference/max": 4.735833168029785, + "sampling/sampling_logp_difference/mean": 0.06953281909227371, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 210.90625, + "completions/mean_terminated_length": 210.90625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.05672915978357196, + "epoch": 0.1079646017699115, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.1260978796261878, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 28441015.0, + "reward": 0.5399414300918579, + "reward_std": 0.08482731878757477, + "rewards/qatch_small_update_with_fm/mean": 0.5399413704872131, + "rewards/qatch_small_update_with_fm/std": 0.3924369812011719, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898838400840759, + "sampling/importance_sampling_ratio/min": 0.00044970287126488984, + "sampling/sampling_logp_difference/max": 7.706923484802246, + "sampling/sampling_logp_difference/mean": 0.07207824289798737, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 221.19140625, + "completions/mean_terminated_length": 221.19140625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.060915837064385414, + "epoch": 0.10973451327433628, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.1258665241146926, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 28913320.0, + "reward": 0.6639999747276306, + "reward_std": 0.10557354986667633, + "rewards/qatch_small_update_with_fm/mean": 0.6640000343322754, + "rewards/qatch_small_update_with_fm/std": 0.37992992997169495, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889802932739258, + "sampling/importance_sampling_ratio/min": 0.004290364682674408, + "sampling/sampling_logp_difference/max": 5.451383590698242, + "sampling/sampling_logp_difference/mean": 0.07521848380565643, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 211.0703125, + "completions/mean_terminated_length": 211.0703125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.057312820106744766, + "epoch": 0.11150442477876106, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3806557267663577, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 29287546.0, + "reward": 0.8031054735183716, + "reward_std": 0.13054558634757996, + "rewards/qatch_small_update_with_fm/mean": 0.8031054735183716, + "rewards/qatch_small_update_with_fm/std": 0.3409731090068817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891703128814697, + "sampling/importance_sampling_ratio/min": 0.0009145242511294782, + "sampling/sampling_logp_difference/max": 6.997106552124023, + "sampling/sampling_logp_difference/mean": 0.07143960893154144, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 209.265625, + "completions/mean_terminated_length": 209.265625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.05227735638618469, + "epoch": 0.11327433628318584, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.1583707369088834, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 29605390.0, + "reward": 0.7770195007324219, + "reward_std": 0.0829365998506546, + "rewards/qatch_small_update_with_fm/mean": 0.7770195007324219, + "rewards/qatch_small_update_with_fm/std": 0.3401285409927368, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9910151958465576, + "sampling/importance_sampling_ratio/min": 9.103809134103358e-05, + "sampling/sampling_logp_difference/max": 9.304232597351074, + "sampling/sampling_logp_difference/mean": 0.06772209703922272, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 211.19140625, + "completions/mean_terminated_length": 211.19140625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.0573471263051033, + "epoch": 0.11504424778761062, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.6612352617651236, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 30237231.0, + "reward": 0.6868828535079956, + "reward_std": 0.12947605550289154, + "rewards/qatch_small_update_with_fm/mean": 0.6868828535079956, + "rewards/qatch_small_update_with_fm/std": 0.38128235936164856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901007413864136, + "sampling/importance_sampling_ratio/min": 0.0007929722778499126, + "sampling/sampling_logp_difference/max": 7.1397223472595215, + "sampling/sampling_logp_difference/mean": 0.0746212974190712, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 196.890625, + "completions/mean_terminated_length": 196.890625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.06002651434391737, + "epoch": 0.1168141592920354, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2459924657963664, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 30586531.0, + "reward": 0.8590390682220459, + "reward_std": 0.10891761630773544, + "rewards/qatch_small_update_with_fm/mean": 0.8590390682220459, + "rewards/qatch_small_update_with_fm/std": 0.29756253957748413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9895268678665161, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.0752851814031601, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 213.6484375, + "completions/mean_terminated_length": 213.6484375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.05108651565387845, + "epoch": 0.11858407079646018, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.6786719907991305, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 30921465.0, + "reward": 0.7343359589576721, + "reward_std": 0.0922800824046135, + "rewards/qatch_small_update_with_fm/mean": 0.7343359589576721, + "rewards/qatch_small_update_with_fm/std": 0.3966872990131378, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894901514053345, + "sampling/importance_sampling_ratio/min": 0.0053325011394917965, + "sampling/sampling_logp_difference/max": 5.2339348793029785, + "sampling/sampling_logp_difference/mean": 0.06567675620317459, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 191.671875, + "completions/mean_terminated_length": 191.671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.049398843199014664, + "epoch": 0.12035398230088495, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3480569196341743, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 31321605.0, + "reward": 0.7132851481437683, + "reward_std": 0.10275396704673767, + "rewards/qatch_small_update_with_fm/mean": 0.7132851481437683, + "rewards/qatch_small_update_with_fm/std": 0.36657455563545227, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901378750801086, + "sampling/importance_sampling_ratio/min": 0.00142034946475178, + "sampling/sampling_logp_difference/max": 6.556852340698242, + "sampling/sampling_logp_difference/mean": 0.06882262229919434, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 226.60546875, + "completions/mean_terminated_length": 226.60546875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.06086216261610389, + "epoch": 0.12212389380530973, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.3171490055496433, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 31961792.0, + "reward": 0.5173789262771606, + "reward_std": 0.10575304925441742, + "rewards/qatch_small_update_with_fm/mean": 0.5173789262771606, + "rewards/qatch_small_update_with_fm/std": 0.3906656801700592, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989037811756134, + "sampling/importance_sampling_ratio/min": 0.006783433724194765, + "sampling/sampling_logp_difference/max": 4.993271827697754, + "sampling/sampling_logp_difference/mean": 0.07524209469556808, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 212.49609375, + "completions/mean_terminated_length": 212.49609375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.06138837058097124, + "epoch": 0.12389380530973451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5948358740892794, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 32319615.0, + "reward": 0.7251406311988831, + "reward_std": 0.14819201827049255, + "rewards/qatch_small_update_with_fm/mean": 0.7251405715942383, + "rewards/qatch_small_update_with_fm/std": 0.4064193665981293, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987440824508667, + "sampling/importance_sampling_ratio/min": 1.3609868787511914e-08, + "sampling/sampling_logp_difference/max": 18.112470626831055, + "sampling/sampling_logp_difference/mean": 0.07786168903112411, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 204.21875, + "completions/mean_terminated_length": 204.21875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.05273952102288604, + "epoch": 0.1256637168141593, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1148325278657547, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 32774903.0, + "reward": 0.6098905801773071, + "reward_std": 0.07109032571315765, + "rewards/qatch_small_update_with_fm/mean": 0.6098905801773071, + "rewards/qatch_small_update_with_fm/std": 0.3966805040836334, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992613673210144, + "sampling/importance_sampling_ratio/min": 0.006861389148980379, + "sampling/sampling_logp_difference/max": 4.981845378875732, + "sampling/sampling_logp_difference/mean": 0.06364965438842773, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 228.99609375, + "completions/mean_terminated_length": 228.99609375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.06161496136337519, + "epoch": 0.12743362831858407, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2833428215317622, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 33585750.0, + "reward": 0.6821328401565552, + "reward_std": 0.11582638323307037, + "rewards/qatch_small_update_with_fm/mean": 0.6821328401565552, + "rewards/qatch_small_update_with_fm/std": 0.4021257758140564, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876558780670166, + "sampling/importance_sampling_ratio/min": 0.003892954671755433, + "sampling/sampling_logp_difference/max": 5.548586845397949, + "sampling/sampling_logp_difference/mean": 0.08207723498344421, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 211.3359375, + "completions/mean_terminated_length": 211.3359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.05988115398213267, + "epoch": 0.12920353982300886, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.398151631377601, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 34136700.0, + "reward": 0.5716406106948853, + "reward_std": 0.04969751462340355, + "rewards/qatch_small_update_with_fm/mean": 0.5716406106948853, + "rewards/qatch_small_update_with_fm/std": 0.40344637632369995, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906647205352783, + "sampling/importance_sampling_ratio/min": 0.005279851146042347, + "sampling/sampling_logp_difference/max": 5.243857383728027, + "sampling/sampling_logp_difference/mean": 0.07489632815122604, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 235.5859375, + "completions/mean_terminated_length": 235.5859375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.06329893041402102, + "epoch": 0.13097345132743363, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0823255201111395, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 34670914.0, + "reward": 0.6543593406677246, + "reward_std": 0.0780901163816452, + "rewards/qatch_small_update_with_fm/mean": 0.6543593406677246, + "rewards/qatch_small_update_with_fm/std": 0.4128835201263428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9857140779495239, + "sampling/importance_sampling_ratio/min": 0.0030068783089518547, + "sampling/sampling_logp_difference/max": 5.8068528175354, + "sampling/sampling_logp_difference/mean": 0.0846657007932663, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 244.5546875, + "completions/mean_terminated_length": 244.5546875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.07185739278793335, + "epoch": 0.13274336283185842, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.378086589282812, + "learning_rate": 1e-06, + "loss": -0.0363, + "num_tokens": 35206464.0, + "reward": 0.6239843964576721, + "reward_std": 0.14069399237632751, + "rewards/qatch_small_update_with_fm/mean": 0.6239843368530273, + "rewards/qatch_small_update_with_fm/std": 0.42358607053756714, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863853454589844, + "sampling/importance_sampling_ratio/min": 0.006657592952251434, + "sampling/sampling_logp_difference/max": 5.011997222900391, + "sampling/sampling_logp_difference/mean": 0.08819466829299927, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 220.5390625, + "completions/mean_terminated_length": 220.5390625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.06628159014508128, + "epoch": 0.13451327433628318, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1875335710855202, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 35660042.0, + "reward": 0.7418711185455322, + "reward_std": 0.08845853805541992, + "rewards/qatch_small_update_with_fm/mean": 0.7418710589408875, + "rewards/qatch_small_update_with_fm/std": 0.32926684617996216, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878979921340942, + "sampling/importance_sampling_ratio/min": 0.0010934383608400822, + "sampling/sampling_logp_difference/max": 6.818428039550781, + "sampling/sampling_logp_difference/mean": 0.0823526680469513, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 210.45703125, + "completions/mean_terminated_length": 210.45703125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.0547793940640986, + "epoch": 0.13628318584070798, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2820133241614284, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 36098735.0, + "reward": 0.7723632454872131, + "reward_std": 0.10817845165729523, + "rewards/qatch_small_update_with_fm/mean": 0.7723632454872131, + "rewards/qatch_small_update_with_fm/std": 0.32957103848457336, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900933504104614, + "sampling/importance_sampling_ratio/min": 0.0019473416032269597, + "sampling/sampling_logp_difference/max": 6.241290092468262, + "sampling/sampling_logp_difference/mean": 0.06941035389900208, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 199.92578125, + "completions/mean_terminated_length": 199.92578125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.059035807847976685, + "epoch": 0.13805309734513274, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.810166027187809, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 36648908.0, + "reward": 0.666699230670929, + "reward_std": 0.1434335559606552, + "rewards/qatch_small_update_with_fm/mean": 0.666699230670929, + "rewards/qatch_small_update_with_fm/std": 0.36849868297576904, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866025447845459, + "sampling/importance_sampling_ratio/min": 0.006941504310816526, + "sampling/sampling_logp_difference/max": 4.970236778259277, + "sampling/sampling_logp_difference/mean": 0.07928824424743652, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 209.16796875, + "completions/mean_terminated_length": 209.16796875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.06461150664836168, + "epoch": 0.13982300884955753, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.5742182581772457, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 37106087.0, + "reward": 0.6654453277587891, + "reward_std": 0.13180303573608398, + "rewards/qatch_small_update_with_fm/mean": 0.6654453277587891, + "rewards/qatch_small_update_with_fm/std": 0.39904341101646423, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9861378073692322, + "sampling/importance_sampling_ratio/min": 0.006765482947230339, + "sampling/sampling_logp_difference/max": 4.995921611785889, + "sampling/sampling_logp_difference/mean": 0.08057506382465363, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 224.44921875, + "completions/mean_terminated_length": 224.44921875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.06497358391061425, + "epoch": 0.1415929203539823, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0691186894798947, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 37761642.0, + "reward": 0.7223789691925049, + "reward_std": 0.03682992607355118, + "rewards/qatch_small_update_with_fm/mean": 0.7223789691925049, + "rewards/qatch_small_update_with_fm/std": 0.4028269946575165, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9870048761367798, + "sampling/importance_sampling_ratio/min": 0.00475488742813468, + "sampling/sampling_logp_difference/max": 5.3485822677612305, + "sampling/sampling_logp_difference/mean": 0.08557099103927612, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 209.359375, + "completions/mean_terminated_length": 209.359375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.07113357819616795, + "epoch": 0.1433628318584071, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1430097586030235, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 38304758.0, + "reward": 0.664121150970459, + "reward_std": 0.07573066651821136, + "rewards/qatch_small_update_with_fm/mean": 0.6641210913658142, + "rewards/qatch_small_update_with_fm/std": 0.3896544873714447, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852210879325867, + "sampling/importance_sampling_ratio/min": 0.004114307928830385, + "sampling/sampling_logp_difference/max": 5.493284702301025, + "sampling/sampling_logp_difference/mean": 0.09156718850135803, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 198.05859375, + "completions/mean_terminated_length": 198.05859375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.060767903458327055, + "epoch": 0.14513274336283186, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.064463118851441, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 38814533.0, + "reward": 0.5577499866485596, + "reward_std": 0.03734627366065979, + "rewards/qatch_small_update_with_fm/mean": 0.5577499866485596, + "rewards/qatch_small_update_with_fm/std": 0.4213722348213196, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9857168197631836, + "sampling/importance_sampling_ratio/min": 1.6835397076420122e-08, + "sampling/sampling_logp_difference/max": 17.899782180786133, + "sampling/sampling_logp_difference/mean": 0.0847933366894722, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 195.48046875, + "completions/mean_terminated_length": 195.48046875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.05940087651833892, + "epoch": 0.14690265486725665, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.5926033700931235, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 39296656.0, + "reward": 0.6463242173194885, + "reward_std": 0.1061515137553215, + "rewards/qatch_small_update_with_fm/mean": 0.6463241577148438, + "rewards/qatch_small_update_with_fm/std": 0.3937905728816986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877705574035645, + "sampling/importance_sampling_ratio/min": 0.004423520993441343, + "sampling/sampling_logp_difference/max": 5.420819282531738, + "sampling/sampling_logp_difference/mean": 0.07931964099407196, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 207.98046875, + "completions/mean_terminated_length": 207.98046875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.05249483743682504, + "epoch": 0.1486725663716814, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.8908889978726945, + "learning_rate": 1e-06, + "loss": -0.0215, + "num_tokens": 40012699.0, + "reward": 0.8066601157188416, + "reward_std": 0.04902618005871773, + "rewards/qatch_small_update_with_fm/mean": 0.8066601753234863, + "rewards/qatch_small_update_with_fm/std": 0.3067781925201416, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874978065490723, + "sampling/importance_sampling_ratio/min": 0.0017582516884431243, + "sampling/sampling_logp_difference/max": 6.343435287475586, + "sampling/sampling_logp_difference/mean": 0.07647126913070679, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 211.04296875, + "completions/mean_terminated_length": 211.04296875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.06565155554562807, + "epoch": 0.1504424778761062, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3694550387992506, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 40324774.0, + "reward": 0.7276562452316284, + "reward_std": 0.12753424048423767, + "rewards/qatch_small_update_with_fm/mean": 0.7276562452316284, + "rewards/qatch_small_update_with_fm/std": 0.3714563846588135, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896517395973206, + "sampling/importance_sampling_ratio/min": 0.0007163186674006283, + "sampling/sampling_logp_difference/max": 7.241385459899902, + "sampling/sampling_logp_difference/mean": 0.07456763833761215, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 201.8203125, + "completions/mean_terminated_length": 201.8203125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.051434017252177, + "epoch": 0.15221238938053097, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.075833278684105, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 40738408.0, + "reward": 0.7893867492675781, + "reward_std": 0.07746925950050354, + "rewards/qatch_small_update_with_fm/mean": 0.7893867492675781, + "rewards/qatch_small_update_with_fm/std": 0.305320680141449, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909459948539734, + "sampling/importance_sampling_ratio/min": 0.00526427524164319, + "sampling/sampling_logp_difference/max": 5.246811866760254, + "sampling/sampling_logp_difference/mean": 0.06327448785305023, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 188.52734375, + "completions/mean_terminated_length": 188.52734375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.05542505206540227, + "epoch": 0.15398230088495576, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4599249891402826, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 41215151.0, + "reward": 0.7529062032699585, + "reward_std": 0.04203649237751961, + "rewards/qatch_small_update_with_fm/mean": 0.7529062032699585, + "rewards/qatch_small_update_with_fm/std": 0.3346794545650482, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897748231887817, + "sampling/importance_sampling_ratio/min": 9.50306002778234e-07, + "sampling/sampling_logp_difference/max": 13.86648178100586, + "sampling/sampling_logp_difference/mean": 0.06705738604068756, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 224.00390625, + "completions/mean_terminated_length": 224.00390625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.06550072925165296, + "epoch": 0.15575221238938053, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.5855353981192564, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 41742432.0, + "reward": 0.6274609565734863, + "reward_std": 0.15894006192684174, + "rewards/qatch_small_update_with_fm/mean": 0.6274609565734863, + "rewards/qatch_small_update_with_fm/std": 0.36873817443847656, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874089360237122, + "sampling/importance_sampling_ratio/min": 0.0012522448087111115, + "sampling/sampling_logp_difference/max": 6.682817459106445, + "sampling/sampling_logp_difference/mean": 0.08358839154243469, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 219.96484375, + "completions/mean_terminated_length": 219.96484375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.06327277980744839, + "epoch": 0.15752212389380532, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.3868624400474925, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 42285063.0, + "reward": 0.6643476486206055, + "reward_std": 0.09053190052509308, + "rewards/qatch_small_update_with_fm/mean": 0.6643476486206055, + "rewards/qatch_small_update_with_fm/std": 0.36776453256607056, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9868210554122925, + "sampling/importance_sampling_ratio/min": 0.0032281808089464903, + "sampling/sampling_logp_difference/max": 5.735836505889893, + "sampling/sampling_logp_difference/mean": 0.08308855444192886, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 221.921875, + "completions/mean_terminated_length": 221.921875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.06209539761766791, + "epoch": 0.1592920353982301, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9604749244588173, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 42771827.0, + "reward": 0.624679684638977, + "reward_std": 0.05232584848999977, + "rewards/qatch_small_update_with_fm/mean": 0.624679684638977, + "rewards/qatch_small_update_with_fm/std": 0.4224699139595032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900439977645874, + "sampling/importance_sampling_ratio/min": 0.006829948164522648, + "sampling/sampling_logp_difference/max": 4.986438274383545, + "sampling/sampling_logp_difference/mean": 0.0733073502779007, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 215.45703125, + "completions/mean_terminated_length": 215.45703125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.059518329333513975, + "epoch": 0.16106194690265488, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.363715543188874, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 43140760.0, + "reward": 0.6687187552452087, + "reward_std": 0.0947665125131607, + "rewards/qatch_small_update_with_fm/mean": 0.6687187552452087, + "rewards/qatch_small_update_with_fm/std": 0.39010441303253174, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891262650489807, + "sampling/importance_sampling_ratio/min": 0.000560973712708801, + "sampling/sampling_logp_difference/max": 7.485836505889893, + "sampling/sampling_logp_difference/mean": 0.07687044888734818, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 238.33203125, + "completions/mean_terminated_length": 238.33203125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.06875412631779909, + "epoch": 0.16283185840707964, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2344610418561328, + "learning_rate": 1e-06, + "loss": -0.0178, + "num_tokens": 43687517.0, + "reward": 0.621164083480835, + "reward_std": 0.07217143476009369, + "rewards/qatch_small_update_with_fm/mean": 0.621164083480835, + "rewards/qatch_small_update_with_fm/std": 0.40286827087402344, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864582419395447, + "sampling/importance_sampling_ratio/min": 0.0018237647600471973, + "sampling/sampling_logp_difference/max": 6.306852340698242, + "sampling/sampling_logp_difference/mean": 0.08633337914943695, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 182.59765625, + "completions/mean_terminated_length": 182.59765625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.048110376577824354, + "epoch": 0.16460176991150444, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3812354569394656, + "learning_rate": 1e-06, + "loss": -0.0125, + "num_tokens": 44077782.0, + "reward": 0.7503007650375366, + "reward_std": 0.05579577013850212, + "rewards/qatch_small_update_with_fm/mean": 0.7503007650375366, + "rewards/qatch_small_update_with_fm/std": 0.3701190650463104, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9926257133483887, + "sampling/importance_sampling_ratio/min": 0.008679235354065895, + "sampling/sampling_logp_difference/max": 4.746821880340576, + "sampling/sampling_logp_difference/mean": 0.06102976202964783, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 226.34765625, + "completions/mean_terminated_length": 226.34765625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.06354533648118377, + "epoch": 0.1663716814159292, + "frac_reward_zero_std": 0.4375, + "grad_norm": 2.0330279818529817, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 44597039.0, + "reward": 0.6143515110015869, + "reward_std": 0.09292621165513992, + "rewards/qatch_small_update_with_fm/mean": 0.6143515706062317, + "rewards/qatch_small_update_with_fm/std": 0.4134713113307953, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902607798576355, + "sampling/importance_sampling_ratio/min": 0.0037183836102485657, + "sampling/sampling_logp_difference/max": 5.594466209411621, + "sampling/sampling_logp_difference/mean": 0.07333210110664368, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 216.32421875, + "completions/mean_terminated_length": 216.32421875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.0570914801210165, + "epoch": 0.168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9805195877495109, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 45176658.0, + "reward": 0.7462930083274841, + "reward_std": 0.05266830697655678, + "rewards/qatch_small_update_with_fm/mean": 0.7462930083274841, + "rewards/qatch_small_update_with_fm/std": 0.3875497579574585, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896615743637085, + "sampling/importance_sampling_ratio/min": 0.0019235039362683892, + "sampling/sampling_logp_difference/max": 6.253606796264648, + "sampling/sampling_logp_difference/mean": 0.07471055537462234, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 222.78125, + "completions/mean_terminated_length": 222.78125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.0618698401376605, + "epoch": 0.16991150442477876, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.2423456493533636, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 45682122.0, + "reward": 0.5794335603713989, + "reward_std": 0.16376078128814697, + "rewards/qatch_small_update_with_fm/mean": 0.5794335603713989, + "rewards/qatch_small_update_with_fm/std": 0.37759649753570557, + "sampling/importance_sampling_ratio/max": 1.9816300868988037, + "sampling/importance_sampling_ratio/mean": 0.9911603927612305, + "sampling/importance_sampling_ratio/min": 0.005275054834783077, + "sampling/sampling_logp_difference/max": 5.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.06948962807655334, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 199.75390625, + "completions/mean_terminated_length": 199.75390625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05948884226381779, + "epoch": 0.17168141592920355, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0621083166267602, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 46077291.0, + "reward": 0.8319296836853027, + "reward_std": 0.10714203119277954, + "rewards/qatch_small_update_with_fm/mean": 0.8319296836853027, + "rewards/qatch_small_update_with_fm/std": 0.32574471831321716, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9903090000152588, + "sampling/importance_sampling_ratio/min": 0.0067833466455340385, + "sampling/sampling_logp_difference/max": 4.993284702301025, + "sampling/sampling_logp_difference/mean": 0.07050412893295288, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 231.16796875, + "completions/mean_terminated_length": 231.16796875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.055903279688209295, + "epoch": 0.17345132743362832, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.498916659342343, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 46600134.0, + "reward": 0.5468984246253967, + "reward_std": 0.11932238936424255, + "rewards/qatch_small_update_with_fm/mean": 0.5468984246253967, + "rewards/qatch_small_update_with_fm/std": 0.3713155686855316, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9923816919326782, + "sampling/importance_sampling_ratio/min": 0.0067833466455340385, + "sampling/sampling_logp_difference/max": 4.993284702301025, + "sampling/sampling_logp_difference/mean": 0.06638407707214355, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 217.890625, + "completions/mean_terminated_length": 217.890625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.06640621554106474, + "epoch": 0.1752212389380531, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.5718249727687301, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 47070490.0, + "reward": 0.7290234565734863, + "reward_std": 0.15270094573497772, + "rewards/qatch_small_update_with_fm/mean": 0.7290234565734863, + "rewards/qatch_small_update_with_fm/std": 0.3411417007446289, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883034229278564, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.07885263860225677, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 201.0859375, + "completions/mean_terminated_length": 201.0859375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.05885794572532177, + "epoch": 0.17699115044247787, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3345730020684947, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 47546912.0, + "reward": 0.6619570255279541, + "reward_std": 0.09077616035938263, + "rewards/qatch_small_update_with_fm/mean": 0.6619570255279541, + "rewards/qatch_small_update_with_fm/std": 0.424406498670578, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890620112419128, + "sampling/importance_sampling_ratio/min": 0.0067546493373811245, + "sampling/sampling_logp_difference/max": 4.997524261474609, + "sampling/sampling_logp_difference/mean": 0.07268444448709488, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 239.95703125, + "completions/mean_terminated_length": 239.95703125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.0687459222972393, + "epoch": 0.17876106194690267, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8181741928189153, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 48250181.0, + "reward": 0.6181562542915344, + "reward_std": 0.06346184015274048, + "rewards/qatch_small_update_with_fm/mean": 0.6181562542915344, + "rewards/qatch_small_update_with_fm/std": 0.393032044172287, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872247576713562, + "sampling/importance_sampling_ratio/min": 0.0052973320707678795, + "sampling/sampling_logp_difference/max": 5.240551948547363, + "sampling/sampling_logp_difference/mean": 0.07996334880590439, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 229.55078125, + "completions/mean_terminated_length": 229.55078125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.076118397526443, + "epoch": 0.18053097345132743, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8489205627398348, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 48699602.0, + "reward": 0.7388476133346558, + "reward_std": 0.08579277992248535, + "rewards/qatch_small_update_with_fm/mean": 0.7388476133346558, + "rewards/qatch_small_update_with_fm/std": 0.3763107657432556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902989864349365, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.07630578428506851, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 234.83203125, + "completions/mean_terminated_length": 234.83203125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.06544858030974865, + "epoch": 0.18230088495575222, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0495184915762261, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 49279447.0, + "reward": 0.7172539234161377, + "reward_std": 0.08092990517616272, + "rewards/qatch_small_update_with_fm/mean": 0.7172539234161377, + "rewards/qatch_small_update_with_fm/std": 0.36648428440093994, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909118413925171, + "sampling/importance_sampling_ratio/min": 0.0002085514715872705, + "sampling/sampling_logp_difference/max": 8.475324630737305, + "sampling/sampling_logp_difference/mean": 0.07229933142662048, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 201.5234375, + "completions/mean_terminated_length": 201.5234375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06334317708387971, + "epoch": 0.184070796460177, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9710610492311486, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 49833373.0, + "reward": 0.7955077886581421, + "reward_std": 0.08076589554548264, + "rewards/qatch_small_update_with_fm/mean": 0.7955077886581421, + "rewards/qatch_small_update_with_fm/std": 0.2992088496685028, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912827014923096, + "sampling/importance_sampling_ratio/min": 0.002524232491850853, + "sampling/sampling_logp_difference/max": 5.981818199157715, + "sampling/sampling_logp_difference/mean": 0.07269036024808884, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 251.88671875, + "completions/mean_terminated_length": 251.88671875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.07081401254981756, + "epoch": 0.18584070796460178, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2405682870240118, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 50403440.0, + "reward": 0.7778749465942383, + "reward_std": 0.07802677154541016, + "rewards/qatch_small_update_with_fm/mean": 0.7778750061988831, + "rewards/qatch_small_update_with_fm/std": 0.32360637187957764, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902708530426025, + "sampling/importance_sampling_ratio/min": 0.003205658169463277, + "sampling/sampling_logp_difference/max": 5.742837905883789, + "sampling/sampling_logp_difference/mean": 0.07808385789394379, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 206.82421875, + "completions/mean_terminated_length": 206.82421875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.0713232234120369, + "epoch": 0.18761061946902655, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2524565483157322, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 50824931.0, + "reward": 0.7011171579360962, + "reward_std": 0.0960366427898407, + "rewards/qatch_small_update_with_fm/mean": 0.7011171579360962, + "rewards/qatch_small_update_with_fm/std": 0.37654486298561096, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864037036895752, + "sampling/importance_sampling_ratio/min": 0.004132171627134085, + "sampling/sampling_logp_difference/max": 5.488952159881592, + "sampling/sampling_logp_difference/mean": 0.08674513548612595, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 204.5234375, + "completions/mean_terminated_length": 204.5234375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.06823413353413343, + "epoch": 0.18938053097345134, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2203515906399123, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 51307657.0, + "reward": 0.7907344102859497, + "reward_std": 0.11454281955957413, + "rewards/qatch_small_update_with_fm/mean": 0.7907344102859497, + "rewards/qatch_small_update_with_fm/std": 0.3722200095653534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859263896942139, + "sampling/importance_sampling_ratio/min": 0.00414169579744339, + "sampling/sampling_logp_difference/max": 5.486649990081787, + "sampling/sampling_logp_difference/mean": 0.0840449258685112, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 216.546875, + "completions/mean_terminated_length": 216.546875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.07362686283886433, + "epoch": 0.1911504424778761, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1727916966743837, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 51739557.0, + "reward": 0.7079882621765137, + "reward_std": 0.1126692146062851, + "rewards/qatch_small_update_with_fm/mean": 0.7079882621765137, + "rewards/qatch_small_update_with_fm/std": 0.3950938880443573, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893667697906494, + "sampling/importance_sampling_ratio/min": 0.0005116977845318615, + "sampling/sampling_logp_difference/max": 7.5777764320373535, + "sampling/sampling_logp_difference/mean": 0.0835602730512619, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 235.3046875, + "completions/mean_terminated_length": 235.3046875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.069594438187778, + "epoch": 0.1929203539823009, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.1717602880661633, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 52304035.0, + "reward": 0.7542539238929749, + "reward_std": 0.10196835547685623, + "rewards/qatch_small_update_with_fm/mean": 0.7542538642883301, + "rewards/qatch_small_update_with_fm/std": 0.3392114043235779, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988386332988739, + "sampling/importance_sampling_ratio/min": 0.003735895035788417, + "sampling/sampling_logp_difference/max": 5.589767932891846, + "sampling/sampling_logp_difference/mean": 0.07965923845767975, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 237.87109375, + "completions/mean_terminated_length": 237.87109375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.08200910221785307, + "epoch": 0.19469026548672566, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1239702239825442, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 52823218.0, + "reward": 0.7591797113418579, + "reward_std": 0.13066166639328003, + "rewards/qatch_small_update_with_fm/mean": 0.7591797113418579, + "rewards/qatch_small_update_with_fm/std": 0.3666918873786926, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904977679252625, + "sampling/importance_sampling_ratio/min": 0.008987288922071457, + "sampling/sampling_logp_difference/max": 4.711944103240967, + "sampling/sampling_logp_difference/mean": 0.08705293387174606, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 248.390625, + "completions/mean_terminated_length": 248.390625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.07323302794247866, + "epoch": 0.19646017699115045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9989215025851239, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 53404518.0, + "reward": 0.7107539176940918, + "reward_std": 0.14668454229831696, + "rewards/qatch_small_update_with_fm/mean": 0.7107539176940918, + "rewards/qatch_small_update_with_fm/std": 0.3824143707752228, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871342182159424, + "sampling/importance_sampling_ratio/min": 0.004103472921997309, + "sampling/sampling_logp_difference/max": 5.495921611785889, + "sampling/sampling_logp_difference/mean": 0.08473586291074753, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 226.14453125, + "completions/mean_terminated_length": 226.14453125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.07057367824018002, + "epoch": 0.19823008849557522, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0350790346928784, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 53895739.0, + "reward": 0.6831991672515869, + "reward_std": 0.0855279192328453, + "rewards/qatch_small_update_with_fm/mean": 0.6831991672515869, + "rewards/qatch_small_update_with_fm/std": 0.402239054441452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9895448684692383, + "sampling/importance_sampling_ratio/min": 0.005957757122814655, + "sampling/sampling_logp_difference/max": 5.123061180114746, + "sampling/sampling_logp_difference/mean": 0.08003868162631989, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 228.67578125, + "completions/mean_terminated_length": 228.67578125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.08122638612985611, + "epoch": 0.2, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.849733979561118, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 54381624.0, + "reward": 0.7690976858139038, + "reward_std": 0.08409841358661652, + "rewards/qatch_small_update_with_fm/mean": 0.7690976858139038, + "rewards/qatch_small_update_with_fm/std": 0.3112572729587555, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9858420491218567, + "sampling/importance_sampling_ratio/min": 0.0054420898668468, + "sampling/sampling_logp_difference/max": 5.213592052459717, + "sampling/sampling_logp_difference/mean": 0.09284666180610657, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 237.3203125, + "completions/mean_terminated_length": 237.3203125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.07597249001264572, + "epoch": 0.20176991150442478, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7586615609792853, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 54832682.0, + "reward": 0.6945820450782776, + "reward_std": 0.09893607348203659, + "rewards/qatch_small_update_with_fm/mean": 0.6945819854736328, + "rewards/qatch_small_update_with_fm/std": 0.36656326055526733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901285767555237, + "sampling/importance_sampling_ratio/min": 0.014312805607914925, + "sampling/sampling_logp_difference/max": 4.24660062789917, + "sampling/sampling_logp_difference/mean": 0.0770685225725174, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 257.66796875, + "completions/mean_terminated_length": 257.66796875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.0925935534760356, + "epoch": 0.20353982300884957, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0435651562227988, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 55348565.0, + "reward": 0.7382968664169312, + "reward_std": 0.10771472752094269, + "rewards/qatch_small_update_with_fm/mean": 0.7382968664169312, + "rewards/qatch_small_update_with_fm/std": 0.358004629611969, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990604043006897, + "sampling/importance_sampling_ratio/min": 0.014600342139601707, + "sampling/sampling_logp_difference/max": 4.226710319519043, + "sampling/sampling_logp_difference/mean": 0.09014023840427399, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 251.6875, + "completions/mean_terminated_length": 251.6875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.08259568270295858, + "epoch": 0.20530973451327433, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8880503095846881, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 55758325.0, + "reward": 0.7133241891860962, + "reward_std": 0.09667318314313889, + "rewards/qatch_small_update_with_fm/mean": 0.7133241891860962, + "rewards/qatch_small_update_with_fm/std": 0.37226539850234985, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913509488105774, + "sampling/importance_sampling_ratio/min": 0.004103474784642458, + "sampling/sampling_logp_difference/max": 5.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.08229538798332214, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 254.046875, + "completions/mean_terminated_length": 254.046875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.09230261761695147, + "epoch": 0.20707964601769913, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1159689278172942, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 56245665.0, + "reward": 0.782953143119812, + "reward_std": 0.08451966941356659, + "rewards/qatch_small_update_with_fm/mean": 0.782953143119812, + "rewards/qatch_small_update_with_fm/std": 0.33307263255119324, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882062673568726, + "sampling/importance_sampling_ratio/min": 0.011158311739563942, + "sampling/sampling_logp_difference/max": 4.495570659637451, + "sampling/sampling_logp_difference/mean": 0.09717138111591339, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 267.03515625, + "completions/mean_terminated_length": 267.03515625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.1017747251316905, + "epoch": 0.2088495575221239, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8300344236250922, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 56741018.0, + "reward": 0.6730078458786011, + "reward_std": 0.0929047018289566, + "rewards/qatch_small_update_with_fm/mean": 0.6730078458786011, + "rewards/qatch_small_update_with_fm/std": 0.37552404403686523, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898161292076111, + "sampling/importance_sampling_ratio/min": 0.0010080569190904498, + "sampling/sampling_logp_difference/max": 6.899730682373047, + "sampling/sampling_logp_difference/mean": 0.09620655328035355, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 255.59765625, + "completions/mean_terminated_length": 255.59765625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.08545802906155586, + "epoch": 0.21061946902654868, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0319217136508039, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 57145571.0, + "reward": 0.6751953363418579, + "reward_std": 0.0626310482621193, + "rewards/qatch_small_update_with_fm/mean": 0.6751953363418579, + "rewards/qatch_small_update_with_fm/std": 0.38396772742271423, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9910754561424255, + "sampling/importance_sampling_ratio/min": 0.006784905679523945, + "sampling/sampling_logp_difference/max": 4.9930548667907715, + "sampling/sampling_logp_difference/mean": 0.08353398740291595, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 283.390625, + "completions/mean_terminated_length": 283.390625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.09278723411262035, + "epoch": 0.21238938053097345, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.40765157194615936, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 57686151.0, + "reward": 0.7513476610183716, + "reward_std": 0.042032234370708466, + "rewards/qatch_small_update_with_fm/mean": 0.7513476610183716, + "rewards/qatch_small_update_with_fm/std": 0.33298224210739136, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9905245304107666, + "sampling/importance_sampling_ratio/min": 7.85573092798586e-07, + "sampling/sampling_logp_difference/max": 14.056852340698242, + "sampling/sampling_logp_difference/mean": 0.09131969511508942, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 253.28125, + "completions/mean_terminated_length": 253.28125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.08711250498890877, + "epoch": 0.21415929203539824, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7822434137440697, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 58111615.0, + "reward": 0.7699062824249268, + "reward_std": 0.14036858081817627, + "rewards/qatch_small_update_with_fm/mean": 0.7699062824249268, + "rewards/qatch_small_update_with_fm/std": 0.3514271378517151, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898660182952881, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.09157545864582062, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 259.3515625, + "completions/mean_terminated_length": 259.3515625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.091193076223135, + "epoch": 0.215929203539823, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7754221166721602, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 58597945.0, + "reward": 0.6951679587364197, + "reward_std": 0.11067484319210052, + "rewards/qatch_small_update_with_fm/mean": 0.6951679587364197, + "rewards/qatch_small_update_with_fm/std": 0.38288044929504395, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879680871963501, + "sampling/importance_sampling_ratio/min": 0.005322370678186417, + "sampling/sampling_logp_difference/max": 5.235836505889893, + "sampling/sampling_logp_difference/mean": 0.09452269226312637, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 279.49609375, + "completions/mean_terminated_length": 279.49609375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.09114830754697323, + "epoch": 0.2176991150442478, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6514437850174619, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 58993496.0, + "reward": 0.6201406121253967, + "reward_std": 0.09873516857624054, + "rewards/qatch_small_update_with_fm/mean": 0.6201406121253967, + "rewards/qatch_small_update_with_fm/std": 0.4052762985229492, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893493056297302, + "sampling/importance_sampling_ratio/min": 0.0015082587487995625, + "sampling/sampling_logp_difference/max": 6.496799468994141, + "sampling/sampling_logp_difference/mean": 0.08865205943584442, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 268.82421875, + "completions/mean_terminated_length": 268.82421875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.09484461508691311, + "epoch": 0.21946902654867256, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.934234614851166, + "learning_rate": 1e-06, + "loss": -0.021, + "num_tokens": 59620395.0, + "reward": 0.6767265200614929, + "reward_std": 0.10116851329803467, + "rewards/qatch_small_update_with_fm/mean": 0.6767265796661377, + "rewards/qatch_small_update_with_fm/std": 0.3683614432811737, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874666333198547, + "sampling/importance_sampling_ratio/min": 0.011243586428463459, + "sampling/sampling_logp_difference/max": 4.48795747756958, + "sampling/sampling_logp_difference/mean": 0.09557405114173889, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 262.40234375, + "completions/mean_terminated_length": 262.40234375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.09813018050044775, + "epoch": 0.22123893805309736, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5368316481587041, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 59913058.0, + "reward": 0.7477929592132568, + "reward_std": 0.032144203782081604, + "rewards/qatch_small_update_with_fm/mean": 0.7477929592132568, + "rewards/qatch_small_update_with_fm/std": 0.3281362056732178, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916006326675415, + "sampling/importance_sampling_ratio/min": 0.018390489742159843, + "sampling/sampling_logp_difference/max": 3.9959216117858887, + "sampling/sampling_logp_difference/mean": 0.08756014704704285, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 276.3359375, + "completions/mean_terminated_length": 276.3359375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.10066359303891659, + "epoch": 0.22300884955752212, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.838232937849372, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 60417352.0, + "reward": 0.6837852001190186, + "reward_std": 0.15973472595214844, + "rewards/qatch_small_update_with_fm/mean": 0.6837852001190186, + "rewards/qatch_small_update_with_fm/std": 0.37046951055526733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911990165710449, + "sampling/importance_sampling_ratio/min": 0.011186771094799042, + "sampling/sampling_logp_difference/max": 4.49302339553833, + "sampling/sampling_logp_difference/mean": 0.09084731340408325, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 259.4140625, + "completions/mean_terminated_length": 259.4140625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.08994897082448006, + "epoch": 0.2247787610619469, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7653161293513729, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 60800850.0, + "reward": 0.6803281307220459, + "reward_std": 0.14784881472587585, + "rewards/qatch_small_update_with_fm/mean": 0.6803281307220459, + "rewards/qatch_small_update_with_fm/std": 0.40771958231925964, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874090552330017, + "sampling/importance_sampling_ratio/min": 0.002029245253652334, + "sampling/sampling_logp_difference/max": 6.200091361999512, + "sampling/sampling_logp_difference/mean": 0.09263262152671814, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 280.67578125, + "completions/mean_terminated_length": 280.67578125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.1118616834282875, + "epoch": 0.22654867256637168, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.8858334262845288, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 61328671.0, + "reward": 0.6529296636581421, + "reward_std": 0.09167185425758362, + "rewards/qatch_small_update_with_fm/mean": 0.6529296636581421, + "rewards/qatch_small_update_with_fm/std": 0.41027647256851196, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9881945848464966, + "sampling/importance_sampling_ratio/min": 0.0045516896061599255, + "sampling/sampling_logp_difference/max": 5.392256736755371, + "sampling/sampling_logp_difference/mean": 0.10528907924890518, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 302.046875, + "completions/mean_terminated_length": 302.046875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.10696916934102774, + "epoch": 0.22831858407079647, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.9604443746123174, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 61709419.0, + "reward": 0.6226562261581421, + "reward_std": 0.1739925742149353, + "rewards/qatch_small_update_with_fm/mean": 0.6226562261581421, + "rewards/qatch_small_update_with_fm/std": 0.39967477321624756, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892038106918335, + "sampling/importance_sampling_ratio/min": 0.0011153517989441752, + "sampling/sampling_logp_difference/max": 6.798585414886475, + "sampling/sampling_logp_difference/mean": 0.09998883306980133, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 305.88671875, + "completions/mean_terminated_length": 305.88671875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.09376539383083582, + "epoch": 0.23008849557522124, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7862261535767899, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 62132238.0, + "reward": 0.6461484432220459, + "reward_std": 0.15436768531799316, + "rewards/qatch_small_update_with_fm/mean": 0.6461484432220459, + "rewards/qatch_small_update_with_fm/std": 0.40689408779144287, + "sampling/importance_sampling_ratio/max": 1.9475082159042358, + "sampling/importance_sampling_ratio/mean": 0.9900339841842651, + "sampling/importance_sampling_ratio/min": 0.012612488120794296, + "sampling/sampling_logp_difference/max": 4.373067855834961, + "sampling/sampling_logp_difference/mean": 0.08664417266845703, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 298.4453125, + "completions/mean_terminated_length": 298.4453125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.10303333774209023, + "epoch": 0.23185840707964603, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0219844010417432, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 62922736.0, + "reward": 0.7904140949249268, + "reward_std": 0.09890885651111603, + "rewards/qatch_small_update_with_fm/mean": 0.7904140949249268, + "rewards/qatch_small_update_with_fm/std": 0.3384302258491516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9887751340866089, + "sampling/importance_sampling_ratio/min": 0.006941414438188076, + "sampling/sampling_logp_difference/max": 4.970249652862549, + "sampling/sampling_logp_difference/mean": 0.1004166379570961, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 272.5390625, + "completions/mean_terminated_length": 272.5390625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.10145942401140928, + "epoch": 0.2336283185840708, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.0066570470039222, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 63381930.0, + "reward": 0.7727656364440918, + "reward_std": 0.04941270500421524, + "rewards/qatch_small_update_with_fm/mean": 0.7727656364440918, + "rewards/qatch_small_update_with_fm/std": 0.3472408652305603, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879018068313599, + "sampling/importance_sampling_ratio/min": 0.005326686892658472, + "sampling/sampling_logp_difference/max": 5.235025882720947, + "sampling/sampling_logp_difference/mean": 0.10085846483707428, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 281.359375, + "completions/mean_terminated_length": 281.359375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.0962858721613884, + "epoch": 0.23539823008849559, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.8718101101072838, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 63788758.0, + "reward": 0.6256601810455322, + "reward_std": 0.11285711824893951, + "rewards/qatch_small_update_with_fm/mean": 0.6256601810455322, + "rewards/qatch_small_update_with_fm/std": 0.39782294631004333, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9886201620101929, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.09229514002799988, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 313.6015625, + "completions/mean_terminated_length": 313.6015625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.10426320135593414, + "epoch": 0.23716814159292035, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.620772979902059, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 64252768.0, + "reward": 0.7330859899520874, + "reward_std": 0.10503730177879333, + "rewards/qatch_small_update_with_fm/mean": 0.7330859899520874, + "rewards/qatch_small_update_with_fm/std": 0.35654518008232117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911209344863892, + "sampling/importance_sampling_ratio/min": 0.009995303116738796, + "sampling/sampling_logp_difference/max": 4.605639934539795, + "sampling/sampling_logp_difference/mean": 0.094190314412117, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 317.60546875, + "completions/mean_terminated_length": 317.60546875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.10367918200790882, + "epoch": 0.23893805309734514, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6907269231345502, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 64625051.0, + "reward": 0.5382304787635803, + "reward_std": 0.14928914606571198, + "rewards/qatch_small_update_with_fm/mean": 0.5382304191589355, + "rewards/qatch_small_update_with_fm/std": 0.3985642194747925, + "sampling/importance_sampling_ratio/max": 1.9336779117584229, + "sampling/importance_sampling_ratio/mean": 0.9892072677612305, + "sampling/importance_sampling_ratio/min": 0.01847412809729576, + "sampling/sampling_logp_difference/max": 3.9913840293884277, + "sampling/sampling_logp_difference/mean": 0.09558893740177155, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 372.515625, + "completions/mean_terminated_length": 313.4127197265625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.10493994690477848, + "epoch": 0.2407079646017699, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6150747718251923, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 65142431.0, + "reward": 0.5792617201805115, + "reward_std": 0.07002231478691101, + "rewards/qatch_small_update_with_fm/mean": 0.5792617201805115, + "rewards/qatch_small_update_with_fm/std": 0.4229722023010254, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899119734764099, + "sampling/importance_sampling_ratio/min": 0.011183901689946651, + "sampling/sampling_logp_difference/max": 4.493279933929443, + "sampling/sampling_logp_difference/mean": 0.10026513040065765, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 328.55859375, + "completions/mean_terminated_length": 328.55859375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.12148185446858406, + "epoch": 0.2424778761061947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.8001348116132405, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 65636302.0, + "reward": 0.5703476667404175, + "reward_std": 0.13366132974624634, + "rewards/qatch_small_update_with_fm/mean": 0.5703476667404175, + "rewards/qatch_small_update_with_fm/std": 0.3910647928714752, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913745522499084, + "sampling/importance_sampling_ratio/min": 0.014454490505158901, + "sampling/sampling_logp_difference/max": 4.23675012588501, + "sampling/sampling_logp_difference/mean": 0.10473871231079102, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 331.17578125, + "completions/mean_terminated_length": 331.17578125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.12022528424859047, + "epoch": 0.24424778761061947, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7779054985874481, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 66158427.0, + "reward": 0.6807109713554382, + "reward_std": 0.09024008363485336, + "rewards/qatch_small_update_with_fm/mean": 0.6807109713554382, + "rewards/qatch_small_update_with_fm/std": 0.37375155091285706, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9935262799263, + "sampling/importance_sampling_ratio/min": 0.0002641607716213912, + "sampling/sampling_logp_difference/max": 8.23895263671875, + "sampling/sampling_logp_difference/mean": 0.1035434752702713, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 315.984375, + "completions/mean_terminated_length": 315.984375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.1255191322416067, + "epoch": 0.24601769911504426, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6235699081867757, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 66667463.0, + "reward": 0.6528828144073486, + "reward_std": 0.10219062864780426, + "rewards/qatch_small_update_with_fm/mean": 0.6528828144073486, + "rewards/qatch_small_update_with_fm/std": 0.3693510890007019, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911524653434753, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.10624203085899353, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 322.84375, + "completions/mean_terminated_length": 322.84375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.11273289285600185, + "epoch": 0.24778761061946902, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.8694494197292071, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 67065663.0, + "reward": 0.6909765005111694, + "reward_std": 0.15022653341293335, + "rewards/qatch_small_update_with_fm/mean": 0.6909765005111694, + "rewards/qatch_small_update_with_fm/std": 0.4188796579837799, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911254644393921, + "sampling/importance_sampling_ratio/min": 0.018390489742159843, + "sampling/sampling_logp_difference/max": 3.9959216117858887, + "sampling/sampling_logp_difference/mean": 0.09885202348232269, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 307.7734375, + "completions/mean_terminated_length": 307.7734375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.10854166280478239, + "epoch": 0.24955752212389382, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6582767132558051, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 67630293.0, + "reward": 0.7654882669448853, + "reward_std": 0.13209347426891327, + "rewards/qatch_small_update_with_fm/mean": 0.7654882669448853, + "rewards/qatch_small_update_with_fm/std": 0.3466072976589203, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894741773605347, + "sampling/importance_sampling_ratio/min": 0.008697095327079296, + "sampling/sampling_logp_difference/max": 4.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.10250886529684067, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 330.546875, + "completions/mean_terminated_length": 330.546875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.11137775052338839, + "epoch": 0.2513274336283186, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.39969463474896644, + "learning_rate": 1e-06, + "loss": -0.013, + "num_tokens": 68206081.0, + "reward": 0.6641288995742798, + "reward_std": 0.08044249564409256, + "rewards/qatch_small_update_with_fm/mean": 0.6641288995742798, + "rewards/qatch_small_update_with_fm/std": 0.40250152349472046, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914456605911255, + "sampling/importance_sampling_ratio/min": 0.01118385884910822, + "sampling/sampling_logp_difference/max": 4.493283748626709, + "sampling/sampling_logp_difference/mean": 0.09565605968236923, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 282.18359375, + "completions/mean_terminated_length": 282.18359375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.11050462163984776, + "epoch": 0.25309734513274335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5152218599616128, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 68596064.0, + "reward": 0.7529257535934448, + "reward_std": 0.06564027070999146, + "rewards/qatch_small_update_with_fm/mean": 0.7529257535934448, + "rewards/qatch_small_update_with_fm/std": 0.34699419140815735, + "sampling/importance_sampling_ratio/max": 1.8420151472091675, + "sampling/importance_sampling_ratio/mean": 0.9903613328933716, + "sampling/importance_sampling_ratio/min": 0.011400715447962284, + "sampling/sampling_logp_difference/max": 4.474079132080078, + "sampling/sampling_logp_difference/mean": 0.09959550946950912, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 281.70703125, + "completions/mean_terminated_length": 281.70703125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.11399852950125933, + "epoch": 0.25486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5059618551556871, + "learning_rate": 1e-06, + "loss": -0.0185, + "num_tokens": 68975989.0, + "reward": 0.8365429639816284, + "reward_std": 0.06033072993159294, + "rewards/qatch_small_update_with_fm/mean": 0.8365429639816284, + "rewards/qatch_small_update_with_fm/std": 0.3398571312427521, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893470406532288, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.10417570918798447, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 311.2265625, + "completions/mean_terminated_length": 311.2265625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.11174062639474869, + "epoch": 0.25663716814159293, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.50898681753391, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 69318207.0, + "reward": 0.6523163914680481, + "reward_std": 0.058695483952760696, + "rewards/qatch_small_update_with_fm/mean": 0.6523163914680481, + "rewards/qatch_small_update_with_fm/std": 0.4232395589351654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892643690109253, + "sampling/importance_sampling_ratio/min": 0.01455187052488327, + "sampling/sampling_logp_difference/max": 4.230035781860352, + "sampling/sampling_logp_difference/mean": 0.10058675706386566, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 283.91015625, + "completions/mean_terminated_length": 283.91015625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.09814571868628263, + "epoch": 0.2584070796460177, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.21386235041296422, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 69840936.0, + "reward": 0.8148593902587891, + "reward_std": 0.004824659787118435, + "rewards/qatch_small_update_with_fm/mean": 0.8148593902587891, + "rewards/qatch_small_update_with_fm/std": 0.30982932448387146, + "sampling/importance_sampling_ratio/max": 1.8642206192016602, + "sampling/importance_sampling_ratio/mean": 0.9873814582824707, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.09649167209863663, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 318.7734375, + "completions/mean_terminated_length": 318.7734375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.10984945390373468, + "epoch": 0.26017699115044246, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7232177387743879, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 70217950.0, + "reward": 0.7357734441757202, + "reward_std": 0.10225854814052582, + "rewards/qatch_small_update_with_fm/mean": 0.7357734441757202, + "rewards/qatch_small_update_with_fm/std": 0.3676723837852478, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927917718887329, + "sampling/importance_sampling_ratio/min": 0.00675980793312192, + "sampling/sampling_logp_difference/max": 4.996760845184326, + "sampling/sampling_logp_difference/mean": 0.09779518842697144, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 327.26171875, + "completions/mean_terminated_length": 327.26171875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.12225450947880745, + "epoch": 0.26194690265486725, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6635589210066514, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 70701761.0, + "reward": 0.7450859546661377, + "reward_std": 0.13423468172550201, + "rewards/qatch_small_update_with_fm/mean": 0.7450859546661377, + "rewards/qatch_small_update_with_fm/std": 0.35765355825424194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927431344985962, + "sampling/importance_sampling_ratio/min": 0.008697601035237312, + "sampling/sampling_logp_difference/max": 4.744708061218262, + "sampling/sampling_logp_difference/mean": 0.10236505419015884, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 304.6171875, + "completions/mean_terminated_length": 304.6171875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.10852374881505966, + "epoch": 0.26371681415929205, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5372910231979201, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 71125407.0, + "reward": 0.7528905868530273, + "reward_std": 0.046875, + "rewards/qatch_small_update_with_fm/mean": 0.7528906464576721, + "rewards/qatch_small_update_with_fm/std": 0.31242799758911133, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933427572250366, + "sampling/importance_sampling_ratio/min": 0.003236274002119899, + "sampling/sampling_logp_difference/max": 5.733332633972168, + "sampling/sampling_logp_difference/mean": 0.0950489193201065, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 327.58984375, + "completions/mean_terminated_length": 327.58984375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.106517244130373, + "epoch": 0.26548672566371684, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3782993611564517, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 71482758.0, + "reward": 0.746874988079071, + "reward_std": 0.030320392921566963, + "rewards/qatch_small_update_with_fm/mean": 0.746874988079071, + "rewards/qatch_small_update_with_fm/std": 0.31360676884651184, + "sampling/importance_sampling_ratio/max": 1.9069268703460693, + "sampling/importance_sampling_ratio/mean": 0.9913297891616821, + "sampling/importance_sampling_ratio/min": 0.014322609640657902, + "sampling/sampling_logp_difference/max": 4.24591588973999, + "sampling/sampling_logp_difference/mean": 0.09537242352962494, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 275.15625, + "completions/mean_terminated_length": 275.15625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.10477493796497583, + "epoch": 0.2672566371681416, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7161847098933055, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 72037950.0, + "reward": 0.8379726409912109, + "reward_std": 0.07797817140817642, + "rewards/qatch_small_update_with_fm/mean": 0.8379726409912109, + "rewards/qatch_small_update_with_fm/std": 0.29647859930992126, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899912476539612, + "sampling/importance_sampling_ratio/min": 0.0015248883282765746, + "sampling/sampling_logp_difference/max": 6.485834121704102, + "sampling/sampling_logp_difference/mean": 0.10141263902187347, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 310.3984375, + "completions/mean_terminated_length": 310.3984375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.11613454110920429, + "epoch": 0.26902654867256637, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6891517872451888, + "learning_rate": 1e-06, + "loss": 0.0335, + "num_tokens": 72599940.0, + "reward": 0.7354961037635803, + "reward_std": 0.1144690066576004, + "rewards/qatch_small_update_with_fm/mean": 0.7354961037635803, + "rewards/qatch_small_update_with_fm/std": 0.33235809206962585, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911872148513794, + "sampling/importance_sampling_ratio/min": 0.0007149595185182989, + "sampling/sampling_logp_difference/max": 7.243284702301025, + "sampling/sampling_logp_difference/mean": 0.10409478843212128, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 341.36328125, + "completions/mean_terminated_length": 341.36328125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.11662444099783897, + "epoch": 0.27079646017699116, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8534737634982954, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 73316417.0, + "reward": 0.5821914672851562, + "reward_std": 0.20425722002983093, + "rewards/qatch_small_update_with_fm/mean": 0.5821914076805115, + "rewards/qatch_small_update_with_fm/std": 0.4198920726776123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909311532974243, + "sampling/importance_sampling_ratio/min": 0.014549053274095058, + "sampling/sampling_logp_difference/max": 4.230229377746582, + "sampling/sampling_logp_difference/mean": 0.10373535752296448, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 340.15234375, + "completions/mean_terminated_length": 340.15234375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.11665886919945478, + "epoch": 0.27256637168141595, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.9040651908852855, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 73947096.0, + "reward": 0.7290664315223694, + "reward_std": 0.12613099813461304, + "rewards/qatch_small_update_with_fm/mean": 0.7290664315223694, + "rewards/qatch_small_update_with_fm/std": 0.38859787583351135, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9903141856193542, + "sampling/importance_sampling_ratio/min": 0.011161153204739094, + "sampling/sampling_logp_difference/max": 4.495316028594971, + "sampling/sampling_logp_difference/mean": 0.10388841480016708, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 311.44921875, + "completions/mean_terminated_length": 311.44921875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.10327374562621117, + "epoch": 0.2743362831858407, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6888872039396743, + "learning_rate": 1e-06, + "loss": -0.0406, + "num_tokens": 74477547.0, + "reward": 0.6128828525543213, + "reward_std": 0.10392679274082184, + "rewards/qatch_small_update_with_fm/mean": 0.6128827929496765, + "rewards/qatch_small_update_with_fm/std": 0.3436572849750519, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906870126724243, + "sampling/importance_sampling_ratio/min": 0.005150408949702978, + "sampling/sampling_logp_difference/max": 5.268679141998291, + "sampling/sampling_logp_difference/mean": 0.09823443740606308, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 322.6484375, + "completions/mean_terminated_length": 322.6484375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.11082449927926064, + "epoch": 0.2761061946902655, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7015851796183483, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 75135905.0, + "reward": 0.7353945374488831, + "reward_std": 0.09259958565235138, + "rewards/qatch_small_update_with_fm/mean": 0.7353945374488831, + "rewards/qatch_small_update_with_fm/std": 0.35178375244140625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904823899269104, + "sampling/importance_sampling_ratio/min": 0.0068324264138937, + "sampling/sampling_logp_difference/max": 4.986075401306152, + "sampling/sampling_logp_difference/mean": 0.09970168769359589, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 340.32421875, + "completions/mean_terminated_length": 340.32421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.12680564727634192, + "epoch": 0.2778761061946903, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.719573916814778, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 75662164.0, + "reward": 0.7470703125, + "reward_std": 0.09985685348510742, + "rewards/qatch_small_update_with_fm/mean": 0.7470703125, + "rewards/qatch_small_update_with_fm/std": 0.3765318989753723, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909191131591797, + "sampling/importance_sampling_ratio/min": 0.011168654076755047, + "sampling/sampling_logp_difference/max": 4.4946441650390625, + "sampling/sampling_logp_difference/mean": 0.1114622950553894, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 320.4140625, + "completions/mean_terminated_length": 320.4140625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.10564819350838661, + "epoch": 0.27964601769911507, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6426424897825909, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 76192926.0, + "reward": 0.6906836032867432, + "reward_std": 0.1533028781414032, + "rewards/qatch_small_update_with_fm/mean": 0.6906836032867432, + "rewards/qatch_small_update_with_fm/std": 0.39270642399787903, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899740815162659, + "sampling/importance_sampling_ratio/min": 0.0009147842065431178, + "sampling/sampling_logp_difference/max": 6.996822357177734, + "sampling/sampling_logp_difference/mean": 0.10140345990657806, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 316.74609375, + "completions/mean_terminated_length": 316.74609375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.11091221682727337, + "epoch": 0.2814159292035398, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6797052787098364, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 76756973.0, + "reward": 0.8056992292404175, + "reward_std": 0.11062506586313248, + "rewards/qatch_small_update_with_fm/mean": 0.8056992292404175, + "rewards/qatch_small_update_with_fm/std": 0.32723745703697205, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991296648979187, + "sampling/importance_sampling_ratio/min": 0.006754668429493904, + "sampling/sampling_logp_difference/max": 4.99752140045166, + "sampling/sampling_logp_difference/mean": 0.10075265914201736, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 336.171875, + "completions/mean_terminated_length": 336.171875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.11384789273142815, + "epoch": 0.2831858407079646, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8797097409635665, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 77260409.0, + "reward": 0.4690038859844208, + "reward_std": 0.10574622452259064, + "rewards/qatch_small_update_with_fm/mean": 0.46900391578674316, + "rewards/qatch_small_update_with_fm/std": 0.41652530431747437, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927676916122437, + "sampling/importance_sampling_ratio/min": 0.009872177615761757, + "sampling/sampling_logp_difference/max": 4.618034839630127, + "sampling/sampling_logp_difference/mean": 0.1024760752916336, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 346.1171875, + "completions/mean_terminated_length": 346.1171875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.11178999207913876, + "epoch": 0.2849557522123894, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6811957283219384, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 77883415.0, + "reward": 0.7185937166213989, + "reward_std": 0.13636133074760437, + "rewards/qatch_small_update_with_fm/mean": 0.7185937762260437, + "rewards/qatch_small_update_with_fm/std": 0.3618468940258026, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907670021057129, + "sampling/importance_sampling_ratio/min": 0.0018389016622677445, + "sampling/sampling_logp_difference/max": 6.298586845397949, + "sampling/sampling_logp_difference/mean": 0.10185998678207397, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 338.296875, + "completions/mean_terminated_length": 338.296875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.10594448260962963, + "epoch": 0.2867256637168142, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7268038126045898, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 78273699.0, + "reward": 0.6265742182731628, + "reward_std": 0.13848204910755157, + "rewards/qatch_small_update_with_fm/mean": 0.6265742182731628, + "rewards/qatch_small_update_with_fm/std": 0.433919221162796, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990743100643158, + "sampling/importance_sampling_ratio/min": 0.0035162202548235655, + "sampling/sampling_logp_difference/max": 5.650368690490723, + "sampling/sampling_logp_difference/mean": 0.09895285218954086, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 362.01953125, + "completions/mean_terminated_length": 362.01953125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.11295005213469267, + "epoch": 0.2884955752212389, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.591131789029338, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 78820680.0, + "reward": 0.734011709690094, + "reward_std": 0.11754102259874344, + "rewards/qatch_small_update_with_fm/mean": 0.7340116500854492, + "rewards/qatch_small_update_with_fm/std": 0.32903358340263367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918836355209351, + "sampling/importance_sampling_ratio/min": 8.4981948020868e-05, + "sampling/sampling_logp_difference/max": 9.373071670532227, + "sampling/sampling_logp_difference/mean": 0.10325956344604492, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 345.74609375, + "completions/mean_terminated_length": 345.74609375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.1067540692165494, + "epoch": 0.2902654867256637, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6919553844927818, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 79378007.0, + "reward": 0.7622460722923279, + "reward_std": 0.11750834435224533, + "rewards/qatch_small_update_with_fm/mean": 0.7622460722923279, + "rewards/qatch_small_update_with_fm/std": 0.38261905312538147, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902242422103882, + "sampling/importance_sampling_ratio/min": 0.010629362426698208, + "sampling/sampling_logp_difference/max": 4.544135093688965, + "sampling/sampling_logp_difference/mean": 0.09993548691272736, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 289.91015625, + "completions/mean_terminated_length": 289.91015625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.09920257981866598, + "epoch": 0.2920353982300885, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6278776429791636, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 79901744.0, + "reward": 0.9076640605926514, + "reward_std": 0.07271681725978851, + "rewards/qatch_small_update_with_fm/mean": 0.9076640605926514, + "rewards/qatch_small_update_with_fm/std": 0.2249380499124527, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918192625045776, + "sampling/importance_sampling_ratio/min": 0.011175889521837234, + "sampling/sampling_logp_difference/max": 4.493996620178223, + "sampling/sampling_logp_difference/mean": 0.09208320826292038, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 293.078125, + "completions/mean_terminated_length": 293.078125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.09951941296458244, + "epoch": 0.2938053097345133, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7492559799835081, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 80436596.0, + "reward": 0.7066914439201355, + "reward_std": 0.11800296604633331, + "rewards/qatch_small_update_with_fm/mean": 0.7066914439201355, + "rewards/qatch_small_update_with_fm/std": 0.357426255941391, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898148775100708, + "sampling/importance_sampling_ratio/min": 0.0004474441520869732, + "sampling/sampling_logp_difference/max": 7.711958885192871, + "sampling/sampling_logp_difference/mean": 0.09365606307983398, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 317.6875, + "completions/mean_terminated_length": 317.6875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.11481165885925293, + "epoch": 0.29557522123893804, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5643059743627443, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 80971908.0, + "reward": 0.5545429587364197, + "reward_std": 0.05453858524560928, + "rewards/qatch_small_update_with_fm/mean": 0.5545429587364197, + "rewards/qatch_small_update_with_fm/std": 0.4164023995399475, + "sampling/importance_sampling_ratio/max": 1.9052268266677856, + "sampling/importance_sampling_ratio/mean": 0.9890497922897339, + "sampling/importance_sampling_ratio/min": 0.012715059332549572, + "sampling/sampling_logp_difference/max": 4.364968299865723, + "sampling/sampling_logp_difference/mean": 0.10033901780843735, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 324.82421875, + "completions/mean_terminated_length": 324.82421875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.11254716478288174, + "epoch": 0.2973451327433628, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5113955054646178, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 81429943.0, + "reward": 0.7119725942611694, + "reward_std": 0.05836210772395134, + "rewards/qatch_small_update_with_fm/mean": 0.7119725942611694, + "rewards/qatch_small_update_with_fm/std": 0.3997737765312195, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912292957305908, + "sampling/importance_sampling_ratio/min": 0.00725527573376894, + "sampling/sampling_logp_difference/max": 4.926026344299316, + "sampling/sampling_logp_difference/mean": 0.10253892838954926, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 324.08203125, + "completions/mean_terminated_length": 324.08203125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.09713317174464464, + "epoch": 0.2991150442477876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.8651796151859013, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 81927804.0, + "reward": 0.6655507683753967, + "reward_std": 0.122608982026577, + "rewards/qatch_small_update_with_fm/mean": 0.6655508279800415, + "rewards/qatch_small_update_with_fm/std": 0.3819872736930847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991803765296936, + "sampling/importance_sampling_ratio/min": 2.927790774265304e-06, + "sampling/sampling_logp_difference/max": 12.741262435913086, + "sampling/sampling_logp_difference/mean": 0.090723916888237, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 307.00390625, + "completions/mean_terminated_length": 307.00390625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.09481226559728384, + "epoch": 0.3008849557522124, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7662300408531278, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 82321245.0, + "reward": 0.7684023380279541, + "reward_std": 0.1258222609758377, + "rewards/qatch_small_update_with_fm/mean": 0.7684023380279541, + "rewards/qatch_small_update_with_fm/std": 0.3740701675415039, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988567590713501, + "sampling/importance_sampling_ratio/min": 0.01430974155664444, + "sampling/sampling_logp_difference/max": 4.246814727783203, + "sampling/sampling_logp_difference/mean": 0.09349850565195084, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 313.04296875, + "completions/mean_terminated_length": 313.04296875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.10745582170784473, + "epoch": 0.30265486725663715, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.121831443907111, + "learning_rate": 1e-06, + "loss": 0.032, + "num_tokens": 82698712.0, + "reward": 0.7548906207084656, + "reward_std": 0.09169437736272812, + "rewards/qatch_small_update_with_fm/mean": 0.7548906803131104, + "rewards/qatch_small_update_with_fm/std": 0.3765425384044647, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882317185401917, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.10578665137290955, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 305.1640625, + "completions/mean_terminated_length": 305.1640625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.09699310921132565, + "epoch": 0.30442477876106194, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.623628416847649, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 83212354.0, + "reward": 0.8881992101669312, + "reward_std": 0.10021714866161346, + "rewards/qatch_small_update_with_fm/mean": 0.8881992101669312, + "rewards/qatch_small_update_with_fm/std": 0.2322145402431488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9884635210037231, + "sampling/importance_sampling_ratio/min": 0.011157982051372528, + "sampling/sampling_logp_difference/max": 4.49560022354126, + "sampling/sampling_logp_difference/mean": 0.09806492924690247, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 306.546875, + "completions/mean_terminated_length": 306.546875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.09263870492577553, + "epoch": 0.30619469026548674, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.9500327430114363, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 83867950.0, + "reward": 0.7660039067268372, + "reward_std": 0.08824647963047028, + "rewards/qatch_small_update_with_fm/mean": 0.7660039067268372, + "rewards/qatch_small_update_with_fm/std": 0.3496793508529663, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896693229675293, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.09181556105613708, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 308.359375, + "completions/mean_terminated_length": 308.359375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.08988738339394331, + "epoch": 0.30796460176991153, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6961920769439389, + "learning_rate": 1e-06, + "loss": -0.0211, + "num_tokens": 84357978.0, + "reward": 0.8017382621765137, + "reward_std": 0.08179794251918793, + "rewards/qatch_small_update_with_fm/mean": 0.8017382621765137, + "rewards/qatch_small_update_with_fm/std": 0.32338473200798035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890596270561218, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.09492075443267822, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 354.05078125, + "completions/mean_terminated_length": 354.05078125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.08568741753697395, + "epoch": 0.30973451327433627, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6020283387826877, + "learning_rate": 1e-06, + "loss": -0.0232, + "num_tokens": 84880967.0, + "reward": 0.776605486869812, + "reward_std": 0.0926983505487442, + "rewards/qatch_small_update_with_fm/mean": 0.776605486869812, + "rewards/qatch_small_update_with_fm/std": 0.33691495656967163, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912744760513306, + "sampling/importance_sampling_ratio/min": 0.0015349813038483262, + "sampling/sampling_logp_difference/max": 6.479237079620361, + "sampling/sampling_logp_difference/mean": 0.08575327694416046, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1039.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 350.390625, + "completions/mean_terminated_length": 350.390625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.11490303184837103, + "epoch": 0.31150442477876106, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6079318288634915, + "learning_rate": 1e-06, + "loss": -0.0335, + "num_tokens": 85434347.0, + "reward": 0.6241093873977661, + "reward_std": 0.10749362409114838, + "rewards/qatch_small_update_with_fm/mean": 0.6241093873977661, + "rewards/qatch_small_update_with_fm/std": 0.36939021944999695, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918155670166016, + "sampling/importance_sampling_ratio/min": 0.0059577059000730515, + "sampling/sampling_logp_difference/max": 5.123069763183594, + "sampling/sampling_logp_difference/mean": 0.10410696268081665, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 355.9609375, + "completions/mean_terminated_length": 355.9609375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.11659033037722111, + "epoch": 0.31327433628318585, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.8230828183609293, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 86127505.0, + "reward": 0.6114218235015869, + "reward_std": 0.11610874533653259, + "rewards/qatch_small_update_with_fm/mean": 0.6114218235015869, + "rewards/qatch_small_update_with_fm/std": 0.3745664954185486, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907248616218567, + "sampling/importance_sampling_ratio/min": 0.008668403141200542, + "sampling/sampling_logp_difference/max": 4.74807071685791, + "sampling/sampling_logp_difference/mean": 0.10726846754550934, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 316.46875, + "completions/mean_terminated_length": 316.46875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.10505182296037674, + "epoch": 0.31504424778761064, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8719774383428669, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 86662585.0, + "reward": 0.8082695603370667, + "reward_std": 0.11230713874101639, + "rewards/qatch_small_update_with_fm/mean": 0.8082695603370667, + "rewards/qatch_small_update_with_fm/std": 0.33685362339019775, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990057647228241, + "sampling/importance_sampling_ratio/min": 0.005370930302888155, + "sampling/sampling_logp_difference/max": 5.226754188537598, + "sampling/sampling_logp_difference/mean": 0.09755245596170425, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 379.71875, + "completions/mean_terminated_length": 379.71875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.10993960406631231, + "epoch": 0.3168141592920354, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7456707224921653, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 87261601.0, + "reward": 0.5580117106437683, + "reward_std": 0.133323535323143, + "rewards/qatch_small_update_with_fm/mean": 0.5580117106437683, + "rewards/qatch_small_update_with_fm/std": 0.3953360915184021, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920111298561096, + "sampling/importance_sampling_ratio/min": 0.008692318573594093, + "sampling/sampling_logp_difference/max": 4.7453155517578125, + "sampling/sampling_logp_difference/mean": 0.10081157088279724, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 290.55859375, + "completions/mean_terminated_length": 290.55859375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.0883013503625989, + "epoch": 0.3185840707964602, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.762903167341592, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 87937168.0, + "reward": 0.7398515939712524, + "reward_std": 0.11943519115447998, + "rewards/qatch_small_update_with_fm/mean": 0.7398515939712524, + "rewards/qatch_small_update_with_fm/std": 0.34509414434432983, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878397583961487, + "sampling/importance_sampling_ratio/min": 0.005517044570297003, + "sampling/sampling_logp_difference/max": 5.199913024902344, + "sampling/sampling_logp_difference/mean": 0.09428344666957855, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 254.7421875, + "completions/mean_terminated_length": 254.7421875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.07716769725084305, + "epoch": 0.32035398230088497, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7686144020436141, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 88278398.0, + "reward": 0.7153867483139038, + "reward_std": 0.07736422121524811, + "rewards/qatch_small_update_with_fm/mean": 0.7153867483139038, + "rewards/qatch_small_update_with_fm/std": 0.3409058749675751, + "sampling/importance_sampling_ratio/max": 1.9639513492584229, + "sampling/importance_sampling_ratio/mean": 0.9903814792633057, + "sampling/importance_sampling_ratio/min": 0.012658960185945034, + "sampling/sampling_logp_difference/max": 4.36939001083374, + "sampling/sampling_logp_difference/mean": 0.08092033863067627, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 320.12109375, + "completions/mean_terminated_length": 320.12109375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.08729387260973454, + "epoch": 0.32212389380530976, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6571255809518773, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 88914173.0, + "reward": 0.7904921770095825, + "reward_std": 0.0689883828163147, + "rewards/qatch_small_update_with_fm/mean": 0.7904921770095825, + "rewards/qatch_small_update_with_fm/std": 0.3290969133377075, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904789328575134, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.08582089096307755, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 287.02734375, + "completions/mean_terminated_length": 287.02734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.08130458928644657, + "epoch": 0.3238938053097345, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.676653673402814, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 89271364.0, + "reward": 0.7791132926940918, + "reward_std": 0.09577371925115585, + "rewards/qatch_small_update_with_fm/mean": 0.7791132926940918, + "rewards/qatch_small_update_with_fm/std": 0.34913262724876404, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9928947687149048, + "sampling/importance_sampling_ratio/min": 0.011154402047395706, + "sampling/sampling_logp_difference/max": 4.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.08006507158279419, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1594.0, + "completions/max_terminated_length": 1594.0, + "completions/mean_length": 421.66015625, + "completions/mean_terminated_length": 421.66015625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.12242889311164618, + "epoch": 0.3256637168141593, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6934224953981509, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 89887709.0, + "reward": 0.7145429849624634, + "reward_std": 0.18574464321136475, + "rewards/qatch_small_update_with_fm/mean": 0.7145429849624634, + "rewards/qatch_small_update_with_fm/std": 0.37374573945999146, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9917437434196472, + "sampling/importance_sampling_ratio/min": 0.0025099029298871756, + "sampling/sampling_logp_difference/max": 5.987511157989502, + "sampling/sampling_logp_difference/mean": 0.10476350039243698, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 354.5703125, + "completions/mean_terminated_length": 354.5703125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.11489906162023544, + "epoch": 0.3274336283185841, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5499845071184065, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 90365359.0, + "reward": 0.665109395980835, + "reward_std": 0.0919097363948822, + "rewards/qatch_small_update_with_fm/mean": 0.665109395980835, + "rewards/qatch_small_update_with_fm/std": 0.402885764837265, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922910928726196, + "sampling/importance_sampling_ratio/min": 0.012095731683075428, + "sampling/sampling_logp_difference/max": 4.414902687072754, + "sampling/sampling_logp_difference/mean": 0.10020913183689117, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 355.58203125, + "completions/mean_terminated_length": 355.58203125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.09928950481116772, + "epoch": 0.3292035398230089, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6232041508744968, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 90929524.0, + "reward": 0.6242695450782776, + "reward_std": 0.1135488897562027, + "rewards/qatch_small_update_with_fm/mean": 0.6242695450782776, + "rewards/qatch_small_update_with_fm/std": 0.3536941707134247, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9942116737365723, + "sampling/importance_sampling_ratio/min": 0.00818202830851078, + "sampling/sampling_logp_difference/max": 4.80581521987915, + "sampling/sampling_logp_difference/mean": 0.08945226669311523, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 327.125, + "completions/mean_terminated_length": 327.125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.09778167959302664, + "epoch": 0.3309734513274336, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6736731693665866, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 91402596.0, + "reward": 0.6589140892028809, + "reward_std": 0.07543458044528961, + "rewards/qatch_small_update_with_fm/mean": 0.6589140892028809, + "rewards/qatch_small_update_with_fm/std": 0.37273266911506653, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901614189147949, + "sampling/importance_sampling_ratio/min": 0.008699359372258186, + "sampling/sampling_logp_difference/max": 4.744505882263184, + "sampling/sampling_logp_difference/mean": 0.09529487788677216, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 315.40625, + "completions/mean_terminated_length": 315.40625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.09347141813486814, + "epoch": 0.3327433628318584, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6242793766687501, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 92142220.0, + "reward": 0.8259179592132568, + "reward_std": 0.10921245813369751, + "rewards/qatch_small_update_with_fm/mean": 0.8259179592132568, + "rewards/qatch_small_update_with_fm/std": 0.3004448115825653, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913510680198669, + "sampling/importance_sampling_ratio/min": 0.00676548620685935, + "sampling/sampling_logp_difference/max": 4.9959211349487305, + "sampling/sampling_logp_difference/mean": 0.09079431742429733, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 361.1484375, + "completions/mean_terminated_length": 361.1484375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.0986938551068306, + "epoch": 0.3345132743362832, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7453656730193217, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 92721058.0, + "reward": 0.6616289019584656, + "reward_std": 0.12159581482410431, + "rewards/qatch_small_update_with_fm/mean": 0.6616289615631104, + "rewards/qatch_small_update_with_fm/std": 0.395707368850708, + "sampling/importance_sampling_ratio/max": 1.9929343461990356, + "sampling/importance_sampling_ratio/mean": 0.9904531240463257, + "sampling/importance_sampling_ratio/min": 0.014309660531580448, + "sampling/sampling_logp_difference/max": 4.246820449829102, + "sampling/sampling_logp_difference/mean": 0.09572584927082062, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 345.2421875, + "completions/mean_terminated_length": 345.2421875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.11141256336122751, + "epoch": 0.336283185840708, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7423535257929594, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 93106320.0, + "reward": 0.7088867425918579, + "reward_std": 0.0872776210308075, + "rewards/qatch_small_update_with_fm/mean": 0.7088867425918579, + "rewards/qatch_small_update_with_fm/std": 0.33193743228912354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9905740022659302, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.10014653205871582, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 369.8515625, + "completions/mean_terminated_length": 369.8515625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.10245176684111357, + "epoch": 0.3380530973451327, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7634695119879356, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 93606074.0, + "reward": 0.6076210737228394, + "reward_std": 0.1056838408112526, + "rewards/qatch_small_update_with_fm/mean": 0.6076210737228394, + "rewards/qatch_small_update_with_fm/std": 0.369706392288208, + "sampling/importance_sampling_ratio/max": 1.9917839765548706, + "sampling/importance_sampling_ratio/mean": 0.9923558235168457, + "sampling/importance_sampling_ratio/min": 0.010142802260816097, + "sampling/sampling_logp_difference/max": 4.590991020202637, + "sampling/sampling_logp_difference/mean": 0.09241395443677902, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 341.4921875, + "completions/mean_terminated_length": 341.4921875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.10239611659199, + "epoch": 0.3398230088495575, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.49375392600444185, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 94055960.0, + "reward": 0.7815742492675781, + "reward_std": 0.029848366975784302, + "rewards/qatch_small_update_with_fm/mean": 0.7815742492675781, + "rewards/qatch_small_update_with_fm/std": 0.3153725266456604, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914690256118774, + "sampling/importance_sampling_ratio/min": 0.008748124353587627, + "sampling/sampling_logp_difference/max": 4.738915920257568, + "sampling/sampling_logp_difference/mean": 0.09125423431396484, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 337.89453125, + "completions/mean_terminated_length": 337.89453125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.10480010323226452, + "epoch": 0.3415929203539823, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6433223060296351, + "learning_rate": 1e-06, + "loss": -0.0301, + "num_tokens": 94600109.0, + "reward": 0.6715664267539978, + "reward_std": 0.09592887759208679, + "rewards/qatch_small_update_with_fm/mean": 0.6715664267539978, + "rewards/qatch_small_update_with_fm/std": 0.3534429371356964, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9928864240646362, + "sampling/importance_sampling_ratio/min": 0.0027512013912200928, + "sampling/sampling_logp_difference/max": 5.895717620849609, + "sampling/sampling_logp_difference/mean": 0.09365631639957428, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 312.56640625, + "completions/mean_terminated_length": 312.56640625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.0991691155359149, + "epoch": 0.3433628318584071, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3817737532536764, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 95152462.0, + "reward": 0.9008671641349792, + "reward_std": 0.04832971841096878, + "rewards/qatch_small_update_with_fm/mean": 0.9008671641349792, + "rewards/qatch_small_update_with_fm/std": 0.2433462291955948, + "sampling/importance_sampling_ratio/max": 1.8678128719329834, + "sampling/importance_sampling_ratio/mean": 0.9926869869232178, + "sampling/importance_sampling_ratio/min": 0.018439065665006638, + "sampling/sampling_logp_difference/max": 3.993283748626709, + "sampling/sampling_logp_difference/mean": 0.088732048869133, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 347.703125, + "completions/mean_terminated_length": 347.703125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.10633239150047302, + "epoch": 0.34513274336283184, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.9229843567916687, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 95595426.0, + "reward": 0.7062265872955322, + "reward_std": 0.11547653377056122, + "rewards/qatch_small_update_with_fm/mean": 0.7062265872955322, + "rewards/qatch_small_update_with_fm/std": 0.3614107668399811, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913710355758667, + "sampling/importance_sampling_ratio/min": 0.004114307928830385, + "sampling/sampling_logp_difference/max": 5.493284702301025, + "sampling/sampling_logp_difference/mean": 0.0934092104434967, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1061.0, + "completions/max_terminated_length": 1061.0, + "completions/mean_length": 325.4296875, + "completions/mean_terminated_length": 325.4296875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.11451444402337074, + "epoch": 0.34690265486725663, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.45549074468231604, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 96067856.0, + "reward": 0.6925156116485596, + "reward_std": 0.051358893513679504, + "rewards/qatch_small_update_with_fm/mean": 0.6925156116485596, + "rewards/qatch_small_update_with_fm/std": 0.3611532151699066, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914189577102661, + "sampling/importance_sampling_ratio/min": 0.014309939928352833, + "sampling/sampling_logp_difference/max": 4.246800899505615, + "sampling/sampling_logp_difference/mean": 0.10335803031921387, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1440.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 424.375, + "completions/mean_terminated_length": 424.375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.12794983014464378, + "epoch": 0.3486725663716814, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5880200744423812, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 96755152.0, + "reward": 0.58935546875, + "reward_std": 0.12563852965831757, + "rewards/qatch_small_update_with_fm/mean": 0.58935546875, + "rewards/qatch_small_update_with_fm/std": 0.3818955421447754, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994789719581604, + "sampling/importance_sampling_ratio/min": 0.0004323141765780747, + "sampling/sampling_logp_difference/max": 7.7463579177856445, + "sampling/sampling_logp_difference/mean": 0.1069219559431076, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 335.69921875, + "completions/mean_terminated_length": 335.69921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.09195560961961746, + "epoch": 0.3504424778761062, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6362032916923682, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 97429299.0, + "reward": 0.7082499861717224, + "reward_std": 0.05375829339027405, + "rewards/qatch_small_update_with_fm/mean": 0.7082499861717224, + "rewards/qatch_small_update_with_fm/std": 0.3975197374820709, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904249310493469, + "sampling/importance_sampling_ratio/min": 0.0031889788806438446, + "sampling/sampling_logp_difference/max": 5.748054504394531, + "sampling/sampling_logp_difference/mean": 0.09103560447692871, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 365.4921875, + "completions/mean_terminated_length": 365.4921875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.11913414020091295, + "epoch": 0.35221238938053095, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6131941301175466, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 98008113.0, + "reward": 0.5971132516860962, + "reward_std": 0.10834619402885437, + "rewards/qatch_small_update_with_fm/mean": 0.5971132516860962, + "rewards/qatch_small_update_with_fm/std": 0.3694043457508087, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916287660598755, + "sampling/importance_sampling_ratio/min": 0.01233906950801611, + "sampling/sampling_logp_difference/max": 4.394984722137451, + "sampling/sampling_logp_difference/mean": 0.10478620231151581, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 357.05859375, + "completions/mean_terminated_length": 357.05859375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.10899745859205723, + "epoch": 0.35398230088495575, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.7218593602759523, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 98531936.0, + "reward": 0.6973046660423279, + "reward_std": 0.11880404502153397, + "rewards/qatch_small_update_with_fm/mean": 0.6973047256469727, + "rewards/qatch_small_update_with_fm/std": 0.4068034291267395, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918450117111206, + "sampling/importance_sampling_ratio/min": 0.0111839659512043, + "sampling/sampling_logp_difference/max": 4.493274211883545, + "sampling/sampling_logp_difference/mean": 0.09498186409473419, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1477.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 360.96484375, + "completions/mean_terminated_length": 360.96484375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.10720335505902767, + "epoch": 0.35575221238938054, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6080464868328257, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 99023719.0, + "reward": 0.6945273876190186, + "reward_std": 0.0998416393995285, + "rewards/qatch_small_update_with_fm/mean": 0.6945273876190186, + "rewards/qatch_small_update_with_fm/std": 0.36767905950546265, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906712770462036, + "sampling/importance_sampling_ratio/min": 0.003982192371040583, + "sampling/sampling_logp_difference/max": 5.525922775268555, + "sampling/sampling_logp_difference/mean": 0.10087412595748901, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 340.1015625, + "completions/mean_terminated_length": 340.1015625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.12118315137922764, + "epoch": 0.35752212389380533, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7410996586697325, + "learning_rate": 1e-06, + "loss": -0.0125, + "num_tokens": 99510897.0, + "reward": 0.698015570640564, + "reward_std": 0.1116129457950592, + "rewards/qatch_small_update_with_fm/mean": 0.698015570640564, + "rewards/qatch_small_update_with_fm/std": 0.3681219816207886, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892449975013733, + "sampling/importance_sampling_ratio/min": 0.011232408694922924, + "sampling/sampling_logp_difference/max": 4.488952159881592, + "sampling/sampling_logp_difference/mean": 0.10859468579292297, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 348.39453125, + "completions/mean_terminated_length": 348.39453125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.1027478277683258, + "epoch": 0.35929203539823007, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7150909605607257, + "learning_rate": 1e-06, + "loss": 0.0356, + "num_tokens": 99994646.0, + "reward": 0.7400000095367432, + "reward_std": 0.10706169903278351, + "rewards/qatch_small_update_with_fm/mean": 0.7400000095367432, + "rewards/qatch_small_update_with_fm/std": 0.35928434133529663, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894899129867554, + "sampling/importance_sampling_ratio/min": 0.018361039459705353, + "sampling/sampling_logp_difference/max": 3.9975242614746094, + "sampling/sampling_logp_difference/mean": 0.09557086229324341, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 372.43359375, + "completions/mean_terminated_length": 372.43359375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.1205616993829608, + "epoch": 0.36106194690265486, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.8901426137212775, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 100514229.0, + "reward": 0.7741680145263672, + "reward_std": 0.13178735971450806, + "rewards/qatch_small_update_with_fm/mean": 0.7741679549217224, + "rewards/qatch_small_update_with_fm/std": 0.33109039068222046, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9924964904785156, + "sampling/importance_sampling_ratio/min": 0.009109980426728725, + "sampling/sampling_logp_difference/max": 4.698384761810303, + "sampling/sampling_logp_difference/mean": 0.10391943156719208, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 355.00390625, + "completions/mean_terminated_length": 355.00390625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.10279853083193302, + "epoch": 0.36283185840707965, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5888127900166, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 100845366.0, + "reward": 0.7348710894584656, + "reward_std": 0.13863445818424225, + "rewards/qatch_small_update_with_fm/mean": 0.7348710894584656, + "rewards/qatch_small_update_with_fm/std": 0.3723523020744324, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934222102165222, + "sampling/importance_sampling_ratio/min": 0.011168073862791061, + "sampling/sampling_logp_difference/max": 4.494696140289307, + "sampling/sampling_logp_difference/mean": 0.09341467916965485, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 390.84765625, + "completions/mean_terminated_length": 390.84765625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.10497349500656128, + "epoch": 0.36460176991150445, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5545867533478495, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 101277951.0, + "reward": 0.7697421908378601, + "reward_std": 0.09470079839229584, + "rewards/qatch_small_update_with_fm/mean": 0.7697421908378601, + "rewards/qatch_small_update_with_fm/std": 0.3705956041812897, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9931164383888245, + "sampling/importance_sampling_ratio/min": 0.011154407635331154, + "sampling/sampling_logp_difference/max": 4.495920658111572, + "sampling/sampling_logp_difference/mean": 0.09605337679386139, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 360.8046875, + "completions/mean_terminated_length": 360.8046875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.10272657312452793, + "epoch": 0.3663716814159292, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.403005864270568, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 101892333.0, + "reward": 0.7502422332763672, + "reward_std": 0.02443435788154602, + "rewards/qatch_small_update_with_fm/mean": 0.7502422332763672, + "rewards/qatch_small_update_with_fm/std": 0.31904786825180054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899274110794067, + "sampling/importance_sampling_ratio/min": 0.005282875616103411, + "sampling/sampling_logp_difference/max": 5.243284702301025, + "sampling/sampling_logp_difference/mean": 0.09682901948690414, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 396.0078125, + "completions/mean_terminated_length": 381.498046875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.11194436810910702, + "epoch": 0.368141592920354, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4911640066793156, + "learning_rate": 1e-06, + "loss": -0.0317, + "num_tokens": 102291583.0, + "reward": 0.6277070045471191, + "reward_std": 0.09390310943126678, + "rewards/qatch_small_update_with_fm/mean": 0.6277070045471191, + "rewards/qatch_small_update_with_fm/std": 0.38415640592575073, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9944193363189697, + "sampling/importance_sampling_ratio/min": 0.01850791648030281, + "sampling/sampling_logp_difference/max": 3.9895567893981934, + "sampling/sampling_logp_difference/mean": 0.09457258880138397, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 384.65625, + "completions/mean_terminated_length": 384.65625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.11076440289616585, + "epoch": 0.36991150442477877, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.8724792452949058, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 102807735.0, + "reward": 0.7210546731948853, + "reward_std": 0.1516134887933731, + "rewards/qatch_small_update_with_fm/mean": 0.7210546731948853, + "rewards/qatch_small_update_with_fm/std": 0.3743986189365387, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922000765800476, + "sampling/importance_sampling_ratio/min": 0.011169740930199623, + "sampling/sampling_logp_difference/max": 4.494546890258789, + "sampling/sampling_logp_difference/mean": 0.09591890871524811, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 371.18359375, + "completions/mean_terminated_length": 371.18359375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.10253117606043816, + "epoch": 0.37168141592920356, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.542561326203597, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 103127958.0, + "reward": 0.727777361869812, + "reward_std": 0.08403307944536209, + "rewards/qatch_small_update_with_fm/mean": 0.727777361869812, + "rewards/qatch_small_update_with_fm/std": 0.3928605914115906, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9942893981933594, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.09067496657371521, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 427.49609375, + "completions/mean_terminated_length": 413.1098327636719, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.11693217884749174, + "epoch": 0.3734513274336283, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.48834549093132984, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 103627781.0, + "reward": 0.7900313138961792, + "reward_std": 0.08779511600732803, + "rewards/qatch_small_update_with_fm/mean": 0.7900311946868896, + "rewards/qatch_small_update_with_fm/std": 0.360713392496109, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922540187835693, + "sampling/importance_sampling_ratio/min": 0.008556932210922241, + "sampling/sampling_logp_difference/max": 4.761013507843018, + "sampling/sampling_logp_difference/mean": 0.10390772670507431, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 412.34375, + "completions/mean_terminated_length": 412.34375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.1140274703502655, + "epoch": 0.3752212389380531, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6180322534177476, + "learning_rate": 1e-06, + "loss": -0.0252, + "num_tokens": 104254797.0, + "reward": 0.6544452905654907, + "reward_std": 0.12845413386821747, + "rewards/qatch_small_update_with_fm/mean": 0.6544452905654907, + "rewards/qatch_small_update_with_fm/std": 0.40956881642341614, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898783564567566, + "sampling/importance_sampling_ratio/min": 4.610423047779477e-07, + "sampling/sampling_logp_difference/max": 14.589776039123535, + "sampling/sampling_logp_difference/mean": 0.10311967879533768, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 363.5234375, + "completions/mean_terminated_length": 363.5234375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.09243700839579105, + "epoch": 0.3769911504424779, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7249564433829758, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 104963187.0, + "reward": 0.6249492168426514, + "reward_std": 0.1054215282201767, + "rewards/qatch_small_update_with_fm/mean": 0.6249492168426514, + "rewards/qatch_small_update_with_fm/std": 0.32009294629096985, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896044731140137, + "sampling/importance_sampling_ratio/min": 0.0032624199520796537, + "sampling/sampling_logp_difference/max": 5.72528600692749, + "sampling/sampling_logp_difference/mean": 0.09415632486343384, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1793.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 418.625, + "completions/mean_terminated_length": 418.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.10987964924424887, + "epoch": 0.3787610619469027, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5524975720746137, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 105315203.0, + "reward": 0.625613272190094, + "reward_std": 0.07790683209896088, + "rewards/qatch_small_update_with_fm/mean": 0.625613272190094, + "rewards/qatch_small_update_with_fm/std": 0.4087977409362793, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906055331230164, + "sampling/importance_sampling_ratio/min": 0.006783362478017807, + "sampling/sampling_logp_difference/max": 4.993282318115234, + "sampling/sampling_logp_difference/mean": 0.0996527373790741, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 383.6875, + "completions/mean_terminated_length": 383.6875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.10452158376574516, + "epoch": 0.3805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.38716645681649037, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 105755875.0, + "reward": 0.8351406455039978, + "reward_std": 0.06556863337755203, + "rewards/qatch_small_update_with_fm/mean": 0.8351406455039978, + "rewards/qatch_small_update_with_fm/std": 0.2928430736064911, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922690391540527, + "sampling/importance_sampling_ratio/min": 0.011539791710674763, + "sampling/sampling_logp_difference/max": 4.461954116821289, + "sampling/sampling_logp_difference/mean": 0.09711340069770813, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 415.6484375, + "completions/mean_terminated_length": 415.6484375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.12065678741782904, + "epoch": 0.3823008849557522, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6352222097447021, + "learning_rate": 1e-06, + "loss": -0.0247, + "num_tokens": 106177929.0, + "reward": 0.7687656283378601, + "reward_std": 0.13633453845977783, + "rewards/qatch_small_update_with_fm/mean": 0.7687656283378601, + "rewards/qatch_small_update_with_fm/std": 0.33637431263923645, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927363395690918, + "sampling/importance_sampling_ratio/min": 0.0111549012362957, + "sampling/sampling_logp_difference/max": 4.495876312255859, + "sampling/sampling_logp_difference/mean": 0.10593421012163162, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 403.5, + "completions/mean_terminated_length": 403.5, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.10500549431890249, + "epoch": 0.384070796460177, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6852083526445841, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 106730537.0, + "reward": 0.8187460899353027, + "reward_std": 0.1587405502796173, + "rewards/qatch_small_update_with_fm/mean": 0.8187460899353027, + "rewards/qatch_small_update_with_fm/std": 0.3261018395423889, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9908450841903687, + "sampling/importance_sampling_ratio/min": 0.0032403613440692425, + "sampling/sampling_logp_difference/max": 5.732070446014404, + "sampling/sampling_logp_difference/mean": 0.09662536531686783, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2469.0, + "completions/max_terminated_length": 2469.0, + "completions/mean_length": 414.6875, + "completions/mean_terminated_length": 414.6875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.10810738615691662, + "epoch": 0.3858407079646018, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6813898746761639, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 107296617.0, + "reward": 0.7636758089065552, + "reward_std": 0.11631257832050323, + "rewards/qatch_small_update_with_fm/mean": 0.7636758089065552, + "rewards/qatch_small_update_with_fm/std": 0.3674674928188324, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918367862701416, + "sampling/importance_sampling_ratio/min": 7.124587864382192e-05, + "sampling/sampling_logp_difference/max": 9.549373626708984, + "sampling/sampling_logp_difference/mean": 0.09496808052062988, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 340.33203125, + "completions/mean_terminated_length": 340.33203125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.10157356224954128, + "epoch": 0.38761061946902653, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5941120949615057, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 107702590.0, + "reward": 0.8051445484161377, + "reward_std": 0.09416786581277847, + "rewards/qatch_small_update_with_fm/mean": 0.8051445484161377, + "rewards/qatch_small_update_with_fm/std": 0.32420217990875244, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893560409545898, + "sampling/importance_sampling_ratio/min": 0.007196484599262476, + "sampling/sampling_logp_difference/max": 4.934162616729736, + "sampling/sampling_logp_difference/mean": 0.09752357006072998, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 1293.0, + "completions/mean_length": 431.86328125, + "completions/mean_terminated_length": 431.86328125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.1107902517542243, + "epoch": 0.3893805309734513, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5457807080706949, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 108195003.0, + "reward": 0.6913124918937683, + "reward_std": 0.10084883123636246, + "rewards/qatch_small_update_with_fm/mean": 0.6913124918937683, + "rewards/qatch_small_update_with_fm/std": 0.3513505458831787, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9931973218917847, + "sampling/importance_sampling_ratio/min": 0.011205301620066166, + "sampling/sampling_logp_difference/max": 4.491368293762207, + "sampling/sampling_logp_difference/mean": 0.09411786496639252, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1267.0, + "completions/max_terminated_length": 1267.0, + "completions/mean_length": 371.265625, + "completions/mean_terminated_length": 371.265625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.1083214282989502, + "epoch": 0.3911504424778761, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7784986805139263, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 108672607.0, + "reward": 0.6840741634368896, + "reward_std": 0.14675581455230713, + "rewards/qatch_small_update_with_fm/mean": 0.6840741634368896, + "rewards/qatch_small_update_with_fm/std": 0.4099890887737274, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994238018989563, + "sampling/importance_sampling_ratio/min": 0.008668428286910057, + "sampling/sampling_logp_difference/max": 4.748067855834961, + "sampling/sampling_logp_difference/mean": 0.0962483137845993, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 359.0, + "completions/mean_terminated_length": 359.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.10230957623571157, + "epoch": 0.3929203539823009, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5490231460475042, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 109189583.0, + "reward": 0.7061171531677246, + "reward_std": 0.10651902854442596, + "rewards/qatch_small_update_with_fm/mean": 0.7061171531677246, + "rewards/qatch_small_update_with_fm/std": 0.3541206419467926, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989014744758606, + "sampling/importance_sampling_ratio/min": 0.008726480416953564, + "sampling/sampling_logp_difference/max": 4.741393089294434, + "sampling/sampling_logp_difference/mean": 0.09948454052209854, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1578.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 389.9453125, + "completions/mean_terminated_length": 389.9453125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.10472445003688335, + "epoch": 0.39469026548672564, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6190601116973689, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 109798977.0, + "reward": 0.6156679391860962, + "reward_std": 0.09265945851802826, + "rewards/qatch_small_update_with_fm/mean": 0.6156679391860962, + "rewards/qatch_small_update_with_fm/std": 0.41817304491996765, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946981072425842, + "sampling/importance_sampling_ratio/min": 0.008702251128852367, + "sampling/sampling_logp_difference/max": 4.744173526763916, + "sampling/sampling_logp_difference/mean": 0.08968104422092438, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1940.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 448.02734375, + "completions/mean_terminated_length": 448.02734375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.10507409740239382, + "epoch": 0.39646017699115044, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.49039506090276497, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 110339976.0, + "reward": 0.6631484031677246, + "reward_std": 0.06115978956222534, + "rewards/qatch_small_update_with_fm/mean": 0.6631484031677246, + "rewards/qatch_small_update_with_fm/std": 0.3594338297843933, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901099801063538, + "sampling/importance_sampling_ratio/min": 0.0068128579296171665, + "sampling/sampling_logp_difference/max": 4.988943576812744, + "sampling/sampling_logp_difference/mean": 0.09931102395057678, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 433.8046875, + "completions/mean_terminated_length": 433.8046875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.1009874390438199, + "epoch": 0.39823008849557523, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.44278668660350046, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 110778358.0, + "reward": 0.6486484408378601, + "reward_std": 0.09495560824871063, + "rewards/qatch_small_update_with_fm/mean": 0.6486484408378601, + "rewards/qatch_small_update_with_fm/std": 0.432005912065506, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933207035064697, + "sampling/importance_sampling_ratio/min": 0.014339202083647251, + "sampling/sampling_logp_difference/max": 4.244758129119873, + "sampling/sampling_logp_difference/mean": 0.08835351467132568, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 341.43359375, + "completions/mean_terminated_length": 341.43359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.10271453391760588, + "epoch": 0.4, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5733446606675858, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 111123429.0, + "reward": 0.5843750238418579, + "reward_std": 0.07411018013954163, + "rewards/qatch_small_update_with_fm/mean": 0.5843749642372131, + "rewards/qatch_small_update_with_fm/std": 0.31206896901130676, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9935798645019531, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.09060897678136826, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 447.81640625, + "completions/mean_terminated_length": 447.81640625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.12403890583664179, + "epoch": 0.40176991150442476, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5250232034467316, + "learning_rate": 1e-06, + "loss": 0.0244, + "num_tokens": 111573878.0, + "reward": 0.7477734088897705, + "reward_std": 0.1547674536705017, + "rewards/qatch_small_update_with_fm/mean": 0.7477734088897705, + "rewards/qatch_small_update_with_fm/std": 0.3546452224254608, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9935329556465149, + "sampling/importance_sampling_ratio/min": 0.007649845909327269, + "sampling/sampling_logp_difference/max": 4.873069763183594, + "sampling/sampling_logp_difference/mean": 0.10479405522346497, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1293.0, + "completions/max_terminated_length": 1293.0, + "completions/mean_length": 389.6484375, + "completions/mean_terminated_length": 389.6484375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.10224001575261354, + "epoch": 0.40353982300884955, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5131191563549943, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 112305164.0, + "reward": 0.8435273766517639, + "reward_std": 0.09165941178798676, + "rewards/qatch_small_update_with_fm/mean": 0.8435273170471191, + "rewards/qatch_small_update_with_fm/std": 0.2752784788608551, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918692708015442, + "sampling/importance_sampling_ratio/min": 0.005490469746291637, + "sampling/sampling_logp_difference/max": 5.204741477966309, + "sampling/sampling_logp_difference/mean": 0.09232448041439056, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 415.18359375, + "completions/mean_terminated_length": 415.18359375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.12820047978311777, + "epoch": 0.40530973451327434, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5694927094518497, + "learning_rate": 1e-06, + "loss": -0.0349, + "num_tokens": 112805947.0, + "reward": 0.7082968354225159, + "reward_std": 0.0801946148276329, + "rewards/qatch_small_update_with_fm/mean": 0.7082968950271606, + "rewards/qatch_small_update_with_fm/std": 0.40650320053100586, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9939711093902588, + "sampling/importance_sampling_ratio/min": 3.631289473560173e-06, + "sampling/sampling_logp_difference/max": 12.525922775268555, + "sampling/sampling_logp_difference/mean": 0.10653382539749146, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1303.0, + "completions/max_terminated_length": 1303.0, + "completions/mean_length": 398.5859375, + "completions/mean_terminated_length": 398.5859375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.1140748867765069, + "epoch": 0.40707964601769914, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6880249325585247, + "learning_rate": 1e-06, + "loss": -0.0121, + "num_tokens": 113518561.0, + "reward": 0.6640703082084656, + "reward_std": 0.15750177204608917, + "rewards/qatch_small_update_with_fm/mean": 0.6640703082084656, + "rewards/qatch_small_update_with_fm/std": 0.40480488538742065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946224093437195, + "sampling/importance_sampling_ratio/min": 0.002527002478018403, + "sampling/sampling_logp_difference/max": 5.980721473693848, + "sampling/sampling_logp_difference/mean": 0.09498130530118942, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1249.0, + "completions/max_terminated_length": 1249.0, + "completions/mean_length": 405.1328125, + "completions/mean_terminated_length": 405.1328125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.12539631128311157, + "epoch": 0.4088495575221239, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5651489326765903, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 114094515.0, + "reward": 0.7878320217132568, + "reward_std": 0.12009721994400024, + "rewards/qatch_small_update_with_fm/mean": 0.7878320217132568, + "rewards/qatch_small_update_with_fm/std": 0.3283204436302185, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9924635887145996, + "sampling/importance_sampling_ratio/min": 0.01837773434817791, + "sampling/sampling_logp_difference/max": 3.996615409851074, + "sampling/sampling_logp_difference/mean": 0.10449755936861038, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3024.0, + "completions/mean_length": 466.86328125, + "completions/mean_terminated_length": 452.63140869140625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.12363697215914726, + "epoch": 0.41061946902654867, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4751429731610685, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 114570384.0, + "reward": 0.6649336218833923, + "reward_std": 0.09007216989994049, + "rewards/qatch_small_update_with_fm/mean": 0.6649336218833923, + "rewards/qatch_small_update_with_fm/std": 0.3947446942329407, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9944989681243896, + "sampling/importance_sampling_ratio/min": 0.016750473529100418, + "sampling/sampling_logp_difference/max": 4.089328765869141, + "sampling/sampling_logp_difference/mean": 0.10331812500953674, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 369.34765625, + "completions/mean_terminated_length": 369.34765625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.10589554626494646, + "epoch": 0.41238938053097346, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.897996865066677, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 115134841.0, + "reward": 0.6671562194824219, + "reward_std": 0.11132475733757019, + "rewards/qatch_small_update_with_fm/mean": 0.6671562194824219, + "rewards/qatch_small_update_with_fm/std": 0.3616568148136139, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922637343406677, + "sampling/importance_sampling_ratio/min": 0.004132173955440521, + "sampling/sampling_logp_difference/max": 5.488951683044434, + "sampling/sampling_logp_difference/mean": 0.09396759420633316, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2038.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 404.7734375, + "completions/mean_terminated_length": 404.7734375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.12250016257166862, + "epoch": 0.41415929203539825, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.495571008102404, + "learning_rate": 1e-06, + "loss": -0.0284, + "num_tokens": 115632399.0, + "reward": 0.634265661239624, + "reward_std": 0.06549917906522751, + "rewards/qatch_small_update_with_fm/mean": 0.634265661239624, + "rewards/qatch_small_update_with_fm/std": 0.35203248262405396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9956581592559814, + "sampling/importance_sampling_ratio/min": 0.0105217844247818, + "sampling/sampling_logp_difference/max": 4.554307460784912, + "sampling/sampling_logp_difference/mean": 0.09811676293611526, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 399.984375, + "completions/mean_terminated_length": 399.984375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.1299695922061801, + "epoch": 0.415929203539823, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4991884830918963, + "learning_rate": 1e-06, + "loss": 0.0279, + "num_tokens": 116033147.0, + "reward": 0.8004375100135803, + "reward_std": 0.11684393882751465, + "rewards/qatch_small_update_with_fm/mean": 0.8004375100135803, + "rewards/qatch_small_update_with_fm/std": 0.35567614436149597, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916921854019165, + "sampling/importance_sampling_ratio/min": 0.0009987365920096636, + "sampling/sampling_logp_difference/max": 6.909019470214844, + "sampling/sampling_logp_difference/mean": 0.1101202666759491, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 416.85546875, + "completions/mean_terminated_length": 416.85546875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.10699698887765408, + "epoch": 0.4176991150442478, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4578399404909556, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 116528534.0, + "reward": 0.6235507726669312, + "reward_std": 0.037941787391901016, + "rewards/qatch_small_update_with_fm/mean": 0.6235507726669312, + "rewards/qatch_small_update_with_fm/std": 0.3784492611885071, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.993585467338562, + "sampling/importance_sampling_ratio/min": 0.008174315094947815, + "sampling/sampling_logp_difference/max": 4.806758403778076, + "sampling/sampling_logp_difference/mean": 0.09373586624860764, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 394.4140625, + "completions/mean_terminated_length": 394.4140625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.12796230241656303, + "epoch": 0.4194690265486726, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5872021743640824, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 117202528.0, + "reward": 0.7495508193969727, + "reward_std": 0.09856027364730835, + "rewards/qatch_small_update_with_fm/mean": 0.7495508193969727, + "rewards/qatch_small_update_with_fm/std": 0.34960198402404785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907758831977844, + "sampling/importance_sampling_ratio/min": 0.014935529790818691, + "sampling/sampling_logp_difference/max": 4.204012393951416, + "sampling/sampling_logp_difference/mean": 0.10940699279308319, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1716.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 470.66015625, + "completions/mean_terminated_length": 470.66015625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.13090031780302525, + "epoch": 0.42123893805309737, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5546880762353853, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 117707193.0, + "reward": 0.7711952924728394, + "reward_std": 0.11917905509471893, + "rewards/qatch_small_update_with_fm/mean": 0.7711952924728394, + "rewards/qatch_small_update_with_fm/std": 0.33702993392944336, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.996447741985321, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.10174047946929932, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 443.375, + "completions/mean_terminated_length": 443.375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.11677948199212551, + "epoch": 0.4230088495575221, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5266583505158154, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 118125289.0, + "reward": 0.6743515729904175, + "reward_std": 0.06367573887109756, + "rewards/qatch_small_update_with_fm/mean": 0.6743515729904175, + "rewards/qatch_small_update_with_fm/std": 0.3398094177246094, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922307729721069, + "sampling/importance_sampling_ratio/min": 0.0005226434441283345, + "sampling/sampling_logp_difference/max": 7.556611061096191, + "sampling/sampling_logp_difference/mean": 0.09814269840717316, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 326.51953125, + "completions/mean_terminated_length": 326.51953125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.10765422880649567, + "epoch": 0.4247787610619469, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.35797878094122443, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 118683294.0, + "reward": 0.7800507545471191, + "reward_std": 0.04426509886980057, + "rewards/qatch_small_update_with_fm/mean": 0.7800507545471191, + "rewards/qatch_small_update_with_fm/std": 0.29425278306007385, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9905130863189697, + "sampling/importance_sampling_ratio/min": 0.007289279717952013, + "sampling/sampling_logp_difference/max": 4.921350479125977, + "sampling/sampling_logp_difference/mean": 0.09998884797096252, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1628.0, + "completions/max_terminated_length": 1628.0, + "completions/mean_length": 420.17578125, + "completions/mean_terminated_length": 420.17578125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.11644061841070652, + "epoch": 0.4265486725663717, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5613929910697356, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 119158715.0, + "reward": 0.6946874856948853, + "reward_std": 0.1308887004852295, + "rewards/qatch_small_update_with_fm/mean": 0.6946874856948853, + "rewards/qatch_small_update_with_fm/std": 0.35073021054267883, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946391582489014, + "sampling/importance_sampling_ratio/min": 0.004145145416259766, + "sampling/sampling_logp_difference/max": 5.4858174324035645, + "sampling/sampling_logp_difference/mean": 0.09869328886270523, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.0, + "completions/max_terminated_length": 1554.0, + "completions/mean_length": 369.453125, + "completions/mean_terminated_length": 369.453125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.12437109090387821, + "epoch": 0.4283185840707965, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5206241947060207, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 119700559.0, + "reward": 0.7600976228713989, + "reward_std": 0.06152753531932831, + "rewards/qatch_small_update_with_fm/mean": 0.7600976228713989, + "rewards/qatch_small_update_with_fm/std": 0.33996260166168213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919652342796326, + "sampling/importance_sampling_ratio/min": 0.0002818721404764801, + "sampling/sampling_logp_difference/max": 8.174057006835938, + "sampling/sampling_logp_difference/mean": 0.10920348763465881, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 459.359375, + "completions/mean_terminated_length": 445.0980529785156, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.14293739199638367, + "epoch": 0.4300884955752212, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6945266374462524, + "learning_rate": 1e-06, + "loss": -0.0358, + "num_tokens": 120191611.0, + "reward": 0.7203359603881836, + "reward_std": 0.1005784347653389, + "rewards/qatch_small_update_with_fm/mean": 0.7203359603881836, + "rewards/qatch_small_update_with_fm/std": 0.37025392055511475, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946791529655457, + "sampling/importance_sampling_ratio/min": 0.006813199259340763, + "sampling/sampling_logp_difference/max": 4.988893508911133, + "sampling/sampling_logp_difference/mean": 0.11522375792264938, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 451.54296875, + "completions/mean_terminated_length": 451.54296875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.13328593783080578, + "epoch": 0.431858407079646, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.6196257477102209, + "learning_rate": 1e-06, + "loss": 0.0426, + "num_tokens": 120759766.0, + "reward": 0.7439101934432983, + "reward_std": 0.1378568708896637, + "rewards/qatch_small_update_with_fm/mean": 0.7439101934432983, + "rewards/qatch_small_update_with_fm/std": 0.3241375982761383, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925317764282227, + "sampling/importance_sampling_ratio/min": 0.02285437099635601, + "sampling/sampling_logp_difference/max": 3.7786128520965576, + "sampling/sampling_logp_difference/mean": 0.11157552152872086, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1783.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 468.84375, + "completions/mean_terminated_length": 468.84375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.14039788208901882, + "epoch": 0.4336283185840708, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5102000070988705, + "learning_rate": 1e-06, + "loss": 0.0272, + "num_tokens": 121329870.0, + "reward": 0.771414041519165, + "reward_std": 0.10335954278707504, + "rewards/qatch_small_update_with_fm/mean": 0.771414041519165, + "rewards/qatch_small_update_with_fm/std": 0.33871808648109436, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99262535572052, + "sampling/importance_sampling_ratio/min": 0.014456496573984623, + "sampling/sampling_logp_difference/max": 4.236611366271973, + "sampling/sampling_logp_difference/mean": 0.11720272153615952, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1608.0, + "completions/max_terminated_length": 1608.0, + "completions/mean_length": 415.3359375, + "completions/mean_terminated_length": 415.3359375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.13042385503649712, + "epoch": 0.4353982300884956, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.31843325314582177, + "learning_rate": 1e-06, + "loss": -0.0266, + "num_tokens": 121774628.0, + "reward": 0.8900351524353027, + "reward_std": 0.07186062633991241, + "rewards/qatch_small_update_with_fm/mean": 0.8900351524353027, + "rewards/qatch_small_update_with_fm/std": 0.2635286748409271, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919847249984741, + "sampling/importance_sampling_ratio/min": 0.013568556867539883, + "sampling/sampling_logp_difference/max": 4.300000190734863, + "sampling/sampling_logp_difference/mean": 0.1108667254447937, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3521.0, + "completions/max_terminated_length": 3521.0, + "completions/mean_length": 509.6640625, + "completions/mean_terminated_length": 509.6640625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.138479420915246, + "epoch": 0.43716814159292033, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4156690061559561, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 122309406.0, + "reward": 0.6351015567779541, + "reward_std": 0.048537224531173706, + "rewards/qatch_small_update_with_fm/mean": 0.6351015567779541, + "rewards/qatch_small_update_with_fm/std": 0.33746272325515747, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9944608211517334, + "sampling/importance_sampling_ratio/min": 0.006842448841780424, + "sampling/sampling_logp_difference/max": 4.984609603881836, + "sampling/sampling_logp_difference/mean": 0.10972237586975098, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 476.61328125, + "completions/mean_terminated_length": 476.61328125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.12279136944562197, + "epoch": 0.4389380530973451, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4630336672015149, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 122840091.0, + "reward": 0.6818008422851562, + "reward_std": 0.09493610262870789, + "rewards/qatch_small_update_with_fm/mean": 0.6818008422851562, + "rewards/qatch_small_update_with_fm/std": 0.39343327283859253, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934309720993042, + "sampling/importance_sampling_ratio/min": 0.0012457328848540783, + "sampling/sampling_logp_difference/max": 6.688031196594238, + "sampling/sampling_logp_difference/mean": 0.10279334336519241, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 548.1796875, + "completions/mean_terminated_length": 548.1796875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.12996913958340883, + "epoch": 0.4407079646017699, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.5700199575774423, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 123465225.0, + "reward": 0.488394558429718, + "reward_std": 0.1281006932258606, + "rewards/qatch_small_update_with_fm/mean": 0.488394558429718, + "rewards/qatch_small_update_with_fm/std": 0.35512784123420715, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9943488836288452, + "sampling/importance_sampling_ratio/min": 0.011242197826504707, + "sampling/sampling_logp_difference/max": 4.488080978393555, + "sampling/sampling_logp_difference/mean": 0.10481943935155869, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 384.2421875, + "completions/mean_terminated_length": 384.2421875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.11992082837969065, + "epoch": 0.4424778761061947, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5498360675010315, + "learning_rate": 1e-06, + "loss": 0.0244, + "num_tokens": 123982887.0, + "reward": 0.8174804449081421, + "reward_std": 0.06691622734069824, + "rewards/qatch_small_update_with_fm/mean": 0.8174804449081421, + "rewards/qatch_small_update_with_fm/std": 0.29479968547821045, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904511570930481, + "sampling/importance_sampling_ratio/min": 0.011172862723469734, + "sampling/sampling_logp_difference/max": 4.494267463684082, + "sampling/sampling_logp_difference/mean": 0.10939574986696243, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 448.39453125, + "completions/mean_terminated_length": 448.39453125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.13190590497106314, + "epoch": 0.44424778761061945, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4586587220281425, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 124570860.0, + "reward": 0.7742382884025574, + "reward_std": 0.08533629775047302, + "rewards/qatch_small_update_with_fm/mean": 0.7742382884025574, + "rewards/qatch_small_update_with_fm/std": 0.3210985064506531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922869205474854, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.11073958873748779, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1989.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 488.015625, + "completions/mean_terminated_length": 488.015625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.12504940200597048, + "epoch": 0.44601769911504424, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.582915685918618, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 125148784.0, + "reward": 0.7658984661102295, + "reward_std": 0.11392462998628616, + "rewards/qatch_small_update_with_fm/mean": 0.7658984661102295, + "rewards/qatch_small_update_with_fm/std": 0.3386352062225342, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992237389087677, + "sampling/importance_sampling_ratio/min": 6.2567501117882784e-06, + "sampling/sampling_logp_difference/max": 11.981849670410156, + "sampling/sampling_logp_difference/mean": 0.10734772682189941, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3040.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 562.68359375, + "completions/mean_terminated_length": 562.68359375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.1231586579233408, + "epoch": 0.44778761061946903, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4012765932095851, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 125544399.0, + "reward": 0.6832812428474426, + "reward_std": 0.07998022437095642, + "rewards/qatch_small_update_with_fm/mean": 0.6832812428474426, + "rewards/qatch_small_update_with_fm/std": 0.38533180952072144, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9948018789291382, + "sampling/importance_sampling_ratio/min": 0.003951632417738438, + "sampling/sampling_logp_difference/max": 5.533626556396484, + "sampling/sampling_logp_difference/mean": 0.10125906020402908, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1840.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 462.59375, + "completions/mean_terminated_length": 462.59375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.12078492902219296, + "epoch": 0.4495575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4699545761158711, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 126110535.0, + "reward": 0.7769765853881836, + "reward_std": 0.04606456309556961, + "rewards/qatch_small_update_with_fm/mean": 0.7769765853881836, + "rewards/qatch_small_update_with_fm/std": 0.30436578392982483, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918253421783447, + "sampling/importance_sampling_ratio/min": 0.007992015220224857, + "sampling/sampling_logp_difference/max": 4.829312324523926, + "sampling/sampling_logp_difference/mean": 0.10492375493049622, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1973.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 403.21484375, + "completions/mean_terminated_length": 403.21484375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.10851260460913181, + "epoch": 0.45132743362831856, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.38739447303368746, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 126501230.0, + "reward": 0.6241718530654907, + "reward_std": 0.0250605009496212, + "rewards/qatch_small_update_with_fm/mean": 0.6241718530654907, + "rewards/qatch_small_update_with_fm/std": 0.4063698649406433, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904232025146484, + "sampling/importance_sampling_ratio/min": 0.009276127442717552, + "sampling/sampling_logp_difference/max": 4.68031120300293, + "sampling/sampling_logp_difference/mean": 0.1012159138917923, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1522.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 450.74609375, + "completions/mean_terminated_length": 450.74609375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.12568799499422312, + "epoch": 0.45309734513274336, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6091856792011116, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 126960237.0, + "reward": 0.7116171717643738, + "reward_std": 0.10217170417308807, + "rewards/qatch_small_update_with_fm/mean": 0.7116171717643738, + "rewards/qatch_small_update_with_fm/std": 0.35243433713912964, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938164949417114, + "sampling/importance_sampling_ratio/min": 0.016684038564562798, + "sampling/sampling_logp_difference/max": 4.0933027267456055, + "sampling/sampling_logp_difference/mean": 0.10752049088478088, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1419.0, + "completions/max_terminated_length": 1419.0, + "completions/mean_length": 411.72265625, + "completions/mean_terminated_length": 411.72265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.10305561777204275, + "epoch": 0.45486725663716815, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5717465117807924, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 127592246.0, + "reward": 0.7968710660934448, + "reward_std": 0.09067166596651077, + "rewards/qatch_small_update_with_fm/mean": 0.7968710660934448, + "rewards/qatch_small_update_with_fm/std": 0.3336501121520996, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914082288742065, + "sampling/importance_sampling_ratio/min": 0.0041133444756269455, + "sampling/sampling_logp_difference/max": 5.493518829345703, + "sampling/sampling_logp_difference/mean": 0.09590524435043335, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2049.0, + "completions/max_terminated_length": 2049.0, + "completions/mean_length": 505.96875, + "completions/mean_terminated_length": 505.96875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.11262097675353289, + "epoch": 0.45663716814159294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5447749013706079, + "learning_rate": 1e-06, + "loss": -0.0507, + "num_tokens": 128076878.0, + "reward": 0.8263515830039978, + "reward_std": 0.13293085992336273, + "rewards/qatch_small_update_with_fm/mean": 0.8263515830039978, + "rewards/qatch_small_update_with_fm/std": 0.29542577266693115, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913504123687744, + "sampling/importance_sampling_ratio/min": 0.014473767951130867, + "sampling/sampling_logp_difference/max": 4.235417366027832, + "sampling/sampling_logp_difference/mean": 0.0993494763970375, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2215.0, + "completions/max_terminated_length": 2215.0, + "completions/mean_length": 473.1640625, + "completions/mean_terminated_length": 473.1640625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.12198252696543932, + "epoch": 0.4584070796460177, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.0757714283775988, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 128524184.0, + "reward": 0.6998945474624634, + "reward_std": 0.07172062247991562, + "rewards/qatch_small_update_with_fm/mean": 0.6998945474624634, + "rewards/qatch_small_update_with_fm/std": 0.3736403286457062, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9921177625656128, + "sampling/importance_sampling_ratio/min": 0.0024047421757131815, + "sampling/sampling_logp_difference/max": 6.030312538146973, + "sampling/sampling_logp_difference/mean": 0.10670118033885956, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1472.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 546.09765625, + "completions/mean_terminated_length": 546.09765625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.12327743601053953, + "epoch": 0.46017699115044247, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6147719138897032, + "learning_rate": 1e-06, + "loss": 0.0429, + "num_tokens": 129136881.0, + "reward": 0.6720781326293945, + "reward_std": 0.13617882132530212, + "rewards/qatch_small_update_with_fm/mean": 0.6720781326293945, + "rewards/qatch_small_update_with_fm/std": 0.4061374366283417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919106960296631, + "sampling/importance_sampling_ratio/min": 0.00174518465064466, + "sampling/sampling_logp_difference/max": 6.350894927978516, + "sampling/sampling_logp_difference/mean": 0.10320325940847397, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2097.0, + "completions/max_terminated_length": 2097.0, + "completions/mean_length": 455.74609375, + "completions/mean_terminated_length": 455.74609375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.11595601961016655, + "epoch": 0.46194690265486726, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.518100506416996, + "learning_rate": 1e-06, + "loss": -0.0265, + "num_tokens": 129576512.0, + "reward": 0.7429882287979126, + "reward_std": 0.14759953320026398, + "rewards/qatch_small_update_with_fm/mean": 0.7429882287979126, + "rewards/qatch_small_update_with_fm/std": 0.34472936391830444, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9928544759750366, + "sampling/importance_sampling_ratio/min": 0.008397025987505913, + "sampling/sampling_logp_difference/max": 4.779877662658691, + "sampling/sampling_logp_difference/mean": 0.09997648000717163, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1890.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 424.14453125, + "completions/mean_terminated_length": 424.14453125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.10993783641606569, + "epoch": 0.46371681415929206, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5021034588733652, + "learning_rate": 1e-06, + "loss": -0.0265, + "num_tokens": 129969141.0, + "reward": 0.8230313062667847, + "reward_std": 0.06805883347988129, + "rewards/qatch_small_update_with_fm/mean": 0.8230313062667847, + "rewards/qatch_small_update_with_fm/std": 0.3318270444869995, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901431202888489, + "sampling/importance_sampling_ratio/min": 0.006941431201994419, + "sampling/sampling_logp_difference/max": 4.970247268676758, + "sampling/sampling_logp_difference/mean": 0.1020810455083847, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2369.0, + "completions/max_terminated_length": 2369.0, + "completions/mean_length": 537.18359375, + "completions/mean_terminated_length": 537.18359375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.14144710265100002, + "epoch": 0.4654867256637168, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5124333861678522, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 130444916.0, + "reward": 0.6506445407867432, + "reward_std": 0.09026811271905899, + "rewards/qatch_small_update_with_fm/mean": 0.6506445407867432, + "rewards/qatch_small_update_with_fm/std": 0.34012213349342346, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9988116025924683, + "sampling/importance_sampling_ratio/min": 0.0017341957427561283, + "sampling/sampling_logp_difference/max": 6.357211589813232, + "sampling/sampling_logp_difference/mean": 0.11083206534385681, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 304.046875, + "completions/mean_terminated_length": 304.046875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.09478252567350864, + "epoch": 0.4672566371681416, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.619049798770197, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 130971360.0, + "reward": 0.8115742206573486, + "reward_std": 0.0725603997707367, + "rewards/qatch_small_update_with_fm/mean": 0.8115742206573486, + "rewards/qatch_small_update_with_fm/std": 0.31686973571777344, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867129325866699, + "sampling/importance_sampling_ratio/min": 0.005328531377017498, + "sampling/sampling_logp_difference/max": 5.234679698944092, + "sampling/sampling_logp_difference/mean": 0.09353677183389664, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1857.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 545.1484375, + "completions/mean_terminated_length": 545.1484375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.1286120619624853, + "epoch": 0.4690265486725664, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.45949231122851514, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 131422790.0, + "reward": 0.7897851467132568, + "reward_std": 0.08896966278553009, + "rewards/qatch_small_update_with_fm/mean": 0.7897851467132568, + "rewards/qatch_small_update_with_fm/std": 0.321509450674057, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9942489266395569, + "sampling/importance_sampling_ratio/min": 0.00868771132081747, + "sampling/sampling_logp_difference/max": 4.745845794677734, + "sampling/sampling_logp_difference/mean": 0.10412135720252991, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2429.0, + "completions/max_terminated_length": 2429.0, + "completions/mean_length": 564.22265625, + "completions/mean_terminated_length": 564.22265625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.14587272237986326, + "epoch": 0.47079646017699117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5004787008761891, + "learning_rate": 1e-06, + "loss": -0.0357, + "num_tokens": 131942543.0, + "reward": 0.7513867616653442, + "reward_std": 0.1140788346529007, + "rewards/qatch_small_update_with_fm/mean": 0.7513867616653442, + "rewards/qatch_small_update_with_fm/std": 0.36217227578163147, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9981526136398315, + "sampling/importance_sampling_ratio/min": 0.004260185174643993, + "sampling/sampling_logp_difference/max": 5.458442687988281, + "sampling/sampling_logp_difference/mean": 0.11560261994600296, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1697.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 555.22265625, + "completions/mean_terminated_length": 555.22265625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1373251285403967, + "epoch": 0.4725663716814159, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6125441902125374, + "learning_rate": 1e-06, + "loss": 0.0188, + "num_tokens": 132468712.0, + "reward": 0.5675820112228394, + "reward_std": 0.14589762687683105, + "rewards/qatch_small_update_with_fm/mean": 0.5675820112228394, + "rewards/qatch_small_update_with_fm/std": 0.418068528175354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9948241710662842, + "sampling/importance_sampling_ratio/min": 0.0018865078454837203, + "sampling/sampling_logp_difference/max": 6.2730278968811035, + "sampling/sampling_logp_difference/mean": 0.11048072576522827, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2317.0, + "completions/max_terminated_length": 2317.0, + "completions/mean_length": 462.8984375, + "completions/mean_terminated_length": 462.8984375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.12027736380696297, + "epoch": 0.4743362831858407, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5504667215266769, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 132848686.0, + "reward": 0.7035663723945618, + "reward_std": 0.05683054029941559, + "rewards/qatch_small_update_with_fm/mean": 0.7035663723945618, + "rewards/qatch_small_update_with_fm/std": 0.38923537731170654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.993371844291687, + "sampling/importance_sampling_ratio/min": 0.002918687416240573, + "sampling/sampling_logp_difference/max": 5.836621284484863, + "sampling/sampling_logp_difference/mean": 0.10290797799825668, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2692.0, + "completions/max_terminated_length": 2692.0, + "completions/mean_length": 500.42578125, + "completions/mean_terminated_length": 500.42578125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.13534568157047033, + "epoch": 0.4761061946902655, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6007433735308465, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 133368459.0, + "reward": 0.6969257593154907, + "reward_std": 0.09634049981832504, + "rewards/qatch_small_update_with_fm/mean": 0.6969257593154907, + "rewards/qatch_small_update_with_fm/std": 0.3994755148887634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934263229370117, + "sampling/importance_sampling_ratio/min": 0.006295106839388609, + "sampling/sampling_logp_difference/max": 5.0679826736450195, + "sampling/sampling_logp_difference/mean": 0.11658207327127457, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1710.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 447.66796875, + "completions/mean_terminated_length": 447.66796875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.12615801114588976, + "epoch": 0.4778761061946903, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6000587456169815, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 133839734.0, + "reward": 0.7735468745231628, + "reward_std": 0.14530882239341736, + "rewards/qatch_small_update_with_fm/mean": 0.7735468149185181, + "rewards/qatch_small_update_with_fm/std": 0.3402929902076721, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9957665205001831, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.10050618648529053, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2592.0, + "completions/max_terminated_length": 2592.0, + "completions/mean_length": 470.07421875, + "completions/mean_terminated_length": 470.07421875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.11290043592453003, + "epoch": 0.479646017699115, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5939769909746901, + "learning_rate": 1e-06, + "loss": 0.0399, + "num_tokens": 134480953.0, + "reward": 0.6712266206741333, + "reward_std": 0.08824647217988968, + "rewards/qatch_small_update_with_fm/mean": 0.6712265610694885, + "rewards/qatch_small_update_with_fm/std": 0.38758161664009094, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9950553774833679, + "sampling/importance_sampling_ratio/min": 0.011183853261172771, + "sampling/sampling_logp_difference/max": 4.493284225463867, + "sampling/sampling_logp_difference/mean": 0.09266015887260437, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3094.0, + "completions/max_terminated_length": 3094.0, + "completions/mean_length": 468.84765625, + "completions/mean_terminated_length": 468.84765625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.13117850199341774, + "epoch": 0.4814159292035398, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6585905752020277, + "learning_rate": 1e-06, + "loss": -0.0277, + "num_tokens": 135080066.0, + "reward": 0.7433632612228394, + "reward_std": 0.05850284546613693, + "rewards/qatch_small_update_with_fm/mean": 0.7433632612228394, + "rewards/qatch_small_update_with_fm/std": 0.3301638960838318, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9974746704101562, + "sampling/importance_sampling_ratio/min": 0.009958724491298199, + "sampling/sampling_logp_difference/max": 4.609306335449219, + "sampling/sampling_logp_difference/mean": 0.10478514432907104, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3146.0, + "completions/max_terminated_length": 3146.0, + "completions/mean_length": 576.86328125, + "completions/mean_terminated_length": 576.86328125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.14908631797879934, + "epoch": 0.4831858407079646, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4628001801800674, + "learning_rate": 1e-06, + "loss": 0.0384, + "num_tokens": 135766959.0, + "reward": 0.6367383003234863, + "reward_std": 0.07259754836559296, + "rewards/qatch_small_update_with_fm/mean": 0.6367383003234863, + "rewards/qatch_small_update_with_fm/std": 0.3717433512210846, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9974573254585266, + "sampling/importance_sampling_ratio/min": 0.003483326407149434, + "sampling/sampling_logp_difference/max": 5.6597676277160645, + "sampling/sampling_logp_difference/mean": 0.11377645283937454, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3232.0, + "completions/max_terminated_length": 3232.0, + "completions/mean_length": 455.0546875, + "completions/mean_terminated_length": 455.0546875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.1027538888156414, + "epoch": 0.4849557522123894, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.38737609564787473, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 136341533.0, + "reward": 0.7330039143562317, + "reward_std": 0.05588027834892273, + "rewards/qatch_small_update_with_fm/mean": 0.7330039143562317, + "rewards/qatch_small_update_with_fm/std": 0.38519182801246643, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9948368668556213, + "sampling/importance_sampling_ratio/min": 0.0019187896978110075, + "sampling/sampling_logp_difference/max": 6.256060600280762, + "sampling/sampling_logp_difference/mean": 0.08965164422988892, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2587.0, + "completions/max_terminated_length": 2587.0, + "completions/mean_length": 472.0390625, + "completions/mean_terminated_length": 472.0390625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.13030268996953964, + "epoch": 0.48672566371681414, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5194915442984417, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 136887495.0, + "reward": 0.8584296703338623, + "reward_std": 0.06279782950878143, + "rewards/qatch_small_update_with_fm/mean": 0.8584296703338623, + "rewards/qatch_small_update_with_fm/std": 0.2424517422914505, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9960412979125977, + "sampling/importance_sampling_ratio/min": 0.0067655895836651325, + "sampling/sampling_logp_difference/max": 4.995905876159668, + "sampling/sampling_logp_difference/mean": 0.10526451468467712, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1889.0, + "completions/max_terminated_length": 1889.0, + "completions/mean_length": 359.8671875, + "completions/mean_terminated_length": 359.8671875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.09691363852471113, + "epoch": 0.48849557522123893, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7103623049986925, + "learning_rate": 1e-06, + "loss": 0.0297, + "num_tokens": 137366693.0, + "reward": 0.7432616949081421, + "reward_std": 0.12966401875019073, + "rewards/qatch_small_update_with_fm/mean": 0.7432616949081421, + "rewards/qatch_small_update_with_fm/std": 0.35516712069511414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898815751075745, + "sampling/importance_sampling_ratio/min": 0.008864426985383034, + "sampling/sampling_logp_difference/max": 4.725708961486816, + "sampling/sampling_logp_difference/mean": 0.09437304735183716, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 423.5078125, + "completions/mean_terminated_length": 409.10589599609375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.1079849824309349, + "epoch": 0.4902654867256637, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4566326790804489, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 137883335.0, + "reward": 0.8503007888793945, + "reward_std": 0.0626184344291687, + "rewards/qatch_small_update_with_fm/mean": 0.8503007888793945, + "rewards/qatch_small_update_with_fm/std": 0.26566144824028015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902591109275818, + "sampling/importance_sampling_ratio/min": 0.008726668544113636, + "sampling/sampling_logp_difference/max": 4.7413716316223145, + "sampling/sampling_logp_difference/mean": 0.09983956068754196, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2062.0, + "completions/max_terminated_length": 2062.0, + "completions/mean_length": 469.0859375, + "completions/mean_terminated_length": 469.0859375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.1033999565988779, + "epoch": 0.4920353982300885, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4812629100597929, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 138490221.0, + "reward": 0.7183789014816284, + "reward_std": 0.12236850708723068, + "rewards/qatch_small_update_with_fm/mean": 0.7183789014816284, + "rewards/qatch_small_update_with_fm/std": 0.37549513578414917, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927672147750854, + "sampling/importance_sampling_ratio/min": 0.003263342659920454, + "sampling/sampling_logp_difference/max": 5.725003242492676, + "sampling/sampling_logp_difference/mean": 0.09193219989538193, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2113.0, + "completions/max_terminated_length": 2113.0, + "completions/mean_length": 452.9453125, + "completions/mean_terminated_length": 452.9453125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.11535468604415655, + "epoch": 0.49380530973451325, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5075709751174705, + "learning_rate": 1e-06, + "loss": -0.0522, + "num_tokens": 139050143.0, + "reward": 0.7936406135559082, + "reward_std": 0.06707090139389038, + "rewards/qatch_small_update_with_fm/mean": 0.7936406135559082, + "rewards/qatch_small_update_with_fm/std": 0.3406023383140564, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915643930435181, + "sampling/importance_sampling_ratio/min": 0.001045701210387051, + "sampling/sampling_logp_difference/max": 6.863067626953125, + "sampling/sampling_logp_difference/mean": 0.10395675897598267, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 460.359375, + "completions/mean_terminated_length": 460.359375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.11615493427962065, + "epoch": 0.49557522123893805, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7188042550526477, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 139657435.0, + "reward": 0.7461133003234863, + "reward_std": 0.13950800895690918, + "rewards/qatch_small_update_with_fm/mean": 0.7461133003234863, + "rewards/qatch_small_update_with_fm/std": 0.38162434101104736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991545557975769, + "sampling/importance_sampling_ratio/min": 0.00027837336529046297, + "sampling/sampling_logp_difference/max": 8.18654727935791, + "sampling/sampling_logp_difference/mean": 0.10376756638288498, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2307.0, + "completions/mean_length": 416.4921875, + "completions/mean_terminated_length": 402.0627746582031, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.10716897528618574, + "epoch": 0.49734513274336284, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5756784364509373, + "learning_rate": 1e-06, + "loss": 0.0212, + "num_tokens": 140091705.0, + "reward": 0.7761679291725159, + "reward_std": 0.10864467173814774, + "rewards/qatch_small_update_with_fm/mean": 0.7761679291725159, + "rewards/qatch_small_update_with_fm/std": 0.3267165720462799, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911130666732788, + "sampling/importance_sampling_ratio/min": 3.5366774682188407e-05, + "sampling/sampling_logp_difference/max": 10.249737739562988, + "sampling/sampling_logp_difference/mean": 0.10210423916578293, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3154.0, + "completions/max_terminated_length": 3154.0, + "completions/mean_length": 418.12109375, + "completions/mean_terminated_length": 418.12109375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.09531951136887074, + "epoch": 0.49911504424778763, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4381018145886672, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 140536440.0, + "reward": 0.8234609365463257, + "reward_std": 0.005906249396502972, + "rewards/qatch_small_update_with_fm/mean": 0.8234609365463257, + "rewards/qatch_small_update_with_fm/std": 0.2951415479183197, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938584566116333, + "sampling/importance_sampling_ratio/min": 0.008785802870988846, + "sampling/sampling_logp_difference/max": 4.734618186950684, + "sampling/sampling_logp_difference/mean": 0.08802133053541183, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2907.0, + "completions/max_terminated_length": 2907.0, + "completions/mean_length": 429.8984375, + "completions/mean_terminated_length": 429.8984375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.10019801463931799, + "epoch": 0.5008849557522124, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.499022198107272, + "learning_rate": 1e-06, + "loss": 0.0493, + "num_tokens": 141033998.0, + "reward": 0.6048046350479126, + "reward_std": 0.050070300698280334, + "rewards/qatch_small_update_with_fm/mean": 0.6048046350479126, + "rewards/qatch_small_update_with_fm/std": 0.3651096522808075, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922409653663635, + "sampling/importance_sampling_ratio/min": 0.01044392678886652, + "sampling/sampling_logp_difference/max": 4.561734676361084, + "sampling/sampling_logp_difference/mean": 0.0922735333442688, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1987.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 542.42578125, + "completions/mean_terminated_length": 542.42578125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.12973017618060112, + "epoch": 0.5026548672566372, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.48294227298363385, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 141533211.0, + "reward": 0.621808648109436, + "reward_std": 0.14083924889564514, + "rewards/qatch_small_update_with_fm/mean": 0.621808648109436, + "rewards/qatch_small_update_with_fm/std": 0.42469820380210876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9957568049430847, + "sampling/importance_sampling_ratio/min": 0.005370942875742912, + "sampling/sampling_logp_difference/max": 5.226751804351807, + "sampling/sampling_logp_difference/mean": 0.10498703271150589, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 314.375, + "completions/mean_terminated_length": 314.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.0781171815469861, + "epoch": 0.504424778761062, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4365562931344378, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 142053163.0, + "reward": 0.7447031736373901, + "reward_std": 0.03437906131148338, + "rewards/qatch_small_update_with_fm/mean": 0.7447031736373901, + "rewards/qatch_small_update_with_fm/std": 0.382269024848938, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909601211547852, + "sampling/importance_sampling_ratio/min": 0.005499903112649918, + "sampling/sampling_logp_difference/max": 5.203024864196777, + "sampling/sampling_logp_difference/mean": 0.07915640622377396, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 367.796875, + "completions/mean_terminated_length": 367.796875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.09002258535474539, + "epoch": 0.5061946902654867, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6698101164123688, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 142605319.0, + "reward": 0.753304660320282, + "reward_std": 0.06268875300884247, + "rewards/qatch_small_update_with_fm/mean": 0.753304660320282, + "rewards/qatch_small_update_with_fm/std": 0.35868948698043823, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894113540649414, + "sampling/importance_sampling_ratio/min": 0.0004474441520869732, + "sampling/sampling_logp_difference/max": 7.711958885192871, + "sampling/sampling_logp_difference/mean": 0.09195245802402496, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1933.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 506.75, + "completions/mean_terminated_length": 506.75, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.11118471156805754, + "epoch": 0.5079646017699115, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4948140819153, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 143225463.0, + "reward": 0.7221835851669312, + "reward_std": 0.06406918913125992, + "rewards/qatch_small_update_with_fm/mean": 0.7221835851669312, + "rewards/qatch_small_update_with_fm/std": 0.3668988347053528, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9937881231307983, + "sampling/importance_sampling_ratio/min": 0.003267764812335372, + "sampling/sampling_logp_difference/max": 5.723649024963379, + "sampling/sampling_logp_difference/mean": 0.09986858069896698, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 419.1796875, + "completions/mean_terminated_length": 419.1796875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.10201077908277512, + "epoch": 0.5097345132743363, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4042234506059746, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 143610229.0, + "reward": 0.8555507659912109, + "reward_std": 0.031004978343844414, + "rewards/qatch_small_update_with_fm/mean": 0.8555507659912109, + "rewards/qatch_small_update_with_fm/std": 0.25457900762557983, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900668859481812, + "sampling/importance_sampling_ratio/min": 0.014422724954783916, + "sampling/sampling_logp_difference/max": 4.238950252532959, + "sampling/sampling_logp_difference/mean": 0.09820079803466797, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1751.0, + "completions/max_terminated_length": 1751.0, + "completions/mean_length": 458.41015625, + "completions/mean_terminated_length": 458.41015625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.10596211347728968, + "epoch": 0.511504424778761, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5988557634553278, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 144313518.0, + "reward": 0.7774180173873901, + "reward_std": 0.08853039145469666, + "rewards/qatch_small_update_with_fm/mean": 0.7774179577827454, + "rewards/qatch_small_update_with_fm/std": 0.35354262590408325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990810751914978, + "sampling/importance_sampling_ratio/min": 0.006848180666565895, + "sampling/sampling_logp_difference/max": 4.983772277832031, + "sampling/sampling_logp_difference/mean": 0.09928888082504272, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2692.0, + "completions/max_terminated_length": 2692.0, + "completions/mean_length": 521.95703125, + "completions/mean_terminated_length": 521.95703125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.11423090100288391, + "epoch": 0.5132743362831859, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6741792027668885, + "learning_rate": 1e-06, + "loss": 0.0429, + "num_tokens": 145116995.0, + "reward": 0.6444180011749268, + "reward_std": 0.12592115998268127, + "rewards/qatch_small_update_with_fm/mean": 0.6444180011749268, + "rewards/qatch_small_update_with_fm/std": 0.3701937198638916, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9935187101364136, + "sampling/importance_sampling_ratio/min": 2.6497457383811707e-07, + "sampling/sampling_logp_difference/max": 15.143631935119629, + "sampling/sampling_logp_difference/mean": 0.10046621412038803, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3026.0, + "completions/max_terminated_length": 3026.0, + "completions/mean_length": 535.671875, + "completions/mean_terminated_length": 535.671875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.10938840173184872, + "epoch": 0.5150442477876106, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5835952264770478, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 145952959.0, + "reward": 0.5978906154632568, + "reward_std": 0.13176509737968445, + "rewards/qatch_small_update_with_fm/mean": 0.5978906154632568, + "rewards/qatch_small_update_with_fm/std": 0.4031720459461212, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9926491975784302, + "sampling/importance_sampling_ratio/min": 0.004141896963119507, + "sampling/sampling_logp_difference/max": 5.48660135269165, + "sampling/sampling_logp_difference/mean": 0.1000325083732605, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4052.0, + "completions/max_terminated_length": 4052.0, + "completions/mean_length": 551.72265625, + "completions/mean_terminated_length": 551.72265625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.11647769901901484, + "epoch": 0.5168141592920354, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6162799567231806, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 146521208.0, + "reward": 0.7887499928474426, + "reward_std": 0.051286663860082626, + "rewards/qatch_small_update_with_fm/mean": 0.7887499928474426, + "rewards/qatch_small_update_with_fm/std": 0.32047656178474426, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9951774477958679, + "sampling/importance_sampling_ratio/min": 0.006804880686104298, + "sampling/sampling_logp_difference/max": 4.990115165710449, + "sampling/sampling_logp_difference/mean": 0.09928067028522491, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3249.0, + "completions/max_terminated_length": 3249.0, + "completions/mean_length": 419.51171875, + "completions/mean_terminated_length": 419.51171875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.10076918825507164, + "epoch": 0.5185840707964602, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6215460006820153, + "learning_rate": 1e-06, + "loss": 0.0632, + "num_tokens": 147232107.0, + "reward": 0.7407070398330688, + "reward_std": 0.12173819541931152, + "rewards/qatch_small_update_with_fm/mean": 0.7407070398330688, + "rewards/qatch_small_update_with_fm/std": 0.3596765995025635, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914222359657288, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.09699016809463501, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2584.0, + "completions/mean_length": 557.6328125, + "completions/mean_terminated_length": 515.6759033203125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.11698231659829617, + "epoch": 0.5203539823008849, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5086059922780932, + "learning_rate": 1e-06, + "loss": -0.0372, + "num_tokens": 147777197.0, + "reward": 0.5674062371253967, + "reward_std": 0.12426469475030899, + "rewards/qatch_small_update_with_fm/mean": 0.5674062371253967, + "rewards/qatch_small_update_with_fm/std": 0.408805787563324, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938693642616272, + "sampling/importance_sampling_ratio/min": 0.008697181940078735, + "sampling/sampling_logp_difference/max": 4.74475622177124, + "sampling/sampling_logp_difference/mean": 0.10387073457241058, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 368.49609375, + "completions/mean_terminated_length": 368.49609375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.08287282753735781, + "epoch": 0.5221238938053098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3754465043261356, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 148209756.0, + "reward": 0.7729140520095825, + "reward_std": 0.07160256057977676, + "rewards/qatch_small_update_with_fm/mean": 0.7729140520095825, + "rewards/qatch_small_update_with_fm/std": 0.3413429260253906, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904618859291077, + "sampling/importance_sampling_ratio/min": 0.0031984634697437286, + "sampling/sampling_logp_difference/max": 5.745084762573242, + "sampling/sampling_logp_difference/mean": 0.08396244049072266, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2880.0, + "completions/max_terminated_length": 2880.0, + "completions/mean_length": 466.6328125, + "completions/mean_terminated_length": 466.6328125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.11806033086031675, + "epoch": 0.5238938053097345, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6080798073663735, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 148634110.0, + "reward": 0.7674218416213989, + "reward_std": 0.11307354271411896, + "rewards/qatch_small_update_with_fm/mean": 0.7674218416213989, + "rewards/qatch_small_update_with_fm/std": 0.3313625454902649, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990534782409668, + "sampling/importance_sampling_ratio/min": 0.011155885644257069, + "sampling/sampling_logp_difference/max": 4.495788097381592, + "sampling/sampling_logp_difference/mean": 0.10630348324775696, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 376.91796875, + "completions/mean_terminated_length": 376.91796875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.08900775015354156, + "epoch": 0.5256637168141592, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4087395121892628, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 149203097.0, + "reward": 0.7409297227859497, + "reward_std": 0.052058301866054535, + "rewards/qatch_small_update_with_fm/mean": 0.7409297227859497, + "rewards/qatch_small_update_with_fm/std": 0.3449692130088806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9903227686882019, + "sampling/importance_sampling_ratio/min": 0.00527548510581255, + "sampling/sampling_logp_difference/max": 5.24468469619751, + "sampling/sampling_logp_difference/mean": 0.09000498056411743, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 364.77734375, + "completions/mean_terminated_length": 364.77734375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.10976507421582937, + "epoch": 0.5274336283185841, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6729616231094641, + "learning_rate": 1e-06, + "loss": -0.0497, + "num_tokens": 149537952.0, + "reward": 0.5636405944824219, + "reward_std": 0.10702510178089142, + "rewards/qatch_small_update_with_fm/mean": 0.5636405944824219, + "rewards/qatch_small_update_with_fm/std": 0.3598759174346924, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9937114119529724, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.09804245829582214, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2403.0, + "completions/mean_length": 500.36328125, + "completions/mean_terminated_length": 472.0511779785156, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.12263609375804663, + "epoch": 0.5292035398230088, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4428215756207369, + "learning_rate": 1e-06, + "loss": -0.0397, + "num_tokens": 150078141.0, + "reward": 0.6905312538146973, + "reward_std": 0.05177067592740059, + "rewards/qatch_small_update_with_fm/mean": 0.6905312538146973, + "rewards/qatch_small_update_with_fm/std": 0.3768806457519531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9940615892410278, + "sampling/importance_sampling_ratio/min": 0.001947162440046668, + "sampling/sampling_logp_difference/max": 6.241382122039795, + "sampling/sampling_logp_difference/mean": 0.10560339689254761, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 312.921875, + "completions/mean_terminated_length": 312.921875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.07294352725148201, + "epoch": 0.5309734513274337, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.49703986559817703, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 150464585.0, + "reward": 0.8377617001533508, + "reward_std": 0.028467310592532158, + "rewards/qatch_small_update_with_fm/mean": 0.8377617001533508, + "rewards/qatch_small_update_with_fm/std": 0.26436394453048706, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98967045545578, + "sampling/importance_sampling_ratio/min": 0.0009882025187835097, + "sampling/sampling_logp_difference/max": 6.919622898101807, + "sampling/sampling_logp_difference/mean": 0.07985977083444595, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2166.0, + "completions/max_terminated_length": 2166.0, + "completions/mean_length": 455.06640625, + "completions/mean_terminated_length": 455.06640625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.11215604841709137, + "epoch": 0.5327433628318584, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.543172696038466, + "learning_rate": 1e-06, + "loss": -0.0341, + "num_tokens": 151168426.0, + "reward": 0.7523554563522339, + "reward_std": 0.11810556054115295, + "rewards/qatch_small_update_with_fm/mean": 0.7523554563522339, + "rewards/qatch_small_update_with_fm/std": 0.32776567339897156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9945645332336426, + "sampling/importance_sampling_ratio/min": 0.007378770504146814, + "sampling/sampling_logp_difference/max": 4.909148216247559, + "sampling/sampling_logp_difference/mean": 0.09771260619163513, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 350.12109375, + "completions/mean_terminated_length": 350.12109375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.08310944028198719, + "epoch": 0.5345132743362832, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5927217882498504, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 151669849.0, + "reward": 0.7303671836853027, + "reward_std": 0.05779767408967018, + "rewards/qatch_small_update_with_fm/mean": 0.7303671836853027, + "rewards/qatch_small_update_with_fm/std": 0.3890523314476013, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9917393922805786, + "sampling/importance_sampling_ratio/min": 0.0041347043588757515, + "sampling/sampling_logp_difference/max": 5.488339424133301, + "sampling/sampling_logp_difference/mean": 0.08336856961250305, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2110.0, + "completions/max_terminated_length": 2110.0, + "completions/mean_length": 455.5, + "completions/mean_terminated_length": 455.5, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.1370784854516387, + "epoch": 0.536283185840708, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5876568831658732, + "learning_rate": 1e-06, + "loss": 0.0535, + "num_tokens": 152257385.0, + "reward": 0.7178086042404175, + "reward_std": 0.07202421128749847, + "rewards/qatch_small_update_with_fm/mean": 0.7178086042404175, + "rewards/qatch_small_update_with_fm/std": 0.33369919657707214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918363094329834, + "sampling/importance_sampling_ratio/min": 0.0042452337220311165, + "sampling/sampling_logp_difference/max": 5.461958408355713, + "sampling/sampling_logp_difference/mean": 0.11546655744314194, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2119.0, + "completions/max_terminated_length": 2119.0, + "completions/mean_length": 434.015625, + "completions/mean_terminated_length": 434.015625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.09734268207103014, + "epoch": 0.5380530973451327, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6656086614188875, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 152871565.0, + "reward": 0.8256992101669312, + "reward_std": 0.07486239075660706, + "rewards/qatch_small_update_with_fm/mean": 0.8256992101669312, + "rewards/qatch_small_update_with_fm/std": 0.2639926075935364, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946987628936768, + "sampling/importance_sampling_ratio/min": 0.0032207805197685957, + "sampling/sampling_logp_difference/max": 5.738131523132324, + "sampling/sampling_logp_difference/mean": 0.08409319072961807, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 337.96875, + "completions/mean_terminated_length": 337.96875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.08514279685914516, + "epoch": 0.5398230088495575, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3134650372865385, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 153296357.0, + "reward": 0.7113710641860962, + "reward_std": 0.05351608246564865, + "rewards/qatch_small_update_with_fm/mean": 0.7113710641860962, + "rewards/qatch_small_update_with_fm/std": 0.3926786184310913, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9917937517166138, + "sampling/importance_sampling_ratio/min": 0.008679499849677086, + "sampling/sampling_logp_difference/max": 4.746791362762451, + "sampling/sampling_logp_difference/mean": 0.08514834940433502, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2561.0, + "completions/max_terminated_length": 2561.0, + "completions/mean_length": 434.58984375, + "completions/mean_terminated_length": 434.58984375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.1213516229763627, + "epoch": 0.5415929203539823, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6627001328299444, + "learning_rate": 1e-06, + "loss": -0.0688, + "num_tokens": 153921036.0, + "reward": 0.7097538709640503, + "reward_std": 0.14472410082817078, + "rewards/qatch_small_update_with_fm/mean": 0.7097538709640503, + "rewards/qatch_small_update_with_fm/std": 0.37847304344177246, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988747239112854, + "sampling/importance_sampling_ratio/min": 0.0025193877518177032, + "sampling/sampling_logp_difference/max": 5.983739376068115, + "sampling/sampling_logp_difference/mean": 0.11412816494703293, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1770.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 465.35546875, + "completions/mean_terminated_length": 465.35546875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.12086922209709883, + "epoch": 0.5433628318584071, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.64253124673734, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 154509207.0, + "reward": 0.8136289119720459, + "reward_std": 0.10231994092464447, + "rewards/qatch_small_update_with_fm/mean": 0.8136289119720459, + "rewards/qatch_small_update_with_fm/std": 0.34205469489097595, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925813674926758, + "sampling/importance_sampling_ratio/min": 0.011445293202996254, + "sampling/sampling_logp_difference/max": 4.470176696777344, + "sampling/sampling_logp_difference/mean": 0.10290302336215973, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2870.0, + "completions/max_terminated_length": 2870.0, + "completions/mean_length": 498.5859375, + "completions/mean_terminated_length": 498.5859375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.12368575483560562, + "epoch": 0.5451327433628319, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5271043034237322, + "learning_rate": 1e-06, + "loss": 0.035, + "num_tokens": 155121037.0, + "reward": 0.7560000419616699, + "reward_std": 0.12761875987052917, + "rewards/qatch_small_update_with_fm/mean": 0.7560000419616699, + "rewards/qatch_small_update_with_fm/std": 0.35406094789505005, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9949927926063538, + "sampling/importance_sampling_ratio/min": 0.008679255843162537, + "sampling/sampling_logp_difference/max": 4.746819496154785, + "sampling/sampling_logp_difference/mean": 0.1031837910413742, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3090.0, + "completions/max_terminated_length": 3090.0, + "completions/mean_length": 502.54296875, + "completions/mean_terminated_length": 502.54296875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.1271666856482625, + "epoch": 0.5469026548672566, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6756247756423795, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 155601320.0, + "reward": 0.6903945207595825, + "reward_std": 0.057952310889959335, + "rewards/qatch_small_update_with_fm/mean": 0.6903945207595825, + "rewards/qatch_small_update_with_fm/std": 0.3617255687713623, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.997051477432251, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.10001347959041595, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2047.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 417.25390625, + "completions/mean_terminated_length": 417.25390625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.11392358969897032, + "epoch": 0.5486725663716814, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7202387252863824, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 156132649.0, + "reward": 0.7270429730415344, + "reward_std": 0.1510547250509262, + "rewards/qatch_small_update_with_fm/mean": 0.7270429730415344, + "rewards/qatch_small_update_with_fm/std": 0.38717004656791687, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9941946268081665, + "sampling/importance_sampling_ratio/min": 0.007314593065530062, + "sampling/sampling_logp_difference/max": 4.91788387298584, + "sampling/sampling_logp_difference/mean": 0.09563559293746948, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2262.0, + "completions/mean_length": 441.7734375, + "completions/mean_terminated_length": 427.44317626953125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.11410268303006887, + "epoch": 0.5504424778761062, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5607862617554898, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 156621343.0, + "reward": 0.8368203043937683, + "reward_std": 0.10923705250024796, + "rewards/qatch_small_update_with_fm/mean": 0.8368203043937683, + "rewards/qatch_small_update_with_fm/std": 0.28346678614616394, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933780431747437, + "sampling/importance_sampling_ratio/min": 0.011173656210303307, + "sampling/sampling_logp_difference/max": 4.49419641494751, + "sampling/sampling_logp_difference/mean": 0.09727410227060318, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2165.0, + "completions/max_terminated_length": 2165.0, + "completions/mean_length": 433.5078125, + "completions/mean_terminated_length": 433.5078125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.11591598391532898, + "epoch": 0.552212389380531, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5845116761011263, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 157069281.0, + "reward": 0.8090898394584656, + "reward_std": 0.0635581985116005, + "rewards/qatch_small_update_with_fm/mean": 0.8090898394584656, + "rewards/qatch_small_update_with_fm/std": 0.3227374851703644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9951672554016113, + "sampling/importance_sampling_ratio/min": 0.006770285312086344, + "sampling/sampling_logp_difference/max": 4.995212078094482, + "sampling/sampling_logp_difference/mean": 0.09658536314964294, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1814.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 446.53125, + "completions/mean_terminated_length": 446.53125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.1346312901005149, + "epoch": 0.5539823008849557, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5379089439997566, + "learning_rate": 1e-06, + "loss": 0.0546, + "num_tokens": 157421465.0, + "reward": 0.7145351767539978, + "reward_std": 0.0909833312034607, + "rewards/qatch_small_update_with_fm/mean": 0.714535117149353, + "rewards/qatch_small_update_with_fm/std": 0.37304043769836426, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946333169937134, + "sampling/importance_sampling_ratio/min": 0.012340717017650604, + "sampling/sampling_logp_difference/max": 4.394851207733154, + "sampling/sampling_logp_difference/mean": 0.10755407065153122, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 378.3203125, + "completions/mean_terminated_length": 378.3203125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.1214052801951766, + "epoch": 0.5557522123893806, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6558845846189386, + "learning_rate": 1e-06, + "loss": -0.0369, + "num_tokens": 157899307.0, + "reward": 0.7183281183242798, + "reward_std": 0.053716663271188736, + "rewards/qatch_small_update_with_fm/mean": 0.7183281183242798, + "rewards/qatch_small_update_with_fm/std": 0.340137779712677, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9941397309303284, + "sampling/importance_sampling_ratio/min": 0.0022659660317003727, + "sampling/sampling_logp_difference/max": 6.089754104614258, + "sampling/sampling_logp_difference/mean": 0.10012153536081314, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 384.48828125, + "completions/mean_terminated_length": 384.48828125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.11504952795803547, + "epoch": 0.5575221238938053, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6495225945273754, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 158448664.0, + "reward": 0.8267968893051147, + "reward_std": 0.11277540028095245, + "rewards/qatch_small_update_with_fm/mean": 0.8267968893051147, + "rewards/qatch_small_update_with_fm/std": 0.2775137722492218, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9939453601837158, + "sampling/importance_sampling_ratio/min": 0.006800443399697542, + "sampling/sampling_logp_difference/max": 4.990767478942871, + "sampling/sampling_logp_difference/mean": 0.0972718894481659, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1794.0, + "completions/max_terminated_length": 1794.0, + "completions/mean_length": 391.35546875, + "completions/mean_terminated_length": 391.35546875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.12797002494335175, + "epoch": 0.5592920353982301, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.66959273180804, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 159075987.0, + "reward": 0.7723633050918579, + "reward_std": 0.09683510661125183, + "rewards/qatch_small_update_with_fm/mean": 0.7723633050918579, + "rewards/qatch_small_update_with_fm/std": 0.31531158089637756, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9960798025131226, + "sampling/importance_sampling_ratio/min": 0.0006279381923377514, + "sampling/sampling_logp_difference/max": 7.373068809509277, + "sampling/sampling_logp_difference/mean": 0.10363449156284332, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 418.4375, + "completions/mean_terminated_length": 418.4375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.12645841762423515, + "epoch": 0.5610619469026549, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.43227915668177896, + "learning_rate": 1e-06, + "loss": -0.0158, + "num_tokens": 159643251.0, + "reward": 0.8227812647819519, + "reward_std": 0.049158066511154175, + "rewards/qatch_small_update_with_fm/mean": 0.8227812647819519, + "rewards/qatch_small_update_with_fm/std": 0.31986841559410095, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9950623512268066, + "sampling/importance_sampling_ratio/min": 0.008687077090144157, + "sampling/sampling_logp_difference/max": 4.7459187507629395, + "sampling/sampling_logp_difference/mean": 0.10484028607606888, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2744.0, + "completions/mean_length": 489.4453125, + "completions/mean_terminated_length": 475.302001953125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.13590082712471485, + "epoch": 0.5628318584070796, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5538138640244875, + "learning_rate": 1e-06, + "loss": -0.0412, + "num_tokens": 160179285.0, + "reward": 0.7630391120910645, + "reward_std": 0.09547914564609528, + "rewards/qatch_small_update_with_fm/mean": 0.7630391120910645, + "rewards/qatch_small_update_with_fm/std": 0.35770267248153687, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000463724136353, + "sampling/importance_sampling_ratio/min": 0.005264458246529102, + "sampling/sampling_logp_difference/max": 5.246777057647705, + "sampling/sampling_logp_difference/mean": 0.10617281496524811, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2257.0, + "completions/max_terminated_length": 2257.0, + "completions/mean_length": 539.88671875, + "completions/mean_terminated_length": 539.88671875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.13833912461996078, + "epoch": 0.5646017699115045, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6595395533621345, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 160704968.0, + "reward": 0.6066679954528809, + "reward_std": 0.08918474614620209, + "rewards/qatch_small_update_with_fm/mean": 0.6066679954528809, + "rewards/qatch_small_update_with_fm/std": 0.37395772337913513, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9956793785095215, + "sampling/importance_sampling_ratio/min": 0.0030901748687028885, + "sampling/sampling_logp_difference/max": 5.77952766418457, + "sampling/sampling_logp_difference/mean": 0.10660675168037415, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1290.0, + "completions/max_terminated_length": 1290.0, + "completions/mean_length": 320.67578125, + "completions/mean_terminated_length": 320.67578125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.09042697865515947, + "epoch": 0.5663716814159292, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6116439453305005, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 161201829.0, + "reward": 0.884960949420929, + "reward_std": 0.04503811523318291, + "rewards/qatch_small_update_with_fm/mean": 0.884960949420929, + "rewards/qatch_small_update_with_fm/std": 0.288003534078598, + "sampling/importance_sampling_ratio/max": 1.8726849555969238, + "sampling/importance_sampling_ratio/mean": 0.9918411374092102, + "sampling/importance_sampling_ratio/min": 0.00045137188863009214, + "sampling/sampling_logp_difference/max": 7.703218936920166, + "sampling/sampling_logp_difference/mean": 0.08345195651054382, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2957.0, + "completions/max_terminated_length": 2957.0, + "completions/mean_length": 560.984375, + "completions/mean_terminated_length": 560.984375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.1479995846748352, + "epoch": 0.5681415929203539, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4996635931610896, + "learning_rate": 1e-06, + "loss": 0.0588, + "num_tokens": 161929473.0, + "reward": 0.6543202996253967, + "reward_std": 0.1047203540802002, + "rewards/qatch_small_update_with_fm/mean": 0.6543203592300415, + "rewards/qatch_small_update_with_fm/std": 0.39747729897499084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9977031946182251, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.11383165419101715, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1232.0, + "completions/mean_length": 440.65234375, + "completions/mean_terminated_length": 426.3176574707031, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.11984299309551716, + "epoch": 0.5699115044247788, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5934492923234808, + "learning_rate": 1e-06, + "loss": -0.0495, + "num_tokens": 162480056.0, + "reward": 0.7722031474113464, + "reward_std": 0.11908012628555298, + "rewards/qatch_small_update_with_fm/mean": 0.7722031474113464, + "rewards/qatch_small_update_with_fm/std": 0.3386456072330475, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946287870407104, + "sampling/importance_sampling_ratio/min": 0.01118414755910635, + "sampling/sampling_logp_difference/max": 4.493257999420166, + "sampling/sampling_logp_difference/mean": 0.0980626791715622, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2773.0, + "completions/max_terminated_length": 2773.0, + "completions/mean_length": 461.82421875, + "completions/mean_terminated_length": 461.82421875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.136813223361969, + "epoch": 0.5716814159292035, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5608859874498612, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 163002811.0, + "reward": 0.8026015758514404, + "reward_std": 0.05060504376888275, + "rewards/qatch_small_update_with_fm/mean": 0.8026015758514404, + "rewards/qatch_small_update_with_fm/std": 0.33220797777175903, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9989163875579834, + "sampling/importance_sampling_ratio/min": 2.4591246983618475e-05, + "sampling/sampling_logp_difference/max": 10.613120079040527, + "sampling/sampling_logp_difference/mean": 0.10465744137763977, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1303.0, + "completions/max_terminated_length": 1303.0, + "completions/mean_length": 368.53515625, + "completions/mean_terminated_length": 368.53515625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.10137500800192356, + "epoch": 0.5734513274336284, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5441647449845531, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 163449396.0, + "reward": 0.821164071559906, + "reward_std": 0.044571854174137115, + "rewards/qatch_small_update_with_fm/mean": 0.821164071559906, + "rewards/qatch_small_update_with_fm/std": 0.31781241297721863, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920123815536499, + "sampling/importance_sampling_ratio/min": 4.6803124860161915e-05, + "sampling/sampling_logp_difference/max": 9.969560623168945, + "sampling/sampling_logp_difference/mean": 0.09228494018316269, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2963.0, + "completions/mean_length": 453.5234375, + "completions/mean_terminated_length": 439.2392272949219, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.14200774021446705, + "epoch": 0.5752212389380531, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5266961980954568, + "learning_rate": 1e-06, + "loss": -0.0445, + "num_tokens": 163966026.0, + "reward": 0.7711563110351562, + "reward_std": 0.09890992194414139, + "rewards/qatch_small_update_with_fm/mean": 0.7711562514305115, + "rewards/qatch_small_update_with_fm/std": 0.3402455151081085, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9953124523162842, + "sampling/importance_sampling_ratio/min": 0.0011746156960725784, + "sampling/sampling_logp_difference/max": 6.746814250946045, + "sampling/sampling_logp_difference/mean": 0.11260481923818588, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 398.9296875, + "completions/mean_terminated_length": 398.9296875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.12197424191981554, + "epoch": 0.5769911504424778, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.35093369253212775, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 164455704.0, + "reward": 0.6438866853713989, + "reward_std": 0.04379892349243164, + "rewards/qatch_small_update_with_fm/mean": 0.6438866853713989, + "rewards/qatch_small_update_with_fm/std": 0.3767320513725281, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938642382621765, + "sampling/importance_sampling_ratio/min": 0.012557579204440117, + "sampling/sampling_logp_difference/max": 4.3774309158325195, + "sampling/sampling_logp_difference/mean": 0.10210156440734863, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2057.0, + "completions/max_terminated_length": 2057.0, + "completions/mean_length": 487.40625, + "completions/mean_terminated_length": 487.40625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.12392527237534523, + "epoch": 0.5787610619469027, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4301567888730242, + "learning_rate": 1e-06, + "loss": -0.017, + "num_tokens": 165043888.0, + "reward": 0.6832578182220459, + "reward_std": 0.07006269693374634, + "rewards/qatch_small_update_with_fm/mean": 0.6832578182220459, + "rewards/qatch_small_update_with_fm/std": 0.3451367914676666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9974828958511353, + "sampling/importance_sampling_ratio/min": 0.003241075435653329, + "sampling/sampling_logp_difference/max": 5.7318501472473145, + "sampling/sampling_logp_difference/mean": 0.09849071502685547, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3916.0, + "completions/max_terminated_length": 3916.0, + "completions/mean_length": 562.7265625, + "completions/mean_terminated_length": 562.7265625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.12135390192270279, + "epoch": 0.5805309734513274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.46934700215857733, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 165710922.0, + "reward": 0.6851679682731628, + "reward_std": 0.09362904727458954, + "rewards/qatch_small_update_with_fm/mean": 0.6851679682731628, + "rewards/qatch_small_update_with_fm/std": 0.3390476405620575, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9955009818077087, + "sampling/importance_sampling_ratio/min": 0.00046856250264681876, + "sampling/sampling_logp_difference/max": 7.665841102600098, + "sampling/sampling_logp_difference/mean": 0.0948438048362732, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 388.74609375, + "completions/mean_terminated_length": 388.74609375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.11189944110810757, + "epoch": 0.5823008849557522, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6534054957023212, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 166287353.0, + "reward": 0.8831367492675781, + "reward_std": 0.04500850662589073, + "rewards/qatch_small_update_with_fm/mean": 0.8831367492675781, + "rewards/qatch_small_update_with_fm/std": 0.2723407745361328, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9937008619308472, + "sampling/importance_sampling_ratio/min": 0.01839052513241768, + "sampling/sampling_logp_difference/max": 3.995919704437256, + "sampling/sampling_logp_difference/mean": 0.09178143739700317, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2535.0, + "completions/max_terminated_length": 2535.0, + "completions/mean_length": 481.04296875, + "completions/mean_terminated_length": 481.04296875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.13939405418932438, + "epoch": 0.584070796460177, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6137833889894624, + "learning_rate": 1e-06, + "loss": 0.0451, + "num_tokens": 166844948.0, + "reward": 0.6526992321014404, + "reward_std": 0.10534843057394028, + "rewards/qatch_small_update_with_fm/mean": 0.6526992321014404, + "rewards/qatch_small_update_with_fm/std": 0.3459371030330658, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9961899518966675, + "sampling/importance_sampling_ratio/min": 0.0015783263370394707, + "sampling/sampling_logp_difference/max": 6.451390266418457, + "sampling/sampling_logp_difference/mean": 0.11320600658655167, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 351.69140625, + "completions/mean_terminated_length": 351.69140625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.10876735299825668, + "epoch": 0.5858407079646017, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5578310759986786, + "learning_rate": 1e-06, + "loss": 0.0591, + "num_tokens": 167356421.0, + "reward": 0.7452616691589355, + "reward_std": 0.10783276706933975, + "rewards/qatch_small_update_with_fm/mean": 0.7452616691589355, + "rewards/qatch_small_update_with_fm/std": 0.32912948727607727, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922351241111755, + "sampling/importance_sampling_ratio/min": 0.009870722889900208, + "sampling/sampling_logp_difference/max": 4.618182182312012, + "sampling/sampling_logp_difference/mean": 0.09559471905231476, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 375.47265625, + "completions/mean_terminated_length": 375.47265625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.12615725863724947, + "epoch": 0.5876106194690266, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5005707404307401, + "learning_rate": 1e-06, + "loss": -0.0222, + "num_tokens": 167926062.0, + "reward": 0.6700546741485596, + "reward_std": 0.07864837348461151, + "rewards/qatch_small_update_with_fm/mean": 0.6700546741485596, + "rewards/qatch_small_update_with_fm/std": 0.4005128741264343, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9930409789085388, + "sampling/importance_sampling_ratio/min": 0.0007163186674006283, + "sampling/sampling_logp_difference/max": 7.241385459899902, + "sampling/sampling_logp_difference/mean": 0.10649150609970093, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2464.0, + "completions/max_terminated_length": 2464.0, + "completions/mean_length": 492.69921875, + "completions/mean_terminated_length": 492.69921875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.12994778994470835, + "epoch": 0.5893805309734513, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.462068641561627, + "learning_rate": 1e-06, + "loss": -0.0203, + "num_tokens": 168363249.0, + "reward": 0.6787070035934448, + "reward_std": 0.12104913592338562, + "rewards/qatch_small_update_with_fm/mean": 0.6787070035934448, + "rewards/qatch_small_update_with_fm/std": 0.3600393533706665, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9959803223609924, + "sampling/importance_sampling_ratio/min": 0.014395722188055515, + "sampling/sampling_logp_difference/max": 4.240824222564697, + "sampling/sampling_logp_difference/mean": 0.10011019557714462, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3668.0, + "completions/max_terminated_length": 3668.0, + "completions/mean_length": 610.2109375, + "completions/mean_terminated_length": 610.2109375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.14918566308915615, + "epoch": 0.5911504424778761, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4833571863040555, + "learning_rate": 1e-06, + "loss": -0.0226, + "num_tokens": 168936967.0, + "reward": 0.6901054978370667, + "reward_std": 0.12324898689985275, + "rewards/qatch_small_update_with_fm/mean": 0.6901054978370667, + "rewards/qatch_small_update_with_fm/std": 0.4031791090965271, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998165369033813, + "sampling/importance_sampling_ratio/min": 0.018390534445643425, + "sampling/sampling_logp_difference/max": 3.9959192276000977, + "sampling/sampling_logp_difference/mean": 0.10862861573696136, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 476.22265625, + "completions/mean_terminated_length": 476.22265625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.14073671028017998, + "epoch": 0.5929203539823009, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4452770871121166, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 169395120.0, + "reward": 0.8413281440734863, + "reward_std": 0.054899152368307114, + "rewards/qatch_small_update_with_fm/mean": 0.8413281440734863, + "rewards/qatch_small_update_with_fm/std": 0.2942917048931122, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9945122599601746, + "sampling/importance_sampling_ratio/min": 0.001602180185727775, + "sampling/sampling_logp_difference/max": 6.436389923095703, + "sampling/sampling_logp_difference/mean": 0.11215078830718994, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2302.0, + "completions/max_terminated_length": 2302.0, + "completions/mean_length": 470.0546875, + "completions/mean_terminated_length": 470.0546875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.13123032171279192, + "epoch": 0.5946902654867257, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4413668770939599, + "learning_rate": 1e-06, + "loss": 0.0154, + "num_tokens": 169960782.0, + "reward": 0.7925976514816284, + "reward_std": 0.06153729930520058, + "rewards/qatch_small_update_with_fm/mean": 0.7925976514816284, + "rewards/qatch_small_update_with_fm/std": 0.32960665225982666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9963173866271973, + "sampling/importance_sampling_ratio/min": 5.2583640353986993e-05, + "sampling/sampling_logp_difference/max": 9.853105545043945, + "sampling/sampling_logp_difference/mean": 0.1045021116733551, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 375.79296875, + "completions/mean_terminated_length": 375.79296875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.12677610013633966, + "epoch": 0.5964601769911504, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5472979749360979, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 170411641.0, + "reward": 0.70751953125, + "reward_std": 0.0590687096118927, + "rewards/qatch_small_update_with_fm/mean": 0.70751953125, + "rewards/qatch_small_update_with_fm/std": 0.3998953700065613, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946392774581909, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.10403802245855331, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 440.74609375, + "completions/mean_terminated_length": 426.41180419921875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.11007914505898952, + "epoch": 0.5982300884955752, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4332959024599203, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 170886616.0, + "reward": 0.7666601538658142, + "reward_std": 0.030690979212522507, + "rewards/qatch_small_update_with_fm/mean": 0.7666601538658142, + "rewards/qatch_small_update_with_fm/std": 0.3420563340187073, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9921419024467468, + "sampling/importance_sampling_ratio/min": 0.004177570343017578, + "sampling/sampling_logp_difference/max": 5.478025436401367, + "sampling/sampling_logp_difference/mean": 0.09689275920391083, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1560.0, + "completions/mean_length": 480.60546875, + "completions/mean_terminated_length": 466.427490234375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.13167188875377178, + "epoch": 0.6, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.41572721343676217, + "learning_rate": 1e-06, + "loss": -0.0408, + "num_tokens": 171562403.0, + "reward": 0.7634413838386536, + "reward_std": 0.06147729605436325, + "rewards/qatch_small_update_with_fm/mean": 0.7634413838386536, + "rewards/qatch_small_update_with_fm/std": 0.30900147557258606, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9971693754196167, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.10109194368124008, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1451.0, + "completions/max_terminated_length": 1451.0, + "completions/mean_length": 443.70703125, + "completions/mean_terminated_length": 443.70703125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.12869115360081196, + "epoch": 0.6017699115044248, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4337358298816994, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 172183448.0, + "reward": 0.6481367349624634, + "reward_std": 0.09045610576868057, + "rewards/qatch_small_update_with_fm/mean": 0.6481367349624634, + "rewards/qatch_small_update_with_fm/std": 0.3557817041873932, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.993698239326477, + "sampling/importance_sampling_ratio/min": 0.011156954802572727, + "sampling/sampling_logp_difference/max": 4.495692253112793, + "sampling/sampling_logp_difference/mean": 0.10702698677778244, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2873.0, + "completions/max_terminated_length": 2873.0, + "completions/mean_length": 449.98046875, + "completions/mean_terminated_length": 449.98046875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.13718800246715546, + "epoch": 0.6035398230088496, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3991429375228678, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 172828595.0, + "reward": 0.8131992220878601, + "reward_std": 0.03861326724290848, + "rewards/qatch_small_update_with_fm/mean": 0.8131991624832153, + "rewards/qatch_small_update_with_fm/std": 0.3113223612308502, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9982845783233643, + "sampling/importance_sampling_ratio/min": 0.011312464252114296, + "sampling/sampling_logp_difference/max": 4.4818501472473145, + "sampling/sampling_logp_difference/mean": 0.10661160200834274, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3680.0, + "completions/max_terminated_length": 3680.0, + "completions/mean_length": 553.42578125, + "completions/mean_terminated_length": 553.42578125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.13830072805285454, + "epoch": 0.6053097345132743, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5307300561999599, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 173347328.0, + "reward": 0.7857421636581421, + "reward_std": 0.12376996129751205, + "rewards/qatch_small_update_with_fm/mean": 0.7857421636581421, + "rewards/qatch_small_update_with_fm/std": 0.3143363893032074, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.998528003692627, + "sampling/importance_sampling_ratio/min": 0.008697294630110264, + "sampling/sampling_logp_difference/max": 4.744743347167969, + "sampling/sampling_logp_difference/mean": 0.10505607724189758, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1794.0, + "completions/mean_length": 583.9375, + "completions/mean_terminated_length": 570.1647338867188, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.13550461642444134, + "epoch": 0.6070796460176991, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6604555736712372, + "learning_rate": 1e-06, + "loss": -0.0578, + "num_tokens": 173950400.0, + "reward": 0.710085928440094, + "reward_std": 0.14106522500514984, + "rewards/qatch_small_update_with_fm/mean": 0.710085928440094, + "rewards/qatch_small_update_with_fm/std": 0.34384334087371826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9940401315689087, + "sampling/importance_sampling_ratio/min": 0.0029973003547638655, + "sampling/sampling_logp_difference/max": 5.8100433349609375, + "sampling/sampling_logp_difference/mean": 0.10811758041381836, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3297.0, + "completions/max_terminated_length": 3297.0, + "completions/mean_length": 553.1171875, + "completions/mean_terminated_length": 553.1171875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.12964780256152153, + "epoch": 0.6088495575221239, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4294082866965215, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 174738366.0, + "reward": 0.669113278388977, + "reward_std": 0.07132252305746078, + "rewards/qatch_small_update_with_fm/mean": 0.669113278388977, + "rewards/qatch_small_update_with_fm/std": 0.3960440456867218, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9974240064620972, + "sampling/importance_sampling_ratio/min": 0.011154402047395706, + "sampling/sampling_logp_difference/max": 4.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.0989176407456398, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2325.0, + "completions/max_terminated_length": 2325.0, + "completions/mean_length": 496.5703125, + "completions/mean_terminated_length": 496.5703125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.1358927357941866, + "epoch": 0.6106194690265486, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5797395111181823, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 175206480.0, + "reward": 0.7895351648330688, + "reward_std": 0.13384278118610382, + "rewards/qatch_small_update_with_fm/mean": 0.7895351648330688, + "rewards/qatch_small_update_with_fm/std": 0.3367822766304016, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9962368011474609, + "sampling/importance_sampling_ratio/min": 0.003333539701998234, + "sampling/sampling_logp_difference/max": 5.703720569610596, + "sampling/sampling_logp_difference/mean": 0.10676106810569763, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3889.0, + "completions/mean_length": 659.578125, + "completions/mean_terminated_length": 605.0317993164062, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.16065522283315659, + "epoch": 0.6123893805309735, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.36081780370740074, + "learning_rate": 1e-06, + "loss": -0.0681, + "num_tokens": 175694948.0, + "reward": 0.8195507526397705, + "reward_std": 0.07761920243501663, + "rewards/qatch_small_update_with_fm/mean": 0.8195507526397705, + "rewards/qatch_small_update_with_fm/std": 0.339427649974823, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0015735626220703, + "sampling/importance_sampling_ratio/min": 0.005800637882202864, + "sampling/sampling_logp_difference/max": 5.149787425994873, + "sampling/sampling_logp_difference/mean": 0.1185375228524208, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1485.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 381.00390625, + "completions/mean_terminated_length": 381.00390625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.11013178713619709, + "epoch": 0.6141592920353982, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5357087422630683, + "learning_rate": 1e-06, + "loss": 0.0389, + "num_tokens": 176272869.0, + "reward": 0.8548632860183716, + "reward_std": 0.0601215660572052, + "rewards/qatch_small_update_with_fm/mean": 0.8548632860183716, + "rewards/qatch_small_update_with_fm/std": 0.3249365985393524, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992617666721344, + "sampling/importance_sampling_ratio/min": 0.011489858850836754, + "sampling/sampling_logp_difference/max": 4.466290473937988, + "sampling/sampling_logp_difference/mean": 0.09646125137805939, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4025.0, + "completions/mean_length": 812.734375, + "completions/mean_terminated_length": 799.85888671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.20528638921678066, + "epoch": 0.6159292035398231, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.4169959713473217, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 176961297.0, + "reward": 0.5849062204360962, + "reward_std": 0.1079559475183487, + "rewards/qatch_small_update_with_fm/mean": 0.5849062204360962, + "rewards/qatch_small_update_with_fm/std": 0.3221876919269562, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.005646824836731, + "sampling/importance_sampling_ratio/min": 4.014266960439272e-05, + "sampling/sampling_logp_difference/max": 10.12307071685791, + "sampling/sampling_logp_difference/mean": 0.13561221957206726, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2089.0, + "completions/mean_length": 563.3828125, + "completions/mean_terminated_length": 549.5294189453125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.137452632188797, + "epoch": 0.6176991150442478, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5080168937055006, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 177646515.0, + "reward": 0.7648085951805115, + "reward_std": 0.14244134724140167, + "rewards/qatch_small_update_with_fm/mean": 0.7648086547851562, + "rewards/qatch_small_update_with_fm/std": 0.31815099716186523, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9965197443962097, + "sampling/importance_sampling_ratio/min": 0.010515945963561535, + "sampling/sampling_logp_difference/max": 4.5548624992370605, + "sampling/sampling_logp_difference/mean": 0.10763923823833466, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2304.0, + "completions/max_terminated_length": 2304.0, + "completions/mean_length": 504.24609375, + "completions/mean_terminated_length": 504.24609375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.14669188857078552, + "epoch": 0.6194690265486725, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.46991361682403926, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 178274386.0, + "reward": 0.7633749842643738, + "reward_std": 0.0714079737663269, + "rewards/qatch_small_update_with_fm/mean": 0.7633749842643738, + "rewards/qatch_small_update_with_fm/std": 0.3283040523529053, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9949438571929932, + "sampling/importance_sampling_ratio/min": 0.005583168473094702, + "sampling/sampling_logp_difference/max": 5.1879987716674805, + "sampling/sampling_logp_difference/mean": 0.11546455323696136, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2115.0, + "completions/max_terminated_length": 2115.0, + "completions/mean_length": 597.47265625, + "completions/mean_terminated_length": 597.47265625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.15416298992931843, + "epoch": 0.6212389380530974, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4550888098901701, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 178902139.0, + "reward": 0.7233320474624634, + "reward_std": 0.09315317869186401, + "rewards/qatch_small_update_with_fm/mean": 0.7233320474624634, + "rewards/qatch_small_update_with_fm/std": 0.38610121607780457, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9959432482719421, + "sampling/importance_sampling_ratio/min": 0.0018164917128160596, + "sampling/sampling_logp_difference/max": 6.310848236083984, + "sampling/sampling_logp_difference/mean": 0.11840873956680298, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3230.0, + "completions/max_terminated_length": 3230.0, + "completions/mean_length": 491.67578125, + "completions/mean_terminated_length": 491.67578125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.17086774297058582, + "epoch": 0.6230088495575221, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.41309780113542105, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 179407928.0, + "reward": 0.8434453010559082, + "reward_std": 0.09375521540641785, + "rewards/qatch_small_update_with_fm/mean": 0.8434453010559082, + "rewards/qatch_small_update_with_fm/std": 0.25720974802970886, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986539483070374, + "sampling/importance_sampling_ratio/min": 0.0020053053740411997, + "sampling/sampling_logp_difference/max": 6.211958885192871, + "sampling/sampling_logp_difference/mean": 0.12576082348823547, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 548.05078125, + "completions/mean_terminated_length": 548.05078125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.15240298677235842, + "epoch": 0.6247787610619469, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4873828861345403, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 179888837.0, + "reward": 0.6651484370231628, + "reward_std": 0.07561331987380981, + "rewards/qatch_small_update_with_fm/mean": 0.6651483774185181, + "rewards/qatch_small_update_with_fm/std": 0.39603519439697266, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9961607456207275, + "sampling/importance_sampling_ratio/min": 0.002665693871676922, + "sampling/sampling_logp_difference/max": 5.927290916442871, + "sampling/sampling_logp_difference/mean": 0.11823346465826035, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 412.3203125, + "completions/mean_terminated_length": 412.3203125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.12977661192417145, + "epoch": 0.6265486725663717, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.320145515397183, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 180399383.0, + "reward": 0.9061718583106995, + "reward_std": 0.0416874997317791, + "rewards/qatch_small_update_with_fm/mean": 0.9061718583106995, + "rewards/qatch_small_update_with_fm/std": 0.20859557390213013, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925155639648438, + "sampling/importance_sampling_ratio/min": 0.01432258915156126, + "sampling/sampling_logp_difference/max": 4.245917320251465, + "sampling/sampling_logp_difference/mean": 0.11004288494586945, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 414.60546875, + "completions/mean_terminated_length": 414.60546875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.1346904393285513, + "epoch": 0.6283185840707964, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.47310823738569224, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 180899346.0, + "reward": 0.8244647979736328, + "reward_std": 0.08044957369565964, + "rewards/qatch_small_update_with_fm/mean": 0.8244647979736328, + "rewards/qatch_small_update_with_fm/std": 0.3035234212875366, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934406876564026, + "sampling/importance_sampling_ratio/min": 0.012023287825286388, + "sampling/sampling_logp_difference/max": 4.420909881591797, + "sampling/sampling_logp_difference/mean": 0.10992369055747986, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1923.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 498.35546875, + "completions/mean_terminated_length": 498.35546875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.13894802145659924, + "epoch": 0.6300884955752213, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5291274570939486, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 181416381.0, + "reward": 0.8441094160079956, + "reward_std": 0.10105187445878983, + "rewards/qatch_small_update_with_fm/mean": 0.8441094160079956, + "rewards/qatch_small_update_with_fm/std": 0.28807127475738525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9959750175476074, + "sampling/importance_sampling_ratio/min": 0.00495093734934926, + "sampling/sampling_logp_difference/max": 5.308178424835205, + "sampling/sampling_logp_difference/mean": 0.10719551891088486, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 503.2890625, + "completions/mean_terminated_length": 489.2000427246094, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.13633199036121368, + "epoch": 0.631858407079646, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.42358638576867164, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 181996903.0, + "reward": 0.8239570260047913, + "reward_std": 0.07967139035463333, + "rewards/qatch_small_update_with_fm/mean": 0.8239570260047913, + "rewards/qatch_small_update_with_fm/std": 0.296528160572052, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9955064058303833, + "sampling/importance_sampling_ratio/min": 0.004409489221870899, + "sampling/sampling_logp_difference/max": 5.423996448516846, + "sampling/sampling_logp_difference/mean": 0.1080026775598526, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2529.0, + "completions/max_terminated_length": 2529.0, + "completions/mean_length": 482.41015625, + "completions/mean_terminated_length": 482.41015625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.1289900867268443, + "epoch": 0.6336283185840708, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5076216355009244, + "learning_rate": 1e-06, + "loss": 0.038, + "num_tokens": 182562896.0, + "reward": 0.7308008074760437, + "reward_std": 0.08840906620025635, + "rewards/qatch_small_update_with_fm/mean": 0.7308008074760437, + "rewards/qatch_small_update_with_fm/std": 0.3640395700931549, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9929482936859131, + "sampling/importance_sampling_ratio/min": 0.006888894364237785, + "sampling/sampling_logp_difference/max": 4.977844715118408, + "sampling/sampling_logp_difference/mean": 0.1100352555513382, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 491.66796875, + "completions/mean_terminated_length": 491.66796875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.1540286112576723, + "epoch": 0.6353982300884956, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5294897566023161, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 183366587.0, + "reward": 0.8129062652587891, + "reward_std": 0.08059161901473999, + "rewards/qatch_small_update_with_fm/mean": 0.8129062652587891, + "rewards/qatch_small_update_with_fm/std": 0.2822462320327759, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9948251247406006, + "sampling/importance_sampling_ratio/min": 0.0018911922816187143, + "sampling/sampling_logp_difference/max": 6.270547866821289, + "sampling/sampling_logp_difference/mean": 0.12115992605686188, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 364.6171875, + "completions/mean_terminated_length": 364.6171875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.10692374687641859, + "epoch": 0.6371681415929203, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.47798261049620744, + "learning_rate": 1e-06, + "loss": -0.0321, + "num_tokens": 183851897.0, + "reward": 0.8864530920982361, + "reward_std": 0.05340360850095749, + "rewards/qatch_small_update_with_fm/mean": 0.8864530920982361, + "rewards/qatch_small_update_with_fm/std": 0.2327709197998047, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992672324180603, + "sampling/importance_sampling_ratio/min": 0.011662163771688938, + "sampling/sampling_logp_difference/max": 4.4514055252075195, + "sampling/sampling_logp_difference/mean": 0.09532105177640915, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2235.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 574.52734375, + "completions/mean_terminated_length": 574.52734375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.1734640970826149, + "epoch": 0.6389380530973451, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5221913862946522, + "learning_rate": 1e-06, + "loss": -0.0231, + "num_tokens": 184408144.0, + "reward": 0.7384335994720459, + "reward_std": 0.11242429167032242, + "rewards/qatch_small_update_with_fm/mean": 0.7384335398674011, + "rewards/qatch_small_update_with_fm/std": 0.3646511435508728, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003238916397095, + "sampling/importance_sampling_ratio/min": 0.00335796014405787, + "sampling/sampling_logp_difference/max": 5.6964216232299805, + "sampling/sampling_logp_difference/mean": 0.12429113686084747, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2408.0, + "completions/max_terminated_length": 2408.0, + "completions/mean_length": 401.9921875, + "completions/mean_terminated_length": 401.9921875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.1182346660643816, + "epoch": 0.6407079646017699, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4427870385280626, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 184909566.0, + "reward": 0.8384257555007935, + "reward_std": 0.05273023992776871, + "rewards/qatch_small_update_with_fm/mean": 0.8384257555007935, + "rewards/qatch_small_update_with_fm/std": 0.2808656394481659, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9951916933059692, + "sampling/importance_sampling_ratio/min": 0.004122166894376278, + "sampling/sampling_logp_difference/max": 5.4913763999938965, + "sampling/sampling_logp_difference/mean": 0.0995698869228363, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3443.0, + "completions/mean_length": 689.7421875, + "completions/mean_terminated_length": 662.9212646484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.18941612169146538, + "epoch": 0.6424778761061947, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.39168740182060785, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 185508332.0, + "reward": 0.7280429601669312, + "reward_std": 0.10673657059669495, + "rewards/qatch_small_update_with_fm/mean": 0.7280429601669312, + "rewards/qatch_small_update_with_fm/std": 0.37747400999069214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0072745084762573, + "sampling/importance_sampling_ratio/min": 0.00017696816939860582, + "sampling/sampling_logp_difference/max": 8.639540672302246, + "sampling/sampling_logp_difference/mean": 0.12284526973962784, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 395.875, + "completions/mean_terminated_length": 395.875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.13422376941889524, + "epoch": 0.6442477876106195, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5339106021835213, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 186070620.0, + "reward": 0.8596680164337158, + "reward_std": 0.05402027443051338, + "rewards/qatch_small_update_with_fm/mean": 0.8596680164337158, + "rewards/qatch_small_update_with_fm/std": 0.27827054262161255, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9923632144927979, + "sampling/importance_sampling_ratio/min": 0.005285664927214384, + "sampling/sampling_logp_difference/max": 5.2427568435668945, + "sampling/sampling_logp_difference/mean": 0.11187142133712769, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2535.0, + "completions/max_terminated_length": 2535.0, + "completions/mean_length": 563.1640625, + "completions/mean_terminated_length": 563.1640625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1712171956896782, + "epoch": 0.6460176991150443, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.4925582892442974, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 186725974.0, + "reward": 0.7571640014648438, + "reward_std": 0.11262641847133636, + "rewards/qatch_small_update_with_fm/mean": 0.7571640014648438, + "rewards/qatch_small_update_with_fm/std": 0.3311219811439514, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9971981048583984, + "sampling/importance_sampling_ratio/min": 0.0032281808089464903, + "sampling/sampling_logp_difference/max": 5.735836505889893, + "sampling/sampling_logp_difference/mean": 0.1241505891084671, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3207.0, + "completions/max_terminated_length": 3207.0, + "completions/mean_length": 530.6640625, + "completions/mean_terminated_length": 530.6640625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.13621144648641348, + "epoch": 0.647787610619469, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5985888632848992, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 187224176.0, + "reward": 0.7640078067779541, + "reward_std": 0.10024788975715637, + "rewards/qatch_small_update_with_fm/mean": 0.7640078067779541, + "rewards/qatch_small_update_with_fm/std": 0.3289325535297394, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985411167144775, + "sampling/importance_sampling_ratio/min": 0.011255462653934956, + "sampling/sampling_logp_difference/max": 4.486901760101318, + "sampling/sampling_logp_difference/mean": 0.10325411707162857, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3778.0, + "completions/mean_length": 547.9375, + "completions/mean_terminated_length": 491.61907958984375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.12220047693699598, + "epoch": 0.6495575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.32769796643092136, + "learning_rate": 1e-06, + "loss": -0.0868, + "num_tokens": 187659680.0, + "reward": 0.7751250267028809, + "reward_std": 0.06956201791763306, + "rewards/qatch_small_update_with_fm/mean": 0.7751250267028809, + "rewards/qatch_small_update_with_fm/std": 0.36153918504714966, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9963778257369995, + "sampling/importance_sampling_ratio/min": 0.018411830067634583, + "sampling/sampling_logp_difference/max": 3.9947619438171387, + "sampling/sampling_logp_difference/mean": 0.10011971741914749, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3122.0, + "completions/mean_length": 734.4375, + "completions/mean_terminated_length": 667.47412109375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.16884157061576843, + "epoch": 0.6513274336283186, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.4418356024268668, + "learning_rate": 1e-06, + "loss": -0.0543, + "num_tokens": 188319328.0, + "reward": 0.7149375081062317, + "reward_std": 0.09334467351436615, + "rewards/qatch_small_update_with_fm/mean": 0.7149375081062317, + "rewards/qatch_small_update_with_fm/std": 0.3463698923587799, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000972747802734, + "sampling/importance_sampling_ratio/min": 0.01852019503712654, + "sampling/sampling_logp_difference/max": 3.988893508911133, + "sampling/sampling_logp_difference/mean": 0.11699914187192917, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2181.0, + "completions/max_terminated_length": 2181.0, + "completions/mean_length": 502.859375, + "completions/mean_terminated_length": 502.859375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.12519878055900335, + "epoch": 0.6530973451327433, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.6438590645763488, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 188984524.0, + "reward": 0.8615351319313049, + "reward_std": 0.11218338459730148, + "rewards/qatch_small_update_with_fm/mean": 0.8615351319313049, + "rewards/qatch_small_update_with_fm/std": 0.24655072391033173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9942902326583862, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.09988274425268173, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3375.0, + "completions/max_terminated_length": 3375.0, + "completions/mean_length": 547.31640625, + "completions/mean_terminated_length": 547.31640625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.14965487271547318, + "epoch": 0.6548672566371682, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4595165561544715, + "learning_rate": 1e-06, + "loss": 0.0728, + "num_tokens": 189394541.0, + "reward": 0.715039074420929, + "reward_std": 0.044145166873931885, + "rewards/qatch_small_update_with_fm/mean": 0.715039074420929, + "rewards/qatch_small_update_with_fm/std": 0.35467761754989624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9968997240066528, + "sampling/importance_sampling_ratio/min": 0.001756817102432251, + "sampling/sampling_logp_difference/max": 6.34425163269043, + "sampling/sampling_logp_difference/mean": 0.11401140689849854, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2255.0, + "completions/max_terminated_length": 2255.0, + "completions/mean_length": 495.32421875, + "completions/mean_terminated_length": 495.32421875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.14836543146520853, + "epoch": 0.6566371681415929, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4925656414759849, + "learning_rate": 1e-06, + "loss": 0.0385, + "num_tokens": 189856128.0, + "reward": 0.6013593673706055, + "reward_std": 0.09321607649326324, + "rewards/qatch_small_update_with_fm/mean": 0.6013593673706055, + "rewards/qatch_small_update_with_fm/std": 0.39479532837867737, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9953794479370117, + "sampling/importance_sampling_ratio/min": 0.001509586232714355, + "sampling/sampling_logp_difference/max": 6.495919704437256, + "sampling/sampling_logp_difference/mean": 0.11514382064342499, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2480.0, + "completions/max_terminated_length": 2480.0, + "completions/mean_length": 617.25, + "completions/mean_terminated_length": 617.25, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.16911724209785461, + "epoch": 0.6584070796460177, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.420032439150815, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 190641648.0, + "reward": 0.7167304754257202, + "reward_std": 0.07554842531681061, + "rewards/qatch_small_update_with_fm/mean": 0.7167304754257202, + "rewards/qatch_small_update_with_fm/std": 0.32102170586586, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9969842433929443, + "sampling/importance_sampling_ratio/min": 0.0024064192548394203, + "sampling/sampling_logp_difference/max": 6.02961540222168, + "sampling/sampling_logp_difference/mean": 0.12394626438617706, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2548.0, + "completions/max_terminated_length": 2548.0, + "completions/mean_length": 746.92578125, + "completions/mean_terminated_length": 746.92578125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.18326103128492832, + "epoch": 0.6601769911504425, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.34790702562535814, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 191207677.0, + "reward": 0.6764101386070251, + "reward_std": 0.1258085072040558, + "rewards/qatch_small_update_with_fm/mean": 0.6764101386070251, + "rewards/qatch_small_update_with_fm/std": 0.3671666085720062, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0023778676986694, + "sampling/importance_sampling_ratio/min": 0.008668403141200542, + "sampling/sampling_logp_difference/max": 4.74807071685791, + "sampling/sampling_logp_difference/mean": 0.12471143901348114, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2508.0, + "completions/max_terminated_length": 2508.0, + "completions/mean_length": 511.9453125, + "completions/mean_terminated_length": 511.9453125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.13981572352349758, + "epoch": 0.6619469026548672, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4099991695151121, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 191821791.0, + "reward": 0.8004882335662842, + "reward_std": 0.050791338086128235, + "rewards/qatch_small_update_with_fm/mean": 0.8004882335662842, + "rewards/qatch_small_update_with_fm/std": 0.3645688593387604, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9944133758544922, + "sampling/importance_sampling_ratio/min": 0.008987173438072205, + "sampling/sampling_logp_difference/max": 4.711956977844238, + "sampling/sampling_logp_difference/mean": 0.11034908890724182, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2796.0, + "completions/max_terminated_length": 2796.0, + "completions/mean_length": 566.328125, + "completions/mean_terminated_length": 566.328125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.15874271467328072, + "epoch": 0.6637168141592921, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4275311076393409, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 192456643.0, + "reward": 0.7895312309265137, + "reward_std": 0.08289177715778351, + "rewards/qatch_small_update_with_fm/mean": 0.7895312309265137, + "rewards/qatch_small_update_with_fm/std": 0.3061412274837494, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9964075684547424, + "sampling/importance_sampling_ratio/min": 0.018361065536737442, + "sampling/sampling_logp_difference/max": 3.9975228309631348, + "sampling/sampling_logp_difference/mean": 0.11977240443229675, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4086.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 692.05859375, + "completions/mean_terminated_length": 692.05859375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.17270477302372456, + "epoch": 0.6654867256637168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.38419500050306277, + "learning_rate": 1e-06, + "loss": 0.0907, + "num_tokens": 193035442.0, + "reward": 0.6908007860183716, + "reward_std": 0.07150629162788391, + "rewards/qatch_small_update_with_fm/mean": 0.6908007860183716, + "rewards/qatch_small_update_with_fm/std": 0.34826797246932983, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000948905944824, + "sampling/importance_sampling_ratio/min": 4.4799104870207884e-08, + "sampling/sampling_logp_difference/max": 16.921077728271484, + "sampling/sampling_logp_difference/mean": 0.12002436816692352, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1786.0, + "completions/max_terminated_length": 1786.0, + "completions/mean_length": 507.24609375, + "completions/mean_terminated_length": 507.24609375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.12251328956335783, + "epoch": 0.6672566371681415, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.34979100036660665, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 193707233.0, + "reward": 0.8660585880279541, + "reward_std": 0.09433520585298538, + "rewards/qatch_small_update_with_fm/mean": 0.8660585880279541, + "rewards/qatch_small_update_with_fm/std": 0.25607553124427795, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9955108761787415, + "sampling/importance_sampling_ratio/min": 0.008815412409603596, + "sampling/sampling_logp_difference/max": 4.731253623962402, + "sampling/sampling_logp_difference/mean": 0.09934522956609726, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2020.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 620.8125, + "completions/mean_terminated_length": 620.8125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.1559175457805395, + "epoch": 0.6690265486725664, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.33073547774063494, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 194216241.0, + "reward": 0.7509804964065552, + "reward_std": 0.0672738254070282, + "rewards/qatch_small_update_with_fm/mean": 0.7509804368019104, + "rewards/qatch_small_update_with_fm/std": 0.3491855263710022, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985901117324829, + "sampling/importance_sampling_ratio/min": 0.00598114961758256, + "sampling/sampling_logp_difference/max": 5.119142532348633, + "sampling/sampling_logp_difference/mean": 0.11180153489112854, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2756.0, + "completions/max_terminated_length": 2756.0, + "completions/mean_length": 540.3046875, + "completions/mean_terminated_length": 540.3046875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.1661166436970234, + "epoch": 0.6707964601769911, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.33195648875070566, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 194831391.0, + "reward": 0.7810038924217224, + "reward_std": 0.04923711344599724, + "rewards/qatch_small_update_with_fm/mean": 0.7810038924217224, + "rewards/qatch_small_update_with_fm/std": 0.33204519748687744, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9990555047988892, + "sampling/importance_sampling_ratio/min": 0.009403667412698269, + "sampling/sampling_logp_difference/max": 4.666655540466309, + "sampling/sampling_logp_difference/mean": 0.11909811943769455, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2042.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 474.10546875, + "completions/mean_terminated_length": 474.10546875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.14898390136659145, + "epoch": 0.672566371681416, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.40024858962846094, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 195477002.0, + "reward": 0.8072617053985596, + "reward_std": 0.06418918073177338, + "rewards/qatch_small_update_with_fm/mean": 0.8072617053985596, + "rewards/qatch_small_update_with_fm/std": 0.3275093734264374, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9951503276824951, + "sampling/importance_sampling_ratio/min": 0.018497345969080925, + "sampling/sampling_logp_difference/max": 3.9901280403137207, + "sampling/sampling_logp_difference/mean": 0.11574701964855194, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 454.40234375, + "completions/mean_terminated_length": 454.40234375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.1413791086524725, + "epoch": 0.6743362831858407, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.38007691311421254, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 195954881.0, + "reward": 0.744070291519165, + "reward_std": 0.05227509140968323, + "rewards/qatch_small_update_with_fm/mean": 0.744070291519165, + "rewards/qatch_small_update_with_fm/std": 0.3730511963367462, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9953719973564148, + "sampling/importance_sampling_ratio/min": 0.0025141651276499033, + "sampling/sampling_logp_difference/max": 5.985814571380615, + "sampling/sampling_logp_difference/mean": 0.11235985904932022, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 510.7578125, + "completions/mean_terminated_length": 510.7578125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.14344297163188457, + "epoch": 0.6761061946902654, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.4625180428801349, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 196464291.0, + "reward": 0.816574215888977, + "reward_std": 0.15176716446876526, + "rewards/qatch_small_update_with_fm/mean": 0.816574215888977, + "rewards/qatch_small_update_with_fm/std": 0.3127204179763794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9949458837509155, + "sampling/importance_sampling_ratio/min": 0.006565525196492672, + "sampling/sampling_logp_difference/max": 5.025922775268555, + "sampling/sampling_logp_difference/mean": 0.11291942000389099, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2107.0, + "completions/max_terminated_length": 2107.0, + "completions/mean_length": 583.234375, + "completions/mean_terminated_length": 583.234375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.15638727508485317, + "epoch": 0.6778761061946903, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45746226742929946, + "learning_rate": 1e-06, + "loss": -0.0288, + "num_tokens": 197120607.0, + "reward": 0.6001679301261902, + "reward_std": 0.12643268704414368, + "rewards/qatch_small_update_with_fm/mean": 0.600167989730835, + "rewards/qatch_small_update_with_fm/std": 0.3939620852470398, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9973058700561523, + "sampling/importance_sampling_ratio/min": 0.011168883182108402, + "sampling/sampling_logp_difference/max": 4.49462366104126, + "sampling/sampling_logp_difference/mean": 0.1140265166759491, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2134.0, + "completions/max_terminated_length": 2134.0, + "completions/mean_length": 478.71875, + "completions/mean_terminated_length": 478.71875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.14543859846889973, + "epoch": 0.679646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3538203767911081, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 197602663.0, + "reward": 0.7854375243186951, + "reward_std": 0.028378454968333244, + "rewards/qatch_small_update_with_fm/mean": 0.7854375243186951, + "rewards/qatch_small_update_with_fm/std": 0.32659393548965454, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9973599314689636, + "sampling/importance_sampling_ratio/min": 1.096164487535134e-05, + "sampling/sampling_logp_difference/max": 11.42110824584961, + "sampling/sampling_logp_difference/mean": 0.11048808693885803, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2174.0, + "completions/max_terminated_length": 2174.0, + "completions/mean_length": 486.6171875, + "completions/mean_terminated_length": 486.6171875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.14200040139257908, + "epoch": 0.6814159292035398, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.44083436912740653, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 198074565.0, + "reward": 0.808886706829071, + "reward_std": 0.026976868510246277, + "rewards/qatch_small_update_with_fm/mean": 0.808886706829071, + "rewards/qatch_small_update_with_fm/std": 0.3425396680831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9965760707855225, + "sampling/importance_sampling_ratio/min": 0.008726546540856361, + "sampling/sampling_logp_difference/max": 4.741385459899902, + "sampling/sampling_logp_difference/mean": 0.10850890725851059, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1812.0, + "completions/max_terminated_length": 1812.0, + "completions/mean_length": 455.9921875, + "completions/mean_terminated_length": 455.9921875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.1441742181777954, + "epoch": 0.6831858407079646, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5614946533696555, + "learning_rate": 1e-06, + "loss": 0.0259, + "num_tokens": 198597955.0, + "reward": 0.8295273780822754, + "reward_std": 0.09427762776613235, + "rewards/qatch_small_update_with_fm/mean": 0.8295273780822754, + "rewards/qatch_small_update_with_fm/std": 0.2936692535877228, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9962972402572632, + "sampling/importance_sampling_ratio/min": 0.013299347832798958, + "sampling/sampling_logp_difference/max": 4.320040225982666, + "sampling/sampling_logp_difference/mean": 0.1088903546333313, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3817.0, + "completions/max_terminated_length": 3817.0, + "completions/mean_length": 589.7578125, + "completions/mean_terminated_length": 589.7578125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.15512034017592669, + "epoch": 0.6849557522123894, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.439384354195262, + "learning_rate": 1e-06, + "loss": -0.0228, + "num_tokens": 198996725.0, + "reward": 0.7426366806030273, + "reward_std": 0.030658302828669548, + "rewards/qatch_small_update_with_fm/mean": 0.7426366806030273, + "rewards/qatch_small_update_with_fm/std": 0.35217705368995667, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001318097114563, + "sampling/importance_sampling_ratio/min": 0.004440457094460726, + "sampling/sampling_logp_difference/max": 5.416997909545898, + "sampling/sampling_logp_difference/mean": 0.11081845313310623, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 1209.0, + "completions/mean_length": 464.1015625, + "completions/mean_terminated_length": 464.1015625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.1471645962446928, + "epoch": 0.6867256637168142, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.530064088610342, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 199847519.0, + "reward": 0.7159297466278076, + "reward_std": 0.1021965891122818, + "rewards/qatch_small_update_with_fm/mean": 0.7159296870231628, + "rewards/qatch_small_update_with_fm/std": 0.36042875051498413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9948940277099609, + "sampling/importance_sampling_ratio/min": 0.006773307919502258, + "sampling/sampling_logp_difference/max": 4.994765758514404, + "sampling/sampling_logp_difference/mean": 0.11302639544010162, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1255.0, + "completions/max_terminated_length": 1255.0, + "completions/mean_length": 392.51953125, + "completions/mean_terminated_length": 392.51953125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.12604860868304968, + "epoch": 0.6884955752212389, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5616242030778377, + "learning_rate": 1e-06, + "loss": -0.0111, + "num_tokens": 200272052.0, + "reward": 0.8972305059432983, + "reward_std": 0.11663605272769928, + "rewards/qatch_small_update_with_fm/mean": 0.8972305059432983, + "rewards/qatch_small_update_with_fm/std": 0.24549192190170288, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938252568244934, + "sampling/importance_sampling_ratio/min": 0.023648742586374283, + "sampling/sampling_logp_difference/max": 3.744445323944092, + "sampling/sampling_logp_difference/mean": 0.10341814160346985, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2475.0, + "completions/max_terminated_length": 2475.0, + "completions/mean_length": 519.9296875, + "completions/mean_terminated_length": 519.9296875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.17424634657800198, + "epoch": 0.6902654867256637, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5942977436585406, + "learning_rate": 1e-06, + "loss": -0.0269, + "num_tokens": 200742274.0, + "reward": 0.8007890582084656, + "reward_std": 0.07194599509239197, + "rewards/qatch_small_update_with_fm/mean": 0.8007890582084656, + "rewards/qatch_small_update_with_fm/std": 0.29960644245147705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002765655517578, + "sampling/importance_sampling_ratio/min": 0.008807619102299213, + "sampling/sampling_logp_difference/max": 4.732138156890869, + "sampling/sampling_logp_difference/mean": 0.12190736830234528, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2373.0, + "completions/max_terminated_length": 2373.0, + "completions/mean_length": 522.78515625, + "completions/mean_terminated_length": 522.78515625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.16362570971250534, + "epoch": 0.6920353982300885, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5138882733479488, + "learning_rate": 1e-06, + "loss": 0.0653, + "num_tokens": 201239675.0, + "reward": 0.76220703125, + "reward_std": 0.05660145357251167, + "rewards/qatch_small_update_with_fm/mean": 0.76220703125, + "rewards/qatch_small_update_with_fm/std": 0.3293953239917755, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004456043243408, + "sampling/importance_sampling_ratio/min": 0.014311004430055618, + "sampling/sampling_logp_difference/max": 4.2467265129089355, + "sampling/sampling_logp_difference/mean": 0.11852040886878967, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2247.0, + "completions/max_terminated_length": 2247.0, + "completions/mean_length": 642.67578125, + "completions/mean_terminated_length": 642.67578125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.17715238966047764, + "epoch": 0.6938053097345133, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.3121151290906744, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 201780376.0, + "reward": 0.684402346611023, + "reward_std": 0.05445664003491402, + "rewards/qatch_small_update_with_fm/mean": 0.684402346611023, + "rewards/qatch_small_update_with_fm/std": 0.3712097406387329, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992333650588989, + "sampling/importance_sampling_ratio/min": 0.0003269147127866745, + "sampling/sampling_logp_difference/max": 8.025811195373535, + "sampling/sampling_logp_difference/mean": 0.1236954778432846, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1274.0, + "completions/max_terminated_length": 1274.0, + "completions/mean_length": 452.3359375, + "completions/mean_terminated_length": 452.3359375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.1450560800731182, + "epoch": 0.695575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.34599328815634517, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 202230654.0, + "reward": 0.8244608640670776, + "reward_std": 0.06429094076156616, + "rewards/qatch_small_update_with_fm/mean": 0.8244609236717224, + "rewards/qatch_small_update_with_fm/std": 0.2881889343261719, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9949817061424255, + "sampling/importance_sampling_ratio/min": 0.0028936576563864946, + "sampling/sampling_logp_difference/max": 5.845233917236328, + "sampling/sampling_logp_difference/mean": 0.11141933500766754, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3501.0, + "completions/mean_length": 703.48828125, + "completions/mean_terminated_length": 608.116455078125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.1723710596561432, + "epoch": 0.6973451327433628, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5065433606281433, + "learning_rate": 1e-06, + "loss": -0.051, + "num_tokens": 202864043.0, + "reward": 0.6551952958106995, + "reward_std": 0.12935110926628113, + "rewards/qatch_small_update_with_fm/mean": 0.6551952958106995, + "rewards/qatch_small_update_with_fm/std": 0.41460785269737244, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006225109100342, + "sampling/importance_sampling_ratio/min": 0.008855162188410759, + "sampling/sampling_logp_difference/max": 4.726754665374756, + "sampling/sampling_logp_difference/mean": 0.12057724595069885, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2785.0, + "completions/max_terminated_length": 2785.0, + "completions/mean_length": 631.3828125, + "completions/mean_terminated_length": 631.3828125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.2066683303564787, + "epoch": 0.6991150442477876, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.43477121030532406, + "learning_rate": 1e-06, + "loss": 0.0352, + "num_tokens": 203379229.0, + "reward": 0.6926288604736328, + "reward_std": 0.0851493626832962, + "rewards/qatch_small_update_with_fm/mean": 0.6926288604736328, + "rewards/qatch_small_update_with_fm/std": 0.39662235975265503, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0038108825683594, + "sampling/importance_sampling_ratio/min": 0.005343807861208916, + "sampling/sampling_logp_difference/max": 5.23181676864624, + "sampling/sampling_logp_difference/mean": 0.13774800300598145, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 419.71484375, + "completions/mean_terminated_length": 419.71484375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.13268938660621643, + "epoch": 0.7008849557522124, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4484007539037055, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 203883556.0, + "reward": 0.8229648470878601, + "reward_std": 0.05420804023742676, + "rewards/qatch_small_update_with_fm/mean": 0.8229647874832153, + "rewards/qatch_small_update_with_fm/std": 0.3085273504257202, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9944854378700256, + "sampling/importance_sampling_ratio/min": 0.004351816605776548, + "sampling/sampling_logp_difference/max": 5.437161922454834, + "sampling/sampling_logp_difference/mean": 0.10334019362926483, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2178.0, + "completions/max_terminated_length": 2178.0, + "completions/mean_length": 561.34375, + "completions/mean_terminated_length": 561.34375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.1825061459094286, + "epoch": 0.7026548672566372, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3502005182299795, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 204624508.0, + "reward": 0.6012461185455322, + "reward_std": 0.07029339671134949, + "rewards/qatch_small_update_with_fm/mean": 0.6012461185455322, + "rewards/qatch_small_update_with_fm/std": 0.3837004601955414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9966017603874207, + "sampling/importance_sampling_ratio/min": 0.00334866507910192, + "sampling/sampling_logp_difference/max": 5.699193477630615, + "sampling/sampling_logp_difference/mean": 0.12955331802368164, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1597.0, + "completions/max_terminated_length": 1597.0, + "completions/mean_length": 466.14453125, + "completions/mean_terminated_length": 466.14453125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.15828338265419006, + "epoch": 0.7044247787610619, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4978733228091175, + "learning_rate": 1e-06, + "loss": 0.0347, + "num_tokens": 205080401.0, + "reward": 0.8767499923706055, + "reward_std": 0.0694652646780014, + "rewards/qatch_small_update_with_fm/mean": 0.8767499923706055, + "rewards/qatch_small_update_with_fm/std": 0.2577313780784607, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.998377799987793, + "sampling/importance_sampling_ratio/min": 0.01460467278957367, + "sampling/sampling_logp_difference/max": 4.226413726806641, + "sampling/sampling_logp_difference/mean": 0.11512663960456848, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1724.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 522.0546875, + "completions/mean_terminated_length": 522.0546875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.16699490696191788, + "epoch": 0.7061946902654868, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.34895092944716055, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 205534671.0, + "reward": 0.7786523103713989, + "reward_std": 0.07300121337175369, + "rewards/qatch_small_update_with_fm/mean": 0.7786523103713989, + "rewards/qatch_small_update_with_fm/std": 0.33300358057022095, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985975027084351, + "sampling/importance_sampling_ratio/min": 0.004132211208343506, + "sampling/sampling_logp_difference/max": 5.488942623138428, + "sampling/sampling_logp_difference/mean": 0.1197887659072876, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 557.84765625, + "completions/mean_terminated_length": 557.84765625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.1648202035576105, + "epoch": 0.7079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2764157663335144, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 205939896.0, + "reward": 0.6907851696014404, + "reward_std": 0.03865916654467583, + "rewards/qatch_small_update_with_fm/mean": 0.6907851696014404, + "rewards/qatch_small_update_with_fm/std": 0.4175146222114563, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005991458892822, + "sampling/importance_sampling_ratio/min": 4.522010840446455e-06, + "sampling/sampling_logp_difference/max": 12.306553840637207, + "sampling/sampling_logp_difference/mean": 0.11411407589912415, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 544.4453125, + "completions/mean_terminated_length": 544.4453125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.20310404524207115, + "epoch": 0.7097345132743362, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5061570416127074, + "learning_rate": 1e-06, + "loss": -0.0461, + "num_tokens": 206369930.0, + "reward": 0.675488293170929, + "reward_std": 0.10496611893177032, + "rewards/qatch_small_update_with_fm/mean": 0.675488293170929, + "rewards/qatch_small_update_with_fm/std": 0.40421876311302185, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0022008419036865, + "sampling/importance_sampling_ratio/min": 0.01850472204387188, + "sampling/sampling_logp_difference/max": 3.989729404449463, + "sampling/sampling_logp_difference/mean": 0.13742220401763916, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 614.6796875, + "completions/mean_terminated_length": 614.6796875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.19880390912294388, + "epoch": 0.7115044247787611, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4435035882354106, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 206997288.0, + "reward": 0.6815507411956787, + "reward_std": 0.11047512292861938, + "rewards/qatch_small_update_with_fm/mean": 0.6815507411956787, + "rewards/qatch_small_update_with_fm/std": 0.3761551082134247, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006957054138184, + "sampling/importance_sampling_ratio/min": 0.010709195397794247, + "sampling/sampling_logp_difference/max": 4.536652565002441, + "sampling/sampling_logp_difference/mean": 0.1332876980304718, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 486.46875, + "completions/mean_terminated_length": 486.46875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.18742970563471317, + "epoch": 0.7132743362831858, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.42034996504613936, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 207693248.0, + "reward": 0.7741523385047913, + "reward_std": 0.07138805091381073, + "rewards/qatch_small_update_with_fm/mean": 0.7741523385047913, + "rewards/qatch_small_update_with_fm/std": 0.31449058651924133, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010027885437012, + "sampling/importance_sampling_ratio/min": 0.008775109425187111, + "sampling/sampling_logp_difference/max": 4.735836029052734, + "sampling/sampling_logp_difference/mean": 0.12943994998931885, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2340.0, + "completions/max_terminated_length": 2340.0, + "completions/mean_length": 451.75390625, + "completions/mean_terminated_length": 451.75390625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.1542245950549841, + "epoch": 0.7150442477876107, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.2923557605448827, + "learning_rate": 1e-06, + "loss": -0.0452, + "num_tokens": 208325889.0, + "reward": 0.8891288638114929, + "reward_std": 0.01680476777255535, + "rewards/qatch_small_update_with_fm/mean": 0.8891288638114929, + "rewards/qatch_small_update_with_fm/std": 0.26092061400413513, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9982110857963562, + "sampling/importance_sampling_ratio/min": 0.003301323391497135, + "sampling/sampling_logp_difference/max": 5.7134318351745605, + "sampling/sampling_logp_difference/mean": 0.113829106092453, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1252.0, + "completions/max_terminated_length": 1252.0, + "completions/mean_length": 400.30078125, + "completions/mean_terminated_length": 400.30078125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.1461954489350319, + "epoch": 0.7168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3594464319887048, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 208893774.0, + "reward": 0.7378515601158142, + "reward_std": 0.06936902552843094, + "rewards/qatch_small_update_with_fm/mean": 0.7378515601158142, + "rewards/qatch_small_update_with_fm/std": 0.3432586193084717, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9973443746566772, + "sampling/importance_sampling_ratio/min": 0.0028301975689828396, + "sampling/sampling_logp_difference/max": 5.867408752441406, + "sampling/sampling_logp_difference/mean": 0.11105867475271225, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3721.0, + "completions/max_terminated_length": 3721.0, + "completions/mean_length": 761.70703125, + "completions/mean_terminated_length": 761.70703125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.21504069492220879, + "epoch": 0.7185840707964601, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.4598716404374558, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 209545459.0, + "reward": 0.6908632516860962, + "reward_std": 0.11959480494260788, + "rewards/qatch_small_update_with_fm/mean": 0.690863311290741, + "rewards/qatch_small_update_with_fm/std": 0.36190682649612427, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0028066635131836, + "sampling/importance_sampling_ratio/min": 0.00274001806974411, + "sampling/sampling_logp_difference/max": 5.8997907638549805, + "sampling/sampling_logp_difference/mean": 0.13885968923568726, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2578.0, + "completions/max_terminated_length": 2578.0, + "completions/mean_length": 592.12109375, + "completions/mean_terminated_length": 592.12109375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.19392023608088493, + "epoch": 0.720353982300885, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2783516577547617, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 210185858.0, + "reward": 0.6728867292404175, + "reward_std": 0.036109741777181625, + "rewards/qatch_small_update_with_fm/mean": 0.6728867292404175, + "rewards/qatch_small_update_with_fm/std": 0.354897141456604, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001744031906128, + "sampling/importance_sampling_ratio/min": 0.0009166671079583466, + "sampling/sampling_logp_difference/max": 6.9947662353515625, + "sampling/sampling_logp_difference/mean": 0.13026288151741028, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3979.0, + "completions/max_terminated_length": 3979.0, + "completions/mean_length": 727.93359375, + "completions/mean_terminated_length": 727.93359375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.2095312885940075, + "epoch": 0.7221238938053097, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5232028757397078, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 210972561.0, + "reward": 0.6885937452316284, + "reward_std": 0.09573820233345032, + "rewards/qatch_small_update_with_fm/mean": 0.6885937452316284, + "rewards/qatch_small_update_with_fm/std": 0.3826073408126831, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0056464672088623, + "sampling/importance_sampling_ratio/min": 0.006897236220538616, + "sampling/sampling_logp_difference/max": 4.976634502410889, + "sampling/sampling_logp_difference/mean": 0.13450539112091064, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2189.0, + "completions/max_terminated_length": 2189.0, + "completions/mean_length": 508.99609375, + "completions/mean_terminated_length": 508.99609375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.16254041530191898, + "epoch": 0.7238938053097345, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.545358972866689, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 211426640.0, + "reward": 0.8075742125511169, + "reward_std": 0.07480335235595703, + "rewards/qatch_small_update_with_fm/mean": 0.8075742125511169, + "rewards/qatch_small_update_with_fm/std": 0.3394266963005066, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000327229499817, + "sampling/importance_sampling_ratio/min": 0.006418498232960701, + "sampling/sampling_logp_difference/max": 5.0485711097717285, + "sampling/sampling_logp_difference/mean": 0.11342006921768188, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2449.0, + "completions/max_terminated_length": 2449.0, + "completions/mean_length": 615.09375, + "completions/mean_terminated_length": 615.09375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.19291920214891434, + "epoch": 0.7256637168141593, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47680289996018316, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 212253944.0, + "reward": 0.6174297332763672, + "reward_std": 0.11888127028942108, + "rewards/qatch_small_update_with_fm/mean": 0.6174297332763672, + "rewards/qatch_small_update_with_fm/std": 0.37394073605537415, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998745918273926, + "sampling/importance_sampling_ratio/min": 0.005661542061716318, + "sampling/sampling_logp_difference/max": 5.17405891418457, + "sampling/sampling_logp_difference/mean": 0.13196709752082825, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3931.0, + "completions/max_terminated_length": 3931.0, + "completions/mean_length": 716.828125, + "completions/mean_terminated_length": 716.828125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.17363397032022476, + "epoch": 0.727433628318584, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.32502024170474697, + "learning_rate": 1e-06, + "loss": 0.0393, + "num_tokens": 212929100.0, + "reward": 0.7465429306030273, + "reward_std": 0.0992249995470047, + "rewards/qatch_small_update_with_fm/mean": 0.7465429306030273, + "rewards/qatch_small_update_with_fm/std": 0.381736159324646, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001173496246338, + "sampling/importance_sampling_ratio/min": 0.0036354686599224806, + "sampling/sampling_logp_difference/max": 5.6170172691345215, + "sampling/sampling_logp_difference/mean": 0.11454272270202637, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4093.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 686.29296875, + "completions/mean_terminated_length": 686.29296875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.1912785768508911, + "epoch": 0.7292035398230089, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.36200051116897225, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 213543863.0, + "reward": 0.7139023542404175, + "reward_std": 0.07025236636400223, + "rewards/qatch_small_update_with_fm/mean": 0.7139023542404175, + "rewards/qatch_small_update_with_fm/std": 0.33821868896484375, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010260343551636, + "sampling/importance_sampling_ratio/min": 0.01126747578382492, + "sampling/sampling_logp_difference/max": 4.485835075378418, + "sampling/sampling_logp_difference/mean": 0.12891894578933716, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 482.08984375, + "completions/mean_terminated_length": 482.08984375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.15765495225787163, + "epoch": 0.7309734513274336, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4196500993343117, + "learning_rate": 1e-06, + "loss": -0.0244, + "num_tokens": 214201166.0, + "reward": 0.8524882793426514, + "reward_std": 0.09491947293281555, + "rewards/qatch_small_update_with_fm/mean": 0.8524882793426514, + "rewards/qatch_small_update_with_fm/std": 0.2842613160610199, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9969896674156189, + "sampling/importance_sampling_ratio/min": 2.4591103283455595e-05, + "sampling/sampling_logp_difference/max": 10.613125801086426, + "sampling/sampling_logp_difference/mean": 0.11501606553792953, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 425.8828125, + "completions/mean_terminated_length": 411.490234375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.16947471164166927, + "epoch": 0.7327433628318584, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.35258047954675187, + "learning_rate": 1e-06, + "loss": -0.022, + "num_tokens": 214648096.0, + "reward": 0.7841289043426514, + "reward_std": 0.04797707498073578, + "rewards/qatch_small_update_with_fm/mean": 0.7841289043426514, + "rewards/qatch_small_update_with_fm/std": 0.3436765670776367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.996971845626831, + "sampling/importance_sampling_ratio/min": 0.008898786269128323, + "sampling/sampling_logp_difference/max": 4.7218403816223145, + "sampling/sampling_logp_difference/mean": 0.12509727478027344, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1760.0, + "completions/max_terminated_length": 1760.0, + "completions/mean_length": 473.265625, + "completions/mean_terminated_length": 473.265625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.16774693876504898, + "epoch": 0.7345132743362832, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.39832537332698686, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 215187780.0, + "reward": 0.8978593349456787, + "reward_std": 0.03775494545698166, + "rewards/qatch_small_update_with_fm/mean": 0.8978593349456787, + "rewards/qatch_small_update_with_fm/std": 0.21952234208583832, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9979634881019592, + "sampling/importance_sampling_ratio/min": 0.0004939264617860317, + "sampling/sampling_logp_difference/max": 7.613123893737793, + "sampling/sampling_logp_difference/mean": 0.12119480222463608, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 493.36328125, + "completions/mean_terminated_length": 493.36328125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.15931341610848904, + "epoch": 0.736283185840708, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5004576326100599, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 215705697.0, + "reward": 0.6353867053985596, + "reward_std": 0.11076454818248749, + "rewards/qatch_small_update_with_fm/mean": 0.6353867053985596, + "rewards/qatch_small_update_with_fm/std": 0.3819371461868286, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9973691701889038, + "sampling/importance_sampling_ratio/min": 0.023676568642258644, + "sampling/sampling_logp_difference/max": 3.743269443511963, + "sampling/sampling_logp_difference/mean": 0.11513706296682358, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1592.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 451.29296875, + "completions/mean_terminated_length": 451.29296875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.1622842587530613, + "epoch": 0.7380530973451327, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.3899241092283991, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 216250860.0, + "reward": 0.785335898399353, + "reward_std": 0.08137451857328415, + "rewards/qatch_small_update_with_fm/mean": 0.785335898399353, + "rewards/qatch_small_update_with_fm/std": 0.35125818848609924, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9958220720291138, + "sampling/importance_sampling_ratio/min": 0.006848800927400589, + "sampling/sampling_logp_difference/max": 4.983681678771973, + "sampling/sampling_logp_difference/mean": 0.12142086774110794, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2938.0, + "completions/max_terminated_length": 2938.0, + "completions/mean_length": 623.05078125, + "completions/mean_terminated_length": 623.05078125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.18198287673294544, + "epoch": 0.7398230088495575, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.409413930464498, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 216796345.0, + "reward": 0.7311093807220459, + "reward_std": 0.1406518816947937, + "rewards/qatch_small_update_with_fm/mean": 0.7311093807220459, + "rewards/qatch_small_update_with_fm/std": 0.3903703987598419, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007123947143555, + "sampling/importance_sampling_ratio/min": 0.00420353701338172, + "sampling/sampling_logp_difference/max": 5.471828937530518, + "sampling/sampling_logp_difference/mean": 0.12437430024147034, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2476.0, + "completions/max_terminated_length": 2476.0, + "completions/mean_length": 474.4375, + "completions/mean_terminated_length": 474.4375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.13196693547070026, + "epoch": 0.7415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.41445779960836726, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 217478441.0, + "reward": 0.8417499661445618, + "reward_std": 0.04443129152059555, + "rewards/qatch_small_update_with_fm/mean": 0.8417499661445618, + "rewards/qatch_small_update_with_fm/std": 0.28878355026245117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9945211410522461, + "sampling/importance_sampling_ratio/min": 0.00021754797489847988, + "sampling/sampling_logp_difference/max": 8.433091163635254, + "sampling/sampling_logp_difference/mean": 0.10465686023235321, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2587.0, + "completions/max_terminated_length": 2587.0, + "completions/mean_length": 734.046875, + "completions/mean_terminated_length": 734.046875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.19972126185894012, + "epoch": 0.7433628318584071, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.3870550794541803, + "learning_rate": 1e-06, + "loss": 0.0529, + "num_tokens": 218280533.0, + "reward": 0.6786133050918579, + "reward_std": 0.09401129186153412, + "rewards/qatch_small_update_with_fm/mean": 0.6786133050918579, + "rewards/qatch_small_update_with_fm/std": 0.3757462203502655, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000883936882019, + "sampling/importance_sampling_ratio/min": 0.007178118452429771, + "sampling/sampling_logp_difference/max": 4.936717987060547, + "sampling/sampling_logp_difference/mean": 0.1316712498664856, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2604.0, + "completions/max_terminated_length": 2604.0, + "completions/mean_length": 564.26171875, + "completions/mean_terminated_length": 564.26171875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.16319572925567627, + "epoch": 0.7451327433628319, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.46633898980222643, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 218783080.0, + "reward": 0.6869062781333923, + "reward_std": 0.0881059467792511, + "rewards/qatch_small_update_with_fm/mean": 0.6869062781333923, + "rewards/qatch_small_update_with_fm/std": 0.40433627367019653, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000537395477295, + "sampling/importance_sampling_ratio/min": 0.014322554692626, + "sampling/sampling_logp_difference/max": 4.245919704437256, + "sampling/sampling_logp_difference/mean": 0.11453516036272049, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1783.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 604.390625, + "completions/mean_terminated_length": 604.390625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.1689841579645872, + "epoch": 0.7469026548672566, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.3544339756613165, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 219359580.0, + "reward": 0.7998437881469727, + "reward_std": 0.08399610966444016, + "rewards/qatch_small_update_with_fm/mean": 0.7998437881469727, + "rewards/qatch_small_update_with_fm/std": 0.30130448937416077, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9989594221115112, + "sampling/importance_sampling_ratio/min": 0.0011994243832305074, + "sampling/sampling_logp_difference/max": 6.7259135246276855, + "sampling/sampling_logp_difference/mean": 0.12029863148927689, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2248.0, + "completions/mean_length": 568.7265625, + "completions/mean_terminated_length": 554.8941650390625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.16228130646049976, + "epoch": 0.7486725663716814, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.22356471074436193, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 219916454.0, + "reward": 0.7088984251022339, + "reward_std": 0.026240184903144836, + "rewards/qatch_small_update_with_fm/mean": 0.7088984251022339, + "rewards/qatch_small_update_with_fm/std": 0.3714582026004791, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 0.023729214444756508, + "sampling/sampling_logp_difference/max": 3.7410483360290527, + "sampling/sampling_logp_difference/mean": 0.11432121694087982, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2102.0, + "completions/max_terminated_length": 2102.0, + "completions/mean_length": 664.03515625, + "completions/mean_terminated_length": 664.03515625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1923084706068039, + "epoch": 0.7504424778761062, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.5461092747329064, + "learning_rate": 1e-06, + "loss": 0.0583, + "num_tokens": 220427167.0, + "reward": 0.8140429258346558, + "reward_std": 0.18169225752353668, + "rewards/qatch_small_update_with_fm/mean": 0.8140429258346558, + "rewards/qatch_small_update_with_fm/std": 0.3105939030647278, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0016264915466309, + "sampling/importance_sampling_ratio/min": 0.00532237533479929, + "sampling/sampling_logp_difference/max": 5.235835552215576, + "sampling/sampling_logp_difference/mean": 0.12810061872005463, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3639.0, + "completions/mean_length": 772.30859375, + "completions/mean_terminated_length": 719.5516357421875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.18962455727159977, + "epoch": 0.7522123893805309, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.3160575103154446, + "learning_rate": 1e-06, + "loss": -0.0781, + "num_tokens": 221069566.0, + "reward": 0.7325546741485596, + "reward_std": 0.053682636469602585, + "rewards/qatch_small_update_with_fm/mean": 0.7325546741485596, + "rewards/qatch_small_update_with_fm/std": 0.37810027599334717, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.003647804260254, + "sampling/importance_sampling_ratio/min": 0.004103474784642458, + "sampling/sampling_logp_difference/max": 5.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.12572765350341797, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2415.0, + "completions/mean_length": 660.265625, + "completions/mean_terminated_length": 646.7921752929688, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.18477561697363853, + "epoch": 0.7539823008849558, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4090449823566527, + "learning_rate": 1e-06, + "loss": -0.0218, + "num_tokens": 221587810.0, + "reward": 0.7701483964920044, + "reward_std": 0.10543603450059891, + "rewards/qatch_small_update_with_fm/mean": 0.7701483964920044, + "rewards/qatch_small_update_with_fm/std": 0.3279249668121338, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0023423433303833, + "sampling/importance_sampling_ratio/min": 0.001957992557436228, + "sampling/sampling_logp_difference/max": 6.235835552215576, + "sampling/sampling_logp_difference/mean": 0.12211276590824127, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3233.0, + "completions/mean_length": 622.453125, + "completions/mean_terminated_length": 608.8314208984375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.15044170431792736, + "epoch": 0.7557522123893805, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.42226060366299967, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 222141878.0, + "reward": 0.7257421612739563, + "reward_std": 0.06865538656711578, + "rewards/qatch_small_update_with_fm/mean": 0.7257421612739563, + "rewards/qatch_small_update_with_fm/std": 0.35420534014701843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9960782527923584, + "sampling/importance_sampling_ratio/min": 0.01118391752243042, + "sampling/sampling_logp_difference/max": 4.493278503417969, + "sampling/sampling_logp_difference/mean": 0.11394952237606049, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3069.0, + "completions/max_terminated_length": 3069.0, + "completions/mean_length": 801.06640625, + "completions/mean_terminated_length": 801.06640625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.17895109206438065, + "epoch": 0.7575221238938054, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.3815108439775098, + "learning_rate": 1e-06, + "loss": -0.0198, + "num_tokens": 222826775.0, + "reward": 0.6168085336685181, + "reward_std": 0.08496352285146713, + "rewards/qatch_small_update_with_fm/mean": 0.6168085336685181, + "rewards/qatch_small_update_with_fm/std": 0.40082257986068726, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0023446083068848, + "sampling/importance_sampling_ratio/min": 0.011175249703228474, + "sampling/sampling_logp_difference/max": 4.494053840637207, + "sampling/sampling_logp_difference/mean": 0.12193140387535095, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2660.0, + "completions/max_terminated_length": 2660.0, + "completions/mean_length": 631.9453125, + "completions/mean_terminated_length": 631.9453125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.1826272513717413, + "epoch": 0.7592920353982301, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.41904123544273625, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 223359449.0, + "reward": 0.775890588760376, + "reward_std": 0.11549194902181625, + "rewards/qatch_small_update_with_fm/mean": 0.7758906483650208, + "rewards/qatch_small_update_with_fm/std": 0.3709191679954529, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9977415204048157, + "sampling/importance_sampling_ratio/min": 0.014440981671214104, + "sampling/sampling_logp_difference/max": 4.237685203552246, + "sampling/sampling_logp_difference/mean": 0.13103239238262177, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2500.0, + "completions/mean_length": 635.03125, + "completions/mean_terminated_length": 635.03125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.1812898088246584, + "epoch": 0.7610619469026548, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5466435269344176, + "learning_rate": 1e-06, + "loss": 0.1014, + "num_tokens": 224151553.0, + "reward": 0.7688593864440918, + "reward_std": 0.10703786462545395, + "rewards/qatch_small_update_with_fm/mean": 0.7688593864440918, + "rewards/qatch_small_update_with_fm/std": 0.31848737597465515, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9988529086112976, + "sampling/importance_sampling_ratio/min": 0.006796242203563452, + "sampling/sampling_logp_difference/max": 4.991385459899902, + "sampling/sampling_logp_difference/mean": 0.13133785128593445, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3099.0, + "completions/max_terminated_length": 3099.0, + "completions/mean_length": 635.1484375, + "completions/mean_terminated_length": 635.1484375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.17052887007594109, + "epoch": 0.7628318584070797, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.42027461892419476, + "learning_rate": 1e-06, + "loss": -0.0535, + "num_tokens": 224847815.0, + "reward": 0.8387616872787476, + "reward_std": 0.03548329323530197, + "rewards/qatch_small_update_with_fm/mean": 0.8387616872787476, + "rewards/qatch_small_update_with_fm/std": 0.2991366684436798, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9974981546401978, + "sampling/importance_sampling_ratio/min": 0.008679375983774662, + "sampling/sampling_logp_difference/max": 4.746805667877197, + "sampling/sampling_logp_difference/mean": 0.12578247487545013, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2238.0, + "completions/max_terminated_length": 2238.0, + "completions/mean_length": 506.56640625, + "completions/mean_terminated_length": 506.56640625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.12694681715220213, + "epoch": 0.7646017699115044, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5130403131834611, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 225484312.0, + "reward": 0.8006914258003235, + "reward_std": 0.08573571592569351, + "rewards/qatch_small_update_with_fm/mean": 0.8006914258003235, + "rewards/qatch_small_update_with_fm/std": 0.29757291078567505, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994242787361145, + "sampling/importance_sampling_ratio/min": 0.008697095327079296, + "sampling/sampling_logp_difference/max": 4.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.10106033831834793, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2420.0, + "completions/max_terminated_length": 2420.0, + "completions/mean_length": 692.78125, + "completions/mean_terminated_length": 692.78125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.13791646622121334, + "epoch": 0.7663716814159292, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.4941877942190622, + "learning_rate": 1e-06, + "loss": 0.0259, + "num_tokens": 225999760.0, + "reward": 0.8496835827827454, + "reward_std": 0.046133317053318024, + "rewards/qatch_small_update_with_fm/mean": 0.8496835827827454, + "rewards/qatch_small_update_with_fm/std": 0.3013724684715271, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9952928423881531, + "sampling/importance_sampling_ratio/min": 0.006896561942994595, + "sampling/sampling_logp_difference/max": 4.97673225402832, + "sampling/sampling_logp_difference/mean": 0.10778751224279404, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 602.625, + "completions/mean_terminated_length": 602.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.15351567789912224, + "epoch": 0.768141592920354, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.42487288101888443, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 226594368.0, + "reward": 0.7664843797683716, + "reward_std": 0.07009606808423996, + "rewards/qatch_small_update_with_fm/mean": 0.7664843797683716, + "rewards/qatch_small_update_with_fm/std": 0.3558588922023773, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9956225752830505, + "sampling/importance_sampling_ratio/min": 0.0011039423989132047, + "sampling/sampling_logp_difference/max": 6.808867454528809, + "sampling/sampling_logp_difference/mean": 0.11667559295892715, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2625.0, + "completions/max_terminated_length": 2625.0, + "completions/mean_length": 748.1171875, + "completions/mean_terminated_length": 748.1171875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.168431606143713, + "epoch": 0.7699115044247787, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.3854349475784843, + "learning_rate": 1e-06, + "loss": 0.0481, + "num_tokens": 227160990.0, + "reward": 0.7121405601501465, + "reward_std": 0.10320340842008591, + "rewards/qatch_small_update_with_fm/mean": 0.7121405601501465, + "rewards/qatch_small_update_with_fm/std": 0.3460690677165985, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9979861974716187, + "sampling/importance_sampling_ratio/min": 0.0012341516558080912, + "sampling/sampling_logp_difference/max": 6.697371482849121, + "sampling/sampling_logp_difference/mean": 0.12207724899053574, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2455.0, + "completions/max_terminated_length": 2455.0, + "completions/mean_length": 581.72265625, + "completions/mean_terminated_length": 581.72265625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.15434116125106812, + "epoch": 0.7716814159292036, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.3569690705345467, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 227766583.0, + "reward": 0.7684921622276306, + "reward_std": 0.07930755615234375, + "rewards/qatch_small_update_with_fm/mean": 0.7684921622276306, + "rewards/qatch_small_update_with_fm/std": 0.3264910876750946, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9944840669631958, + "sampling/importance_sampling_ratio/min": 0.00047306326450780034, + "sampling/sampling_logp_difference/max": 7.656281471252441, + "sampling/sampling_logp_difference/mean": 0.11814532428979874, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2112.0, + "completions/max_terminated_length": 2112.0, + "completions/mean_length": 723.5625, + "completions/mean_terminated_length": 723.5625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.15825031697750092, + "epoch": 0.7734513274336283, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.34000322678139233, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 228371063.0, + "reward": 0.7408750057220459, + "reward_std": 0.09346934407949448, + "rewards/qatch_small_update_with_fm/mean": 0.7408750057220459, + "rewards/qatch_small_update_with_fm/std": 0.3187641203403473, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9984309077262878, + "sampling/importance_sampling_ratio/min": 0.008697095327079296, + "sampling/sampling_logp_difference/max": 4.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.11309655755758286, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2843.0, + "completions/max_terminated_length": 2843.0, + "completions/mean_length": 711.328125, + "completions/mean_terminated_length": 711.328125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.18015681765973568, + "epoch": 0.7752212389380531, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.45693622408807605, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 229134907.0, + "reward": 0.6116093397140503, + "reward_std": 0.062218718230724335, + "rewards/qatch_small_update_with_fm/mean": 0.6116093397140503, + "rewards/qatch_small_update_with_fm/std": 0.357837438583374, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985137581825256, + "sampling/importance_sampling_ratio/min": 0.0051135290414094925, + "sampling/sampling_logp_difference/max": 5.27586555480957, + "sampling/sampling_logp_difference/mean": 0.1284247636795044, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2524.0, + "completions/max_terminated_length": 2524.0, + "completions/mean_length": 600.83203125, + "completions/mean_terminated_length": 600.83203125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.1643862072378397, + "epoch": 0.7769911504424779, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4528087649562646, + "learning_rate": 1e-06, + "loss": -0.0204, + "num_tokens": 229853200.0, + "reward": 0.7506523132324219, + "reward_std": 0.02189062535762787, + "rewards/qatch_small_update_with_fm/mean": 0.7506523132324219, + "rewards/qatch_small_update_with_fm/std": 0.3145546019077301, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.997706949710846, + "sampling/importance_sampling_ratio/min": 0.001599552109837532, + "sampling/sampling_logp_difference/max": 6.4380316734313965, + "sampling/sampling_logp_difference/mean": 0.12278648465871811, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2930.0, + "completions/max_terminated_length": 2930.0, + "completions/mean_length": 586.2578125, + "completions/mean_terminated_length": 586.2578125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.1549396775662899, + "epoch": 0.7787610619469026, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.31433711549478927, + "learning_rate": 1e-06, + "loss": -0.0224, + "num_tokens": 230362770.0, + "reward": 0.7373007535934448, + "reward_std": 0.05242425575852394, + "rewards/qatch_small_update_with_fm/mean": 0.7373007535934448, + "rewards/qatch_small_update_with_fm/std": 0.3278573751449585, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9971117377281189, + "sampling/importance_sampling_ratio/min": 0.011141431517899036, + "sampling/sampling_logp_difference/max": 4.497084617614746, + "sampling/sampling_logp_difference/mean": 0.12081331014633179, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3951.0, + "completions/max_terminated_length": 3951.0, + "completions/mean_length": 858.46484375, + "completions/mean_terminated_length": 858.46484375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.16271107271313667, + "epoch": 0.7805309734513274, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.39440167191771214, + "learning_rate": 1e-06, + "loss": -0.0644, + "num_tokens": 231013257.0, + "reward": 0.7417617440223694, + "reward_std": 0.07942794263362885, + "rewards/qatch_small_update_with_fm/mean": 0.7417617440223694, + "rewards/qatch_small_update_with_fm/std": 0.37872886657714844, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0037524700164795, + "sampling/importance_sampling_ratio/min": 0.0020053647458553314, + "sampling/sampling_logp_difference/max": 6.2119293212890625, + "sampling/sampling_logp_difference/mean": 0.11337009072303772, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3282.0, + "completions/max_terminated_length": 3282.0, + "completions/mean_length": 593.32421875, + "completions/mean_terminated_length": 593.32421875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.1659252755343914, + "epoch": 0.7823008849557522, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.36212202748645383, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 231473228.0, + "reward": 0.7991992235183716, + "reward_std": 0.033742353320121765, + "rewards/qatch_small_update_with_fm/mean": 0.7991992235183716, + "rewards/qatch_small_update_with_fm/std": 0.3366776406764984, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9989630579948425, + "sampling/importance_sampling_ratio/min": 0.0038688918575644493, + "sampling/sampling_logp_difference/max": 5.5547871589660645, + "sampling/sampling_logp_difference/mean": 0.12536245584487915, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3265.0, + "completions/max_terminated_length": 3265.0, + "completions/mean_length": 694.28515625, + "completions/mean_terminated_length": 694.28515625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.1803117860108614, + "epoch": 0.784070796460177, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.419844471085865, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 232015333.0, + "reward": 0.8482460975646973, + "reward_std": 0.09589515626430511, + "rewards/qatch_small_update_with_fm/mean": 0.8482460975646973, + "rewards/qatch_small_update_with_fm/std": 0.30093497037887573, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9970134496688843, + "sampling/importance_sampling_ratio/min": 0.011528928764164448, + "sampling/sampling_logp_difference/max": 4.46289587020874, + "sampling/sampling_logp_difference/mean": 0.1318548023700714, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2764.0, + "completions/max_terminated_length": 2764.0, + "completions/mean_length": 829.99609375, + "completions/mean_terminated_length": 829.99609375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.1749946866184473, + "epoch": 0.7858407079646018, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.39484374074074674, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 232805252.0, + "reward": 0.686453104019165, + "reward_std": 0.11683434247970581, + "rewards/qatch_small_update_with_fm/mean": 0.686453104019165, + "rewards/qatch_small_update_with_fm/std": 0.3472793996334076, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994871616363525, + "sampling/importance_sampling_ratio/min": 0.0013426319928839803, + "sampling/sampling_logp_difference/max": 6.613123416900635, + "sampling/sampling_logp_difference/mean": 0.12524878978729248, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 704.53515625, + "completions/mean_terminated_length": 664.3201904296875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.1625970732420683, + "epoch": 0.7876106194690266, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4301157411953366, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 233457437.0, + "reward": 0.6297616958618164, + "reward_std": 0.09737332165241241, + "rewards/qatch_small_update_with_fm/mean": 0.6297616958618164, + "rewards/qatch_small_update_with_fm/std": 0.41093724966049194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9979434609413147, + "sampling/importance_sampling_ratio/min": 0.0008534918888472021, + "sampling/sampling_logp_difference/max": 7.066174507141113, + "sampling/sampling_logp_difference/mean": 0.12096916139125824, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2229.0, + "completions/max_terminated_length": 2229.0, + "completions/mean_length": 596.2890625, + "completions/mean_terminated_length": 596.2890625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.14597140066325665, + "epoch": 0.7893805309734513, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.2835604822547674, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 234065047.0, + "reward": 0.8153945207595825, + "reward_std": 0.03509760648012161, + "rewards/qatch_small_update_with_fm/mean": 0.8153945207595825, + "rewards/qatch_small_update_with_fm/std": 0.3443441689014435, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938744306564331, + "sampling/importance_sampling_ratio/min": 0.0015488486969843507, + "sampling/sampling_logp_difference/max": 6.470243453979492, + "sampling/sampling_logp_difference/mean": 0.11343678086996078, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3454.0, + "completions/mean_length": 780.5234375, + "completions/mean_terminated_length": 767.5216064453125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.15503480285406113, + "epoch": 0.7911504424778761, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.3740383952215832, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 234562269.0, + "reward": 0.7261210680007935, + "reward_std": 0.0933132916688919, + "rewards/qatch_small_update_with_fm/mean": 0.7261210680007935, + "rewards/qatch_small_update_with_fm/std": 0.3724234700202942, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.996291995048523, + "sampling/importance_sampling_ratio/min": 0.005370934959501028, + "sampling/sampling_logp_difference/max": 5.226753234863281, + "sampling/sampling_logp_difference/mean": 0.11553680151700974, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2725.0, + "completions/max_terminated_length": 2725.0, + "completions/mean_length": 659.67578125, + "completions/mean_terminated_length": 659.67578125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.15476826392114162, + "epoch": 0.7929203539823009, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.42675635170846243, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 235043658.0, + "reward": 0.6756054759025574, + "reward_std": 0.09046542644500732, + "rewards/qatch_small_update_with_fm/mean": 0.6756054759025574, + "rewards/qatch_small_update_with_fm/std": 0.3835858702659607, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9972091913223267, + "sampling/importance_sampling_ratio/min": 0.008706767112016678, + "sampling/sampling_logp_difference/max": 4.743654727935791, + "sampling/sampling_logp_difference/mean": 0.12006603181362152, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 550.0390625, + "completions/mean_terminated_length": 550.0390625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.12986405473202467, + "epoch": 0.7946902654867256, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5427483756820398, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 235464196.0, + "reward": 0.7087421417236328, + "reward_std": 0.09299471974372864, + "rewards/qatch_small_update_with_fm/mean": 0.7087421417236328, + "rewards/qatch_small_update_with_fm/std": 0.3667365312576294, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9917939901351929, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.11037851870059967, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2407.0, + "completions/max_terminated_length": 2407.0, + "completions/mean_length": 698.49609375, + "completions/mean_terminated_length": 698.49609375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.1488830018788576, + "epoch": 0.7964601769911505, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4951429798091371, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 236066467.0, + "reward": 0.7882499694824219, + "reward_std": 0.13462623953819275, + "rewards/qatch_small_update_with_fm/mean": 0.7882499694824219, + "rewards/qatch_small_update_with_fm/std": 0.3175378739833832, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9960341453552246, + "sampling/importance_sampling_ratio/min": 1.961273119377438e-05, + "sampling/sampling_logp_difference/max": 10.83933162689209, + "sampling/sampling_logp_difference/mean": 0.11680779606103897, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2239.0, + "completions/max_terminated_length": 2239.0, + "completions/mean_length": 509.01171875, + "completions/mean_terminated_length": 509.01171875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.11211834196001291, + "epoch": 0.7982300884955752, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3849702455218366, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 236867798.0, + "reward": 0.9428945183753967, + "reward_std": 0.028621939942240715, + "rewards/qatch_small_update_with_fm/mean": 0.9428945183753967, + "rewards/qatch_small_update_with_fm/std": 0.14791765809059143, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909011721611023, + "sampling/importance_sampling_ratio/min": 0.005266137886792421, + "sampling/sampling_logp_difference/max": 5.246458053588867, + "sampling/sampling_logp_difference/mean": 0.10161980986595154, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3160.0, + "completions/max_terminated_length": 3160.0, + "completions/mean_length": 702.078125, + "completions/mean_terminated_length": 702.078125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.15019656997174025, + "epoch": 0.8, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.325804060936362, + "learning_rate": 1e-06, + "loss": 0.0638, + "num_tokens": 237550042.0, + "reward": 0.816601574420929, + "reward_std": 0.027011848986148834, + "rewards/qatch_small_update_with_fm/mean": 0.816601574420929, + "rewards/qatch_small_update_with_fm/std": 0.2908731698989868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9971194267272949, + "sampling/importance_sampling_ratio/min": 0.008687051944434643, + "sampling/sampling_logp_difference/max": 4.745921611785889, + "sampling/sampling_logp_difference/mean": 0.11530216038227081, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 515.734375, + "completions/mean_terminated_length": 515.734375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.12139484006911516, + "epoch": 0.8017699115044248, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2689081659869354, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 238204646.0, + "reward": 0.7734726667404175, + "reward_std": 0.032330580055713654, + "rewards/qatch_small_update_with_fm/mean": 0.7734726667404175, + "rewards/qatch_small_update_with_fm/std": 0.3623669147491455, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991873562335968, + "sampling/importance_sampling_ratio/min": 0.0018496569246053696, + "sampling/sampling_logp_difference/max": 6.292755126953125, + "sampling/sampling_logp_difference/mean": 0.1044083833694458, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3267.0, + "completions/max_terminated_length": 3267.0, + "completions/mean_length": 661.6796875, + "completions/mean_terminated_length": 661.6796875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.13479878939688206, + "epoch": 0.8035398230088495, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.49470886264254876, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 238715492.0, + "reward": 0.6919882297515869, + "reward_std": 0.10859642922878265, + "rewards/qatch_small_update_with_fm/mean": 0.6919882297515869, + "rewards/qatch_small_update_with_fm/std": 0.4031483829021454, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9940096139907837, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.11178075522184372, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3160.0, + "completions/max_terminated_length": 3160.0, + "completions/mean_length": 636.25390625, + "completions/mean_terminated_length": 636.25390625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.12827913463115692, + "epoch": 0.8053097345132744, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8735075316746723, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 239250309.0, + "reward": 0.7329335808753967, + "reward_std": 0.06027635186910629, + "rewards/qatch_small_update_with_fm/mean": 0.7329335808753967, + "rewards/qatch_small_update_with_fm/std": 0.3722755014896393, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9950513243675232, + "sampling/importance_sampling_ratio/min": 0.004170369356870651, + "sampling/sampling_logp_difference/max": 5.479750633239746, + "sampling/sampling_logp_difference/mean": 0.10352346301078796, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2681.0, + "completions/max_terminated_length": 2681.0, + "completions/mean_length": 547.953125, + "completions/mean_terminated_length": 547.953125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.1058797063305974, + "epoch": 0.8070796460176991, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4273494684096733, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 239704009.0, + "reward": 0.8358047008514404, + "reward_std": 0.04802708327770233, + "rewards/qatch_small_update_with_fm/mean": 0.8358047008514404, + "rewards/qatch_small_update_with_fm/std": 0.2909993529319763, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992448091506958, + "sampling/importance_sampling_ratio/min": 0.0026839813217520714, + "sampling/sampling_logp_difference/max": 5.920454025268555, + "sampling/sampling_logp_difference/mean": 0.09336571395397186, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2089.0, + "completions/max_terminated_length": 2089.0, + "completions/mean_length": 617.75390625, + "completions/mean_terminated_length": 617.75390625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.13106741197407246, + "epoch": 0.8088495575221238, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.34818347973080715, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 240374890.0, + "reward": 0.693066418170929, + "reward_std": 0.05205351859331131, + "rewards/qatch_small_update_with_fm/mean": 0.693066418170929, + "rewards/qatch_small_update_with_fm/std": 0.37017884850502014, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9947883486747742, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.10878457874059677, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 3826.0, + "completions/mean_length": 671.5, + "completions/mean_terminated_length": 671.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.14847872778773308, + "epoch": 0.8106194690265487, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2268928508210983, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 240994906.0, + "reward": 0.8564844131469727, + "reward_std": 0.034672707319259644, + "rewards/qatch_small_update_with_fm/mean": 0.8564844131469727, + "rewards/qatch_small_update_with_fm/std": 0.28987762331962585, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986143112182617, + "sampling/importance_sampling_ratio/min": 0.008972340263426304, + "sampling/sampling_logp_difference/max": 4.713608741760254, + "sampling/sampling_logp_difference/mean": 0.11376264691352844, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3260.0, + "completions/max_terminated_length": 3260.0, + "completions/mean_length": 511.375, + "completions/mean_terminated_length": 511.375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.14136963710188866, + "epoch": 0.8123893805309734, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4322927842698917, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 241443450.0, + "reward": 0.8052030801773071, + "reward_std": 0.04568236321210861, + "rewards/qatch_small_update_with_fm/mean": 0.8052030801773071, + "rewards/qatch_small_update_with_fm/std": 0.29775679111480713, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992789626121521, + "sampling/importance_sampling_ratio/min": 0.011154704727232456, + "sampling/sampling_logp_difference/max": 4.495893955230713, + "sampling/sampling_logp_difference/mean": 0.11692950874567032, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2345.0, + "completions/max_terminated_length": 2345.0, + "completions/mean_length": 471.6953125, + "completions/mean_terminated_length": 471.6953125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.10994228906929493, + "epoch": 0.8141592920353983, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.272719066624188, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 242050972.0, + "reward": 0.8606210947036743, + "reward_std": 0.03434634208679199, + "rewards/qatch_small_update_with_fm/mean": 0.8606210947036743, + "rewards/qatch_small_update_with_fm/std": 0.30872756242752075, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9928982257843018, + "sampling/importance_sampling_ratio/min": 0.002553700003772974, + "sampling/sampling_logp_difference/max": 5.970211982727051, + "sampling/sampling_logp_difference/mean": 0.09547945111989975, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3529.0, + "completions/max_terminated_length": 3529.0, + "completions/mean_length": 808.0859375, + "completions/mean_terminated_length": 808.0859375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.16546104848384857, + "epoch": 0.815929203539823, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.32918008336972193, + "learning_rate": 1e-06, + "loss": 0.0306, + "num_tokens": 242787538.0, + "reward": 0.7358086109161377, + "reward_std": 0.05101485177874565, + "rewards/qatch_small_update_with_fm/mean": 0.7358086109161377, + "rewards/qatch_small_update_with_fm/std": 0.34538453817367554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9978460669517517, + "sampling/importance_sampling_ratio/min": 0.0003619363415054977, + "sampling/sampling_logp_difference/max": 7.924042224884033, + "sampling/sampling_logp_difference/mean": 0.12252495437860489, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2679.0, + "completions/max_terminated_length": 2679.0, + "completions/mean_length": 503.17578125, + "completions/mean_terminated_length": 503.17578125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.1253802226856351, + "epoch": 0.8176991150442477, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.35180005875154, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 243200367.0, + "reward": 0.8208436965942383, + "reward_std": 0.053170062601566315, + "rewards/qatch_small_update_with_fm/mean": 0.8208437561988831, + "rewards/qatch_small_update_with_fm/std": 0.33666518330574036, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9941401481628418, + "sampling/importance_sampling_ratio/min": 0.005322411190718412, + "sampling/sampling_logp_difference/max": 5.235828876495361, + "sampling/sampling_logp_difference/mean": 0.10240937769412994, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3923.0, + "completions/mean_length": 744.49609375, + "completions/mean_terminated_length": 718.1063232421875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.16998208314180374, + "epoch": 0.8194690265486726, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2983180012105374, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 243699182.0, + "reward": 0.728640615940094, + "reward_std": 0.05621249973773956, + "rewards/qatch_small_update_with_fm/mean": 0.728640615940094, + "rewards/qatch_small_update_with_fm/std": 0.3335835039615631, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001490116119385, + "sampling/importance_sampling_ratio/min": 0.011265814304351807, + "sampling/sampling_logp_difference/max": 4.485982418060303, + "sampling/sampling_logp_difference/mean": 0.12198029458522797, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1772.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 556.328125, + "completions/mean_terminated_length": 556.328125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.1414416767656803, + "epoch": 0.8212389380530973, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4023449772591073, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 244336994.0, + "reward": 0.6802421808242798, + "reward_std": 0.042659495025873184, + "rewards/qatch_small_update_with_fm/mean": 0.6802421808242798, + "rewards/qatch_small_update_with_fm/std": 0.3406652808189392, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933852553367615, + "sampling/importance_sampling_ratio/min": 0.00175619893707335, + "sampling/sampling_logp_difference/max": 6.344603538513184, + "sampling/sampling_logp_difference/mean": 0.11338303238153458, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1973.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 496.59765625, + "completions/mean_terminated_length": 496.59765625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.15152422711253166, + "epoch": 0.8230088495575221, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.3084270165573425, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 244833451.0, + "reward": 0.717523455619812, + "reward_std": 0.01150607131421566, + "rewards/qatch_small_update_with_fm/mean": 0.717523455619812, + "rewards/qatch_small_update_with_fm/std": 0.3694573640823364, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9948652982711792, + "sampling/importance_sampling_ratio/min": 0.011154529638588428, + "sampling/sampling_logp_difference/max": 4.495909690856934, + "sampling/sampling_logp_difference/mean": 0.1199587881565094, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4049.0, + "completions/mean_length": 964.40625, + "completions/mean_terminated_length": 876.3694458007812, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.1926432903856039, + "epoch": 0.8247787610619469, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.32375511279508756, + "learning_rate": 1e-06, + "loss": -0.1064, + "num_tokens": 245471555.0, + "reward": 0.6633554697036743, + "reward_std": 0.0848885253071785, + "rewards/qatch_small_update_with_fm/mean": 0.6633554697036743, + "rewards/qatch_small_update_with_fm/std": 0.3811967074871063, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0064408779144287, + "sampling/importance_sampling_ratio/min": 0.0007988452562130988, + "sampling/sampling_logp_difference/max": 7.132343292236328, + "sampling/sampling_logp_difference/mean": 0.13031846284866333, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3352.0, + "completions/max_terminated_length": 3352.0, + "completions/mean_length": 754.55078125, + "completions/mean_terminated_length": 754.55078125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.17175315879285336, + "epoch": 0.8265486725663717, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3004368041198786, + "learning_rate": 1e-06, + "loss": -0.0468, + "num_tokens": 246164080.0, + "reward": 0.8258593678474426, + "reward_std": 0.04333872348070145, + "rewards/qatch_small_update_with_fm/mean": 0.8258593678474426, + "rewards/qatch_small_update_with_fm/std": 0.28225964307785034, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985412955284119, + "sampling/importance_sampling_ratio/min": 0.0001251505163963884, + "sampling/sampling_logp_difference/max": 8.985993385314941, + "sampling/sampling_logp_difference/mean": 0.12510454654693604, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3902.0, + "completions/max_terminated_length": 3902.0, + "completions/mean_length": 689.5234375, + "completions/mean_terminated_length": 689.5234375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.15941564738750458, + "epoch": 0.8283185840707965, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.46909349336641437, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 246863030.0, + "reward": 0.8296366930007935, + "reward_std": 0.0837571993470192, + "rewards/qatch_small_update_with_fm/mean": 0.8296366930007935, + "rewards/qatch_small_update_with_fm/std": 0.2871522605419159, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992177486419678, + "sampling/importance_sampling_ratio/min": 0.0001529164583189413, + "sampling/sampling_logp_difference/max": 8.785618782043457, + "sampling/sampling_logp_difference/mean": 0.11667613685131073, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 747.42578125, + "completions/mean_terminated_length": 734.294189453125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.16841256991028786, + "epoch": 0.8300884955752212, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.34838615373771503, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 247468851.0, + "reward": 0.6867148876190186, + "reward_std": 0.056690070778131485, + "rewards/qatch_small_update_with_fm/mean": 0.6867148876190186, + "rewards/qatch_small_update_with_fm/std": 0.3688320219516754, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9969513416290283, + "sampling/importance_sampling_ratio/min": 0.006885254755616188, + "sampling/sampling_logp_difference/max": 4.978373050689697, + "sampling/sampling_logp_difference/mean": 0.1274685263633728, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2827.0, + "completions/max_terminated_length": 2827.0, + "completions/mean_length": 638.0859375, + "completions/mean_terminated_length": 638.0859375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.1450775759294629, + "epoch": 0.831858407079646, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4692889831796996, + "learning_rate": 1e-06, + "loss": 0.0244, + "num_tokens": 248138985.0, + "reward": 0.7625468969345093, + "reward_std": 0.08190645277500153, + "rewards/qatch_small_update_with_fm/mean": 0.7625468969345093, + "rewards/qatch_small_update_with_fm/std": 0.3101102411746979, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9959828853607178, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.11293405294418335, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3411.0, + "completions/max_terminated_length": 3411.0, + "completions/mean_length": 471.22265625, + "completions/mean_terminated_length": 471.22265625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.1280396282672882, + "epoch": 0.8336283185840708, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.46987885082001113, + "learning_rate": 1e-06, + "loss": -0.0515, + "num_tokens": 248581938.0, + "reward": 0.8769687414169312, + "reward_std": 0.09168442338705063, + "rewards/qatch_small_update_with_fm/mean": 0.8769687414169312, + "rewards/qatch_small_update_with_fm/std": 0.25800442695617676, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934540390968323, + "sampling/importance_sampling_ratio/min": 0.009367694146931171, + "sampling/sampling_logp_difference/max": 4.670488357543945, + "sampling/sampling_logp_difference/mean": 0.10664229094982147, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 584.125, + "completions/mean_terminated_length": 528.3809814453125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.10968455206602812, + "epoch": 0.8353982300884956, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.28465633487894076, + "learning_rate": 1e-06, + "loss": -0.144, + "num_tokens": 249136658.0, + "reward": 0.7801679372787476, + "reward_std": 0.06691081821918488, + "rewards/qatch_small_update_with_fm/mean": 0.7801679372787476, + "rewards/qatch_small_update_with_fm/std": 0.3578769862651825, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9958081245422363, + "sampling/importance_sampling_ratio/min": 0.0017069082241505384, + "sampling/sampling_logp_difference/max": 6.373071670532227, + "sampling/sampling_logp_difference/mean": 0.09798172861337662, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3985.0, + "completions/mean_length": 773.7734375, + "completions/mean_terminated_length": 747.6141967773438, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.1615205705165863, + "epoch": 0.8371681415929203, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.3232061583745438, + "learning_rate": 1e-06, + "loss": -0.0545, + "num_tokens": 249666472.0, + "reward": 0.6544336080551147, + "reward_std": 0.07753826677799225, + "rewards/qatch_small_update_with_fm/mean": 0.65443354845047, + "rewards/qatch_small_update_with_fm/std": 0.33462199568748474, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986249208450317, + "sampling/importance_sampling_ratio/min": 0.01115453988313675, + "sampling/sampling_logp_difference/max": 4.495908737182617, + "sampling/sampling_logp_difference/mean": 0.11811242997646332, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3852.0, + "completions/mean_length": 936.25390625, + "completions/mean_terminated_length": 873.310791015625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.17724050767719746, + "epoch": 0.8389380530973451, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.38820766688732716, + "learning_rate": 1e-06, + "loss": -0.0682, + "num_tokens": 250401577.0, + "reward": 0.6836913824081421, + "reward_std": 0.1162552535533905, + "rewards/qatch_small_update_with_fm/mean": 0.6836913824081421, + "rewards/qatch_small_update_with_fm/std": 0.3393937647342682, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9975631237030029, + "sampling/importance_sampling_ratio/min": 0.008690698072314262, + "sampling/sampling_logp_difference/max": 4.74550199508667, + "sampling/sampling_logp_difference/mean": 0.1296575367450714, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2105.0, + "completions/max_terminated_length": 2105.0, + "completions/mean_length": 527.55859375, + "completions/mean_terminated_length": 527.55859375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.12861198373138905, + "epoch": 0.8407079646017699, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5101103556172623, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 251068408.0, + "reward": 0.929367184638977, + "reward_std": 0.0932401642203331, + "rewards/qatch_small_update_with_fm/mean": 0.929367184638977, + "rewards/qatch_small_update_with_fm/std": 0.1822948455810547, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920750856399536, + "sampling/importance_sampling_ratio/min": 0.008755389600992203, + "sampling/sampling_logp_difference/max": 4.738085746765137, + "sampling/sampling_logp_difference/mean": 0.11195670813322067, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3323.0, + "completions/mean_length": 659.09765625, + "completions/mean_terminated_length": 645.61962890625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1493313405662775, + "epoch": 0.8424778761061947, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.3765353734239925, + "learning_rate": 1e-06, + "loss": -0.0242, + "num_tokens": 251657729.0, + "reward": 0.8344101905822754, + "reward_std": 0.09548807889223099, + "rewards/qatch_small_update_with_fm/mean": 0.8344101905822754, + "rewards/qatch_small_update_with_fm/std": 0.2909124791622162, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9970696568489075, + "sampling/importance_sampling_ratio/min": 0.011136996559798717, + "sampling/sampling_logp_difference/max": 4.497482776641846, + "sampling/sampling_logp_difference/mean": 0.1136137917637825, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3756.0, + "completions/max_terminated_length": 3756.0, + "completions/mean_length": 669.21875, + "completions/mean_terminated_length": 669.21875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.13360167015343904, + "epoch": 0.8442477876106195, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.3526060990407319, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 252104105.0, + "reward": 0.7368632555007935, + "reward_std": 0.05987680330872536, + "rewards/qatch_small_update_with_fm/mean": 0.7368632555007935, + "rewards/qatch_small_update_with_fm/std": 0.3386927843093872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9987338781356812, + "sampling/importance_sampling_ratio/min": 0.008173546753823757, + "sampling/sampling_logp_difference/max": 4.806852340698242, + "sampling/sampling_logp_difference/mean": 0.10058818012475967, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3016.0, + "completions/mean_length": 849.1875, + "completions/mean_terminated_length": 823.6220703125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.15353283658623695, + "epoch": 0.8460176991150442, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.4516144346284262, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 252697289.0, + "reward": 0.6796093583106995, + "reward_std": 0.1720023900270462, + "rewards/qatch_small_update_with_fm/mean": 0.6796093583106995, + "rewards/qatch_small_update_with_fm/std": 0.3603024184703827, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9981167912483215, + "sampling/importance_sampling_ratio/min": 0.004114686511456966, + "sampling/sampling_logp_difference/max": 5.493192672729492, + "sampling/sampling_logp_difference/mean": 0.11910088360309601, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3387.0, + "completions/max_terminated_length": 3387.0, + "completions/mean_length": 628.56640625, + "completions/mean_terminated_length": 628.56640625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.13713755458593369, + "epoch": 0.8477876106194691, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5442060091729809, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 253342426.0, + "reward": 0.7485117316246033, + "reward_std": 0.09448330849409103, + "rewards/qatch_small_update_with_fm/mean": 0.7485117316246033, + "rewards/qatch_small_update_with_fm/std": 0.3188386857509613, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9971814751625061, + "sampling/importance_sampling_ratio/min": 0.004977615084499121, + "sampling/sampling_logp_difference/max": 5.302804470062256, + "sampling/sampling_logp_difference/mean": 0.11223170161247253, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3405.0, + "completions/mean_length": 737.57421875, + "completions/mean_terminated_length": 724.4039916992188, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.16004723869264126, + "epoch": 0.8495575221238938, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.3865325051659031, + "learning_rate": 1e-06, + "loss": -0.037, + "num_tokens": 253793261.0, + "reward": 0.6448359489440918, + "reward_std": 0.15111446380615234, + "rewards/qatch_small_update_with_fm/mean": 0.6448359489440918, + "rewards/qatch_small_update_with_fm/std": 0.3562937080860138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002942085266113, + "sampling/importance_sampling_ratio/min": 0.008727042004466057, + "sampling/sampling_logp_difference/max": 4.741328716278076, + "sampling/sampling_logp_difference/mean": 0.12233758717775345, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3297.0, + "completions/mean_length": 547.390625, + "completions/mean_terminated_length": 533.4745483398438, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.111550472676754, + "epoch": 0.8513274336283185, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.2075705475708058, + "learning_rate": 1e-06, + "loss": -0.0327, + "num_tokens": 254333697.0, + "reward": 0.6903281211853027, + "reward_std": 0.03593749552965164, + "rewards/qatch_small_update_with_fm/mean": 0.6903281211853027, + "rewards/qatch_small_update_with_fm/std": 0.31454235315322876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991868793964386, + "sampling/importance_sampling_ratio/min": 1.0412995834485628e-05, + "sampling/sampling_logp_difference/max": 11.472455978393555, + "sampling/sampling_logp_difference/mean": 0.10045738518238068, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3397.0, + "completions/mean_length": 843.91015625, + "completions/mean_terminated_length": 779.1275024414062, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.17988201044499874, + "epoch": 0.8530973451327434, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.3950648815701397, + "learning_rate": 1e-06, + "loss": -0.0401, + "num_tokens": 254895770.0, + "reward": 0.7115039229393005, + "reward_std": 0.05014953017234802, + "rewards/qatch_small_update_with_fm/mean": 0.7115039229393005, + "rewards/qatch_small_update_with_fm/std": 0.3483102321624756, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0065269470214844, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.12747487425804138, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3370.0, + "completions/max_terminated_length": 3370.0, + "completions/mean_length": 718.48828125, + "completions/mean_terminated_length": 718.48828125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.13500451855361462, + "epoch": 0.8548672566371681, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.43031337259879043, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 255571047.0, + "reward": 0.6986562609672546, + "reward_std": 0.07628892362117767, + "rewards/qatch_small_update_with_fm/mean": 0.6986562609672546, + "rewards/qatch_small_update_with_fm/std": 0.35163816809654236, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9929754137992859, + "sampling/importance_sampling_ratio/min": 0.002005315152928233, + "sampling/sampling_logp_difference/max": 6.211954116821289, + "sampling/sampling_logp_difference/mean": 0.11147291958332062, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3851.0, + "completions/mean_length": 643.24609375, + "completions/mean_terminated_length": 588.4404907226562, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.11379838734865189, + "epoch": 0.856637168141593, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.33428081990047404, + "learning_rate": 1e-06, + "loss": -0.1222, + "num_tokens": 256008854.0, + "reward": 0.7900702953338623, + "reward_std": 0.04124121367931366, + "rewards/qatch_small_update_with_fm/mean": 0.7900702953338623, + "rewards/qatch_small_update_with_fm/std": 0.31325802206993103, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9924777746200562, + "sampling/importance_sampling_ratio/min": 5.922396303503774e-05, + "sampling/sampling_logp_difference/max": 9.734184265136719, + "sampling/sampling_logp_difference/mean": 0.10147938132286072, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3578.0, + "completions/max_terminated_length": 3578.0, + "completions/mean_length": 783.12109375, + "completions/mean_terminated_length": 783.12109375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.11473686061799526, + "epoch": 0.8584070796460177, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.3716733908405323, + "learning_rate": 1e-06, + "loss": -0.0131, + "num_tokens": 256643029.0, + "reward": 0.7510117292404175, + "reward_std": 0.05420318618416786, + "rewards/qatch_small_update_with_fm/mean": 0.7510117292404175, + "rewards/qatch_small_update_with_fm/std": 0.3284763693809509, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.993644118309021, + "sampling/importance_sampling_ratio/min": 0.0015391077613458037, + "sampling/sampling_logp_difference/max": 6.476552486419678, + "sampling/sampling_logp_difference/mean": 0.09778095036745071, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2555.0, + "completions/max_terminated_length": 2555.0, + "completions/mean_length": 506.46484375, + "completions/mean_terminated_length": 506.46484375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.1107774619013071, + "epoch": 0.8601769911504424, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4364102557318908, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 257068300.0, + "reward": 0.7705351710319519, + "reward_std": 0.07204098999500275, + "rewards/qatch_small_update_with_fm/mean": 0.7705351710319519, + "rewards/qatch_small_update_with_fm/std": 0.3419608771800995, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9939799904823303, + "sampling/importance_sampling_ratio/min": 0.0054782782681286335, + "sampling/sampling_logp_difference/max": 5.206964492797852, + "sampling/sampling_logp_difference/mean": 0.09605875611305237, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3576.0, + "completions/max_terminated_length": 3576.0, + "completions/mean_length": 663.84375, + "completions/mean_terminated_length": 663.84375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.14957253821194172, + "epoch": 0.8619469026548673, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.48289123247994914, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 257611332.0, + "reward": 0.8433437347412109, + "reward_std": 0.11846810579299927, + "rewards/qatch_small_update_with_fm/mean": 0.8433437347412109, + "rewards/qatch_small_update_with_fm/std": 0.31830912828445435, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99498450756073, + "sampling/importance_sampling_ratio/min": 0.005322446580976248, + "sampling/sampling_logp_difference/max": 5.2358222007751465, + "sampling/sampling_logp_difference/mean": 0.11823485791683197, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3639.0, + "completions/max_terminated_length": 3639.0, + "completions/mean_length": 532.5, + "completions/mean_terminated_length": 532.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.1004238873720169, + "epoch": 0.863716814159292, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.34170294454283023, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 258201076.0, + "reward": 0.8699023127555847, + "reward_std": 0.008277655579149723, + "rewards/qatch_small_update_with_fm/mean": 0.8699023127555847, + "rewards/qatch_small_update_with_fm/std": 0.24368809163570404, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896460771560669, + "sampling/importance_sampling_ratio/min": 0.005287018604576588, + "sampling/sampling_logp_difference/max": 5.2425007820129395, + "sampling/sampling_logp_difference/mean": 0.09384314715862274, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3694.0, + "completions/max_terminated_length": 3694.0, + "completions/mean_length": 638.52734375, + "completions/mean_terminated_length": 638.52734375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.10675976052880287, + "epoch": 0.8654867256637168, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4477106407992075, + "learning_rate": 1e-06, + "loss": -0.031, + "num_tokens": 258868619.0, + "reward": 0.827468752861023, + "reward_std": 0.06248141825199127, + "rewards/qatch_small_update_with_fm/mean": 0.827468752861023, + "rewards/qatch_small_update_with_fm/std": 0.31693241000175476, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.993117094039917, + "sampling/importance_sampling_ratio/min": 0.002081583719700575, + "sampling/sampling_logp_difference/max": 6.174626350402832, + "sampling/sampling_logp_difference/mean": 0.09679339826107025, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3630.0, + "completions/max_terminated_length": 3630.0, + "completions/mean_length": 555.64453125, + "completions/mean_terminated_length": 555.64453125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.11295890063047409, + "epoch": 0.8672566371681416, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.47637855017594344, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 259404736.0, + "reward": 0.8470624685287476, + "reward_std": 0.08143269270658493, + "rewards/qatch_small_update_with_fm/mean": 0.8470624685287476, + "rewards/qatch_small_update_with_fm/std": 0.28385287523269653, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990898847579956, + "sampling/importance_sampling_ratio/min": 0.005345887504518032, + "sampling/sampling_logp_difference/max": 5.2314276695251465, + "sampling/sampling_logp_difference/mean": 0.10513126850128174, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2616.0, + "completions/max_terminated_length": 2616.0, + "completions/mean_length": 525.6875, + "completions/mean_terminated_length": 525.6875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.11057206615805626, + "epoch": 0.8690265486725663, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.49316013842711826, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 259886576.0, + "reward": 0.6682304739952087, + "reward_std": 0.06963515281677246, + "rewards/qatch_small_update_with_fm/mean": 0.6682304739952087, + "rewards/qatch_small_update_with_fm/std": 0.41227987408638, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994171142578125, + "sampling/importance_sampling_ratio/min": 5.919911814089573e-07, + "sampling/sampling_logp_difference/max": 14.339774131774902, + "sampling/sampling_logp_difference/mean": 0.09741106629371643, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3209.0, + "completions/max_terminated_length": 3209.0, + "completions/mean_length": 544.3125, + "completions/mean_terminated_length": 544.3125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.10203938465565443, + "epoch": 0.8707964601769912, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.40395896479505305, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 260393056.0, + "reward": 0.825976550579071, + "reward_std": 0.07064475864171982, + "rewards/qatch_small_update_with_fm/mean": 0.825976550579071, + "rewards/qatch_small_update_with_fm/std": 0.3124232292175293, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9923082590103149, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.09712864458560944, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3687.0, + "completions/mean_length": 740.96875, + "completions/mean_terminated_length": 727.8118286132812, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.13502811640501022, + "epoch": 0.8725663716814159, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7541222795830347, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 261195416.0, + "reward": 0.6253867149353027, + "reward_std": 0.036423392593860626, + "rewards/qatch_small_update_with_fm/mean": 0.6253867149353027, + "rewards/qatch_small_update_with_fm/std": 0.39507678151130676, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9943881034851074, + "sampling/importance_sampling_ratio/min": 0.0019406526116654277, + "sampling/sampling_logp_difference/max": 6.2447309494018555, + "sampling/sampling_logp_difference/mean": 0.11299841105937958, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3588.0, + "completions/max_terminated_length": 3588.0, + "completions/mean_length": 626.94140625, + "completions/mean_terminated_length": 626.94140625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.1206303657963872, + "epoch": 0.8743362831858407, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4655702140113037, + "learning_rate": 1e-06, + "loss": 0.0335, + "num_tokens": 261846201.0, + "reward": 0.6357265710830688, + "reward_std": 0.05497089400887489, + "rewards/qatch_small_update_with_fm/mean": 0.6357265710830688, + "rewards/qatch_small_update_with_fm/std": 0.380975604057312, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99620121717453, + "sampling/importance_sampling_ratio/min": 0.006773304659873247, + "sampling/sampling_logp_difference/max": 4.9947662353515625, + "sampling/sampling_logp_difference/mean": 0.10108514130115509, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3938.0, + "completions/max_terminated_length": 3938.0, + "completions/mean_length": 853.140625, + "completions/mean_terminated_length": 853.140625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.15600799396634102, + "epoch": 0.8761061946902655, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.2990253421074993, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 262474925.0, + "reward": 0.7554570436477661, + "reward_std": 0.0371590256690979, + "rewards/qatch_small_update_with_fm/mean": 0.7554570436477661, + "rewards/qatch_small_update_with_fm/std": 0.3164856433868408, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999498128890991, + "sampling/importance_sampling_ratio/min": 0.006773614324629307, + "sampling/sampling_logp_difference/max": 4.994720458984375, + "sampling/sampling_logp_difference/mean": 0.11973873525857925, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3271.0, + "completions/max_terminated_length": 3271.0, + "completions/mean_length": 620.87890625, + "completions/mean_terminated_length": 620.87890625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.1256363745778799, + "epoch": 0.8778761061946903, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.695171336012073, + "learning_rate": 1e-06, + "loss": 0.0355, + "num_tokens": 263142830.0, + "reward": 0.7039062976837158, + "reward_std": 0.12556229531764984, + "rewards/qatch_small_update_with_fm/mean": 0.7039062976837158, + "rewards/qatch_small_update_with_fm/std": 0.37397584319114685, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914224147796631, + "sampling/importance_sampling_ratio/min": 0.0022153332829475403, + "sampling/sampling_logp_difference/max": 6.11235237121582, + "sampling/sampling_logp_difference/mean": 0.11125008016824722, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 3627.0, + "completions/mean_length": 599.80078125, + "completions/mean_terminated_length": 599.80078125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.11637549288570881, + "epoch": 0.879646017699115, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.1743804157983891, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 263689803.0, + "reward": 0.7390859127044678, + "reward_std": 0.009406249970197678, + "rewards/qatch_small_update_with_fm/mean": 0.7390859127044678, + "rewards/qatch_small_update_with_fm/std": 0.3409428298473358, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9943405389785767, + "sampling/importance_sampling_ratio/min": 0.008762093260884285, + "sampling/sampling_logp_difference/max": 4.737320423126221, + "sampling/sampling_logp_difference/mean": 0.09983810782432556, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2520.0, + "completions/max_terminated_length": 2520.0, + "completions/mean_length": 592.25, + "completions/mean_terminated_length": 592.25, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.12224038224667311, + "epoch": 0.8814159292035398, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4241928294714878, + "learning_rate": 1e-06, + "loss": -0.0232, + "num_tokens": 264256907.0, + "reward": 0.7852305173873901, + "reward_std": 0.04839783534407616, + "rewards/qatch_small_update_with_fm/mean": 0.7852305173873901, + "rewards/qatch_small_update_with_fm/std": 0.32617098093032837, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915899634361267, + "sampling/importance_sampling_ratio/min": 0.00750912819057703, + "sampling/sampling_logp_difference/max": 4.891635894775391, + "sampling/sampling_logp_difference/mean": 0.10896101593971252, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3446.0, + "completions/mean_length": 680.08203125, + "completions/mean_terminated_length": 653.18505859375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.1548197865486145, + "epoch": 0.8831858407079646, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.42449434564075406, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 264778464.0, + "reward": 0.7460390329360962, + "reward_std": 0.04268177971243858, + "rewards/qatch_small_update_with_fm/mean": 0.7460390329360962, + "rewards/qatch_small_update_with_fm/std": 0.3574458062648773, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997272491455078, + "sampling/importance_sampling_ratio/min": 0.00681761559098959, + "sampling/sampling_logp_difference/max": 4.988245487213135, + "sampling/sampling_logp_difference/mean": 0.12331503629684448, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3910.0, + "completions/mean_length": 621.04296875, + "completions/mean_terminated_length": 607.4157104492188, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.12380017153918743, + "epoch": 0.8849557522123894, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5938474878995735, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 265333899.0, + "reward": 0.6871640682220459, + "reward_std": 0.11829064786434174, + "rewards/qatch_small_update_with_fm/mean": 0.6871640682220459, + "rewards/qatch_small_update_with_fm/std": 0.4201640784740448, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9962127208709717, + "sampling/importance_sampling_ratio/min": 0.00790239404886961, + "sampling/sampling_logp_difference/max": 4.84058952331543, + "sampling/sampling_logp_difference/mean": 0.10077165812253952, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3528.0, + "completions/mean_length": 649.7109375, + "completions/mean_terminated_length": 636.1961059570312, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.13394583947956562, + "epoch": 0.8867256637168142, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5901939553261971, + "learning_rate": 1e-06, + "loss": -0.0195, + "num_tokens": 265955729.0, + "reward": 0.7199413776397705, + "reward_std": 0.13469067215919495, + "rewards/qatch_small_update_with_fm/mean": 0.7199413776397705, + "rewards/qatch_small_update_with_fm/std": 0.358967661857605, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9926297068595886, + "sampling/importance_sampling_ratio/min": 0.0005967216566205025, + "sampling/sampling_logp_difference/max": 7.424059867858887, + "sampling/sampling_logp_difference/mean": 0.1124434769153595, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3276.0, + "completions/max_terminated_length": 3276.0, + "completions/mean_length": 795.90625, + "completions/mean_terminated_length": 795.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.13532480970025063, + "epoch": 0.8884955752212389, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47390615928516006, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 266872601.0, + "reward": 0.6929843425750732, + "reward_std": 0.128580704331398, + "rewards/qatch_small_update_with_fm/mean": 0.6929843425750732, + "rewards/qatch_small_update_with_fm/std": 0.3396168649196625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927940964698792, + "sampling/importance_sampling_ratio/min": 0.0025748631451278925, + "sampling/sampling_logp_difference/max": 5.961958885192871, + "sampling/sampling_logp_difference/mean": 0.11729906499385834, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3900.0, + "completions/mean_length": 747.88671875, + "completions/mean_terminated_length": 625.8906860351562, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.13497674651443958, + "epoch": 0.8902654867256637, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.397347167414158, + "learning_rate": 1e-06, + "loss": -0.1661, + "num_tokens": 267346108.0, + "reward": 0.7580468654632568, + "reward_std": 0.09373210370540619, + "rewards/qatch_small_update_with_fm/mean": 0.7580468654632568, + "rewards/qatch_small_update_with_fm/std": 0.3393411636352539, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9984617829322815, + "sampling/importance_sampling_ratio/min": 0.0005745299276895821, + "sampling/sampling_logp_difference/max": 7.461958408355713, + "sampling/sampling_logp_difference/mean": 0.10664564371109009, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3773.0, + "completions/max_terminated_length": 3773.0, + "completions/mean_length": 794.94140625, + "completions/mean_terminated_length": 794.94140625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.14703031815588474, + "epoch": 0.8920353982300885, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47300725939173244, + "learning_rate": 1e-06, + "loss": -0.038, + "num_tokens": 268114605.0, + "reward": 0.746777355670929, + "reward_std": 0.1325729787349701, + "rewards/qatch_small_update_with_fm/mean": 0.746777355670929, + "rewards/qatch_small_update_with_fm/std": 0.3324522376060486, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946229457855225, + "sampling/importance_sampling_ratio/min": 0.004103472921997309, + "sampling/sampling_logp_difference/max": 5.495921611785889, + "sampling/sampling_logp_difference/mean": 0.11896717548370361, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2692.0, + "completions/max_terminated_length": 2692.0, + "completions/mean_length": 607.3125, + "completions/mean_terminated_length": 607.3125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.1080400925129652, + "epoch": 0.8938053097345132, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5307241600276866, + "learning_rate": 1e-06, + "loss": 0.04, + "num_tokens": 268747581.0, + "reward": 0.7987890243530273, + "reward_std": 0.045952875167131424, + "rewards/qatch_small_update_with_fm/mean": 0.7987890243530273, + "rewards/qatch_small_update_with_fm/std": 0.3247639536857605, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9940429925918579, + "sampling/importance_sampling_ratio/min": 0.003386986441910267, + "sampling/sampling_logp_difference/max": 5.687814712524414, + "sampling/sampling_logp_difference/mean": 0.09250658750534058, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1967.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 516.34375, + "completions/mean_terminated_length": 516.34375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.10784084629267454, + "epoch": 0.8955752212389381, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.44363434212922453, + "learning_rate": 1e-06, + "loss": -0.0223, + "num_tokens": 269317493.0, + "reward": 0.7939062714576721, + "reward_std": 0.05867087468504906, + "rewards/qatch_small_update_with_fm/mean": 0.7939062714576721, + "rewards/qatch_small_update_with_fm/std": 0.31158387660980225, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9887971878051758, + "sampling/importance_sampling_ratio/min": 0.006596569903194904, + "sampling/sampling_logp_difference/max": 5.021205425262451, + "sampling/sampling_logp_difference/mean": 0.10122236609458923, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 547.87890625, + "completions/mean_terminated_length": 533.9647216796875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.10982772801071405, + "epoch": 0.8973451327433628, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5602113323442856, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 270047254.0, + "reward": 0.8025702834129333, + "reward_std": 0.07300128787755966, + "rewards/qatch_small_update_with_fm/mean": 0.8025702834129333, + "rewards/qatch_small_update_with_fm/std": 0.36065641045570374, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890502691268921, + "sampling/importance_sampling_ratio/min": 0.008726593106985092, + "sampling/sampling_logp_difference/max": 4.741380214691162, + "sampling/sampling_logp_difference/mean": 0.10416848212480545, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3059.0, + "completions/max_terminated_length": 3059.0, + "completions/mean_length": 555.31640625, + "completions/mean_terminated_length": 555.31640625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.10554197616875172, + "epoch": 0.8991150442477877, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6791503270002681, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 270632903.0, + "reward": 0.7361679673194885, + "reward_std": 0.06678478419780731, + "rewards/qatch_small_update_with_fm/mean": 0.7361679673194885, + "rewards/qatch_small_update_with_fm/std": 0.3236234784126282, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916872382164001, + "sampling/importance_sampling_ratio/min": 0.006751068867743015, + "sampling/sampling_logp_difference/max": 4.998054504394531, + "sampling/sampling_logp_difference/mean": 0.0991954579949379, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3049.0, + "completions/mean_length": 520.3515625, + "completions/mean_terminated_length": 492.19683837890625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.10747648775577545, + "epoch": 0.9008849557522124, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.31376394668547153, + "learning_rate": 1e-06, + "loss": -0.0331, + "num_tokens": 271390833.0, + "reward": 0.8341054916381836, + "reward_std": 0.05029167979955673, + "rewards/qatch_small_update_with_fm/mean": 0.8341054916381836, + "rewards/qatch_small_update_with_fm/std": 0.33065176010131836, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912950396537781, + "sampling/importance_sampling_ratio/min": 0.000440972886281088, + "sampling/sampling_logp_difference/max": 7.726527214050293, + "sampling/sampling_logp_difference/mean": 0.10213141143321991, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1767.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 475.76171875, + "completions/mean_terminated_length": 475.76171875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.101709870621562, + "epoch": 0.9026548672566371, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5136908947485258, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 271914180.0, + "reward": 0.7939648628234863, + "reward_std": 0.10082687437534332, + "rewards/qatch_small_update_with_fm/mean": 0.7939648032188416, + "rewards/qatch_small_update_with_fm/std": 0.3312971591949463, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915147423744202, + "sampling/importance_sampling_ratio/min": 0.00526475952938199, + "sampling/sampling_logp_difference/max": 5.246719837188721, + "sampling/sampling_logp_difference/mean": 0.09459386765956879, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3733.0, + "completions/mean_length": 709.73828125, + "completions/mean_terminated_length": 683.0748291015625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.11102820746600628, + "epoch": 0.904424778761062, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.0245789709175026, + "learning_rate": 1e-06, + "loss": -0.0864, + "num_tokens": 272625921.0, + "reward": 0.8580273389816284, + "reward_std": 0.0242561474442482, + "rewards/qatch_small_update_with_fm/mean": 0.8580273389816284, + "rewards/qatch_small_update_with_fm/std": 0.2759164571762085, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9949482679367065, + "sampling/importance_sampling_ratio/min": 0.005282989237457514, + "sampling/sampling_logp_difference/max": 5.243263244628906, + "sampling/sampling_logp_difference/mean": 0.09105080366134644, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1474.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 379.58984375, + "completions/mean_terminated_length": 379.58984375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.09121392387896776, + "epoch": 0.9061946902654867, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2937055756246945, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 273184376.0, + "reward": 0.960460901260376, + "reward_std": 0.02830149233341217, + "rewards/qatch_small_update_with_fm/mean": 0.960460901260376, + "rewards/qatch_small_update_with_fm/std": 0.14264100790023804, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898667931556702, + "sampling/importance_sampling_ratio/min": 0.0067546553909778595, + "sampling/sampling_logp_difference/max": 4.997523307800293, + "sampling/sampling_logp_difference/mean": 0.0861605778336525, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3332.0, + "completions/max_terminated_length": 3332.0, + "completions/mean_length": 550.4609375, + "completions/mean_terminated_length": 550.4609375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.12689280044287443, + "epoch": 0.9079646017699115, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5932380148617874, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 273674846.0, + "reward": 0.7879921793937683, + "reward_std": 0.14545322954654694, + "rewards/qatch_small_update_with_fm/mean": 0.7879921793937683, + "rewards/qatch_small_update_with_fm/std": 0.3656655251979828, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9960757493972778, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.10277114808559418, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3784.0, + "completions/mean_length": 491.1015625, + "completions/mean_terminated_length": 476.9647216796875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.10514062270522118, + "epoch": 0.9097345132743363, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4467300183498017, + "learning_rate": 1e-06, + "loss": -0.0923, + "num_tokens": 274116648.0, + "reward": 0.872878909111023, + "reward_std": 0.04924742877483368, + "rewards/qatch_small_update_with_fm/mean": 0.872878909111023, + "rewards/qatch_small_update_with_fm/std": 0.22971169650554657, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994683027267456, + "sampling/importance_sampling_ratio/min": 0.0010823322227224708, + "sampling/sampling_logp_difference/max": 6.82863712310791, + "sampling/sampling_logp_difference/mean": 0.09253266453742981, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2293.0, + "completions/max_terminated_length": 2293.0, + "completions/mean_length": 489.421875, + "completions/mean_terminated_length": 489.421875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.10272311139851809, + "epoch": 0.911504424778761, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6825855255982989, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 274671252.0, + "reward": 0.7494022846221924, + "reward_std": 0.08151744306087494, + "rewards/qatch_small_update_with_fm/mean": 0.7494022846221924, + "rewards/qatch_small_update_with_fm/std": 0.3152708411216736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9923495054244995, + "sampling/importance_sampling_ratio/min": 0.000986733939498663, + "sampling/sampling_logp_difference/max": 6.921110153198242, + "sampling/sampling_logp_difference/mean": 0.09436982870101929, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3437.0, + "completions/mean_length": 709.078125, + "completions/mean_terminated_length": 695.796142578125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.1157941734418273, + "epoch": 0.9132743362831859, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.44786462791495824, + "learning_rate": 1e-06, + "loss": -0.0207, + "num_tokens": 275267848.0, + "reward": 0.7720664143562317, + "reward_std": 0.09372272342443466, + "rewards/qatch_small_update_with_fm/mean": 0.7720664143562317, + "rewards/qatch_small_update_with_fm/std": 0.31302616000175476, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927377700805664, + "sampling/importance_sampling_ratio/min": 0.004789196886122227, + "sampling/sampling_logp_difference/max": 5.341392517089844, + "sampling/sampling_logp_difference/mean": 0.09881985187530518, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3991.0, + "completions/mean_length": 756.34375, + "completions/mean_terminated_length": 716.7431030273438, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.11825973261147738, + "epoch": 0.9150442477876106, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6220394658984534, + "learning_rate": 1e-06, + "loss": -0.078, + "num_tokens": 276139968.0, + "reward": 0.6929961442947388, + "reward_std": 0.07409294694662094, + "rewards/qatch_small_update_with_fm/mean": 0.6929961442947388, + "rewards/qatch_small_update_with_fm/std": 0.38647279143333435, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.995445966720581, + "sampling/importance_sampling_ratio/min": 0.00030024454463273287, + "sampling/sampling_logp_difference/max": 8.110913276672363, + "sampling/sampling_logp_difference/mean": 0.09951721131801605, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1978.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 383.0859375, + "completions/mean_terminated_length": 383.0859375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.0746139083057642, + "epoch": 0.9168141592920354, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4217912624034053, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 276603734.0, + "reward": 0.7472773194313049, + "reward_std": 0.04199932515621185, + "rewards/qatch_small_update_with_fm/mean": 0.7472773194313049, + "rewards/qatch_small_update_with_fm/std": 0.3478599488735199, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900175333023071, + "sampling/importance_sampling_ratio/min": 7.775412814226002e-05, + "sampling/sampling_logp_difference/max": 9.461958885192871, + "sampling/sampling_logp_difference/mean": 0.07847994565963745, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3543.0, + "completions/max_terminated_length": 3543.0, + "completions/mean_length": 668.703125, + "completions/mean_terminated_length": 668.703125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.11498175282031298, + "epoch": 0.9185840707964602, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.45490108367204685, + "learning_rate": 1e-06, + "loss": 0.0702, + "num_tokens": 277088282.0, + "reward": 0.7890429496765137, + "reward_std": 0.05803219974040985, + "rewards/qatch_small_update_with_fm/mean": 0.7890429496765137, + "rewards/qatch_small_update_with_fm/std": 0.2938583195209503, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992883563041687, + "sampling/importance_sampling_ratio/min": 0.0026743346825242043, + "sampling/sampling_logp_difference/max": 5.9240546226501465, + "sampling/sampling_logp_difference/mean": 0.10016723722219467, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2929.0, + "completions/max_terminated_length": 2929.0, + "completions/mean_length": 570.6796875, + "completions/mean_terminated_length": 570.6796875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.12316263653337955, + "epoch": 0.9203539823008849, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5428826382650482, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 277591640.0, + "reward": 0.8450508117675781, + "reward_std": 0.0795350894331932, + "rewards/qatch_small_update_with_fm/mean": 0.8450508117675781, + "rewards/qatch_small_update_with_fm/std": 0.2786359190940857, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909742474555969, + "sampling/importance_sampling_ratio/min": 0.0041517033241689205, + "sampling/sampling_logp_difference/max": 5.484236717224121, + "sampling/sampling_logp_difference/mean": 0.11202063411474228, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4030.0, + "completions/mean_length": 851.640625, + "completions/mean_terminated_length": 838.917724609375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.1328129693865776, + "epoch": 0.9221238938053097, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6348699462205718, + "learning_rate": 1e-06, + "loss": -0.0749, + "num_tokens": 278204700.0, + "reward": 0.6865078210830688, + "reward_std": 0.0656687542796135, + "rewards/qatch_small_update_with_fm/mean": 0.6865078210830688, + "rewards/qatch_small_update_with_fm/std": 0.35934215784072876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9983229041099548, + "sampling/importance_sampling_ratio/min": 0.0016617655055597425, + "sampling/sampling_logp_difference/max": 6.399874687194824, + "sampling/sampling_logp_difference/mean": 0.10541870445013046, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1831.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 466.08984375, + "completions/mean_terminated_length": 466.08984375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.09466620348393917, + "epoch": 0.9238938053097345, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5389918379739942, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 278721795.0, + "reward": 0.7020820379257202, + "reward_std": 0.04175513610243797, + "rewards/qatch_small_update_with_fm/mean": 0.7020820379257202, + "rewards/qatch_small_update_with_fm/std": 0.2982487976551056, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9895206689834595, + "sampling/importance_sampling_ratio/min": 0.0014664140762761235, + "sampling/sampling_logp_difference/max": 6.524935245513916, + "sampling/sampling_logp_difference/mean": 0.0922519713640213, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2794.0, + "completions/max_terminated_length": 2794.0, + "completions/mean_length": 697.234375, + "completions/mean_terminated_length": 697.234375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.12420686054974794, + "epoch": 0.9256637168141593, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4637277192811646, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 279378175.0, + "reward": 0.907964825630188, + "reward_std": 0.060234054923057556, + "rewards/qatch_small_update_with_fm/mean": 0.907964825630188, + "rewards/qatch_small_update_with_fm/std": 0.2710384428501129, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9939944744110107, + "sampling/importance_sampling_ratio/min": 3.092254246439552e-06, + "sampling/sampling_logp_difference/max": 12.686610221862793, + "sampling/sampling_logp_difference/mean": 0.10830149054527283, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3774.0, + "completions/max_terminated_length": 3774.0, + "completions/mean_length": 592.01953125, + "completions/mean_terminated_length": 592.01953125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.11691632680594921, + "epoch": 0.9274336283185841, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4683143629081742, + "learning_rate": 1e-06, + "loss": 0.0532, + "num_tokens": 279822276.0, + "reward": 0.7248905897140503, + "reward_std": 0.0623093843460083, + "rewards/qatch_small_update_with_fm/mean": 0.7248905897140503, + "rewards/qatch_small_update_with_fm/std": 0.3484153151512146, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925761222839355, + "sampling/importance_sampling_ratio/min": 0.008074809797108173, + "sampling/sampling_logp_difference/max": 4.819005966186523, + "sampling/sampling_logp_difference/mean": 0.1028011366724968, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2042.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 483.20703125, + "completions/mean_terminated_length": 483.20703125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.08595089986920357, + "epoch": 0.9292035398230089, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5232236333613473, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 280248649.0, + "reward": 0.800000011920929, + "reward_std": 0.05487021803855896, + "rewards/qatch_small_update_with_fm/mean": 0.800000011920929, + "rewards/qatch_small_update_with_fm/std": 0.3535082936286926, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883502721786499, + "sampling/importance_sampling_ratio/min": 0.004996729549020529, + "sampling/sampling_logp_difference/max": 5.298971652984619, + "sampling/sampling_logp_difference/mean": 0.09205498546361923, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2210.0, + "completions/max_terminated_length": 2210.0, + "completions/mean_length": 597.28515625, + "completions/mean_terminated_length": 597.28515625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.11841507069766521, + "epoch": 0.9309734513274336, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5666674244929343, + "learning_rate": 1e-06, + "loss": 0.0301, + "num_tokens": 280657026.0, + "reward": 0.8406132459640503, + "reward_std": 0.0987255647778511, + "rewards/qatch_small_update_with_fm/mean": 0.8406132459640503, + "rewards/qatch_small_update_with_fm/std": 0.2870708405971527, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896042943000793, + "sampling/importance_sampling_ratio/min": 0.002490278333425522, + "sampling/sampling_logp_difference/max": 5.995360851287842, + "sampling/sampling_logp_difference/mean": 0.1075456291437149, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1692.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 487.5078125, + "completions/mean_terminated_length": 487.5078125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.08723330218344927, + "epoch": 0.9327433628318584, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5266721054678201, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 281301316.0, + "reward": 0.7623125314712524, + "reward_std": 0.029279010370373726, + "rewards/qatch_small_update_with_fm/mean": 0.7623125314712524, + "rewards/qatch_small_update_with_fm/std": 0.37368521094322205, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890673160552979, + "sampling/importance_sampling_ratio/min": 2.2507217067868623e-07, + "sampling/sampling_logp_difference/max": 15.306844711303711, + "sampling/sampling_logp_difference/mean": 0.08900699019432068, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3234.0, + "completions/max_terminated_length": 3234.0, + "completions/mean_length": 670.80078125, + "completions/mean_terminated_length": 670.80078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.10302252415567636, + "epoch": 0.9345132743362832, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.48289001580005797, + "learning_rate": 1e-06, + "loss": -0.033, + "num_tokens": 281832577.0, + "reward": 0.714773416519165, + "reward_std": 0.07669016718864441, + "rewards/qatch_small_update_with_fm/mean": 0.714773416519165, + "rewards/qatch_small_update_with_fm/std": 0.334821879863739, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901472926139832, + "sampling/importance_sampling_ratio/min": 0.004114319570362568, + "sampling/sampling_logp_difference/max": 5.493281841278076, + "sampling/sampling_logp_difference/mean": 0.10036074370145798, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 1048.1640625, + "completions/mean_terminated_length": 962.4818725585938, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.12826403323560953, + "epoch": 0.9362831858407079, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4273381754299511, + "learning_rate": 1e-06, + "loss": -0.0511, + "num_tokens": 282545243.0, + "reward": 0.6557812690734863, + "reward_std": 0.11316819489002228, + "rewards/qatch_small_update_with_fm/mean": 0.6557812690734863, + "rewards/qatch_small_update_with_fm/std": 0.37949860095977783, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934544563293457, + "sampling/importance_sampling_ratio/min": 0.00034160760696977377, + "sampling/sampling_logp_difference/max": 7.981847763061523, + "sampling/sampling_logp_difference/mean": 0.10794238746166229, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3022.0, + "completions/mean_length": 870.19921875, + "completions/mean_terminated_length": 857.549072265625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.14097079075872898, + "epoch": 0.9380530973451328, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4239509379502484, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 283138702.0, + "reward": 0.6905586123466492, + "reward_std": 0.06804914027452469, + "rewards/qatch_small_update_with_fm/mean": 0.6905585527420044, + "rewards/qatch_small_update_with_fm/std": 0.3696425259113312, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920858144760132, + "sampling/importance_sampling_ratio/min": 0.003613534849137068, + "sampling/sampling_logp_difference/max": 5.623068809509277, + "sampling/sampling_logp_difference/mean": 0.11941371113061905, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2325.0, + "completions/max_terminated_length": 2325.0, + "completions/mean_length": 528.30078125, + "completions/mean_terminated_length": 528.30078125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.09164797514677048, + "epoch": 0.9398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5307769550601885, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 283613819.0, + "reward": 0.8188437819480896, + "reward_std": 0.06766097992658615, + "rewards/qatch_small_update_with_fm/mean": 0.8188437819480896, + "rewards/qatch_small_update_with_fm/std": 0.3401625156402588, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893657565116882, + "sampling/importance_sampling_ratio/min": 0.0008690520189702511, + "sampling/sampling_logp_difference/max": 7.048107624053955, + "sampling/sampling_logp_difference/mean": 0.09450694918632507, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3533.0, + "completions/max_terminated_length": 3533.0, + "completions/mean_length": 565.26171875, + "completions/mean_terminated_length": 565.26171875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.08598773088306189, + "epoch": 0.9415929203539823, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.38821236574145923, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 284038894.0, + "reward": 0.7724023461341858, + "reward_std": 0.042911797761917114, + "rewards/qatch_small_update_with_fm/mean": 0.7724023461341858, + "rewards/qatch_small_update_with_fm/std": 0.337915301322937, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913812875747681, + "sampling/importance_sampling_ratio/min": 0.004855873994529247, + "sampling/sampling_logp_difference/max": 5.327566146850586, + "sampling/sampling_logp_difference/mean": 0.0843484178185463, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3197.0, + "completions/max_terminated_length": 3197.0, + "completions/mean_length": 405.58984375, + "completions/mean_terminated_length": 405.58984375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.0662353434599936, + "epoch": 0.9433628318584071, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6172104659359773, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 284502357.0, + "reward": 0.7266562581062317, + "reward_std": 0.07175854593515396, + "rewards/qatch_small_update_with_fm/mean": 0.7266561985015869, + "rewards/qatch_small_update_with_fm/std": 0.40244054794311523, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914896488189697, + "sampling/importance_sampling_ratio/min": 0.0015248889103531837, + "sampling/sampling_logp_difference/max": 6.485833644866943, + "sampling/sampling_logp_difference/mean": 0.07165195047855377, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2190.0, + "completions/max_terminated_length": 2190.0, + "completions/mean_length": 402.88671875, + "completions/mean_terminated_length": 402.88671875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.06576312193647027, + "epoch": 0.9451327433628318, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.41788947304605045, + "learning_rate": 1e-06, + "loss": -0.0167, + "num_tokens": 284956248.0, + "reward": 0.8745741844177246, + "reward_std": 0.0049965716898441315, + "rewards/qatch_small_update_with_fm/mean": 0.8745741844177246, + "rewards/qatch_small_update_with_fm/std": 0.2458096295595169, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9895414113998413, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.07585783302783966, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3813.0, + "completions/max_terminated_length": 3813.0, + "completions/mean_length": 575.8125, + "completions/mean_terminated_length": 575.8125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.07056690799072385, + "epoch": 0.9469026548672567, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7378731304713649, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 285410296.0, + "reward": 0.7944101691246033, + "reward_std": 0.06454658508300781, + "rewards/qatch_small_update_with_fm/mean": 0.7944101691246033, + "rewards/qatch_small_update_with_fm/std": 0.34657639265060425, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912918210029602, + "sampling/importance_sampling_ratio/min": 0.004212359432131052, + "sampling/sampling_logp_difference/max": 5.469732284545898, + "sampling/sampling_logp_difference/mean": 0.07724401354789734, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2376.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 515.69921875, + "completions/mean_terminated_length": 515.69921875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.08022209769114852, + "epoch": 0.9486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5696256068461455, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 286059707.0, + "reward": 0.7645195126533508, + "reward_std": 0.06664008647203445, + "rewards/qatch_small_update_with_fm/mean": 0.7645195126533508, + "rewards/qatch_small_update_with_fm/std": 0.33420178294181824, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892451763153076, + "sampling/importance_sampling_ratio/min": 0.0020269453525543213, + "sampling/sampling_logp_difference/max": 6.201225280761719, + "sampling/sampling_logp_difference/mean": 0.08961813151836395, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2371.0, + "completions/mean_length": 601.76171875, + "completions/mean_terminated_length": 588.058837890625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.0777341304346919, + "epoch": 0.9504424778761061, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5403558957446779, + "learning_rate": 1e-06, + "loss": 0.039, + "num_tokens": 286713822.0, + "reward": 0.8262421488761902, + "reward_std": 0.10278799384832382, + "rewards/qatch_small_update_with_fm/mean": 0.8262421488761902, + "rewards/qatch_small_update_with_fm/std": 0.32409027218818665, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906113147735596, + "sampling/importance_sampling_ratio/min": 0.000726877769920975, + "sampling/sampling_logp_difference/max": 7.226752281188965, + "sampling/sampling_logp_difference/mean": 0.08289214968681335, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2920.0, + "completions/max_terminated_length": 2920.0, + "completions/mean_length": 534.79296875, + "completions/mean_terminated_length": 534.79296875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.07284298213198781, + "epoch": 0.952212389380531, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.407306536990088, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 287406777.0, + "reward": 0.7659531235694885, + "reward_std": 0.03347664326429367, + "rewards/qatch_small_update_with_fm/mean": 0.7659531235694885, + "rewards/qatch_small_update_with_fm/std": 0.31723061203956604, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882594347000122, + "sampling/importance_sampling_ratio/min": 4.673008646705057e-08, + "sampling/sampling_logp_difference/max": 16.878877639770508, + "sampling/sampling_logp_difference/mean": 0.08553198724985123, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3659.0, + "completions/max_terminated_length": 3659.0, + "completions/mean_length": 560.9609375, + "completions/mean_terminated_length": 560.9609375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.063908651471138, + "epoch": 0.9539823008849557, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.49161049554807584, + "learning_rate": 1e-06, + "loss": 0.0278, + "num_tokens": 287984415.0, + "reward": 0.7458593845367432, + "reward_std": 0.035589832812547684, + "rewards/qatch_small_update_with_fm/mean": 0.7458593845367432, + "rewards/qatch_small_update_with_fm/std": 0.35967662930488586, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9910004138946533, + "sampling/importance_sampling_ratio/min": 2.653115188877564e-06, + "sampling/sampling_logp_difference/max": 12.839776039123535, + "sampling/sampling_logp_difference/mean": 0.07102929055690765, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3777.0, + "completions/mean_length": 657.8671875, + "completions/mean_terminated_length": 644.3843383789062, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.07635938376188278, + "epoch": 0.9557522123893806, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5452335209582936, + "learning_rate": 1e-06, + "loss": -0.0607, + "num_tokens": 288796429.0, + "reward": 0.5647109746932983, + "reward_std": 0.06827317178249359, + "rewards/qatch_small_update_with_fm/mean": 0.5647109746932983, + "rewards/qatch_small_update_with_fm/std": 0.38985252380371094, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927794933319092, + "sampling/importance_sampling_ratio/min": 0.00033576894202269614, + "sampling/sampling_logp_difference/max": 7.999087333679199, + "sampling/sampling_logp_difference/mean": 0.07802629470825195, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2996.0, + "completions/max_terminated_length": 2996.0, + "completions/mean_length": 544.43359375, + "completions/mean_terminated_length": 544.43359375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.06596293672919273, + "epoch": 0.9575221238938053, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.27206970564743294, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 289291676.0, + "reward": 0.7545312643051147, + "reward_std": 0.015625, + "rewards/qatch_small_update_with_fm/mean": 0.7545312643051147, + "rewards/qatch_small_update_with_fm/std": 0.37506264448165894, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873992204666138, + "sampling/importance_sampling_ratio/min": 0.0026083062402904034, + "sampling/sampling_logp_difference/max": 5.94905424118042, + "sampling/sampling_logp_difference/mean": 0.08176256716251373, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 934.8359375, + "completions/mean_terminated_length": 922.4392700195312, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.1001374926418066, + "epoch": 0.95929203539823, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.559960616971841, + "learning_rate": 1e-06, + "loss": -0.0294, + "num_tokens": 289976722.0, + "reward": 0.6559218764305115, + "reward_std": 0.08523955941200256, + "rewards/qatch_small_update_with_fm/mean": 0.6559218168258667, + "rewards/qatch_small_update_with_fm/std": 0.3872259855270386, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9958049654960632, + "sampling/importance_sampling_ratio/min": 0.0012593659339472651, + "sampling/sampling_logp_difference/max": 6.677146911621094, + "sampling/sampling_logp_difference/mean": 0.09016713500022888, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2828.0, + "completions/max_terminated_length": 2828.0, + "completions/mean_length": 498.71484375, + "completions/mean_terminated_length": 498.71484375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.0649999501183629, + "epoch": 0.9610619469026549, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.47685171312494107, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 290494377.0, + "reward": 0.745410144329071, + "reward_std": 0.04396291822195053, + "rewards/qatch_small_update_with_fm/mean": 0.745410144329071, + "rewards/qatch_small_update_with_fm/std": 0.3141641318798065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892974495887756, + "sampling/importance_sampling_ratio/min": 0.0004503573873080313, + "sampling/sampling_logp_difference/max": 7.705469131469727, + "sampling/sampling_logp_difference/mean": 0.07702207565307617, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1027.33984375, + "completions/mean_terminated_length": 990.95263671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.09254028089344501, + "epoch": 0.9628318584070796, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5142745856968639, + "learning_rate": 1e-06, + "loss": -0.0922, + "num_tokens": 291223952.0, + "reward": 0.657976508140564, + "reward_std": 0.1279950886964798, + "rewards/qatch_small_update_with_fm/mean": 0.657976508140564, + "rewards/qatch_small_update_with_fm/std": 0.3692423105239868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9931938648223877, + "sampling/importance_sampling_ratio/min": 0.0009468301432207227, + "sampling/sampling_logp_difference/max": 6.962390899658203, + "sampling/sampling_logp_difference/mean": 0.0876588374376297, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2998.0, + "completions/max_terminated_length": 2998.0, + "completions/mean_length": 573.875, + "completions/mean_terminated_length": 573.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.06328198779374361, + "epoch": 0.9646017699115044, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.41556654472228777, + "learning_rate": 1e-06, + "loss": -0.0308, + "num_tokens": 291649248.0, + "reward": 0.7210820317268372, + "reward_std": 0.022427227348089218, + "rewards/qatch_small_update_with_fm/mean": 0.7210820317268372, + "rewards/qatch_small_update_with_fm/std": 0.3797529935836792, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914613962173462, + "sampling/importance_sampling_ratio/min": 0.0003364813746884465, + "sampling/sampling_logp_difference/max": 7.996967792510986, + "sampling/sampling_logp_difference/mean": 0.0714886337518692, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3946.0, + "completions/mean_length": 674.890625, + "completions/mean_terminated_length": 647.9527587890625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.07149964012205601, + "epoch": 0.9663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4074940853826009, + "learning_rate": 1e-06, + "loss": -0.0965, + "num_tokens": 292226948.0, + "reward": 0.7326640486717224, + "reward_std": 0.05338604003190994, + "rewards/qatch_small_update_with_fm/mean": 0.7326640486717224, + "rewards/qatch_small_update_with_fm/std": 0.35138407349586487, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919158220291138, + "sampling/importance_sampling_ratio/min": 0.0015048105269670486, + "sampling/sampling_logp_difference/max": 6.499088287353516, + "sampling/sampling_logp_difference/mean": 0.07774293422698975, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4069.0, + "completions/mean_length": 783.1640625, + "completions/mean_terminated_length": 770.172607421875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.07332196040078998, + "epoch": 0.968141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2833815598679686, + "learning_rate": 1e-06, + "loss": -0.028, + "num_tokens": 292919086.0, + "reward": 0.6988593339920044, + "reward_std": 0.03941141813993454, + "rewards/qatch_small_update_with_fm/mean": 0.6988593339920044, + "rewards/qatch_small_update_with_fm/std": 0.346405953168869, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990686297416687, + "sampling/importance_sampling_ratio/min": 9.723559196572751e-05, + "sampling/sampling_logp_difference/max": 9.238373756408691, + "sampling/sampling_logp_difference/mean": 0.08024582266807556, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3540.0, + "completions/mean_length": 1048.33203125, + "completions/mean_terminated_length": 1024.3345947265625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.10439629666507244, + "epoch": 0.9699115044247788, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.38819391093077565, + "learning_rate": 1e-06, + "loss": -0.0542, + "num_tokens": 293700723.0, + "reward": 0.7764492034912109, + "reward_std": 0.057538390159606934, + "rewards/qatch_small_update_with_fm/mean": 0.7764492630958557, + "rewards/qatch_small_update_with_fm/std": 0.35874632000923157, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915471076965332, + "sampling/importance_sampling_ratio/min": 0.0003894827968906611, + "sampling/sampling_logp_difference/max": 7.850690841674805, + "sampling/sampling_logp_difference/mean": 0.10121684521436691, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3337.0, + "completions/max_terminated_length": 3337.0, + "completions/mean_length": 514.9296875, + "completions/mean_terminated_length": 514.9296875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.06488791201263666, + "epoch": 0.9716814159292035, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.49346829936262865, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 294261521.0, + "reward": 0.8128906488418579, + "reward_std": 0.05527229979634285, + "rewards/qatch_small_update_with_fm/mean": 0.8128906488418579, + "rewards/qatch_small_update_with_fm/std": 0.3675207793712616, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900767207145691, + "sampling/importance_sampling_ratio/min": 0.002415587892755866, + "sampling/sampling_logp_difference/max": 6.02581262588501, + "sampling/sampling_logp_difference/mean": 0.07517706602811813, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3724.0, + "completions/max_terminated_length": 3724.0, + "completions/mean_length": 569.7109375, + "completions/mean_terminated_length": 569.7109375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.0651154387742281, + "epoch": 0.9734513274336283, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5618206950409255, + "learning_rate": 1e-06, + "loss": -0.0255, + "num_tokens": 294822679.0, + "reward": 0.7361406087875366, + "reward_std": 0.06598907709121704, + "rewards/qatch_small_update_with_fm/mean": 0.7361406087875366, + "rewards/qatch_small_update_with_fm/std": 0.3569088876247406, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9903668165206909, + "sampling/importance_sampling_ratio/min": 0.001626877929084003, + "sampling/sampling_logp_difference/max": 6.421092510223389, + "sampling/sampling_logp_difference/mean": 0.07287141680717468, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2719.0, + "completions/max_terminated_length": 2719.0, + "completions/mean_length": 693.64453125, + "completions/mean_terminated_length": 693.64453125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.07973913569003344, + "epoch": 0.9752212389380531, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4333880222785954, + "learning_rate": 1e-06, + "loss": 0.0287, + "num_tokens": 295314780.0, + "reward": 0.6914101839065552, + "reward_std": 0.0544731467962265, + "rewards/qatch_small_update_with_fm/mean": 0.6914101839065552, + "rewards/qatch_small_update_with_fm/std": 0.35723307728767395, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877251982688904, + "sampling/importance_sampling_ratio/min": 0.005544170271605253, + "sampling/sampling_logp_difference/max": 5.195008277893066, + "sampling/sampling_logp_difference/mean": 0.08873097598552704, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2087.0, + "completions/max_terminated_length": 2087.0, + "completions/mean_length": 467.06640625, + "completions/mean_terminated_length": 467.06640625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.05367149272933602, + "epoch": 0.9769911504424779, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9635481879647013, + "learning_rate": 1e-06, + "loss": 0.0675, + "num_tokens": 295878157.0, + "reward": 0.7614219188690186, + "reward_std": 0.07371485978364944, + "rewards/qatch_small_update_with_fm/mean": 0.7614219188690186, + "rewards/qatch_small_update_with_fm/std": 0.3245559334754944, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889311790466309, + "sampling/importance_sampling_ratio/min": 0.0019758539274334908, + "sampling/sampling_logp_difference/max": 6.226754665374756, + "sampling/sampling_logp_difference/mean": 0.06883668154478073, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3886.0, + "completions/mean_length": 799.53515625, + "completions/mean_terminated_length": 773.5787353515625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.09158620424568653, + "epoch": 0.9787610619469026, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5187698393124252, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 296637702.0, + "reward": 0.7982265949249268, + "reward_std": 0.05513294041156769, + "rewards/qatch_small_update_with_fm/mean": 0.7982265949249268, + "rewards/qatch_small_update_with_fm/std": 0.33354005217552185, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991692304611206, + "sampling/importance_sampling_ratio/min": 0.0009394215303473175, + "sampling/sampling_logp_difference/max": 6.970246315002441, + "sampling/sampling_logp_difference/mean": 0.09343910217285156, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1973.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 656.17578125, + "completions/mean_terminated_length": 656.17578125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.07529762573540211, + "epoch": 0.9805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.46503478854107794, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 297062307.0, + "reward": 0.8423788547515869, + "reward_std": 0.04574102163314819, + "rewards/qatch_small_update_with_fm/mean": 0.8423788547515869, + "rewards/qatch_small_update_with_fm/std": 0.272524356842041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878612160682678, + "sampling/importance_sampling_ratio/min": 0.005275082774460316, + "sampling/sampling_logp_difference/max": 5.244760990142822, + "sampling/sampling_logp_difference/mean": 0.08667662739753723, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3507.0, + "completions/max_terminated_length": 3507.0, + "completions/mean_length": 720.953125, + "completions/mean_terminated_length": 720.953125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.0724447788670659, + "epoch": 0.9823008849557522, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.47734244313632035, + "learning_rate": 1e-06, + "loss": 0.0308, + "num_tokens": 297617959.0, + "reward": 0.611578106880188, + "reward_std": 0.08688230812549591, + "rewards/qatch_small_update_with_fm/mean": 0.611578106880188, + "rewards/qatch_small_update_with_fm/std": 0.3563700318336487, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988653302192688, + "sampling/importance_sampling_ratio/min": 0.00012823805445805192, + "sampling/sampling_logp_difference/max": 8.96162223815918, + "sampling/sampling_logp_difference/mean": 0.08443759381771088, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3472.0, + "completions/mean_length": 762.35546875, + "completions/mean_terminated_length": 722.8261108398438, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.07419738546013832, + "epoch": 0.984070796460177, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5607787784340242, + "learning_rate": 1e-06, + "loss": -0.0576, + "num_tokens": 298193794.0, + "reward": 0.729687511920929, + "reward_std": 0.14253780245780945, + "rewards/qatch_small_update_with_fm/mean": 0.729687511920929, + "rewards/qatch_small_update_with_fm/std": 0.3643515110015869, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896014928817749, + "sampling/importance_sampling_ratio/min": 0.0019406072096899152, + "sampling/sampling_logp_difference/max": 6.244754314422607, + "sampling/sampling_logp_difference/mean": 0.08009640872478485, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 517.08984375, + "completions/mean_terminated_length": 517.08984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.06529176607728004, + "epoch": 0.9858407079646018, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5883079496784032, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 298870505.0, + "reward": 0.7845039367675781, + "reward_std": 0.037567272782325745, + "rewards/qatch_small_update_with_fm/mean": 0.7845039367675781, + "rewards/qatch_small_update_with_fm/std": 0.36427584290504456, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9908521771430969, + "sampling/importance_sampling_ratio/min": 0.0007565406849607825, + "sampling/sampling_logp_difference/max": 7.18675422668457, + "sampling/sampling_logp_difference/mean": 0.07576609402894974, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3902.0, + "completions/mean_length": 881.76171875, + "completions/mean_terminated_length": 804.6200561523438, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.0811238419264555, + "epoch": 0.9876106194690265, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5896053283682656, + "learning_rate": 1e-06, + "loss": -0.1348, + "num_tokens": 299473212.0, + "reward": 0.741335928440094, + "reward_std": 0.16152629256248474, + "rewards/qatch_small_update_with_fm/mean": 0.741335928440094, + "rewards/qatch_small_update_with_fm/std": 0.37875017523765564, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893544912338257, + "sampling/importance_sampling_ratio/min": 0.0012162798084318638, + "sampling/sampling_logp_difference/max": 6.711958408355713, + "sampling/sampling_logp_difference/mean": 0.08723506331443787, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2388.0, + "completions/max_terminated_length": 2388.0, + "completions/mean_length": 556.34375, + "completions/mean_terminated_length": 556.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.07279634848237038, + "epoch": 0.9893805309734514, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6346360887575442, + "learning_rate": 1e-06, + "loss": 0.0392, + "num_tokens": 299892036.0, + "reward": 0.8744804859161377, + "reward_std": 0.07507223635911942, + "rewards/qatch_small_update_with_fm/mean": 0.8744804859161377, + "rewards/qatch_small_update_with_fm/std": 0.2712794840335846, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875009059906006, + "sampling/importance_sampling_ratio/min": 0.0007305411854758859, + "sampling/sampling_logp_difference/max": 7.221724987030029, + "sampling/sampling_logp_difference/mean": 0.08305776119232178, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3367.0, + "completions/max_terminated_length": 3367.0, + "completions/mean_length": 1021.33984375, + "completions/mean_terminated_length": 1021.33984375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.09527640696614981, + "epoch": 0.9911504424778761, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.768685757865519, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 300635739.0, + "reward": 0.6343789100646973, + "reward_std": 0.13254612684249878, + "rewards/qatch_small_update_with_fm/mean": 0.6343789100646973, + "rewards/qatch_small_update_with_fm/std": 0.3772452473640442, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9884824156761169, + "sampling/importance_sampling_ratio/min": 1.0130537702934816e-05, + "sampling/sampling_logp_difference/max": 11.499956130981445, + "sampling/sampling_logp_difference/mean": 0.09710503369569778, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3505.0, + "completions/max_terminated_length": 3505.0, + "completions/mean_length": 566.98046875, + "completions/mean_terminated_length": 566.98046875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.09003073815256357, + "epoch": 0.9929203539823008, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.665274763420313, + "learning_rate": 1e-06, + "loss": -0.047, + "num_tokens": 300981334.0, + "reward": 0.8392460942268372, + "reward_std": 0.05475308746099472, + "rewards/qatch_small_update_with_fm/mean": 0.8392460942268372, + "rewards/qatch_small_update_with_fm/std": 0.3062317967414856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896265268325806, + "sampling/importance_sampling_ratio/min": 4.6086139349199584e-08, + "sampling/sampling_logp_difference/max": 16.89275360107422, + "sampling/sampling_logp_difference/mean": 0.09289674460887909, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3772.0, + "completions/max_terminated_length": 3772.0, + "completions/mean_length": 804.1796875, + "completions/mean_terminated_length": 804.1796875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.10023855697363615, + "epoch": 0.9946902654867257, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5995308303865646, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 301536276.0, + "reward": 0.7380585670471191, + "reward_std": 0.0959700495004654, + "rewards/qatch_small_update_with_fm/mean": 0.7380585670471191, + "rewards/qatch_small_update_with_fm/std": 0.34974250197410583, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933926463127136, + "sampling/importance_sampling_ratio/min": 0.00023100488760974258, + "sampling/sampling_logp_difference/max": 8.373071670532227, + "sampling/sampling_logp_difference/mean": 0.0933670848608017, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1935.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 606.3984375, + "completions/mean_terminated_length": 606.3984375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.07056841254234314, + "epoch": 0.9964601769911504, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6327470025212618, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 301984378.0, + "reward": 0.7754648327827454, + "reward_std": 0.12288480997085571, + "rewards/qatch_small_update_with_fm/mean": 0.7754648327827454, + "rewards/qatch_small_update_with_fm/std": 0.3759419918060303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867947101593018, + "sampling/importance_sampling_ratio/min": 8.739297072679619e-07, + "sampling/sampling_logp_difference/max": 13.950265884399414, + "sampling/sampling_logp_difference/mean": 0.08524052798748016, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3921.0, + "completions/mean_length": 824.3203125, + "completions/mean_terminated_length": 772.388916015625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.08141087833791971, + "epoch": 0.9982300884955753, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4522442359272215, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 302708316.0, + "reward": 0.6576171517372131, + "reward_std": 0.08948611468076706, + "rewards/qatch_small_update_with_fm/mean": 0.6576172113418579, + "rewards/qatch_small_update_with_fm/std": 0.413068950176239, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990230917930603, + "sampling/importance_sampling_ratio/min": 3.224205258334223e-08, + "sampling/sampling_logp_difference/max": 17.2499942779541, + "sampling/sampling_logp_difference/mean": 0.085847407579422, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3438.0, + "completions/max_terminated_length": 3438.0, + "completions/mean_length": 632.109375, + "completions/mean_terminated_length": 632.109375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.07970152795314789, + "epoch": 1.0, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4195401770571529, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 303171576.0, + "reward": 0.8823828101158142, + "reward_std": 0.05790891870856285, + "rewards/qatch_small_update_with_fm/mean": 0.8823828101158142, + "rewards/qatch_small_update_with_fm/std": 0.2736479640007019, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889679551124573, + "sampling/importance_sampling_ratio/min": 0.00027138934819959104, + "sampling/sampling_logp_difference/max": 8.211956024169922, + "sampling/sampling_logp_difference/mean": 0.08676078915596008, + "step": 565 + }, + { + "epoch": 1.0, + "step": 565, + "total_flos": 0.0, + "train_loss": -0.0013111333199982754, + "train_runtime": 50378.8497, + "train_samples_per_second": 0.18, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1, + "max_steps": 565, + "num_input_tokens_seen": 303171576, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}