| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.0009000360014400576, |
| "eval_steps": 500, |
| "global_step": 15, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2393.0, |
| "completions/max_terminated_length": 2393.0, |
| "completions/mean_length": 2078.5, |
| "completions/mean_terminated_length": 2086.78271484375, |
| "completions/min_length": 694.0, |
| "completions/min_terminated_length": 694.0, |
| "entropy": 0.33678532888491947, |
| "epoch": 6.000240009600384e-05, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 2.966360330581665, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.083, |
| "num_tokens": 70871.0, |
| "reward": 0.01833334192633629, |
| "reward_std": 0.30440181493759155, |
| "rewards/rollout_reward_func/mean": 0.018333343788981438, |
| "rewards/rollout_reward_func/std": 0.5022702217102051, |
| "sampling/importance_sampling_ratio/max": 1.498870849609375, |
| "sampling/importance_sampling_ratio/mean": 0.9928643107414246, |
| "sampling/importance_sampling_ratio/min": 1.5878707237959588e-09, |
| "sampling/sampling_logp_difference/max": 21.133813858032227, |
| "sampling/sampling_logp_difference/mean": 0.04281112551689148, |
| "step": 1, |
| "step_time": 15.020983219003028 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2388.0, |
| "completions/max_terminated_length": 2388.0, |
| "completions/mean_length": 2032.4583740234375, |
| "completions/mean_terminated_length": 2088.0, |
| "completions/min_length": 755.0, |
| "completions/min_terminated_length": 1434.0, |
| "entropy": 0.33894892781972885, |
| "epoch": 0.00012000480019200768, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.8514909744262695, |
| "kl": 0.0, |
| "learning_rate": 8.571428571428571e-07, |
| "loss": -0.0637, |
| "num_tokens": 140234.0, |
| "reward": 0.01833333633840084, |
| "reward_std": 0.3617991805076599, |
| "rewards/rollout_reward_func/mean": 0.01833333633840084, |
| "rewards/rollout_reward_func/std": 0.41583511233329773, |
| "sampling/importance_sampling_ratio/max": 1.5429106950759888, |
| "sampling/importance_sampling_ratio/mean": 0.9241917133331299, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 7.0373640060424805, |
| "sampling/sampling_logp_difference/mean": 0.03344937413930893, |
| "step": 2, |
| "step_time": 13.70249634799984 |
| }, |
| { |
| "clip_ratio/high_max": 0.002923976629972458, |
| "clip_ratio/high_mean": 0.002923976629972458, |
| "clip_ratio/low_mean": 0.001366120142241319, |
| "clip_ratio/low_min": 0.001366120142241319, |
| "clip_ratio/region_mean": 0.004290096772213777, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2396.0, |
| "completions/max_terminated_length": 2396.0, |
| "completions/mean_length": 2183.791748046875, |
| "completions/mean_terminated_length": 2183.791748046875, |
| "completions/min_length": 676.0, |
| "completions/min_terminated_length": 676.0, |
| "entropy": 0.3009852096438408, |
| "epoch": 0.00018000720028801153, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 2.9021389484405518, |
| "kl": 0.0008198164481048783, |
| "learning_rate": 1.7142857142857143e-06, |
| "loss": 0.0103, |
| "num_tokens": 213894.0, |
| "reward": 0.2187500298023224, |
| "reward_std": 0.16249999403953552, |
| "rewards/rollout_reward_func/mean": 0.2187500149011612, |
| "rewards/rollout_reward_func/std": 0.27593812346458435, |
| "sampling/importance_sampling_ratio/max": 1.6837977170944214, |
| "sampling/importance_sampling_ratio/mean": 1.0721747875213623, |
| "sampling/importance_sampling_ratio/min": 0.5105805397033691, |
| "sampling/sampling_logp_difference/max": 0.7568278312683105, |
| "sampling/sampling_logp_difference/mean": 0.017255382612347603, |
| "step": 3, |
| "step_time": 13.48628649100101 |
| }, |
| { |
| "clip_ratio/high_max": 0.002923976629972458, |
| "clip_ratio/high_mean": 0.002923976629972458, |
| "clip_ratio/low_mean": 0.0010162601247429848, |
| "clip_ratio/low_min": 0.0010162601247429848, |
| "clip_ratio/region_mean": 0.003940236754715443, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2407.0, |
| "completions/max_terminated_length": 2407.0, |
| "completions/mean_length": 2167.5, |
| "completions/mean_terminated_length": 2163.95654296875, |
| "completions/min_length": 751.0, |
| "completions/min_terminated_length": 751.0, |
| "entropy": 0.26422637701034546, |
| "epoch": 0.00024000960038401536, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 3.117863178253174, |
| "kl": 0.0008683214109623805, |
| "learning_rate": 2.5714285714285716e-06, |
| "loss": 0.0319, |
| "num_tokens": 286603.0, |
| "reward": 0.25833338499069214, |
| "reward_std": 0.17131535708904266, |
| "rewards/rollout_reward_func/mean": 0.25833335518836975, |
| "rewards/rollout_reward_func/std": 0.2583167254924774, |
| "sampling/importance_sampling_ratio/max": 1.4256991147994995, |
| "sampling/importance_sampling_ratio/mean": 0.969955325126648, |
| "sampling/importance_sampling_ratio/min": 0.5823519825935364, |
| "sampling/sampling_logp_difference/max": 0.329906702041626, |
| "sampling/sampling_logp_difference/mean": 0.015135754831135273, |
| "step": 4, |
| "step_time": 12.97427468299793 |
| }, |
| { |
| "clip_ratio/high_max": 0.0011415525029102962, |
| "clip_ratio/high_mean": 0.0011415525029102962, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0011415525029102962, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2393.0, |
| "completions/max_terminated_length": 2393.0, |
| "completions/mean_length": 1980.166748046875, |
| "completions/mean_terminated_length": 1983.9130859375, |
| "completions/min_length": 667.0, |
| "completions/min_terminated_length": 667.0, |
| "entropy": 0.2581128428379695, |
| "epoch": 0.0003000120004800192, |
| "frac_reward_zero_std": 0.3333333432674408, |
| "grad_norm": 2.8420257568359375, |
| "kl": 0.0006481069843478812, |
| "learning_rate": 3.4285714285714285e-06, |
| "loss": 0.0613, |
| "num_tokens": 355097.0, |
| "reward": 0.04541667923331261, |
| "reward_std": 0.38386815786361694, |
| "rewards/rollout_reward_func/mean": 0.04541667178273201, |
| "rewards/rollout_reward_func/std": 0.4871387481689453, |
| "sampling/importance_sampling_ratio/max": 1.2240864038467407, |
| "sampling/importance_sampling_ratio/mean": 0.9118739366531372, |
| "sampling/importance_sampling_ratio/min": 2.346623517723856e-08, |
| "sampling/sampling_logp_difference/max": 9.750813484191895, |
| "sampling/sampling_logp_difference/mean": 0.05021395534276962, |
| "step": 5, |
| "step_time": 13.251750441997501 |
| }, |
| { |
| "clip_ratio/high_max": 0.0021307161853959164, |
| "clip_ratio/high_mean": 0.0021307161853959164, |
| "clip_ratio/low_mean": 0.0034020394862939916, |
| "clip_ratio/low_min": 0.0034020394862939916, |
| "clip_ratio/region_mean": 0.005532755671689908, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2471.0, |
| "completions/max_terminated_length": 2396.0, |
| "completions/mean_length": 1981.5833740234375, |
| "completions/mean_terminated_length": 1960.304443359375, |
| "completions/min_length": 669.0, |
| "completions/min_terminated_length": 669.0, |
| "entropy": 0.388533021012942, |
| "epoch": 0.00036001440057602306, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 3.518169403076172, |
| "kl": 0.0009793323115445673, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": -0.1467, |
| "num_tokens": 423828.0, |
| "reward": 0.06583334505558014, |
| "reward_std": 0.41378408670425415, |
| "rewards/rollout_reward_func/mean": 0.06583333760499954, |
| "rewards/rollout_reward_func/std": 0.450168251991272, |
| "sampling/importance_sampling_ratio/max": 1.502521276473999, |
| "sampling/importance_sampling_ratio/mean": 1.0047988891601562, |
| "sampling/importance_sampling_ratio/min": 0.00031536107417196035, |
| "sampling/sampling_logp_difference/max": 6.730105876922607, |
| "sampling/sampling_logp_difference/mean": 0.027479952201247215, |
| "step": 6, |
| "step_time": 13.971570022000378 |
| }, |
| { |
| "clip_ratio/high_max": 0.0014492752961814404, |
| "clip_ratio/high_mean": 0.0014492752961814404, |
| "clip_ratio/low_mean": 0.0009009009227156639, |
| "clip_ratio/low_min": 0.0009009009227156639, |
| "clip_ratio/region_mean": 0.0023501762188971043, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2321.0, |
| "completions/max_terminated_length": 2321.0, |
| "completions/mean_length": 2112.416748046875, |
| "completions/mean_terminated_length": 2113.652099609375, |
| "completions/min_length": 1386.0, |
| "completions/min_terminated_length": 1386.0, |
| "entropy": 0.2739458481470744, |
| "epoch": 0.0004200168006720269, |
| "frac_reward_zero_std": 0.3333333432674408, |
| "grad_norm": 2.717536449432373, |
| "kl": 0.0010390299139544368, |
| "learning_rate": 5.142857142857143e-06, |
| "loss": -0.1854, |
| "num_tokens": 495731.0, |
| "reward": 0.15916667878627777, |
| "reward_std": 0.1548699289560318, |
| "rewards/rollout_reward_func/mean": 0.15916667878627777, |
| "rewards/rollout_reward_func/std": 0.32958361506462097, |
| "sampling/importance_sampling_ratio/max": 1.5678151845932007, |
| "sampling/importance_sampling_ratio/mean": 0.9848524928092957, |
| "sampling/importance_sampling_ratio/min": 2.720875045270077e-06, |
| "sampling/sampling_logp_difference/max": 9.624576568603516, |
| "sampling/sampling_logp_difference/mean": 0.03278766945004463, |
| "step": 7, |
| "step_time": 12.972795774996484 |
| }, |
| { |
| "clip_ratio/high_max": 0.0037208329886198044, |
| "clip_ratio/high_mean": 0.0037208329886198044, |
| "clip_ratio/low_mean": 0.0018710837854693334, |
| "clip_ratio/low_min": 0.0018710837854693334, |
| "clip_ratio/region_mean": 0.005591916696478923, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2392.0, |
| "completions/max_terminated_length": 2392.0, |
| "completions/mean_length": 2101.416748046875, |
| "completions/mean_terminated_length": 2181.381103515625, |
| "completions/min_length": 811.0, |
| "completions/min_terminated_length": 1519.0, |
| "entropy": 0.38193629682064056, |
| "epoch": 0.0004800192007680307, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 4.98358678817749, |
| "kl": 0.001539110904559493, |
| "learning_rate": 6e-06, |
| "loss": 0.0265, |
| "num_tokens": 567094.0, |
| "reward": 0.01291667204350233, |
| "reward_std": 0.4608933925628662, |
| "rewards/rollout_reward_func/mean": 0.01291667204350233, |
| "rewards/rollout_reward_func/std": 0.49684640765190125, |
| "sampling/importance_sampling_ratio/max": 1.5221177339553833, |
| "sampling/importance_sampling_ratio/mean": 0.9741430282592773, |
| "sampling/importance_sampling_ratio/min": 0.37359893321990967, |
| "sampling/sampling_logp_difference/max": 0.40885448455810547, |
| "sampling/sampling_logp_difference/mean": 0.01908516138792038, |
| "step": 8, |
| "step_time": 13.575062246001835 |
| }, |
| { |
| "clip_ratio/high_max": 0.0010482179932296276, |
| "clip_ratio/high_mean": 0.0010482179932296276, |
| "clip_ratio/low_mean": 0.0026881719628969827, |
| "clip_ratio/low_min": 0.0026881719628969827, |
| "clip_ratio/region_mean": 0.0037363899561266103, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2403.0, |
| "completions/max_terminated_length": 2403.0, |
| "completions/mean_length": 2160.666748046875, |
| "completions/mean_terminated_length": 2160.666748046875, |
| "completions/min_length": 854.0, |
| "completions/min_terminated_length": 854.0, |
| "entropy": 0.2373982494076093, |
| "epoch": 0.0005400216008640345, |
| "frac_reward_zero_std": 0.3333333432674408, |
| "grad_norm": 4.026190280914307, |
| "kl": 0.001170102283746625, |
| "learning_rate": 6.857142857142857e-06, |
| "loss": 0.2258, |
| "num_tokens": 639900.0, |
| "reward": 0.2250000238418579, |
| "reward_std": 0.157368004322052, |
| "rewards/rollout_reward_func/mean": 0.2250000238418579, |
| "rewards/rollout_reward_func/std": 0.26091811060905457, |
| "sampling/importance_sampling_ratio/max": 1.3226174116134644, |
| "sampling/importance_sampling_ratio/mean": 0.9841742515563965, |
| "sampling/importance_sampling_ratio/min": 3.137126449999188e-14, |
| "sampling/sampling_logp_difference/max": 19.257625579833984, |
| "sampling/sampling_logp_difference/mean": 0.0487070232629776, |
| "step": 9, |
| "step_time": 14.116852209996068 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2421.0, |
| "completions/max_terminated_length": 2421.0, |
| "completions/mean_length": 1925.8333740234375, |
| "completions/mean_terminated_length": 1925.8333740234375, |
| "completions/min_length": 400.0, |
| "completions/min_terminated_length": 400.0, |
| "entropy": 0.2908450166384379, |
| "epoch": 0.0006000240009600384, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 2.877199172973633, |
| "kl": 0.0020954393645903715, |
| "learning_rate": 7.714285714285714e-06, |
| "loss": 0.1587, |
| "num_tokens": 706860.0, |
| "reward": 0.16597223281860352, |
| "reward_std": 0.26020175218582153, |
| "rewards/rollout_reward_func/mean": 0.16597223281860352, |
| "rewards/rollout_reward_func/std": 0.42211174964904785, |
| "sampling/importance_sampling_ratio/max": 1.2571473121643066, |
| "sampling/importance_sampling_ratio/mean": 0.8514933586120605, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 12.186629295349121, |
| "sampling/sampling_logp_difference/mean": 0.057263195514678955, |
| "step": 10, |
| "step_time": 1903.418363845998 |
| }, |
| { |
| "clip_ratio/high_max": 0.0009633911152680715, |
| "clip_ratio/high_mean": 0.0009633911152680715, |
| "clip_ratio/low_mean": 0.002207505516707897, |
| "clip_ratio/low_min": 0.002207505516707897, |
| "clip_ratio/region_mean": 0.003170896631975969, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 2416.0, |
| "completions/max_terminated_length": 2416.0, |
| "completions/mean_length": 1997.4583740234375, |
| "completions/mean_terminated_length": 1997.4583740234375, |
| "completions/min_length": 650.0, |
| "completions/min_terminated_length": 650.0, |
| "entropy": 0.2779182270169258, |
| "epoch": 0.0006600264010560422, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.155987739562988, |
| "kl": 0.003926942406299834, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": -0.151, |
| "num_tokens": 775448.0, |
| "reward": 0.20642617344856262, |
| "reward_std": 0.413322776556015, |
| "rewards/rollout_reward_func/mean": 0.20642615854740143, |
| "rewards/rollout_reward_func/std": 0.4893973767757416, |
| "sampling/importance_sampling_ratio/max": 1.5803664922714233, |
| "sampling/importance_sampling_ratio/mean": 0.9162782430648804, |
| "sampling/importance_sampling_ratio/min": 1.3544125465614343e-07, |
| "sampling/sampling_logp_difference/max": 10.99795150756836, |
| "sampling/sampling_logp_difference/mean": 0.03617515414953232, |
| "step": 11, |
| "step_time": 1941.150138971001 |
| }, |
| { |
| "clip_ratio/high_max": 0.001883239174882571, |
| "clip_ratio/high_mean": 0.001883239174882571, |
| "clip_ratio/low_mean": 0.0011037527583539486, |
| "clip_ratio/low_min": 0.0011037527583539486, |
| "clip_ratio/region_mean": 0.0029869919332365194, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2480.0, |
| "completions/max_terminated_length": 2480.0, |
| "completions/mean_length": 1994.3333740234375, |
| "completions/mean_terminated_length": 1990.95654296875, |
| "completions/min_length": 741.0, |
| "completions/min_terminated_length": 741.0, |
| "entropy": 0.2965584595998128, |
| "epoch": 0.0007200288011520461, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 2.91935133934021, |
| "kl": 0.006807463922693084, |
| "learning_rate": 9.428571428571428e-06, |
| "loss": 0.2746, |
| "num_tokens": 843904.0, |
| "reward": 0.19167006015777588, |
| "reward_std": 0.2366676777601242, |
| "rewards/rollout_reward_func/mean": 0.19167006015777588, |
| "rewards/rollout_reward_func/std": 0.3872174620628357, |
| "sampling/importance_sampling_ratio/max": 1.4269827604293823, |
| "sampling/importance_sampling_ratio/mean": 0.9134854078292847, |
| "sampling/importance_sampling_ratio/min": 4.090348326712956e-08, |
| "sampling/sampling_logp_difference/max": 7.874401569366455, |
| "sampling/sampling_logp_difference/mean": 0.035650916397571564, |
| "step": 12, |
| "step_time": 1983.2984636460005 |
| }, |
| { |
| "clip_ratio/high_max": 0.002906976888577143, |
| "clip_ratio/high_mean": 0.002906976888577143, |
| "clip_ratio/low_mean": 0.0014880953046182792, |
| "clip_ratio/low_min": 0.0014880953046182792, |
| "clip_ratio/region_mean": 0.004395072193195422, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2377.0, |
| "completions/max_terminated_length": 2377.0, |
| "completions/mean_length": 1990.75, |
| "completions/mean_terminated_length": 1974.2174072265625, |
| "completions/min_length": 818.0, |
| "completions/min_terminated_length": 818.0, |
| "entropy": 0.24792559693257013, |
| "epoch": 0.00078003120124805, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 2.9418699741363525, |
| "kl": 0.007263694618207713, |
| "learning_rate": 1.0285714285714286e-05, |
| "loss": 0.3781, |
| "num_tokens": 912501.0, |
| "reward": 0.2585095167160034, |
| "reward_std": 0.2756158113479614, |
| "rewards/rollout_reward_func/mean": 0.25850948691368103, |
| "rewards/rollout_reward_func/std": 0.37159210443496704, |
| "sampling/importance_sampling_ratio/max": 1.6105681657791138, |
| "sampling/importance_sampling_ratio/mean": 0.9647274017333984, |
| "sampling/importance_sampling_ratio/min": 6.621257142569448e-08, |
| "sampling/sampling_logp_difference/max": 10.260540008544922, |
| "sampling/sampling_logp_difference/mean": 0.05481240525841713, |
| "step": 13, |
| "step_time": 2070.256960532004 |
| }, |
| { |
| "clip_ratio/high_max": 0.0006485084304586053, |
| "clip_ratio/high_mean": 0.0006485084304586053, |
| "clip_ratio/low_mean": 0.0014430014416575432, |
| "clip_ratio/low_min": 0.0014430014416575432, |
| "clip_ratio/region_mean": 0.0020915098721161485, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2370.0, |
| "completions/max_terminated_length": 2370.0, |
| "completions/mean_length": 2130.916748046875, |
| "completions/mean_terminated_length": 2126.652099609375, |
| "completions/min_length": 1223.0, |
| "completions/min_terminated_length": 1223.0, |
| "entropy": 0.16453668102622032, |
| "epoch": 0.0008400336013440538, |
| "frac_reward_zero_std": 0.3333333432674408, |
| "grad_norm": 1.9463223218917847, |
| "kl": 0.014017233702664575, |
| "learning_rate": 1.1142857142857143e-05, |
| "loss": -0.1296, |
| "num_tokens": 984804.0, |
| "reward": 0.2805420160293579, |
| "reward_std": 0.17220479249954224, |
| "rewards/rollout_reward_func/mean": 0.2805420160293579, |
| "rewards/rollout_reward_func/std": 0.2652944028377533, |
| "sampling/importance_sampling_ratio/max": 1.4470672607421875, |
| "sampling/importance_sampling_ratio/mean": 0.9239229559898376, |
| "sampling/importance_sampling_ratio/min": 4.1741555338842856e-17, |
| "sampling/sampling_logp_difference/max": 17.244997024536133, |
| "sampling/sampling_logp_difference/mean": 0.0690554603934288, |
| "step": 14, |
| "step_time": 2234.725167364999 |
| }, |
| { |
| "clip_ratio/high_max": 0.001851851896693309, |
| "clip_ratio/high_mean": 0.001851851896693309, |
| "clip_ratio/low_mean": 0.0009523809421807528, |
| "clip_ratio/low_min": 0.0009523809421807528, |
| "clip_ratio/region_mean": 0.002804232838874062, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2404.0, |
| "completions/max_terminated_length": 2404.0, |
| "completions/mean_length": 1999.125, |
| "completions/mean_terminated_length": 1988.5653076171875, |
| "completions/min_length": 751.0, |
| "completions/min_terminated_length": 751.0, |
| "entropy": 0.18070783466100693, |
| "epoch": 0.0009000360014400576, |
| "frac_reward_zero_std": 0.1666666716337204, |
| "grad_norm": 1.9805845022201538, |
| "kl": 0.017273214490463335, |
| "learning_rate": 1.2e-05, |
| "loss": 0.1801, |
| "num_tokens": 1053620.0, |
| "reward": 0.3113042116165161, |
| "reward_std": 0.2151390165090561, |
| "rewards/rollout_reward_func/mean": 0.3113042116165161, |
| "rewards/rollout_reward_func/std": 0.31088533997535706, |
| "sampling/importance_sampling_ratio/max": 1.6857080459594727, |
| "sampling/importance_sampling_ratio/mean": 0.922259509563446, |
| "sampling/importance_sampling_ratio/min": 4.6535773281716764e-14, |
| "sampling/sampling_logp_difference/max": 20.623287200927734, |
| "sampling/sampling_logp_difference/mean": 0.05669760704040527, |
| "step": 15, |
| "step_time": 2093.846225461999 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 33332, |
| "num_input_tokens_seen": 1053620, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|