| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.18450184501845018, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "cispo_clip_ratio": 0.364774439483881, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 5409.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2607.8125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 674.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.4863766431808472, |
| "epoch": 0.0036900369003690036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6501518985023722e-05, |
| "learning_rate": 0.0002, |
| "loss": -5.250487447483465e-05, |
| "num_tokens": 45661.0, |
| "reward": 0.2993389666080475, |
| "reward_std": 0.17980147898197174, |
| "rewards/humor_reward/mean": 0.2993389666080475, |
| "rewards/humor_reward/std": 0.17980147898197174, |
| "sampling/importance_sampling_ratio/max": 0.0032514820341020823, |
| "sampling/importance_sampling_ratio/mean": 0.00025803607422858477, |
| "sampling/importance_sampling_ratio/min": 6.591494638996664e-08, |
| "sampling/sampling_logp_difference/max": 3.1304593086242676, |
| "sampling/sampling_logp_difference/mean": 0.04722540080547333, |
| "step": 1, |
| "step_time": 79.16197230700345 |
| }, |
| { |
| "cispo_clip_ratio": 0.4739677682518959, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 4785.0, |
| "completions/max_terminated_length": 439.0, |
| "completions/mean_length": 2616.0625, |
| "completions/mean_terminated_length": 439.0, |
| "completions/min_length": 439.0, |
| "completions/min_terminated_length": 439.0, |
| "entropy": 1.2316511571407318, |
| "epoch": 0.007380073800738007, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.003729997668415308, |
| "learning_rate": 0.0002, |
| "loss": -0.003799113677814603, |
| "num_tokens": 91230.0, |
| "reward": 0.5310899019241333, |
| "reward_std": 0.27845898270606995, |
| "rewards/humor_reward/mean": 0.5310899019241333, |
| "rewards/humor_reward/std": 0.27845898270606995, |
| "sampling/importance_sampling_ratio/max": 1.048789381980896, |
| "sampling/importance_sampling_ratio/mean": 0.12509621679782867, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.2385149002075195, |
| "sampling/sampling_logp_difference/mean": 0.04188045486807823, |
| "step": 2, |
| "step_time": 66.1825476239901 |
| }, |
| { |
| "cispo_clip_ratio": 0.5831195935606956, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 2244.0, |
| "completions/mean_length": 3197.3125, |
| "completions/mean_terminated_length": 2244.0, |
| "completions/min_length": 1313.0, |
| "completions/min_terminated_length": 2244.0, |
| "entropy": 1.3326809257268906, |
| "epoch": 0.01107011070110701, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.462636181619018e-05, |
| "learning_rate": 0.0002, |
| "loss": 0.00016624285490252078, |
| "num_tokens": 146995.0, |
| "reward": 0.4853614568710327, |
| "reward_std": 0.2525769770145416, |
| "rewards/humor_reward/mean": 0.4853614568710327, |
| "rewards/humor_reward/std": 0.252577006816864, |
| "sampling/importance_sampling_ratio/max": 0.022306665778160095, |
| "sampling/importance_sampling_ratio/mean": 0.001404650043696165, |
| "sampling/importance_sampling_ratio/min": 1.7064398505350908e-10, |
| "sampling/sampling_logp_difference/max": 3.054729700088501, |
| "sampling/sampling_logp_difference/mean": 0.0515315905213356, |
| "step": 3, |
| "step_time": 86.2583410259831 |
| }, |
| { |
| "cispo_clip_ratio": 0.5900484155863523, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3705.3125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1009.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.533411294221878, |
| "epoch": 0.014760147601476014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0004810348036698997, |
| "learning_rate": 0.0002, |
| "loss": -0.0014380415668711066, |
| "num_tokens": 210440.0, |
| "reward": 0.48688405752182007, |
| "reward_std": 0.2839711010456085, |
| "rewards/humor_reward/mean": 0.48688405752182007, |
| "rewards/humor_reward/std": 0.2839711010456085, |
| "sampling/importance_sampling_ratio/max": 0.21202650666236877, |
| "sampling/importance_sampling_ratio/mean": 0.013267980888485909, |
| "sampling/importance_sampling_ratio/min": 1.5972070888103929e-10, |
| "sampling/sampling_logp_difference/max": 3.0988059043884277, |
| "sampling/sampling_logp_difference/mean": 0.04585312306880951, |
| "step": 4, |
| "step_time": 83.59175038301328 |
| }, |
| { |
| "cispo_clip_ratio": 0.6057924032211304, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4057.8125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 482.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.3052150011062622, |
| "epoch": 0.01845018450184502, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.325129793869564e-06, |
| "learning_rate": 0.0002, |
| "loss": -5.259538738755509e-05, |
| "num_tokens": 279525.0, |
| "reward": 0.409991055727005, |
| "reward_std": 0.27515894174575806, |
| "rewards/humor_reward/mean": 0.409991055727005, |
| "rewards/humor_reward/std": 0.27515894174575806, |
| "sampling/importance_sampling_ratio/max": 0.0020468428265303373, |
| "sampling/importance_sampling_ratio/mean": 0.0002199342561652884, |
| "sampling/importance_sampling_ratio/min": 4.834557643107473e-09, |
| "sampling/sampling_logp_difference/max": 1.898941993713379, |
| "sampling/sampling_logp_difference/mean": 0.04034310206770897, |
| "step": 5, |
| "step_time": 63.17914728100004 |
| }, |
| { |
| "cispo_clip_ratio": 0.51509914919734, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4054.625, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 106.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.3227798640727997, |
| "epoch": 0.02214022140221402, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0003930912062060088, |
| "learning_rate": 0.0002, |
| "loss": -8.775031892582774e-05, |
| "num_tokens": 348111.0, |
| "reward": 0.47100865840911865, |
| "reward_std": 0.2912721335887909, |
| "rewards/humor_reward/mean": 0.47100865840911865, |
| "rewards/humor_reward/std": 0.2912721633911133, |
| "sampling/importance_sampling_ratio/max": 0.26060751080513, |
| "sampling/importance_sampling_ratio/mean": 0.0164032019674778, |
| "sampling/importance_sampling_ratio/min": 9.639339326739105e-10, |
| "sampling/sampling_logp_difference/max": 2.0555481910705566, |
| "sampling/sampling_logp_difference/mean": 0.039414241909980774, |
| "step": 6, |
| "step_time": 82.86594534001779 |
| }, |
| { |
| "cispo_clip_ratio": 0.6096626706421375, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 3121.0, |
| "completions/mean_length": 3594.3125, |
| "completions/mean_terminated_length": 1685.5, |
| "completions/min_length": 184.0, |
| "completions/min_terminated_length": 250.0, |
| "entropy": 1.4805490970611572, |
| "epoch": 0.025830258302583026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001002174976747483, |
| "learning_rate": 0.0002, |
| "loss": -0.00036368955625221133, |
| "num_tokens": 409556.0, |
| "reward": 0.25648510456085205, |
| "reward_std": 0.21258896589279175, |
| "rewards/humor_reward/mean": 0.25648510456085205, |
| "rewards/humor_reward/std": 0.21258896589279175, |
| "sampling/importance_sampling_ratio/max": 0.042766377329826355, |
| "sampling/importance_sampling_ratio/mean": 0.003916537389159203, |
| "sampling/importance_sampling_ratio/min": 3.151466310136186e-10, |
| "sampling/sampling_logp_difference/max": 2.511254072189331, |
| "sampling/sampling_logp_difference/mean": 0.04306597262620926, |
| "step": 7, |
| "step_time": 77.88916695199441 |
| }, |
| { |
| "cispo_clip_ratio": 0.4443662669509649, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4063.8125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 426.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.275453269481659, |
| "epoch": 0.02952029520295203, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6524854800081812e-05, |
| "learning_rate": 0.0002, |
| "loss": 6.745757855242118e-05, |
| "num_tokens": 478513.0, |
| "reward": 0.3238402009010315, |
| "reward_std": 0.2930999994277954, |
| "rewards/humor_reward/mean": 0.3238402009010315, |
| "rewards/humor_reward/std": 0.2931000292301178, |
| "sampling/importance_sampling_ratio/max": 0.010375253856182098, |
| "sampling/importance_sampling_ratio/mean": 0.0010308868950232863, |
| "sampling/importance_sampling_ratio/min": 5.8848907968922504e-08, |
| "sampling/sampling_logp_difference/max": 3.019871711730957, |
| "sampling/sampling_logp_difference/mean": 0.038398630917072296, |
| "step": 8, |
| "step_time": 62.586684064008296 |
| }, |
| { |
| "cispo_clip_ratio": 0.4614889621734619, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3351.5, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 925.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.392012134194374, |
| "epoch": 0.033210332103321034, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.4697693485650234e-08, |
| "learning_rate": 0.0002, |
| "loss": 3.4436376949997793e-07, |
| "num_tokens": 536969.0, |
| "reward": 0.320010781288147, |
| "reward_std": 0.26967623829841614, |
| "rewards/humor_reward/mean": 0.320010781288147, |
| "rewards/humor_reward/std": 0.26967623829841614, |
| "sampling/importance_sampling_ratio/max": 2.8325872335699387e-05, |
| "sampling/importance_sampling_ratio/mean": 4.4159696699352935e-06, |
| "sampling/importance_sampling_ratio/min": 4.985558949184565e-13, |
| "sampling/sampling_logp_difference/max": 2.7537732124328613, |
| "sampling/sampling_logp_difference/mean": 0.05181171000003815, |
| "step": 9, |
| "step_time": 85.44417907499883 |
| }, |
| { |
| "cispo_clip_ratio": 0.46727752313017845, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4545.25, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1297.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.277551643550396, |
| "epoch": 0.03690036900369004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0018553344998508692, |
| "learning_rate": 0.0002, |
| "loss": -0.00018077026470564306, |
| "num_tokens": 613629.0, |
| "reward": 0.3976645767688751, |
| "reward_std": 0.3508976995944977, |
| "rewards/humor_reward/mean": 0.3976645767688751, |
| "rewards/humor_reward/std": 0.3508976995944977, |
| "sampling/importance_sampling_ratio/max": 0.32763394713401794, |
| "sampling/importance_sampling_ratio/mean": 0.02203518897294998, |
| "sampling/importance_sampling_ratio/min": 2.647046748460724e-10, |
| "sampling/sampling_logp_difference/max": 3.727196216583252, |
| "sampling/sampling_logp_difference/mean": 0.037366170436143875, |
| "step": 10, |
| "step_time": 83.16811528199469 |
| }, |
| { |
| "cispo_clip_ratio": 0.564013222232461, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4322.8125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 705.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.0119460746645927, |
| "epoch": 0.04059040590405904, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0012440788559615612, |
| "learning_rate": 0.0002, |
| "loss": -0.0035373736172914505, |
| "num_tokens": 687402.0, |
| "reward": 0.378947377204895, |
| "reward_std": 0.2340850830078125, |
| "rewards/humor_reward/mean": 0.378947377204895, |
| "rewards/humor_reward/std": 0.2340850979089737, |
| "sampling/importance_sampling_ratio/max": 0.30296310782432556, |
| "sampling/importance_sampling_ratio/mean": 0.018969321623444557, |
| "sampling/importance_sampling_ratio/min": 8.10443156811988e-15, |
| "sampling/sampling_logp_difference/max": 3.91379976272583, |
| "sampling/sampling_logp_difference/mean": 0.04132331907749176, |
| "step": 11, |
| "step_time": 94.6180467310187 |
| }, |
| { |
| "cispo_clip_ratio": 0.3971639759838581, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 5086.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1271.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.0960774347186089, |
| "epoch": 0.04428044280442804, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.153976962901652e-05, |
| "learning_rate": 0.0002, |
| "loss": -0.00044693646486848593, |
| "num_tokens": 772714.0, |
| "reward": 0.32133913040161133, |
| "reward_std": 0.380583792924881, |
| "rewards/humor_reward/mean": 0.32133913040161133, |
| "rewards/humor_reward/std": 0.38058382272720337, |
| "sampling/importance_sampling_ratio/max": 0.017419712617993355, |
| "sampling/importance_sampling_ratio/mean": 0.001832809066399932, |
| "sampling/importance_sampling_ratio/min": 6.273771866599498e-12, |
| "sampling/sampling_logp_difference/max": 5.7523579597473145, |
| "sampling/sampling_logp_difference/mean": 0.034892238676548004, |
| "step": 12, |
| "step_time": 79.48709479898389 |
| }, |
| { |
| "cispo_clip_ratio": 0.5508307814598083, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4785.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1482.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.299708716571331, |
| "epoch": 0.04797047970479705, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.673526968370425e-06, |
| "learning_rate": 0.0002, |
| "loss": -2.0674373445217498e-05, |
| "num_tokens": 853224.0, |
| "reward": 0.36821186542510986, |
| "reward_std": 0.31694290041923523, |
| "rewards/humor_reward/mean": 0.36821186542510986, |
| "rewards/humor_reward/std": 0.3169429302215576, |
| "sampling/importance_sampling_ratio/max": 0.0009282280807383358, |
| "sampling/importance_sampling_ratio/mean": 0.0001542143290862441, |
| "sampling/importance_sampling_ratio/min": 4.797664487909969e-10, |
| "sampling/sampling_logp_difference/max": 2.7313647270202637, |
| "sampling/sampling_logp_difference/mean": 0.03830663487315178, |
| "step": 13, |
| "step_time": 82.41276069695596 |
| }, |
| { |
| "cispo_clip_ratio": 0.3803188279271126, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 1625.0, |
| "completions/mean_length": 4955.6875, |
| "completions/mean_terminated_length": 1625.0, |
| "completions/min_length": 968.0, |
| "completions/min_terminated_length": 1625.0, |
| "entropy": 1.364896483719349, |
| "epoch": 0.05166051660516605, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3385028978518676e-05, |
| "learning_rate": 0.0002, |
| "loss": 2.8572056180564687e-05, |
| "num_tokens": 937123.0, |
| "reward": 0.2386319488286972, |
| "reward_std": 0.28499212861061096, |
| "rewards/humor_reward/mean": 0.2386319488286972, |
| "rewards/humor_reward/std": 0.28499215841293335, |
| "sampling/importance_sampling_ratio/max": 0.009222447872161865, |
| "sampling/importance_sampling_ratio/mean": 0.0005764853558503091, |
| "sampling/importance_sampling_ratio/min": 1.6275064690298314e-15, |
| "sampling/sampling_logp_difference/max": 4.84999942779541, |
| "sampling/sampling_logp_difference/mean": 0.04662536829710007, |
| "step": 14, |
| "step_time": 85.49535570498847 |
| }, |
| { |
| "cispo_clip_ratio": 0.3275146186351776, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 2590.0, |
| "completions/mean_length": 4530.75, |
| "completions/mean_terminated_length": 2166.5, |
| "completions/min_length": 505.0, |
| "completions/min_terminated_length": 1743.0, |
| "entropy": 0.9317370727658272, |
| "epoch": 0.055350553505535055, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.116483614780009e-05, |
| "learning_rate": 0.0002, |
| "loss": 0.0002636128047015518, |
| "num_tokens": 1013327.0, |
| "reward": 0.2526451349258423, |
| "reward_std": 0.36080095171928406, |
| "rewards/humor_reward/mean": 0.2526451349258423, |
| "rewards/humor_reward/std": 0.36080095171928406, |
| "sampling/importance_sampling_ratio/max": 0.011370004154741764, |
| "sampling/importance_sampling_ratio/mean": 0.001944657531566918, |
| "sampling/importance_sampling_ratio/min": 1.1834137185820492e-12, |
| "sampling/sampling_logp_difference/max": 3.4003725051879883, |
| "sampling/sampling_logp_difference/mean": 0.03138995170593262, |
| "step": 15, |
| "step_time": 73.45879589200194 |
| }, |
| { |
| "cispo_clip_ratio": 0.1871738387271762, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 5333.125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 854.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.1076993495225906, |
| "epoch": 0.05904059040590406, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0002949201443698257, |
| "learning_rate": 0.0002, |
| "loss": 0.0002496525994502008, |
| "num_tokens": 1102593.0, |
| "reward": 0.12923595309257507, |
| "reward_std": 0.24049529433250427, |
| "rewards/humor_reward/mean": 0.12923595309257507, |
| "rewards/humor_reward/std": 0.24049529433250427, |
| "sampling/importance_sampling_ratio/max": 0.13862203061580658, |
| "sampling/importance_sampling_ratio/mean": 0.013717259280383587, |
| "sampling/importance_sampling_ratio/min": 2.0222361563071445e-09, |
| "sampling/sampling_logp_difference/max": 1.9613001346588135, |
| "sampling/sampling_logp_difference/mean": 0.02957136556506157, |
| "step": 16, |
| "step_time": 76.10329114498745 |
| }, |
| { |
| "cispo_clip_ratio": 0.2707713171839714, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 5502.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2296.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.3415134847164154, |
| "epoch": 0.06273062730627306, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.443507812837197e-06, |
| "learning_rate": 0.0002, |
| "loss": 1.1968148101004772e-05, |
| "num_tokens": 1194337.0, |
| "reward": 0.14995136857032776, |
| "reward_std": 0.2515668570995331, |
| "rewards/humor_reward/mean": 0.14995136857032776, |
| "rewards/humor_reward/std": 0.25156688690185547, |
| "sampling/importance_sampling_ratio/max": 0.00044680986320599914, |
| "sampling/importance_sampling_ratio/mean": 3.927269790438004e-05, |
| "sampling/importance_sampling_ratio/min": 9.991393268293791e-10, |
| "sampling/sampling_logp_difference/max": 3.0433225631713867, |
| "sampling/sampling_logp_difference/mean": 0.033172741532325745, |
| "step": 17, |
| "step_time": 76.20640446299512 |
| }, |
| { |
| "cispo_clip_ratio": 0.4475329667329788, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4663.4375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1583.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.0032905638217926, |
| "epoch": 0.06642066420664207, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.456151003751074e-08, |
| "learning_rate": 0.0002, |
| "loss": 1.6434560734523984e-07, |
| "num_tokens": 1273560.0, |
| "reward": 0.47607260942459106, |
| "reward_std": 0.3206636905670166, |
| "rewards/humor_reward/mean": 0.47607260942459106, |
| "rewards/humor_reward/std": 0.3206636905670166, |
| "sampling/importance_sampling_ratio/max": 1.8386488591204397e-05, |
| "sampling/importance_sampling_ratio/mean": 2.1001696950406767e-06, |
| "sampling/importance_sampling_ratio/min": 2.987033566998441e-14, |
| "sampling/sampling_logp_difference/max": 2.9727420806884766, |
| "sampling/sampling_logp_difference/mean": 0.04333854466676712, |
| "step": 18, |
| "step_time": 91.44482385799347 |
| }, |
| { |
| "cispo_clip_ratio": 0.5934446323662996, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4332.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 692.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.2455691695213318, |
| "epoch": 0.07011070110701106, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6948493541567586e-05, |
| "learning_rate": 0.0002, |
| "loss": 0.00015589207760058343, |
| "num_tokens": 1346808.0, |
| "reward": 0.4407455325126648, |
| "reward_std": 0.32844167947769165, |
| "rewards/humor_reward/mean": 0.4407455325126648, |
| "rewards/humor_reward/std": 0.32844167947769165, |
| "sampling/importance_sampling_ratio/max": 0.0622418075799942, |
| "sampling/importance_sampling_ratio/mean": 0.00455310195684433, |
| "sampling/importance_sampling_ratio/min": 6.87965170942384e-11, |
| "sampling/sampling_logp_difference/max": 1.9999836683273315, |
| "sampling/sampling_logp_difference/mean": 0.040162429213523865, |
| "step": 19, |
| "step_time": 63.66296142600186 |
| }, |
| { |
| "cispo_clip_ratio": 0.3904235363006592, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 3539.0, |
| "completions/mean_length": 4865.125, |
| "completions/mean_terminated_length": 3539.0, |
| "completions/min_length": 1705.0, |
| "completions/min_terminated_length": 3539.0, |
| "entropy": 1.1548645570874214, |
| "epoch": 0.07380073800738007, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0003063328331336379, |
| "learning_rate": 0.0002, |
| "loss": 0.0016583555843681097, |
| "num_tokens": 1428810.0, |
| "reward": 0.32657718658447266, |
| "reward_std": 0.3702032268047333, |
| "rewards/humor_reward/mean": 0.32657718658447266, |
| "rewards/humor_reward/std": 0.37020325660705566, |
| "sampling/importance_sampling_ratio/max": 0.07926318794488907, |
| "sampling/importance_sampling_ratio/mean": 0.00835469737648964, |
| "sampling/importance_sampling_ratio/min": 1.397608049256982e-12, |
| "sampling/sampling_logp_difference/max": 3.58780574798584, |
| "sampling/sampling_logp_difference/mean": 0.036592792719602585, |
| "step": 20, |
| "step_time": 80.32529347503441 |
| }, |
| { |
| "cispo_clip_ratio": 0.46490118466317654, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4817.375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1635.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.468906119465828, |
| "epoch": 0.07749077490774908, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 9.814787915729539e-08, |
| "learning_rate": 0.0002, |
| "loss": -6.973982635827269e-07, |
| "num_tokens": 1510048.0, |
| "reward": 0.4167077839374542, |
| "reward_std": 0.3755662441253662, |
| "rewards/humor_reward/mean": 0.4167077839374542, |
| "rewards/humor_reward/std": 0.3755662441253662, |
| "sampling/importance_sampling_ratio/max": 2.670207504706923e-05, |
| "sampling/importance_sampling_ratio/mean": 3.46238084603101e-06, |
| "sampling/importance_sampling_ratio/min": 6.556125781154165e-11, |
| "sampling/sampling_logp_difference/max": 2.0220751762390137, |
| "sampling/sampling_logp_difference/mean": 0.04112354293465614, |
| "step": 21, |
| "step_time": 66.39175808998698 |
| }, |
| { |
| "cispo_clip_ratio": 0.4479520544409752, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4476.625, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 576.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.3538827523589134, |
| "epoch": 0.08118081180811808, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 9.053791472979356e-06, |
| "learning_rate": 0.0002, |
| "loss": -4.3639585783239454e-05, |
| "num_tokens": 1585386.0, |
| "reward": 0.398086816072464, |
| "reward_std": 0.39999622106552124, |
| "rewards/humor_reward/mean": 0.398086816072464, |
| "rewards/humor_reward/std": 0.39999625086784363, |
| "sampling/importance_sampling_ratio/max": 0.011358072981238365, |
| "sampling/importance_sampling_ratio/mean": 0.0010091899894177914, |
| "sampling/importance_sampling_ratio/min": 4.4679931976432385e-10, |
| "sampling/sampling_logp_difference/max": 3.5639562606811523, |
| "sampling/sampling_logp_difference/mean": 0.035584691911935806, |
| "step": 22, |
| "step_time": 77.29983079498925 |
| }, |
| { |
| "cispo_clip_ratio": 0.5936666019260883, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4713.3125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 701.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.1694510877132416, |
| "epoch": 0.08487084870848709, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00025732198264449835, |
| "learning_rate": 0.0002, |
| "loss": 0.00032361538615077734, |
| "num_tokens": 1664511.0, |
| "reward": 0.3887137174606323, |
| "reward_std": 0.35025978088378906, |
| "rewards/humor_reward/mean": 0.3887137174606323, |
| "rewards/humor_reward/std": 0.35025978088378906, |
| "sampling/importance_sampling_ratio/max": 0.2608579397201538, |
| "sampling/importance_sampling_ratio/mean": 0.016369398683309555, |
| "sampling/importance_sampling_ratio/min": 5.653824977636113e-11, |
| "sampling/sampling_logp_difference/max": 1.6544189453125, |
| "sampling/sampling_logp_difference/mean": 0.03655308857560158, |
| "step": 23, |
| "step_time": 79.8481750869978 |
| }, |
| { |
| "cispo_clip_ratio": 0.46904783695936203, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4739.25, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2132.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.2212552726268768, |
| "epoch": 0.08856088560885608, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.1028017727076076e-06, |
| "learning_rate": 0.0002, |
| "loss": 5.860923010914121e-06, |
| "num_tokens": 1744499.0, |
| "reward": 0.33275866508483887, |
| "reward_std": 0.3678642511367798, |
| "rewards/humor_reward/mean": 0.33275866508483887, |
| "rewards/humor_reward/std": 0.3678642809391022, |
| "sampling/importance_sampling_ratio/max": 0.0013681778218597174, |
| "sampling/importance_sampling_ratio/mean": 9.336201765108854e-05, |
| "sampling/importance_sampling_ratio/min": 2.4985560842516463e-10, |
| "sampling/sampling_logp_difference/max": 1.9406442642211914, |
| "sampling/sampling_logp_difference/mean": 0.03511609137058258, |
| "step": 24, |
| "step_time": 79.88214839099965 |
| }, |
| { |
| "cispo_clip_ratio": 0.4411753863096237, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4651.4375, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1004.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 0.8522009998559952, |
| "epoch": 0.09225092250922509, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00017764404765330255, |
| "learning_rate": 0.0002, |
| "loss": 0.0008231036481447518, |
| "num_tokens": 1823082.0, |
| "reward": 0.274623304605484, |
| "reward_std": 0.3160068690776825, |
| "rewards/humor_reward/mean": 0.274623304605484, |
| "rewards/humor_reward/std": 0.3160068988800049, |
| "sampling/importance_sampling_ratio/max": 0.03566610440611839, |
| "sampling/importance_sampling_ratio/mean": 0.0037315732333809137, |
| "sampling/importance_sampling_ratio/min": 4.288159161092153e-09, |
| "sampling/sampling_logp_difference/max": 2.191567897796631, |
| "sampling/sampling_logp_difference/mean": 0.027100346982479095, |
| "step": 25, |
| "step_time": 65.47632334999798 |
| }, |
| { |
| "cispo_clip_ratio": 0.3125321865081787, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 5114.5625, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 354.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 0.8008184731006622, |
| "epoch": 0.0959409594095941, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00044094581971876323, |
| "learning_rate": 0.0002, |
| "loss": -0.0023368855472654104, |
| "num_tokens": 1909075.0, |
| "reward": 0.2744894027709961, |
| "reward_std": 0.3853631615638733, |
| "rewards/humor_reward/mean": 0.2744894027709961, |
| "rewards/humor_reward/std": 0.3853631913661957, |
| "sampling/importance_sampling_ratio/max": 0.27399709820747375, |
| "sampling/importance_sampling_ratio/mean": 0.02907378226518631, |
| "sampling/importance_sampling_ratio/min": 1.0831848751280404e-09, |
| "sampling/sampling_logp_difference/max": 2.079195022583008, |
| "sampling/sampling_logp_difference/mean": 0.027823781594634056, |
| "step": 26, |
| "step_time": 79.29085839301115 |
| }, |
| { |
| "cispo_clip_ratio": 0.45367857813835144, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 4420.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1406.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.1720404550433159, |
| "epoch": 0.0996309963099631, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0236677300999872e-05, |
| "learning_rate": 0.0002, |
| "loss": -0.0002367804991081357, |
| "num_tokens": 1983073.0, |
| "reward": 0.37771666049957275, |
| "reward_std": 0.3381035327911377, |
| "rewards/humor_reward/mean": 0.37771666049957275, |
| "rewards/humor_reward/std": 0.3381035625934601, |
| "sampling/importance_sampling_ratio/max": 0.01442171260714531, |
| "sampling/importance_sampling_ratio/mean": 0.0017098486423492432, |
| "sampling/importance_sampling_ratio/min": 2.375326602077621e-08, |
| "sampling/sampling_logp_difference/max": 2.334380626678467, |
| "sampling/sampling_logp_difference/mean": 0.03178582713007927, |
| "step": 27, |
| "step_time": 75.20771357801277 |
| }, |
| { |
| "cispo_clip_ratio": 0.6514521557837725, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 5111.0, |
| "completions/mean_length": 3562.625, |
| "completions/mean_terminated_length": 5111.0, |
| "completions/min_length": 171.0, |
| "completions/min_terminated_length": 5111.0, |
| "entropy": 1.2078422084450722, |
| "epoch": 0.1033210332103321, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.000864825677126646, |
| "learning_rate": 0.0002, |
| "loss": 0.0034890188835561275, |
| "num_tokens": 2044011.0, |
| "reward": 0.5999776124954224, |
| "reward_std": 0.378849059343338, |
| "rewards/humor_reward/mean": 0.5999776124954224, |
| "rewards/humor_reward/std": 0.3788490891456604, |
| "sampling/importance_sampling_ratio/max": 0.4461648166179657, |
| "sampling/importance_sampling_ratio/mean": 0.056391313672065735, |
| "sampling/importance_sampling_ratio/min": 1.9029215536647826e-08, |
| "sampling/sampling_logp_difference/max": 2.0776968002319336, |
| "sampling/sampling_logp_difference/mean": 0.04053337126970291, |
| "step": 28, |
| "step_time": 59.718489810984465 |
| }, |
| { |
| "cispo_clip_ratio": 0.5842036623507738, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3761.1875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1373.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 0.9947392120957375, |
| "epoch": 0.1070110701107011, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.404234002526209e-07, |
| "learning_rate": 0.0002, |
| "loss": 3.1678846426075324e-06, |
| "num_tokens": 2108798.0, |
| "reward": 0.5179843902587891, |
| "reward_std": 0.2485807240009308, |
| "rewards/humor_reward/mean": 0.5179843902587891, |
| "rewards/humor_reward/std": 0.2485807240009308, |
| "sampling/importance_sampling_ratio/max": 0.00023896525090094656, |
| "sampling/importance_sampling_ratio/mean": 1.7564820154802874e-05, |
| "sampling/importance_sampling_ratio/min": 8.362109155070005e-15, |
| "sampling/sampling_logp_difference/max": 3.4245944023132324, |
| "sampling/sampling_logp_difference/mean": 0.04190240800380707, |
| "step": 29, |
| "step_time": 92.04002382898761 |
| }, |
| { |
| "cispo_clip_ratio": 0.481358059681952, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 3548.0, |
| "completions/mean_length": 3230.25, |
| "completions/mean_terminated_length": 3548.0, |
| "completions/min_length": 142.0, |
| "completions/min_terminated_length": 3548.0, |
| "entropy": 0.9884351938962936, |
| "epoch": 0.11070110701107011, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0007160792592912912, |
| "learning_rate": 0.0002, |
| "loss": 0.0030249895062297583, |
| "num_tokens": 2165090.0, |
| "reward": 0.32606449723243713, |
| "reward_std": 0.2554178833961487, |
| "rewards/humor_reward/mean": 0.32606449723243713, |
| "rewards/humor_reward/std": 0.2554178833961487, |
| "sampling/importance_sampling_ratio/max": 0.19502675533294678, |
| "sampling/importance_sampling_ratio/mean": 0.01895805075764656, |
| "sampling/importance_sampling_ratio/min": 1.5065569625161146e-10, |
| "sampling/sampling_logp_difference/max": 2.431838035583496, |
| "sampling/sampling_logp_difference/mean": 0.04055629298090935, |
| "step": 30, |
| "step_time": 83.23436555001535 |
| }, |
| { |
| "cispo_clip_ratio": 0.6561583578586578, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3374.625, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 247.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.0753171369433403, |
| "epoch": 0.11439114391143912, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0035273912362754345, |
| "learning_rate": 0.0002, |
| "loss": 0.01652080938220024, |
| "num_tokens": 2223020.0, |
| "reward": 0.5712062120437622, |
| "reward_std": 0.3373561203479767, |
| "rewards/humor_reward/mean": 0.5712062120437622, |
| "rewards/humor_reward/std": 0.3373561203479767, |
| "sampling/importance_sampling_ratio/max": 1.9402817487716675, |
| "sampling/importance_sampling_ratio/mean": 0.13337960839271545, |
| "sampling/importance_sampling_ratio/min": 1.6416909431882232e-07, |
| "sampling/sampling_logp_difference/max": 6.734940528869629, |
| "sampling/sampling_logp_difference/mean": 0.03378872573375702, |
| "step": 31, |
| "step_time": 59.33454087101563 |
| }, |
| { |
| "cispo_clip_ratio": 0.6109768375754356, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3135.3125, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 350.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.203492984175682, |
| "epoch": 0.11808118081180811, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.204303594538942e-05, |
| "learning_rate": 0.0002, |
| "loss": 0.0006842454895377159, |
| "num_tokens": 2277121.0, |
| "reward": 0.6346642971038818, |
| "reward_std": 0.2692297101020813, |
| "rewards/humor_reward/mean": 0.6346642971038818, |
| "rewards/humor_reward/std": 0.2692297399044037, |
| "sampling/importance_sampling_ratio/max": 1.2163432836532593, |
| "sampling/importance_sampling_ratio/mean": 0.07933147996664047, |
| "sampling/importance_sampling_ratio/min": 3.8786645006894105e-09, |
| "sampling/sampling_logp_difference/max": 1.4427013397216797, |
| "sampling/sampling_logp_difference/mean": 0.03805601969361305, |
| "step": 32, |
| "step_time": 56.60236855900439 |
| }, |
| { |
| "cispo_clip_ratio": 0.6497912313789129, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 4153.0, |
| "completions/mean_length": 3000.5625, |
| "completions/mean_terminated_length": 2144.5, |
| "completions/min_length": 136.0, |
| "completions/min_terminated_length": 136.0, |
| "entropy": 1.1894647032022476, |
| "epoch": 0.12177121771217712, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00014119563275016844, |
| "learning_rate": 0.0002, |
| "loss": 0.0002433458430459723, |
| "num_tokens": 2329962.0, |
| "reward": 0.5449860095977783, |
| "reward_std": 0.3282092809677124, |
| "rewards/humor_reward/mean": 0.5449860095977783, |
| "rewards/humor_reward/std": 0.3282092809677124, |
| "sampling/importance_sampling_ratio/max": 0.08825313299894333, |
| "sampling/importance_sampling_ratio/mean": 0.006083119660615921, |
| "sampling/importance_sampling_ratio/min": 1.4223322519768544e-09, |
| "sampling/sampling_logp_difference/max": 2.7407069206237793, |
| "sampling/sampling_logp_difference/mean": 0.04315001145005226, |
| "step": 33, |
| "step_time": 87.83407789799094 |
| }, |
| { |
| "cispo_clip_ratio": 0.603145282715559, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 5923.0, |
| "completions/max_terminated_length": 1476.0, |
| "completions/mean_length": 3187.3125, |
| "completions/mean_terminated_length": 1476.0, |
| "completions/min_length": 1006.0, |
| "completions/min_terminated_length": 1476.0, |
| "entropy": 1.1579975709319115, |
| "epoch": 0.12546125461254612, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.3547359129879624e-05, |
| "learning_rate": 0.0002, |
| "loss": 0.0002505858719814569, |
| "num_tokens": 2385343.0, |
| "reward": 0.5027609467506409, |
| "reward_std": 0.23998978734016418, |
| "rewards/humor_reward/mean": 0.5027609467506409, |
| "rewards/humor_reward/std": 0.23998978734016418, |
| "sampling/importance_sampling_ratio/max": 0.03983650356531143, |
| "sampling/importance_sampling_ratio/mean": 0.003191877156496048, |
| "sampling/importance_sampling_ratio/min": 9.33344068698716e-09, |
| "sampling/sampling_logp_difference/max": 2.9324636459350586, |
| "sampling/sampling_logp_difference/mean": 0.04020746424794197, |
| "step": 34, |
| "step_time": 55.506949331000214 |
| }, |
| { |
| "cispo_clip_ratio": 0.6451264955103397, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 5022.0, |
| "completions/max_terminated_length": 529.0, |
| "completions/mean_length": 2561.125, |
| "completions/mean_terminated_length": 529.0, |
| "completions/min_length": 529.0, |
| "completions/min_terminated_length": 529.0, |
| "entropy": 1.1418700069189072, |
| "epoch": 0.12915129151291513, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0015835068188607693, |
| "learning_rate": 0.0002, |
| "loss": -0.004869138356298208, |
| "num_tokens": 2430257.0, |
| "reward": 0.5386431217193604, |
| "reward_std": 0.27275699377059937, |
| "rewards/humor_reward/mean": 0.5386431217193604, |
| "rewards/humor_reward/std": 0.27275702357292175, |
| "sampling/importance_sampling_ratio/max": 0.4542810022830963, |
| "sampling/importance_sampling_ratio/mean": 0.035323094576597214, |
| "sampling/importance_sampling_ratio/min": 4.0660263350700276e-11, |
| "sampling/sampling_logp_difference/max": 17.207799911499023, |
| "sampling/sampling_logp_difference/mean": 0.038973793387413025, |
| "step": 35, |
| "step_time": 69.47620079400076 |
| }, |
| { |
| "cispo_clip_ratio": 0.6264413706958294, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 4404.0, |
| "completions/mean_length": 3681.5, |
| "completions/mean_terminated_length": 4404.0, |
| "completions/min_length": 524.0, |
| "completions/min_terminated_length": 4404.0, |
| "entropy": 1.4374851435422897, |
| "epoch": 0.13284132841328414, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0009762643603608012, |
| "learning_rate": 0.0002, |
| "loss": 0.006543246563524008, |
| "num_tokens": 2493321.0, |
| "reward": 0.5082834959030151, |
| "reward_std": 0.34550100564956665, |
| "rewards/humor_reward/mean": 0.5082834959030151, |
| "rewards/humor_reward/std": 0.34550103545188904, |
| "sampling/importance_sampling_ratio/max": 0.3936307430267334, |
| "sampling/importance_sampling_ratio/mean": 0.03291841596364975, |
| "sampling/importance_sampling_ratio/min": 5.0612318780451915e-09, |
| "sampling/sampling_logp_difference/max": 2.5245909690856934, |
| "sampling/sampling_logp_difference/mean": 0.03988190367817879, |
| "step": 36, |
| "step_time": 59.509835421005846 |
| }, |
| { |
| "cispo_clip_ratio": 0.6464070416986942, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3666.25, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1304.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.4860893040895462, |
| "epoch": 0.13653136531365315, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0014742233324795961, |
| "learning_rate": 0.0002, |
| "loss": 0.009775211103260517, |
| "num_tokens": 2555917.0, |
| "reward": 0.5413703918457031, |
| "reward_std": 0.2888968586921692, |
| "rewards/humor_reward/mean": 0.5413703918457031, |
| "rewards/humor_reward/std": 0.2888968884944916, |
| "sampling/importance_sampling_ratio/max": 0.5927172899246216, |
| "sampling/importance_sampling_ratio/mean": 0.03740416467189789, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.5622243881225586, |
| "sampling/sampling_logp_difference/mean": 0.03937282785773277, |
| "step": 37, |
| "step_time": 84.5253242639883 |
| }, |
| { |
| "cispo_clip_ratio": 0.5759155452251434, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 5372.0, |
| "completions/max_terminated_length": 3480.0, |
| "completions/mean_length": 2918.75, |
| "completions/mean_terminated_length": 2408.0, |
| "completions/min_length": 964.0, |
| "completions/min_terminated_length": 1336.0, |
| "entropy": 1.4434982985258102, |
| "epoch": 0.14022140221402213, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.001353645813651383, |
| "learning_rate": 0.0002, |
| "loss": -0.005609482992440462, |
| "num_tokens": 2606553.0, |
| "reward": 0.27517157793045044, |
| "reward_std": 0.17297060787677765, |
| "rewards/humor_reward/mean": 0.27517157793045044, |
| "rewards/humor_reward/std": 0.17297060787677765, |
| "sampling/importance_sampling_ratio/max": 0.8227623105049133, |
| "sampling/importance_sampling_ratio/mean": 0.08773483335971832, |
| "sampling/importance_sampling_ratio/min": 3.3746889016583737e-07, |
| "sampling/sampling_logp_difference/max": 1.4419441223144531, |
| "sampling/sampling_logp_difference/mean": 0.04054056107997894, |
| "step": 38, |
| "step_time": 70.05384999897797 |
| }, |
| { |
| "cispo_clip_ratio": 0.47585512325167656, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 3937.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2637.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 631.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.9437123090028763, |
| "epoch": 0.14391143911439114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001363922783639282, |
| "learning_rate": 0.0002, |
| "loss": 0.001440045889467001, |
| "num_tokens": 2652695.0, |
| "reward": 0.5775443911552429, |
| "reward_std": 0.1908194124698639, |
| "rewards/humor_reward/mean": 0.5775443911552429, |
| "rewards/humor_reward/std": 0.1908194124698639, |
| "sampling/importance_sampling_ratio/max": 0.04935026168823242, |
| "sampling/importance_sampling_ratio/mean": 0.003360162954777479, |
| "sampling/importance_sampling_ratio/min": 9.852194615689314e-09, |
| "sampling/sampling_logp_difference/max": 2.8174242973327637, |
| "sampling/sampling_logp_difference/mean": 0.04766388610005379, |
| "step": 39, |
| "step_time": 65.56160380599613 |
| }, |
| { |
| "cispo_clip_ratio": 0.5537771657109261, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 3334.0, |
| "completions/mean_length": 3318.3125, |
| "completions/mean_terminated_length": 2593.5, |
| "completions/min_length": 1005.0, |
| "completions/min_terminated_length": 1853.0, |
| "entropy": 1.6230905801057816, |
| "epoch": 0.14760147601476015, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.798116323305294e-05, |
| "learning_rate": 0.0002, |
| "loss": 0.0004228673642501235, |
| "num_tokens": 2710396.0, |
| "reward": 0.3043411374092102, |
| "reward_std": 0.29077214002609253, |
| "rewards/humor_reward/mean": 0.3043411374092102, |
| "rewards/humor_reward/std": 0.29077211022377014, |
| "sampling/importance_sampling_ratio/max": 0.019216708838939667, |
| "sampling/importance_sampling_ratio/mean": 0.0028733701910823584, |
| "sampling/importance_sampling_ratio/min": 4.04905886597362e-09, |
| "sampling/sampling_logp_difference/max": 4.360086441040039, |
| "sampling/sampling_logp_difference/mean": 0.04630262777209282, |
| "step": 40, |
| "step_time": 82.58221219903498 |
| }, |
| { |
| "cispo_clip_ratio": 0.7186006270349026, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 5217.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3383.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 706.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.9114204794168472, |
| "epoch": 0.15129151291512916, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.005707095842808485, |
| "learning_rate": 0.0002, |
| "loss": 0.07725385576486588, |
| "num_tokens": 2768684.0, |
| "reward": 0.6538974642753601, |
| "reward_std": 0.22179268300533295, |
| "rewards/humor_reward/mean": 0.6538974642753601, |
| "rewards/humor_reward/std": 0.22179268300533295, |
| "sampling/importance_sampling_ratio/max": 2.3075239658355713, |
| "sampling/importance_sampling_ratio/mean": 0.24276912212371826, |
| "sampling/importance_sampling_ratio/min": 2.1112403203460417e-07, |
| "sampling/sampling_logp_difference/max": 1.9947752952575684, |
| "sampling/sampling_logp_difference/mean": 0.04460817202925682, |
| "step": 41, |
| "step_time": 50.73555830302939 |
| }, |
| { |
| "cispo_clip_ratio": 0.4581793490797281, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3443.75, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 823.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 1.684003859758377, |
| "epoch": 0.15498154981549817, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.124524952610955e-05, |
| "learning_rate": 0.0002, |
| "loss": -0.0007328057545237243, |
| "num_tokens": 2828168.0, |
| "reward": 0.38168156147003174, |
| "reward_std": 0.1816880702972412, |
| "rewards/humor_reward/mean": 0.38168156147003174, |
| "rewards/humor_reward/std": 0.1816880702972412, |
| "sampling/importance_sampling_ratio/max": 0.7620577812194824, |
| "sampling/importance_sampling_ratio/mean": 0.052507393062114716, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.3952629566192627, |
| "sampling/sampling_logp_difference/mean": 0.044263970106840134, |
| "step": 42, |
| "step_time": 86.9609940169612 |
| }, |
| { |
| "cispo_clip_ratio": 0.47743209451436996, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 957.0, |
| "completions/mean_length": 3697.8125, |
| "completions/mean_terminated_length": 957.0, |
| "completions/min_length": 608.0, |
| "completions/min_terminated_length": 957.0, |
| "entropy": 1.7134400680661201, |
| "epoch": 0.15867158671586715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0007043051300570369, |
| "learning_rate": 0.0002, |
| "loss": -0.0017245950875803828, |
| "num_tokens": 2891493.0, |
| "reward": 0.22081857919692993, |
| "reward_std": 0.221486434340477, |
| "rewards/humor_reward/mean": 0.22081857919692993, |
| "rewards/humor_reward/std": 0.22148646414279938, |
| "sampling/importance_sampling_ratio/max": 0.40315449237823486, |
| "sampling/importance_sampling_ratio/mean": 0.02584739960730076, |
| "sampling/importance_sampling_ratio/min": 2.196902926243638e-07, |
| "sampling/sampling_logp_difference/max": 1.3882695436477661, |
| "sampling/sampling_logp_difference/mean": 0.04035360738635063, |
| "step": 43, |
| "step_time": 79.38622622596449 |
| }, |
| { |
| "cispo_clip_ratio": 0.4560951702296734, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 3416.6875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 1422.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 2.0115034580230713, |
| "epoch": 0.16236162361623616, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00011439115041866899, |
| "learning_rate": 0.0002, |
| "loss": 0.0011536129750311375, |
| "num_tokens": 2950096.0, |
| "reward": 0.44468164443969727, |
| "reward_std": 0.2576386332511902, |
| "rewards/humor_reward/mean": 0.44468164443969727, |
| "rewards/humor_reward/std": 0.2576386630535126, |
| "sampling/importance_sampling_ratio/max": 0.04263285920023918, |
| "sampling/importance_sampling_ratio/mean": 0.004383362829685211, |
| "sampling/importance_sampling_ratio/min": 1.0335932643101842e-07, |
| "sampling/sampling_logp_difference/max": 2.972001552581787, |
| "sampling/sampling_logp_difference/mean": 0.04580405354499817, |
| "step": 44, |
| "step_time": 83.37110035201476 |
| }, |
| { |
| "cispo_clip_ratio": 0.5093751782551408, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2493.1875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 544.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 2.2517920583486557, |
| "epoch": 0.16605166051660517, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0009305228013545275, |
| "learning_rate": 0.0002, |
| "loss": -0.008580348454415798, |
| "num_tokens": 2994147.0, |
| "reward": 0.36086541414260864, |
| "reward_std": 0.23784013092517853, |
| "rewards/humor_reward/mean": 0.36086541414260864, |
| "rewards/humor_reward/std": 0.23784014582633972, |
| "sampling/importance_sampling_ratio/max": 0.34510719776153564, |
| "sampling/importance_sampling_ratio/mean": 0.02897491306066513, |
| "sampling/importance_sampling_ratio/min": 1.3008460953388423e-13, |
| "sampling/sampling_logp_difference/max": 4.940759658813477, |
| "sampling/sampling_logp_difference/mean": 0.05280106142163277, |
| "step": 45, |
| "step_time": 81.29779967200011 |
| }, |
| { |
| "cispo_clip_ratio": 0.4961427040398121, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 1664.0, |
| "completions/mean_length": 2631.6875, |
| "completions/mean_terminated_length": 1664.0, |
| "completions/min_length": 919.0, |
| "completions/min_terminated_length": 1664.0, |
| "entropy": 2.523237004876137, |
| "epoch": 0.16974169741697417, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0004199541872367263, |
| "learning_rate": 0.0002, |
| "loss": 0.0025538511108607054, |
| "num_tokens": 3040190.0, |
| "reward": 0.3032156825065613, |
| "reward_std": 0.2560163736343384, |
| "rewards/humor_reward/mean": 0.3032156825065613, |
| "rewards/humor_reward/std": 0.2560163736343384, |
| "sampling/importance_sampling_ratio/max": 0.6374390125274658, |
| "sampling/importance_sampling_ratio/mean": 0.047728560864925385, |
| "sampling/importance_sampling_ratio/min": 3.620046413743694e-07, |
| "sampling/sampling_logp_difference/max": 2.632134437561035, |
| "sampling/sampling_logp_difference/mean": 0.047268252819776535, |
| "step": 46, |
| "step_time": 73.58007367796381 |
| }, |
| { |
| "cispo_clip_ratio": 0.3577369898557663, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 2277.4375, |
| "completions/mean_terminated_length": 1734.0, |
| "completions/min_length": 899.0, |
| "completions/min_terminated_length": 1734.0, |
| "entropy": 2.4675562232732773, |
| "epoch": 0.17343173431734318, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.011461591348052025, |
| "learning_rate": 0.0002, |
| "loss": 0.08860153704881668, |
| "num_tokens": 3081013.0, |
| "reward": 0.2814764380455017, |
| "reward_std": 0.293722927570343, |
| "rewards/humor_reward/mean": 0.2814764380455017, |
| "rewards/humor_reward/std": 0.293722927570343, |
| "sampling/importance_sampling_ratio/max": 1.583497166633606, |
| "sampling/importance_sampling_ratio/mean": 0.19163595139980316, |
| "sampling/importance_sampling_ratio/min": 1.2635299029284397e-08, |
| "sampling/sampling_logp_difference/max": 3.013592481613159, |
| "sampling/sampling_logp_difference/mean": 0.05257761478424072, |
| "step": 47, |
| "step_time": 80.03998614201555 |
| }, |
| { |
| "cispo_clip_ratio": 0.343157634139061, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 1813.0, |
| "completions/mean_length": 1572.6875, |
| "completions/mean_terminated_length": 1556.0, |
| "completions/min_length": 505.0, |
| "completions/min_terminated_length": 1299.0, |
| "entropy": 2.2395776584744453, |
| "epoch": 0.17712177121771217, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0015310291200876236, |
| "learning_rate": 0.0002, |
| "loss": 0.007011768873780966, |
| "num_tokens": 3110336.0, |
| "reward": 0.25455841422080994, |
| "reward_std": 0.21981197595596313, |
| "rewards/humor_reward/mean": 0.25455841422080994, |
| "rewards/humor_reward/std": 0.21981199085712433, |
| "sampling/importance_sampling_ratio/max": 1.104674220085144, |
| "sampling/importance_sampling_ratio/mean": 0.09266799688339233, |
| "sampling/importance_sampling_ratio/min": 6.321847649815027e-07, |
| "sampling/sampling_logp_difference/max": 2.861496925354004, |
| "sampling/sampling_logp_difference/mean": 0.048282306641340256, |
| "step": 48, |
| "step_time": 74.09863769897493 |
| }, |
| { |
| "cispo_clip_ratio": 0.2400471270084381, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 6144.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 1609.875, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 105.0, |
| "completions/min_terminated_length": 0.0, |
| "entropy": 2.189691785722971, |
| "epoch": 0.18081180811808117, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.005181934684514999, |
| "learning_rate": 0.0002, |
| "loss": 0.004273830913007259, |
| "num_tokens": 3140478.0, |
| "reward": 0.2623010277748108, |
| "reward_std": 0.2546696662902832, |
| "rewards/humor_reward/mean": 0.2623010277748108, |
| "rewards/humor_reward/std": 0.2546696662902832, |
| "sampling/importance_sampling_ratio/max": 1.6369633674621582, |
| "sampling/importance_sampling_ratio/mean": 0.1263563483953476, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.9059357643127441, |
| "sampling/sampling_logp_difference/mean": 0.043733954429626465, |
| "step": 49, |
| "step_time": 75.97332381599699 |
| }, |
| { |
| "cispo_clip_ratio": 0.43684835731983185, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2413.0, |
| "completions/max_terminated_length": 791.0, |
| "completions/mean_length": 951.875, |
| "completions/mean_terminated_length": 791.0, |
| "completions/min_length": 434.0, |
| "completions/min_terminated_length": 791.0, |
| "entropy": 2.404914140701294, |
| "epoch": 0.18450184501845018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00264533469453454, |
| "learning_rate": 0.0002, |
| "loss": 0.007856368087232113, |
| "num_tokens": 3159868.0, |
| "reward": 0.24542748928070068, |
| "reward_std": 0.2106892615556717, |
| "rewards/humor_reward/mean": 0.24542748928070068, |
| "rewards/humor_reward/std": 0.21068927645683289, |
| "sampling/importance_sampling_ratio/max": 0.8078871965408325, |
| "sampling/importance_sampling_ratio/mean": 0.06913819909095764, |
| "sampling/importance_sampling_ratio/min": 8.870298984220426e-08, |
| "sampling/sampling_logp_difference/max": 3.869755744934082, |
| "sampling/sampling_logp_difference/mean": 0.0640038326382637, |
| "step": 50, |
| "step_time": 23.586737716992502 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 200, |
| "num_input_tokens_seen": 3159868, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|