{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18450184501845018, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "cispo_clip_ratio": 0.364774439483881, "completions/clipped_ratio": 1.0, "completions/max_length": 5409.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2607.8125, "completions/mean_terminated_length": 0.0, "completions/min_length": 674.0, "completions/min_terminated_length": 0.0, "entropy": 1.4863766431808472, "epoch": 0.0036900369003690036, "frac_reward_zero_std": 0.0, "grad_norm": 1.6501518985023722e-05, "learning_rate": 0.0002, "loss": -5.250487447483465e-05, "num_tokens": 45661.0, "reward": 0.2993389666080475, "reward_std": 0.17980147898197174, "rewards/humor_reward/mean": 0.2993389666080475, "rewards/humor_reward/std": 0.17980147898197174, "sampling/importance_sampling_ratio/max": 0.0032514820341020823, "sampling/importance_sampling_ratio/mean": 0.00025803607422858477, "sampling/importance_sampling_ratio/min": 6.591494638996664e-08, "sampling/sampling_logp_difference/max": 3.1304593086242676, "sampling/sampling_logp_difference/mean": 0.04722540080547333, "step": 1, "step_time": 79.16197230700345 }, { "cispo_clip_ratio": 0.4739677682518959, "completions/clipped_ratio": 0.9375, "completions/max_length": 4785.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 2616.0625, "completions/mean_terminated_length": 439.0, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 1.2316511571407318, "epoch": 0.007380073800738007, "frac_reward_zero_std": 0.0, "grad_norm": 0.003729997668415308, "learning_rate": 0.0002, "loss": -0.003799113677814603, "num_tokens": 91230.0, "reward": 0.5310899019241333, "reward_std": 0.27845898270606995, "rewards/humor_reward/mean": 0.5310899019241333, "rewards/humor_reward/std": 0.27845898270606995, "sampling/importance_sampling_ratio/max": 1.048789381980896, "sampling/importance_sampling_ratio/mean": 0.12509621679782867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.2385149002075195, "sampling/sampling_logp_difference/mean": 0.04188045486807823, "step": 2, "step_time": 66.1825476239901 }, { "cispo_clip_ratio": 0.5831195935606956, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 3197.3125, "completions/mean_terminated_length": 2244.0, "completions/min_length": 1313.0, "completions/min_terminated_length": 2244.0, "entropy": 1.3326809257268906, "epoch": 0.01107011070110701, "frac_reward_zero_std": 0.0, "grad_norm": 5.462636181619018e-05, "learning_rate": 0.0002, "loss": 0.00016624285490252078, "num_tokens": 146995.0, "reward": 0.4853614568710327, "reward_std": 0.2525769770145416, "rewards/humor_reward/mean": 0.4853614568710327, "rewards/humor_reward/std": 0.252577006816864, "sampling/importance_sampling_ratio/max": 0.022306665778160095, "sampling/importance_sampling_ratio/mean": 0.001404650043696165, "sampling/importance_sampling_ratio/min": 1.7064398505350908e-10, "sampling/sampling_logp_difference/max": 3.054729700088501, "sampling/sampling_logp_difference/mean": 0.0515315905213356, "step": 3, "step_time": 86.2583410259831 }, { "cispo_clip_ratio": 0.5900484155863523, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3705.3125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1009.0, "completions/min_terminated_length": 0.0, "entropy": 1.533411294221878, "epoch": 0.014760147601476014, "frac_reward_zero_std": 0.0, "grad_norm": 0.0004810348036698997, "learning_rate": 0.0002, "loss": -0.0014380415668711066, "num_tokens": 210440.0, "reward": 0.48688405752182007, "reward_std": 0.2839711010456085, "rewards/humor_reward/mean": 0.48688405752182007, "rewards/humor_reward/std": 0.2839711010456085, "sampling/importance_sampling_ratio/max": 0.21202650666236877, "sampling/importance_sampling_ratio/mean": 0.013267980888485909, "sampling/importance_sampling_ratio/min": 1.5972070888103929e-10, "sampling/sampling_logp_difference/max": 3.0988059043884277, "sampling/sampling_logp_difference/mean": 0.04585312306880951, "step": 4, "step_time": 83.59175038301328 }, { "cispo_clip_ratio": 0.6057924032211304, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4057.8125, "completions/mean_terminated_length": 0.0, "completions/min_length": 482.0, "completions/min_terminated_length": 0.0, "entropy": 1.3052150011062622, "epoch": 0.01845018450184502, "frac_reward_zero_std": 0.0, "grad_norm": 4.325129793869564e-06, "learning_rate": 0.0002, "loss": -5.259538738755509e-05, "num_tokens": 279525.0, "reward": 0.409991055727005, "reward_std": 0.27515894174575806, "rewards/humor_reward/mean": 0.409991055727005, "rewards/humor_reward/std": 0.27515894174575806, "sampling/importance_sampling_ratio/max": 0.0020468428265303373, "sampling/importance_sampling_ratio/mean": 0.0002199342561652884, "sampling/importance_sampling_ratio/min": 4.834557643107473e-09, "sampling/sampling_logp_difference/max": 1.898941993713379, "sampling/sampling_logp_difference/mean": 0.04034310206770897, "step": 5, "step_time": 63.17914728100004 }, { "cispo_clip_ratio": 0.51509914919734, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4054.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 106.0, "completions/min_terminated_length": 0.0, "entropy": 1.3227798640727997, "epoch": 0.02214022140221402, "frac_reward_zero_std": 0.0, "grad_norm": 0.0003930912062060088, "learning_rate": 0.0002, "loss": -8.775031892582774e-05, "num_tokens": 348111.0, "reward": 0.47100865840911865, "reward_std": 0.2912721335887909, "rewards/humor_reward/mean": 0.47100865840911865, "rewards/humor_reward/std": 0.2912721633911133, "sampling/importance_sampling_ratio/max": 0.26060751080513, "sampling/importance_sampling_ratio/mean": 0.0164032019674778, "sampling/importance_sampling_ratio/min": 9.639339326739105e-10, "sampling/sampling_logp_difference/max": 2.0555481910705566, "sampling/sampling_logp_difference/mean": 0.039414241909980774, "step": 6, "step_time": 82.86594534001779 }, { "cispo_clip_ratio": 0.6096626706421375, "completions/clipped_ratio": 0.875, "completions/max_length": 6144.0, "completions/max_terminated_length": 3121.0, "completions/mean_length": 3594.3125, "completions/mean_terminated_length": 1685.5, "completions/min_length": 184.0, "completions/min_terminated_length": 250.0, "entropy": 1.4805490970611572, "epoch": 0.025830258302583026, "frac_reward_zero_std": 0.0, "grad_norm": 0.0001002174976747483, "learning_rate": 0.0002, "loss": -0.00036368955625221133, "num_tokens": 409556.0, "reward": 0.25648510456085205, "reward_std": 0.21258896589279175, "rewards/humor_reward/mean": 0.25648510456085205, "rewards/humor_reward/std": 0.21258896589279175, "sampling/importance_sampling_ratio/max": 0.042766377329826355, "sampling/importance_sampling_ratio/mean": 0.003916537389159203, "sampling/importance_sampling_ratio/min": 3.151466310136186e-10, "sampling/sampling_logp_difference/max": 2.511254072189331, "sampling/sampling_logp_difference/mean": 0.04306597262620926, "step": 7, "step_time": 77.88916695199441 }, { "cispo_clip_ratio": 0.4443662669509649, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4063.8125, "completions/mean_terminated_length": 0.0, "completions/min_length": 426.0, "completions/min_terminated_length": 0.0, "entropy": 1.275453269481659, "epoch": 0.02952029520295203, "frac_reward_zero_std": 0.0, "grad_norm": 1.6524854800081812e-05, "learning_rate": 0.0002, "loss": 6.745757855242118e-05, "num_tokens": 478513.0, "reward": 0.3238402009010315, "reward_std": 0.2930999994277954, "rewards/humor_reward/mean": 0.3238402009010315, "rewards/humor_reward/std": 0.2931000292301178, "sampling/importance_sampling_ratio/max": 0.010375253856182098, "sampling/importance_sampling_ratio/mean": 0.0010308868950232863, "sampling/importance_sampling_ratio/min": 5.8848907968922504e-08, "sampling/sampling_logp_difference/max": 3.019871711730957, "sampling/sampling_logp_difference/mean": 0.038398630917072296, "step": 8, "step_time": 62.586684064008296 }, { "cispo_clip_ratio": 0.4614889621734619, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3351.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 925.0, "completions/min_terminated_length": 0.0, "entropy": 1.392012134194374, "epoch": 0.033210332103321034, "frac_reward_zero_std": 0.0, "grad_norm": 5.4697693485650234e-08, "learning_rate": 0.0002, "loss": 3.4436376949997793e-07, "num_tokens": 536969.0, "reward": 0.320010781288147, "reward_std": 0.26967623829841614, "rewards/humor_reward/mean": 0.320010781288147, "rewards/humor_reward/std": 0.26967623829841614, "sampling/importance_sampling_ratio/max": 2.8325872335699387e-05, "sampling/importance_sampling_ratio/mean": 4.4159696699352935e-06, "sampling/importance_sampling_ratio/min": 4.985558949184565e-13, "sampling/sampling_logp_difference/max": 2.7537732124328613, "sampling/sampling_logp_difference/mean": 0.05181171000003815, "step": 9, "step_time": 85.44417907499883 }, { "cispo_clip_ratio": 0.46727752313017845, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4545.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1297.0, "completions/min_terminated_length": 0.0, "entropy": 1.277551643550396, "epoch": 0.03690036900369004, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018553344998508692, "learning_rate": 0.0002, "loss": -0.00018077026470564306, "num_tokens": 613629.0, "reward": 0.3976645767688751, "reward_std": 0.3508976995944977, "rewards/humor_reward/mean": 0.3976645767688751, "rewards/humor_reward/std": 0.3508976995944977, "sampling/importance_sampling_ratio/max": 0.32763394713401794, "sampling/importance_sampling_ratio/mean": 0.02203518897294998, "sampling/importance_sampling_ratio/min": 2.647046748460724e-10, "sampling/sampling_logp_difference/max": 3.727196216583252, "sampling/sampling_logp_difference/mean": 0.037366170436143875, "step": 10, "step_time": 83.16811528199469 }, { "cispo_clip_ratio": 0.564013222232461, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4322.8125, "completions/mean_terminated_length": 0.0, "completions/min_length": 705.0, "completions/min_terminated_length": 0.0, "entropy": 1.0119460746645927, "epoch": 0.04059040590405904, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012440788559615612, "learning_rate": 0.0002, "loss": -0.0035373736172914505, "num_tokens": 687402.0, "reward": 0.378947377204895, "reward_std": 0.2340850830078125, "rewards/humor_reward/mean": 0.378947377204895, "rewards/humor_reward/std": 0.2340850979089737, "sampling/importance_sampling_ratio/max": 0.30296310782432556, "sampling/importance_sampling_ratio/mean": 0.018969321623444557, "sampling/importance_sampling_ratio/min": 8.10443156811988e-15, "sampling/sampling_logp_difference/max": 3.91379976272583, "sampling/sampling_logp_difference/mean": 0.04132331907749176, "step": 11, "step_time": 94.6180467310187 }, { "cispo_clip_ratio": 0.3971639759838581, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 5086.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1271.0, "completions/min_terminated_length": 0.0, "entropy": 1.0960774347186089, "epoch": 0.04428044280442804, "frac_reward_zero_std": 0.0, "grad_norm": 5.153976962901652e-05, "learning_rate": 0.0002, "loss": -0.00044693646486848593, "num_tokens": 772714.0, "reward": 0.32133913040161133, "reward_std": 0.380583792924881, "rewards/humor_reward/mean": 0.32133913040161133, "rewards/humor_reward/std": 0.38058382272720337, "sampling/importance_sampling_ratio/max": 0.017419712617993355, "sampling/importance_sampling_ratio/mean": 0.001832809066399932, "sampling/importance_sampling_ratio/min": 6.273771866599498e-12, "sampling/sampling_logp_difference/max": 5.7523579597473145, "sampling/sampling_logp_difference/mean": 0.034892238676548004, "step": 12, "step_time": 79.48709479898389 }, { "cispo_clip_ratio": 0.5508307814598083, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4785.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1482.0, "completions/min_terminated_length": 0.0, "entropy": 1.299708716571331, "epoch": 0.04797047970479705, "frac_reward_zero_std": 0.0, "grad_norm": 6.673526968370425e-06, "learning_rate": 0.0002, "loss": -2.0674373445217498e-05, "num_tokens": 853224.0, "reward": 0.36821186542510986, "reward_std": 0.31694290041923523, "rewards/humor_reward/mean": 0.36821186542510986, "rewards/humor_reward/std": 0.3169429302215576, "sampling/importance_sampling_ratio/max": 0.0009282280807383358, "sampling/importance_sampling_ratio/mean": 0.0001542143290862441, "sampling/importance_sampling_ratio/min": 4.797664487909969e-10, "sampling/sampling_logp_difference/max": 2.7313647270202637, "sampling/sampling_logp_difference/mean": 0.03830663487315178, "step": 13, "step_time": 82.41276069695596 }, { "cispo_clip_ratio": 0.3803188279271126, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 4955.6875, "completions/mean_terminated_length": 1625.0, "completions/min_length": 968.0, "completions/min_terminated_length": 1625.0, "entropy": 1.364896483719349, "epoch": 0.05166051660516605, "frac_reward_zero_std": 0.0, "grad_norm": 1.3385028978518676e-05, "learning_rate": 0.0002, "loss": 2.8572056180564687e-05, "num_tokens": 937123.0, "reward": 0.2386319488286972, "reward_std": 0.28499212861061096, "rewards/humor_reward/mean": 0.2386319488286972, "rewards/humor_reward/std": 0.28499215841293335, "sampling/importance_sampling_ratio/max": 0.009222447872161865, "sampling/importance_sampling_ratio/mean": 0.0005764853558503091, "sampling/importance_sampling_ratio/min": 1.6275064690298314e-15, "sampling/sampling_logp_difference/max": 4.84999942779541, "sampling/sampling_logp_difference/mean": 0.04662536829710007, "step": 14, "step_time": 85.49535570498847 }, { "cispo_clip_ratio": 0.3275146186351776, "completions/clipped_ratio": 0.875, "completions/max_length": 6144.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 4530.75, "completions/mean_terminated_length": 2166.5, "completions/min_length": 505.0, "completions/min_terminated_length": 1743.0, "entropy": 0.9317370727658272, "epoch": 0.055350553505535055, "frac_reward_zero_std": 0.0, "grad_norm": 4.116483614780009e-05, "learning_rate": 0.0002, "loss": 0.0002636128047015518, "num_tokens": 1013327.0, "reward": 0.2526451349258423, "reward_std": 0.36080095171928406, "rewards/humor_reward/mean": 0.2526451349258423, "rewards/humor_reward/std": 0.36080095171928406, "sampling/importance_sampling_ratio/max": 0.011370004154741764, "sampling/importance_sampling_ratio/mean": 0.001944657531566918, "sampling/importance_sampling_ratio/min": 1.1834137185820492e-12, "sampling/sampling_logp_difference/max": 3.4003725051879883, "sampling/sampling_logp_difference/mean": 0.03138995170593262, "step": 15, "step_time": 73.45879589200194 }, { "cispo_clip_ratio": 0.1871738387271762, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 5333.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 854.0, "completions/min_terminated_length": 0.0, "entropy": 1.1076993495225906, "epoch": 0.05904059040590406, "frac_reward_zero_std": 0.0, "grad_norm": 0.0002949201443698257, "learning_rate": 0.0002, "loss": 0.0002496525994502008, "num_tokens": 1102593.0, "reward": 0.12923595309257507, "reward_std": 0.24049529433250427, "rewards/humor_reward/mean": 0.12923595309257507, "rewards/humor_reward/std": 0.24049529433250427, "sampling/importance_sampling_ratio/max": 0.13862203061580658, "sampling/importance_sampling_ratio/mean": 0.013717259280383587, "sampling/importance_sampling_ratio/min": 2.0222361563071445e-09, "sampling/sampling_logp_difference/max": 1.9613001346588135, "sampling/sampling_logp_difference/mean": 0.02957136556506157, "step": 16, "step_time": 76.10329114498745 }, { "cispo_clip_ratio": 0.2707713171839714, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 5502.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2296.0, "completions/min_terminated_length": 0.0, "entropy": 1.3415134847164154, "epoch": 0.06273062730627306, "frac_reward_zero_std": 0.0, "grad_norm": 1.443507812837197e-06, "learning_rate": 0.0002, "loss": 1.1968148101004772e-05, "num_tokens": 1194337.0, "reward": 0.14995136857032776, "reward_std": 0.2515668570995331, "rewards/humor_reward/mean": 0.14995136857032776, "rewards/humor_reward/std": 0.25156688690185547, "sampling/importance_sampling_ratio/max": 0.00044680986320599914, "sampling/importance_sampling_ratio/mean": 3.927269790438004e-05, "sampling/importance_sampling_ratio/min": 9.991393268293791e-10, "sampling/sampling_logp_difference/max": 3.0433225631713867, "sampling/sampling_logp_difference/mean": 0.033172741532325745, "step": 17, "step_time": 76.20640446299512 }, { "cispo_clip_ratio": 0.4475329667329788, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4663.4375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1583.0, "completions/min_terminated_length": 0.0, "entropy": 1.0032905638217926, "epoch": 0.06642066420664207, "frac_reward_zero_std": 0.0, "grad_norm": 4.456151003751074e-08, "learning_rate": 0.0002, "loss": 1.6434560734523984e-07, "num_tokens": 1273560.0, "reward": 0.47607260942459106, "reward_std": 0.3206636905670166, "rewards/humor_reward/mean": 0.47607260942459106, "rewards/humor_reward/std": 0.3206636905670166, "sampling/importance_sampling_ratio/max": 1.8386488591204397e-05, "sampling/importance_sampling_ratio/mean": 2.1001696950406767e-06, "sampling/importance_sampling_ratio/min": 2.987033566998441e-14, "sampling/sampling_logp_difference/max": 2.9727420806884766, "sampling/sampling_logp_difference/mean": 0.04333854466676712, "step": 18, "step_time": 91.44482385799347 }, { "cispo_clip_ratio": 0.5934446323662996, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4332.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 692.0, "completions/min_terminated_length": 0.0, "entropy": 1.2455691695213318, "epoch": 0.07011070110701106, "frac_reward_zero_std": 0.0, "grad_norm": 2.6948493541567586e-05, "learning_rate": 0.0002, "loss": 0.00015589207760058343, "num_tokens": 1346808.0, "reward": 0.4407455325126648, "reward_std": 0.32844167947769165, "rewards/humor_reward/mean": 0.4407455325126648, "rewards/humor_reward/std": 0.32844167947769165, "sampling/importance_sampling_ratio/max": 0.0622418075799942, "sampling/importance_sampling_ratio/mean": 0.00455310195684433, "sampling/importance_sampling_ratio/min": 6.87965170942384e-11, "sampling/sampling_logp_difference/max": 1.9999836683273315, "sampling/sampling_logp_difference/mean": 0.040162429213523865, "step": 19, "step_time": 63.66296142600186 }, { "cispo_clip_ratio": 0.3904235363006592, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 4865.125, "completions/mean_terminated_length": 3539.0, "completions/min_length": 1705.0, "completions/min_terminated_length": 3539.0, "entropy": 1.1548645570874214, "epoch": 0.07380073800738007, "frac_reward_zero_std": 0.0, "grad_norm": 0.0003063328331336379, "learning_rate": 0.0002, "loss": 0.0016583555843681097, "num_tokens": 1428810.0, "reward": 0.32657718658447266, "reward_std": 0.3702032268047333, "rewards/humor_reward/mean": 0.32657718658447266, "rewards/humor_reward/std": 0.37020325660705566, "sampling/importance_sampling_ratio/max": 0.07926318794488907, "sampling/importance_sampling_ratio/mean": 0.00835469737648964, "sampling/importance_sampling_ratio/min": 1.397608049256982e-12, "sampling/sampling_logp_difference/max": 3.58780574798584, "sampling/sampling_logp_difference/mean": 0.036592792719602585, "step": 20, "step_time": 80.32529347503441 }, { "cispo_clip_ratio": 0.46490118466317654, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4817.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1635.0, "completions/min_terminated_length": 0.0, "entropy": 1.468906119465828, "epoch": 0.07749077490774908, "frac_reward_zero_std": 0.0, "grad_norm": 9.814787915729539e-08, "learning_rate": 0.0002, "loss": -6.973982635827269e-07, "num_tokens": 1510048.0, "reward": 0.4167077839374542, "reward_std": 0.3755662441253662, "rewards/humor_reward/mean": 0.4167077839374542, "rewards/humor_reward/std": 0.3755662441253662, "sampling/importance_sampling_ratio/max": 2.670207504706923e-05, "sampling/importance_sampling_ratio/mean": 3.46238084603101e-06, "sampling/importance_sampling_ratio/min": 6.556125781154165e-11, "sampling/sampling_logp_difference/max": 2.0220751762390137, "sampling/sampling_logp_difference/mean": 0.04112354293465614, "step": 21, "step_time": 66.39175808998698 }, { "cispo_clip_ratio": 0.4479520544409752, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4476.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 1.3538827523589134, "epoch": 0.08118081180811808, "frac_reward_zero_std": 0.0, "grad_norm": 9.053791472979356e-06, "learning_rate": 0.0002, "loss": -4.3639585783239454e-05, "num_tokens": 1585386.0, "reward": 0.398086816072464, "reward_std": 0.39999622106552124, "rewards/humor_reward/mean": 0.398086816072464, "rewards/humor_reward/std": 0.39999625086784363, "sampling/importance_sampling_ratio/max": 0.011358072981238365, "sampling/importance_sampling_ratio/mean": 0.0010091899894177914, "sampling/importance_sampling_ratio/min": 4.4679931976432385e-10, "sampling/sampling_logp_difference/max": 3.5639562606811523, "sampling/sampling_logp_difference/mean": 0.035584691911935806, "step": 22, "step_time": 77.29983079498925 }, { "cispo_clip_ratio": 0.5936666019260883, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4713.3125, "completions/mean_terminated_length": 0.0, "completions/min_length": 701.0, "completions/min_terminated_length": 0.0, "entropy": 1.1694510877132416, "epoch": 0.08487084870848709, "frac_reward_zero_std": 0.0, "grad_norm": 0.00025732198264449835, "learning_rate": 0.0002, "loss": 0.00032361538615077734, "num_tokens": 1664511.0, "reward": 0.3887137174606323, "reward_std": 0.35025978088378906, "rewards/humor_reward/mean": 0.3887137174606323, "rewards/humor_reward/std": 0.35025978088378906, "sampling/importance_sampling_ratio/max": 0.2608579397201538, "sampling/importance_sampling_ratio/mean": 0.016369398683309555, "sampling/importance_sampling_ratio/min": 5.653824977636113e-11, "sampling/sampling_logp_difference/max": 1.6544189453125, "sampling/sampling_logp_difference/mean": 0.03655308857560158, "step": 23, "step_time": 79.8481750869978 }, { "cispo_clip_ratio": 0.46904783695936203, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4739.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 2132.0, "completions/min_terminated_length": 0.0, "entropy": 1.2212552726268768, "epoch": 0.08856088560885608, "frac_reward_zero_std": 0.0, "grad_norm": 2.1028017727076076e-06, "learning_rate": 0.0002, "loss": 5.860923010914121e-06, "num_tokens": 1744499.0, "reward": 0.33275866508483887, "reward_std": 0.3678642511367798, "rewards/humor_reward/mean": 0.33275866508483887, "rewards/humor_reward/std": 0.3678642809391022, "sampling/importance_sampling_ratio/max": 0.0013681778218597174, "sampling/importance_sampling_ratio/mean": 9.336201765108854e-05, "sampling/importance_sampling_ratio/min": 2.4985560842516463e-10, "sampling/sampling_logp_difference/max": 1.9406442642211914, "sampling/sampling_logp_difference/mean": 0.03511609137058258, "step": 24, "step_time": 79.88214839099965 }, { "cispo_clip_ratio": 0.4411753863096237, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4651.4375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1004.0, "completions/min_terminated_length": 0.0, "entropy": 0.8522009998559952, "epoch": 0.09225092250922509, "frac_reward_zero_std": 0.0, "grad_norm": 0.00017764404765330255, "learning_rate": 0.0002, "loss": 0.0008231036481447518, "num_tokens": 1823082.0, "reward": 0.274623304605484, "reward_std": 0.3160068690776825, "rewards/humor_reward/mean": 0.274623304605484, "rewards/humor_reward/std": 0.3160068988800049, "sampling/importance_sampling_ratio/max": 0.03566610440611839, "sampling/importance_sampling_ratio/mean": 0.0037315732333809137, "sampling/importance_sampling_ratio/min": 4.288159161092153e-09, "sampling/sampling_logp_difference/max": 2.191567897796631, "sampling/sampling_logp_difference/mean": 0.027100346982479095, "step": 25, "step_time": 65.47632334999798 }, { "cispo_clip_ratio": 0.3125321865081787, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 5114.5625, "completions/mean_terminated_length": 0.0, "completions/min_length": 354.0, "completions/min_terminated_length": 0.0, "entropy": 0.8008184731006622, "epoch": 0.0959409594095941, "frac_reward_zero_std": 0.0, "grad_norm": 0.00044094581971876323, "learning_rate": 0.0002, "loss": -0.0023368855472654104, "num_tokens": 1909075.0, "reward": 0.2744894027709961, "reward_std": 0.3853631615638733, "rewards/humor_reward/mean": 0.2744894027709961, "rewards/humor_reward/std": 0.3853631913661957, "sampling/importance_sampling_ratio/max": 0.27399709820747375, "sampling/importance_sampling_ratio/mean": 0.02907378226518631, "sampling/importance_sampling_ratio/min": 1.0831848751280404e-09, "sampling/sampling_logp_difference/max": 2.079195022583008, "sampling/sampling_logp_difference/mean": 0.027823781594634056, "step": 26, "step_time": 79.29085839301115 }, { "cispo_clip_ratio": 0.45367857813835144, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 4420.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1406.0, "completions/min_terminated_length": 0.0, "entropy": 1.1720404550433159, "epoch": 0.0996309963099631, "frac_reward_zero_std": 0.0, "grad_norm": 2.0236677300999872e-05, "learning_rate": 0.0002, "loss": -0.0002367804991081357, "num_tokens": 1983073.0, "reward": 0.37771666049957275, "reward_std": 0.3381035327911377, "rewards/humor_reward/mean": 0.37771666049957275, "rewards/humor_reward/std": 0.3381035625934601, "sampling/importance_sampling_ratio/max": 0.01442171260714531, "sampling/importance_sampling_ratio/mean": 0.0017098486423492432, "sampling/importance_sampling_ratio/min": 2.375326602077621e-08, "sampling/sampling_logp_difference/max": 2.334380626678467, "sampling/sampling_logp_difference/mean": 0.03178582713007927, "step": 27, "step_time": 75.20771357801277 }, { "cispo_clip_ratio": 0.6514521557837725, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 5111.0, "completions/mean_length": 3562.625, "completions/mean_terminated_length": 5111.0, "completions/min_length": 171.0, "completions/min_terminated_length": 5111.0, "entropy": 1.2078422084450722, "epoch": 0.1033210332103321, "frac_reward_zero_std": 0.0, "grad_norm": 0.000864825677126646, "learning_rate": 0.0002, "loss": 0.0034890188835561275, "num_tokens": 2044011.0, "reward": 0.5999776124954224, "reward_std": 0.378849059343338, "rewards/humor_reward/mean": 0.5999776124954224, "rewards/humor_reward/std": 0.3788490891456604, "sampling/importance_sampling_ratio/max": 0.4461648166179657, "sampling/importance_sampling_ratio/mean": 0.056391313672065735, "sampling/importance_sampling_ratio/min": 1.9029215536647826e-08, "sampling/sampling_logp_difference/max": 2.0776968002319336, "sampling/sampling_logp_difference/mean": 0.04053337126970291, "step": 28, "step_time": 59.718489810984465 }, { "cispo_clip_ratio": 0.5842036623507738, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3761.1875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1373.0, "completions/min_terminated_length": 0.0, "entropy": 0.9947392120957375, "epoch": 0.1070110701107011, "frac_reward_zero_std": 0.0, "grad_norm": 5.404234002526209e-07, "learning_rate": 0.0002, "loss": 3.1678846426075324e-06, "num_tokens": 2108798.0, "reward": 0.5179843902587891, "reward_std": 0.2485807240009308, "rewards/humor_reward/mean": 0.5179843902587891, "rewards/humor_reward/std": 0.2485807240009308, "sampling/importance_sampling_ratio/max": 0.00023896525090094656, "sampling/importance_sampling_ratio/mean": 1.7564820154802874e-05, "sampling/importance_sampling_ratio/min": 8.362109155070005e-15, "sampling/sampling_logp_difference/max": 3.4245944023132324, "sampling/sampling_logp_difference/mean": 0.04190240800380707, "step": 29, "step_time": 92.04002382898761 }, { "cispo_clip_ratio": 0.481358059681952, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 3230.25, "completions/mean_terminated_length": 3548.0, "completions/min_length": 142.0, "completions/min_terminated_length": 3548.0, "entropy": 0.9884351938962936, "epoch": 0.11070110701107011, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007160792592912912, "learning_rate": 0.0002, "loss": 0.0030249895062297583, "num_tokens": 2165090.0, "reward": 0.32606449723243713, "reward_std": 0.2554178833961487, "rewards/humor_reward/mean": 0.32606449723243713, "rewards/humor_reward/std": 0.2554178833961487, "sampling/importance_sampling_ratio/max": 0.19502675533294678, "sampling/importance_sampling_ratio/mean": 0.01895805075764656, "sampling/importance_sampling_ratio/min": 1.5065569625161146e-10, "sampling/sampling_logp_difference/max": 2.431838035583496, "sampling/sampling_logp_difference/mean": 0.04055629298090935, "step": 30, "step_time": 83.23436555001535 }, { "cispo_clip_ratio": 0.6561583578586578, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3374.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 247.0, "completions/min_terminated_length": 0.0, "entropy": 1.0753171369433403, "epoch": 0.11439114391143912, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035273912362754345, "learning_rate": 0.0002, "loss": 0.01652080938220024, "num_tokens": 2223020.0, "reward": 0.5712062120437622, "reward_std": 0.3373561203479767, "rewards/humor_reward/mean": 0.5712062120437622, "rewards/humor_reward/std": 0.3373561203479767, "sampling/importance_sampling_ratio/max": 1.9402817487716675, "sampling/importance_sampling_ratio/mean": 0.13337960839271545, "sampling/importance_sampling_ratio/min": 1.6416909431882232e-07, "sampling/sampling_logp_difference/max": 6.734940528869629, "sampling/sampling_logp_difference/mean": 0.03378872573375702, "step": 31, "step_time": 59.33454087101563 }, { "cispo_clip_ratio": 0.6109768375754356, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3135.3125, "completions/mean_terminated_length": 0.0, "completions/min_length": 350.0, "completions/min_terminated_length": 0.0, "entropy": 1.203492984175682, "epoch": 0.11808118081180811, "frac_reward_zero_std": 0.0, "grad_norm": 8.204303594538942e-05, "learning_rate": 0.0002, "loss": 0.0006842454895377159, "num_tokens": 2277121.0, "reward": 0.6346642971038818, "reward_std": 0.2692297101020813, "rewards/humor_reward/mean": 0.6346642971038818, "rewards/humor_reward/std": 0.2692297399044037, "sampling/importance_sampling_ratio/max": 1.2163432836532593, "sampling/importance_sampling_ratio/mean": 0.07933147996664047, "sampling/importance_sampling_ratio/min": 3.8786645006894105e-09, "sampling/sampling_logp_difference/max": 1.4427013397216797, "sampling/sampling_logp_difference/mean": 0.03805601969361305, "step": 32, "step_time": 56.60236855900439 }, { "cispo_clip_ratio": 0.6497912313789129, "completions/clipped_ratio": 0.875, "completions/max_length": 6144.0, "completions/max_terminated_length": 4153.0, "completions/mean_length": 3000.5625, "completions/mean_terminated_length": 2144.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 1.1894647032022476, "epoch": 0.12177121771217712, "frac_reward_zero_std": 0.0, "grad_norm": 0.00014119563275016844, "learning_rate": 0.0002, "loss": 0.0002433458430459723, "num_tokens": 2329962.0, "reward": 0.5449860095977783, "reward_std": 0.3282092809677124, "rewards/humor_reward/mean": 0.5449860095977783, "rewards/humor_reward/std": 0.3282092809677124, "sampling/importance_sampling_ratio/max": 0.08825313299894333, "sampling/importance_sampling_ratio/mean": 0.006083119660615921, "sampling/importance_sampling_ratio/min": 1.4223322519768544e-09, "sampling/sampling_logp_difference/max": 2.7407069206237793, "sampling/sampling_logp_difference/mean": 0.04315001145005226, "step": 33, "step_time": 87.83407789799094 }, { "cispo_clip_ratio": 0.603145282715559, "completions/clipped_ratio": 0.9375, "completions/max_length": 5923.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 3187.3125, "completions/mean_terminated_length": 1476.0, "completions/min_length": 1006.0, "completions/min_terminated_length": 1476.0, "entropy": 1.1579975709319115, "epoch": 0.12546125461254612, "frac_reward_zero_std": 0.0, "grad_norm": 5.3547359129879624e-05, "learning_rate": 0.0002, "loss": 0.0002505858719814569, "num_tokens": 2385343.0, "reward": 0.5027609467506409, "reward_std": 0.23998978734016418, "rewards/humor_reward/mean": 0.5027609467506409, "rewards/humor_reward/std": 0.23998978734016418, "sampling/importance_sampling_ratio/max": 0.03983650356531143, "sampling/importance_sampling_ratio/mean": 0.003191877156496048, "sampling/importance_sampling_ratio/min": 9.33344068698716e-09, "sampling/sampling_logp_difference/max": 2.9324636459350586, "sampling/sampling_logp_difference/mean": 0.04020746424794197, "step": 34, "step_time": 55.506949331000214 }, { "cispo_clip_ratio": 0.6451264955103397, "completions/clipped_ratio": 0.9375, "completions/max_length": 5022.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 2561.125, "completions/mean_terminated_length": 529.0, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "entropy": 1.1418700069189072, "epoch": 0.12915129151291513, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015835068188607693, "learning_rate": 0.0002, "loss": -0.004869138356298208, "num_tokens": 2430257.0, "reward": 0.5386431217193604, "reward_std": 0.27275699377059937, "rewards/humor_reward/mean": 0.5386431217193604, "rewards/humor_reward/std": 0.27275702357292175, "sampling/importance_sampling_ratio/max": 0.4542810022830963, "sampling/importance_sampling_ratio/mean": 0.035323094576597214, "sampling/importance_sampling_ratio/min": 4.0660263350700276e-11, "sampling/sampling_logp_difference/max": 17.207799911499023, "sampling/sampling_logp_difference/mean": 0.038973793387413025, "step": 35, "step_time": 69.47620079400076 }, { "cispo_clip_ratio": 0.6264413706958294, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 4404.0, "completions/mean_length": 3681.5, "completions/mean_terminated_length": 4404.0, "completions/min_length": 524.0, "completions/min_terminated_length": 4404.0, "entropy": 1.4374851435422897, "epoch": 0.13284132841328414, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009762643603608012, "learning_rate": 0.0002, "loss": 0.006543246563524008, "num_tokens": 2493321.0, "reward": 0.5082834959030151, "reward_std": 0.34550100564956665, "rewards/humor_reward/mean": 0.5082834959030151, "rewards/humor_reward/std": 0.34550103545188904, "sampling/importance_sampling_ratio/max": 0.3936307430267334, "sampling/importance_sampling_ratio/mean": 0.03291841596364975, "sampling/importance_sampling_ratio/min": 5.0612318780451915e-09, "sampling/sampling_logp_difference/max": 2.5245909690856934, "sampling/sampling_logp_difference/mean": 0.03988190367817879, "step": 36, "step_time": 59.509835421005846 }, { "cispo_clip_ratio": 0.6464070416986942, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3666.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1304.0, "completions/min_terminated_length": 0.0, "entropy": 1.4860893040895462, "epoch": 0.13653136531365315, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014742233324795961, "learning_rate": 0.0002, "loss": 0.009775211103260517, "num_tokens": 2555917.0, "reward": 0.5413703918457031, "reward_std": 0.2888968586921692, "rewards/humor_reward/mean": 0.5413703918457031, "rewards/humor_reward/std": 0.2888968884944916, "sampling/importance_sampling_ratio/max": 0.5927172899246216, "sampling/importance_sampling_ratio/mean": 0.03740416467189789, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5622243881225586, "sampling/sampling_logp_difference/mean": 0.03937282785773277, "step": 37, "step_time": 84.5253242639883 }, { "cispo_clip_ratio": 0.5759155452251434, "completions/clipped_ratio": 0.875, "completions/max_length": 5372.0, "completions/max_terminated_length": 3480.0, "completions/mean_length": 2918.75, "completions/mean_terminated_length": 2408.0, "completions/min_length": 964.0, "completions/min_terminated_length": 1336.0, "entropy": 1.4434982985258102, "epoch": 0.14022140221402213, "frac_reward_zero_std": 0.0, "grad_norm": 0.001353645813651383, "learning_rate": 0.0002, "loss": -0.005609482992440462, "num_tokens": 2606553.0, "reward": 0.27517157793045044, "reward_std": 0.17297060787677765, "rewards/humor_reward/mean": 0.27517157793045044, "rewards/humor_reward/std": 0.17297060787677765, "sampling/importance_sampling_ratio/max": 0.8227623105049133, "sampling/importance_sampling_ratio/mean": 0.08773483335971832, "sampling/importance_sampling_ratio/min": 3.3746889016583737e-07, "sampling/sampling_logp_difference/max": 1.4419441223144531, "sampling/sampling_logp_difference/mean": 0.04054056107997894, "step": 38, "step_time": 70.05384999897797 }, { "cispo_clip_ratio": 0.47585512325167656, "completions/clipped_ratio": 1.0, "completions/max_length": 3937.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2637.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 631.0, "completions/min_terminated_length": 0.0, "entropy": 1.9437123090028763, "epoch": 0.14391143911439114, "frac_reward_zero_std": 0.0, "grad_norm": 0.0001363922783639282, "learning_rate": 0.0002, "loss": 0.001440045889467001, "num_tokens": 2652695.0, "reward": 0.5775443911552429, "reward_std": 0.1908194124698639, "rewards/humor_reward/mean": 0.5775443911552429, "rewards/humor_reward/std": 0.1908194124698639, "sampling/importance_sampling_ratio/max": 0.04935026168823242, "sampling/importance_sampling_ratio/mean": 0.003360162954777479, "sampling/importance_sampling_ratio/min": 9.852194615689314e-09, "sampling/sampling_logp_difference/max": 2.8174242973327637, "sampling/sampling_logp_difference/mean": 0.04766388610005379, "step": 39, "step_time": 65.56160380599613 }, { "cispo_clip_ratio": 0.5537771657109261, "completions/clipped_ratio": 0.875, "completions/max_length": 6144.0, "completions/max_terminated_length": 3334.0, "completions/mean_length": 3318.3125, "completions/mean_terminated_length": 2593.5, "completions/min_length": 1005.0, "completions/min_terminated_length": 1853.0, "entropy": 1.6230905801057816, "epoch": 0.14760147601476015, "frac_reward_zero_std": 0.0, "grad_norm": 8.798116323305294e-05, "learning_rate": 0.0002, "loss": 0.0004228673642501235, "num_tokens": 2710396.0, "reward": 0.3043411374092102, "reward_std": 0.29077214002609253, "rewards/humor_reward/mean": 0.3043411374092102, "rewards/humor_reward/std": 0.29077211022377014, "sampling/importance_sampling_ratio/max": 0.019216708838939667, "sampling/importance_sampling_ratio/mean": 0.0028733701910823584, "sampling/importance_sampling_ratio/min": 4.04905886597362e-09, "sampling/sampling_logp_difference/max": 4.360086441040039, "sampling/sampling_logp_difference/mean": 0.04630262777209282, "step": 40, "step_time": 82.58221219903498 }, { "cispo_clip_ratio": 0.7186006270349026, "completions/clipped_ratio": 1.0, "completions/max_length": 5217.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3383.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 706.0, "completions/min_terminated_length": 0.0, "entropy": 1.9114204794168472, "epoch": 0.15129151291512916, "frac_reward_zero_std": 0.0, "grad_norm": 0.005707095842808485, "learning_rate": 0.0002, "loss": 0.07725385576486588, "num_tokens": 2768684.0, "reward": 0.6538974642753601, "reward_std": 0.22179268300533295, "rewards/humor_reward/mean": 0.6538974642753601, "rewards/humor_reward/std": 0.22179268300533295, "sampling/importance_sampling_ratio/max": 2.3075239658355713, "sampling/importance_sampling_ratio/mean": 0.24276912212371826, "sampling/importance_sampling_ratio/min": 2.1112403203460417e-07, "sampling/sampling_logp_difference/max": 1.9947752952575684, "sampling/sampling_logp_difference/mean": 0.04460817202925682, "step": 41, "step_time": 50.73555830302939 }, { "cispo_clip_ratio": 0.4581793490797281, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3443.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 823.0, "completions/min_terminated_length": 0.0, "entropy": 1.684003859758377, "epoch": 0.15498154981549817, "frac_reward_zero_std": 0.0, "grad_norm": 6.124524952610955e-05, "learning_rate": 0.0002, "loss": -0.0007328057545237243, "num_tokens": 2828168.0, "reward": 0.38168156147003174, "reward_std": 0.1816880702972412, "rewards/humor_reward/mean": 0.38168156147003174, "rewards/humor_reward/std": 0.1816880702972412, "sampling/importance_sampling_ratio/max": 0.7620577812194824, "sampling/importance_sampling_ratio/mean": 0.052507393062114716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3952629566192627, "sampling/sampling_logp_difference/mean": 0.044263970106840134, "step": 42, "step_time": 86.9609940169612 }, { "cispo_clip_ratio": 0.47743209451436996, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 3697.8125, "completions/mean_terminated_length": 957.0, "completions/min_length": 608.0, "completions/min_terminated_length": 957.0, "entropy": 1.7134400680661201, "epoch": 0.15867158671586715, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007043051300570369, "learning_rate": 0.0002, "loss": -0.0017245950875803828, "num_tokens": 2891493.0, "reward": 0.22081857919692993, "reward_std": 0.221486434340477, "rewards/humor_reward/mean": 0.22081857919692993, "rewards/humor_reward/std": 0.22148646414279938, "sampling/importance_sampling_ratio/max": 0.40315449237823486, "sampling/importance_sampling_ratio/mean": 0.02584739960730076, "sampling/importance_sampling_ratio/min": 2.196902926243638e-07, "sampling/sampling_logp_difference/max": 1.3882695436477661, "sampling/sampling_logp_difference/mean": 0.04035360738635063, "step": 43, "step_time": 79.38622622596449 }, { "cispo_clip_ratio": 0.4560951702296734, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3416.6875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1422.0, "completions/min_terminated_length": 0.0, "entropy": 2.0115034580230713, "epoch": 0.16236162361623616, "frac_reward_zero_std": 0.0, "grad_norm": 0.00011439115041866899, "learning_rate": 0.0002, "loss": 0.0011536129750311375, "num_tokens": 2950096.0, "reward": 0.44468164443969727, "reward_std": 0.2576386332511902, "rewards/humor_reward/mean": 0.44468164443969727, "rewards/humor_reward/std": 0.2576386630535126, "sampling/importance_sampling_ratio/max": 0.04263285920023918, "sampling/importance_sampling_ratio/mean": 0.004383362829685211, "sampling/importance_sampling_ratio/min": 1.0335932643101842e-07, "sampling/sampling_logp_difference/max": 2.972001552581787, "sampling/sampling_logp_difference/mean": 0.04580405354499817, "step": 44, "step_time": 83.37110035201476 }, { "cispo_clip_ratio": 0.5093751782551408, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2493.1875, "completions/mean_terminated_length": 0.0, "completions/min_length": 544.0, "completions/min_terminated_length": 0.0, "entropy": 2.2517920583486557, "epoch": 0.16605166051660517, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009305228013545275, "learning_rate": 0.0002, "loss": -0.008580348454415798, "num_tokens": 2994147.0, "reward": 0.36086541414260864, "reward_std": 0.23784013092517853, "rewards/humor_reward/mean": 0.36086541414260864, "rewards/humor_reward/std": 0.23784014582633972, "sampling/importance_sampling_ratio/max": 0.34510719776153564, "sampling/importance_sampling_ratio/mean": 0.02897491306066513, "sampling/importance_sampling_ratio/min": 1.3008460953388423e-13, "sampling/sampling_logp_difference/max": 4.940759658813477, "sampling/sampling_logp_difference/mean": 0.05280106142163277, "step": 45, "step_time": 81.29779967200011 }, { "cispo_clip_ratio": 0.4961427040398121, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 2631.6875, "completions/mean_terminated_length": 1664.0, "completions/min_length": 919.0, "completions/min_terminated_length": 1664.0, "entropy": 2.523237004876137, "epoch": 0.16974169741697417, "frac_reward_zero_std": 0.0, "grad_norm": 0.0004199541872367263, "learning_rate": 0.0002, "loss": 0.0025538511108607054, "num_tokens": 3040190.0, "reward": 0.3032156825065613, "reward_std": 0.2560163736343384, "rewards/humor_reward/mean": 0.3032156825065613, "rewards/humor_reward/std": 0.2560163736343384, "sampling/importance_sampling_ratio/max": 0.6374390125274658, "sampling/importance_sampling_ratio/mean": 0.047728560864925385, "sampling/importance_sampling_ratio/min": 3.620046413743694e-07, "sampling/sampling_logp_difference/max": 2.632134437561035, "sampling/sampling_logp_difference/mean": 0.047268252819776535, "step": 46, "step_time": 73.58007367796381 }, { "cispo_clip_ratio": 0.3577369898557663, "completions/clipped_ratio": 0.9375, "completions/max_length": 6144.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 2277.4375, "completions/mean_terminated_length": 1734.0, "completions/min_length": 899.0, "completions/min_terminated_length": 1734.0, "entropy": 2.4675562232732773, "epoch": 0.17343173431734318, "frac_reward_zero_std": 0.0, "grad_norm": 0.011461591348052025, "learning_rate": 0.0002, "loss": 0.08860153704881668, "num_tokens": 3081013.0, "reward": 0.2814764380455017, "reward_std": 0.293722927570343, "rewards/humor_reward/mean": 0.2814764380455017, "rewards/humor_reward/std": 0.293722927570343, "sampling/importance_sampling_ratio/max": 1.583497166633606, "sampling/importance_sampling_ratio/mean": 0.19163595139980316, "sampling/importance_sampling_ratio/min": 1.2635299029284397e-08, "sampling/sampling_logp_difference/max": 3.013592481613159, "sampling/sampling_logp_difference/mean": 0.05257761478424072, "step": 47, "step_time": 80.03998614201555 }, { "cispo_clip_ratio": 0.343157634139061, "completions/clipped_ratio": 0.875, "completions/max_length": 6144.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1572.6875, "completions/mean_terminated_length": 1556.0, "completions/min_length": 505.0, "completions/min_terminated_length": 1299.0, "entropy": 2.2395776584744453, "epoch": 0.17712177121771217, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015310291200876236, "learning_rate": 0.0002, "loss": 0.007011768873780966, "num_tokens": 3110336.0, "reward": 0.25455841422080994, "reward_std": 0.21981197595596313, "rewards/humor_reward/mean": 0.25455841422080994, "rewards/humor_reward/std": 0.21981199085712433, "sampling/importance_sampling_ratio/max": 1.104674220085144, "sampling/importance_sampling_ratio/mean": 0.09266799688339233, "sampling/importance_sampling_ratio/min": 6.321847649815027e-07, "sampling/sampling_logp_difference/max": 2.861496925354004, "sampling/sampling_logp_difference/mean": 0.048282306641340256, "step": 48, "step_time": 74.09863769897493 }, { "cispo_clip_ratio": 0.2400471270084381, "completions/clipped_ratio": 1.0, "completions/max_length": 6144.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1609.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 105.0, "completions/min_terminated_length": 0.0, "entropy": 2.189691785722971, "epoch": 0.18081180811808117, "frac_reward_zero_std": 0.0, "grad_norm": 0.005181934684514999, "learning_rate": 0.0002, "loss": 0.004273830913007259, "num_tokens": 3140478.0, "reward": 0.2623010277748108, "reward_std": 0.2546696662902832, "rewards/humor_reward/mean": 0.2623010277748108, "rewards/humor_reward/std": 0.2546696662902832, "sampling/importance_sampling_ratio/max": 1.6369633674621582, "sampling/importance_sampling_ratio/mean": 0.1263563483953476, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9059357643127441, "sampling/sampling_logp_difference/mean": 0.043733954429626465, "step": 49, "step_time": 75.97332381599699 }, { "cispo_clip_ratio": 0.43684835731983185, "completions/clipped_ratio": 0.9375, "completions/max_length": 2413.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 951.875, "completions/mean_terminated_length": 791.0, "completions/min_length": 434.0, "completions/min_terminated_length": 791.0, "entropy": 2.404914140701294, "epoch": 0.18450184501845018, "frac_reward_zero_std": 0.0, "grad_norm": 0.00264533469453454, "learning_rate": 0.0002, "loss": 0.007856368087232113, "num_tokens": 3159868.0, "reward": 0.24542748928070068, "reward_std": 0.2106892615556717, "rewards/humor_reward/mean": 0.24542748928070068, "rewards/humor_reward/std": 0.21068927645683289, "sampling/importance_sampling_ratio/max": 0.8078871965408325, "sampling/importance_sampling_ratio/mean": 0.06913819909095764, "sampling/importance_sampling_ratio/min": 8.870298984220426e-08, "sampling/sampling_logp_difference/max": 3.869755744934082, "sampling/sampling_logp_difference/mean": 0.0640038326382637, "step": 50, "step_time": 23.586737716992502 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 3159868, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }