{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8603832616347282, "eval_steps": 200.0, "global_step": 4400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 562.4453125, "completions/min_length": 302.0, "epoch": 0.00019554165037152912, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0, "learning_rate": 1.948051948051948e-08, "loss": -2.3283064365386963e-09, "reward": 0.2987072467803955, "reward_std": 0.2615412771701813, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2987072169780731, "rewards/QAReward/std": 0.4167041480541229, "step": 1 }, { "clip_ratio/high_max": 0.00013299641432240605, "clip_ratio/high_mean": 0.00010025746450992301, "clip_ratio/low_mean": 4.475891910260543e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014501638361252844, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 551.142578125, "completions/min_length": 239.5, "epoch": 0.0009777082518576457, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.0004989657609257847, "learning_rate": 9.740259740259739e-08, "loss": -1.5654737580916844e-05, "reward": 0.27835673838853836, "reward_std": 0.2954357862472534, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.27835674583911896, "rewards/QAReward/std": 0.4436791092157364, "step": 5 }, { "clip_ratio/high_max": 0.0005795849370770157, "clip_ratio/high_mean": 0.00024537567514926195, "clip_ratio/low_mean": 6.564069190062582e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031101637287065385, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 540.189453125, "completions/min_length": 223.5, "epoch": 0.0019554165037152915, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.0006606287555769086, "learning_rate": 1.9480519480519478e-07, "loss": 0.00010113009484484792, "reward": 0.36509257555007935, "reward_std": 0.289382666349411, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36509254574775696, "rewards/QAReward/std": 0.4571075290441513, "step": 10 }, { "clip_ratio/high_max": 0.0004360680468380451, "clip_ratio/high_mean": 0.00017039936501532792, "clip_ratio/low_mean": 6.552295235451311e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002359223086386919, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 541.2356770833334, "completions/min_length": 268.0, "epoch": 0.002933124755572937, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.0006797303445637226, "learning_rate": 2.9220779220779225e-07, "loss": -3.894256660714745e-05, "reward": 0.28139416376749676, "reward_std": 0.2969278891881307, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2813941538333893, "rewards/QAReward/std": 0.45961932341257733, "step": 15 }, { "clip_ratio/high_max": 0.0005894795642234385, "clip_ratio/high_mean": 0.00021264092647470533, "clip_ratio/low_mean": 6.248029094422237e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002751212101429701, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 540.634765625, "completions/min_length": 244.0, "epoch": 0.003910833007430583, "frac_reward_zero_std": 0.046875, "grad_norm": 0.765625, "kl": 0.0006689954898320138, "learning_rate": 3.8961038961038956e-07, "loss": -5.698250606656074e-06, "reward": 0.2892438769340515, "reward_std": 0.2756893038749695, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2892438620328903, "rewards/QAReward/std": 0.4445720314979553, "step": 20 }, { "clip_ratio/high_max": 0.0003064883640035987, "clip_ratio/high_mean": 0.0001317629124969244, "clip_ratio/low_mean": 5.908096500206739e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019084387458860873, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 550.1640625, "completions/min_length": 253.66666666666666, "epoch": 0.004888541259288229, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80859375, "kl": 0.0006636074394918978, "learning_rate": 4.87012987012987e-07, "loss": 3.0344194965437054e-05, "reward": 0.32549087206522626, "reward_std": 0.2992330690224965, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32549087206522626, "rewards/QAReward/std": 0.4611874520778656, "step": 25 }, { "clip_ratio/high_max": 0.00045579004799947145, "clip_ratio/high_mean": 0.00022142889210954308, "clip_ratio/low_mean": 7.320458971662446e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002946334658190608, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 559.998046875, "completions/min_length": 276.0, "epoch": 0.005866249511145874, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7734375, "kl": 0.0006453444366343319, "learning_rate": 5.844155844155845e-07, "loss": -4.128175205551088e-05, "reward": 0.3105515390634537, "reward_std": 0.2835467457771301, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3105515390634537, "rewards/QAReward/std": 0.42640554904937744, "step": 30 }, { "clip_ratio/high_max": 0.00030994919361546633, "clip_ratio/high_mean": 0.000143716688035056, "clip_ratio/low_mean": 3.9226736407727e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001829434302635491, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 554.75, "completions/min_length": 255.33333333333334, "epoch": 0.00684395776300352, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.0006644245819188655, "learning_rate": 6.818181818181818e-07, "loss": 5.571667570620775e-05, "reward": 0.33385108908017475, "reward_std": 0.2860560218493144, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33385106921195984, "rewards/QAReward/std": 0.44075724482536316, "step": 35 }, { "clip_ratio/high_max": 0.0005055900779552758, "clip_ratio/high_mean": 0.0002731950080487877, "clip_ratio/low_mean": 8.723702194401994e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036043204599991443, "completions/clipped_ratio": 0.009765625, "completions/max_length": 941.0, "completions/mean_length": 525.240234375, "completions/min_length": 251.0, "epoch": 0.007821666014861166, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.0006731366622261703, "learning_rate": 7.792207792207791e-07, "loss": 9.254221804440021e-05, "reward": 0.292619988322258, "reward_std": 0.278785839676857, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2926199734210968, "rewards/QAReward/std": 0.46675050258636475, "step": 40 }, { "clip_ratio/high_max": 0.0002751371939666569, "clip_ratio/high_mean": 0.00014495440991595386, "clip_ratio/low_mean": 5.045548896305263e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001954099105205387, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 555.5768229166666, "completions/min_length": 270.0, "epoch": 0.00879937426671881, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.72265625, "kl": 0.0006610127747990191, "learning_rate": 8.766233766233766e-07, "loss": 7.178044761531055e-05, "reward": 0.30065879225730896, "reward_std": 0.3012184302012126, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30065879225730896, "rewards/QAReward/std": 0.4749120871225993, "step": 45 }, { "clip_ratio/high_max": 0.00046007443452253936, "clip_ratio/high_mean": 0.00024293921887874603, "clip_ratio/low_mean": 7.948396378196776e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003224231535568833, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 559.0, "completions/min_length": 252.0, "epoch": 0.009777082518576457, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.0006676746881566942, "learning_rate": 9.74025974025974e-07, "loss": -2.3658486315980554e-05, "reward": 0.39391741156578064, "reward_std": 0.28632408380508423, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39391741156578064, "rewards/QAReward/std": 0.4299541562795639, "step": 50 }, { "clip_ratio/high_max": 0.00034361593425273894, "clip_ratio/high_mean": 0.00016527508269064127, "clip_ratio/low_mean": 5.0530160660855475e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002158052404411137, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 553.6770833333334, "completions/min_length": 269.0, "epoch": 0.010754790770434102, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.0006495720474049449, "learning_rate": 1.0714285714285716e-06, "loss": -9.185168892145157e-07, "reward": 0.2917834420998891, "reward_std": 0.3064896762371063, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2917834420998891, "rewards/QAReward/std": 0.46412121256192523, "step": 55 }, { "clip_ratio/high_max": 0.00047091301530599594, "clip_ratio/high_mean": 0.00023275414714589714, "clip_ratio/low_mean": 4.900535859633237e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002817595028318465, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 547.966796875, "completions/min_length": 257.5, "epoch": 0.011732499022291749, "frac_reward_zero_std": 0.015625, "grad_norm": 0.734375, "kl": 0.0006722186226397753, "learning_rate": 1.168831168831169e-06, "loss": 2.5298635591752827e-05, "reward": 0.3142020255327225, "reward_std": 0.29678478837013245, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3142020106315613, "rewards/QAReward/std": 0.4490370601415634, "step": 60 }, { "clip_ratio/high_max": 0.000319178169593215, "clip_ratio/high_mean": 0.0001839135366026312, "clip_ratio/low_mean": 3.91429421142675e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002230564830824733, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 545.3763020833334, "completions/min_length": 258.0, "epoch": 0.012710207274149394, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83203125, "kl": 0.0006653358228504658, "learning_rate": 1.2662337662337662e-06, "loss": 3.069699159823358e-05, "reward": 0.282090683778127, "reward_std": 0.30544304847717285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2820906688769658, "rewards/QAReward/std": 0.47107067704200745, "step": 65 }, { "clip_ratio/high_max": 0.00045887030428275465, "clip_ratio/high_mean": 0.00023441991652362049, "clip_ratio/low_mean": 9.384149452671409e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032826141105033455, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 532.90625, "completions/min_length": 245.5, "epoch": 0.01368791552600704, "frac_reward_zero_std": 0.078125, "grad_norm": 0.8515625, "kl": 0.0006509797065518796, "learning_rate": 1.3636363636363636e-06, "loss": 6.15305732935667e-05, "reward": 0.35376518964767456, "reward_std": 0.3068739175796509, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35376517474651337, "rewards/QAReward/std": 0.5194836854934692, "step": 70 }, { "clip_ratio/high_max": 0.00033037859247997403, "clip_ratio/high_mean": 0.00018577446462586522, "clip_ratio/low_mean": 6.385861488524824e-05, "clip_ratio/low_min": 1.860984484665096e-05, "clip_ratio/region_mean": 0.0002496330882422626, "completions/clipped_ratio": 0.044270833333333336, "completions/max_length": 1024.0, "completions/mean_length": 563.25, "completions/min_length": 255.66666666666666, "epoch": 0.014665623777864685, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.0006523304735310375, "learning_rate": 1.461038961038961e-06, "loss": 0.00015398797113448383, "reward": 0.3307499388853709, "reward_std": 0.2906431754430135, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3307499388853709, "rewards/QAReward/std": 0.4296267330646515, "step": 75 }, { "clip_ratio/high_max": 0.00047288697678595783, "clip_ratio/high_mean": 0.0002623249252792448, "clip_ratio/low_mean": 7.936097681522369e-05, "clip_ratio/low_min": 2.1507688506972046e-05, "clip_ratio/region_mean": 0.0003416858962737024, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 558.990234375, "completions/min_length": 256.0, "epoch": 0.01564333202972233, "frac_reward_zero_std": 0.0625, "grad_norm": 0.73046875, "kl": 0.0006510267499834299, "learning_rate": 1.5584415584415582e-06, "loss": 0.0001063991105183959, "reward": 0.3050587773323059, "reward_std": 0.28442850708961487, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3050587624311447, "rewards/QAReward/std": 0.430167093873024, "step": 80 }, { "clip_ratio/high_max": 0.0003167412476614118, "clip_ratio/high_mean": 0.0001979916007257998, "clip_ratio/low_mean": 3.69674657122232e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023495907662436367, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 556.3216145833334, "completions/min_length": 255.33333333333334, "epoch": 0.016621040281579978, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80078125, "kl": 0.0006697330740280449, "learning_rate": 1.6558441558441559e-06, "loss": -3.562969504855573e-06, "reward": 0.290574590365092, "reward_std": 0.28912221391995746, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2905746102333069, "rewards/QAReward/std": 0.4445178508758545, "step": 85 }, { "clip_ratio/high_max": 0.0005794412572868168, "clip_ratio/high_mean": 0.0002689883578568697, "clip_ratio/low_mean": 5.878048614249565e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003277688520029187, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 538.30078125, "completions/min_length": 251.0, "epoch": 0.01759874853343762, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0006838439381681383, "learning_rate": 1.7532467532467533e-06, "loss": 5.6549470173195004e-05, "reward": 0.3175765722990036, "reward_std": 0.2832054942846298, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3175765722990036, "rewards/QAReward/std": 0.4400376230478287, "step": 90 }, { "clip_ratio/high_max": 0.00023860433720983564, "clip_ratio/high_mean": 0.00015799218672327698, "clip_ratio/low_mean": 5.934430955676362e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021733649773523212, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 544.28515625, "completions/min_length": 260.0, "epoch": 0.018576456785295268, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.000683420174755156, "learning_rate": 1.850649350649351e-06, "loss": 7.690881029702723e-05, "reward": 0.359575480222702, "reward_std": 0.29577478766441345, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3595754901568095, "rewards/QAReward/std": 0.44259650508562726, "step": 95 }, { "clip_ratio/high_max": 0.00044861147180199625, "clip_ratio/high_mean": 0.00024347268627025186, "clip_ratio/low_mean": 7.37666996428743e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031723937718197703, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 548.947265625, "completions/min_length": 232.0, "epoch": 0.019554165037152915, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8515625, "kl": 0.000686402537394315, "learning_rate": 1.948051948051948e-06, "loss": 6.883929017931223e-05, "reward": 0.2982747554779053, "reward_std": 0.2770645469427109, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2982747554779053, "rewards/QAReward/std": 0.47102899849414825, "step": 100 }, { "clip_ratio/high_max": 0.000274475640617311, "clip_ratio/high_mean": 0.00018215365125797688, "clip_ratio/low_mean": 3.29315967974253e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021508525824174285, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 534.81640625, "completions/min_length": 273.0, "epoch": 0.020531873289010558, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.0006897091283462941, "learning_rate": 2.0454545454545453e-06, "loss": -3.748370363609865e-06, "reward": 0.30275246500968933, "reward_std": 0.28464510043462116, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30275246500968933, "rewards/QAReward/std": 0.42870111266771954, "step": 105 }, { "clip_ratio/high_max": 0.00040068229427561166, "clip_ratio/high_mean": 0.00022229008609429002, "clip_ratio/low_mean": 6.385733431670814e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000286147411679849, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 548.216796875, "completions/min_length": 269.0, "epoch": 0.021509581540868204, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.0006867600372061133, "learning_rate": 2.142857142857143e-06, "loss": 2.9267676291055976e-05, "reward": 0.2968005836009979, "reward_std": 0.28436748683452606, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2968005985021591, "rewards/QAReward/std": 0.44361642003059387, "step": 110 }, { "clip_ratio/high_max": 0.0002929657930508256, "clip_ratio/high_mean": 0.00016370548401027917, "clip_ratio/low_mean": 3.029078507097438e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019399626180529594, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 540.1484375, "completions/min_length": 240.0, "epoch": 0.02248728979272585, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80078125, "kl": 0.0006798295769840478, "learning_rate": 2.24025974025974e-06, "loss": 7.920886273495852e-05, "reward": 0.3281994064648946, "reward_std": 0.28745022416114807, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3281994163990021, "rewards/QAReward/std": 0.4605083366235097, "step": 115 }, { "clip_ratio/high_max": 0.00038903726963326337, "clip_ratio/high_mean": 0.00026088427985087036, "clip_ratio/low_mean": 7.006176892900839e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033094603568315504, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 545.701171875, "completions/min_length": 270.0, "epoch": 0.023464998044583497, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.0007009691209532321, "learning_rate": 2.337662337662338e-06, "loss": 4.886114038527012e-05, "reward": 0.3642702251672745, "reward_std": 0.30107346177101135, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3642702102661133, "rewards/QAReward/std": 0.45058342814445496, "step": 120 }, { "clip_ratio/high_max": 0.00032967779552564027, "clip_ratio/high_mean": 0.00014789351262152196, "clip_ratio/low_mean": 3.983711067121476e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018773061456158757, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 557.1432291666666, "completions/min_length": 269.3333333333333, "epoch": 0.02444270629644114, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.85546875, "kl": 0.0007103055249899626, "learning_rate": 2.435064935064935e-06, "loss": 2.1297603962011635e-05, "reward": 0.34973235925038654, "reward_std": 0.26230395833651227, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34973235925038654, "rewards/QAReward/std": 0.43835218747456867, "step": 125 }, { "clip_ratio/high_max": 0.0004620332969352603, "clip_ratio/high_mean": 0.0002529845340177417, "clip_ratio/low_mean": 9.015513933263719e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034313967917114495, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 546.03125, "completions/min_length": 309.5, "epoch": 0.025420414548298787, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.0007165866903960705, "learning_rate": 2.5324675324675324e-06, "loss": 5.311581771820784e-05, "reward": 0.3860165774822235, "reward_std": 0.29506707191467285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3860165625810623, "rewards/QAReward/std": 0.42962099611759186, "step": 130 }, { "clip_ratio/high_max": 0.00028117754263803365, "clip_ratio/high_mean": 0.00015255670878104864, "clip_ratio/low_mean": 7.387506921077147e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002264317823573947, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 542.3776041666666, "completions/min_length": 260.6666666666667, "epoch": 0.026398122800156434, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8125, "kl": 0.0007479621097445488, "learning_rate": 2.62987012987013e-06, "loss": -5.417463253252208e-05, "reward": 0.34394583106040955, "reward_std": 0.2822273174921672, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34394583106040955, "rewards/QAReward/std": 0.4261429210503896, "step": 135 }, { "clip_ratio/high_max": 0.0005654786364175379, "clip_ratio/high_mean": 0.0002959935285616666, "clip_ratio/low_mean": 7.874260481912642e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037473614793270826, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 527.87109375, "completions/min_length": 277.0, "epoch": 0.02737583105201408, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83984375, "kl": 0.0007733012200333178, "learning_rate": 2.7272727272727272e-06, "loss": 1.852451532613486e-05, "reward": 0.2887330949306488, "reward_std": 0.3043355196714401, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28873310983181, "rewards/QAReward/std": 0.4583306908607483, "step": 140 }, { "clip_ratio/high_max": 0.00028303938452154396, "clip_ratio/high_mean": 0.00014192849048413336, "clip_ratio/low_mean": 5.528327310457826e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019721175776794554, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 527.359375, "completions/min_length": 251.66666666666666, "epoch": 0.028353539303871723, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8359375, "kl": 0.0007946271449327469, "learning_rate": 2.824675324675325e-06, "loss": 7.711388170719146e-05, "reward": 0.34228723247845966, "reward_std": 0.29790321985880536, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34228722254435223, "rewards/QAReward/std": 0.45375685890515643, "step": 145 }, { "clip_ratio/high_max": 0.00045053022331558167, "clip_ratio/high_mean": 0.00022743117297068237, "clip_ratio/low_mean": 6.970932881813496e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002971404988784343, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 544.982421875, "completions/min_length": 257.5, "epoch": 0.02933124755572937, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7734375, "kl": 0.0008167603868059814, "learning_rate": 2.922077922077922e-06, "loss": 4.8733130097389224e-05, "reward": 0.29362212121486664, "reward_std": 0.3005276769399643, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29362212121486664, "rewards/QAReward/std": 0.4608456790447235, "step": 150 }, { "clip_ratio/high_max": 0.00040344748413190246, "clip_ratio/high_mean": 0.00018179616890847682, "clip_ratio/low_mean": 2.857549916370772e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021037166588939727, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 545.3697916666666, "completions/min_length": 257.6666666666667, "epoch": 0.030308955807587017, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8203125, "kl": 0.0008302195579744876, "learning_rate": 2.9999996991170065e-06, "loss": 0.00012241069925948976, "reward": 0.3005637228488922, "reward_std": 0.2997136414051056, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30056371291478473, "rewards/QAReward/std": 0.43654900789260864, "step": 155 }, { "clip_ratio/high_max": 0.00045237722806632517, "clip_ratio/high_mean": 0.00022525826352648437, "clip_ratio/low_mean": 7.218647078843787e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002974447328597307, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 559.33203125, "completions/min_length": 261.5, "epoch": 0.03128666405944466, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.0008292320300824941, "learning_rate": 2.9999891682249077e-06, "loss": 4.8669875832274555e-05, "reward": 0.29745782911777496, "reward_std": 0.3213290125131607, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29745785892009735, "rewards/QAReward/std": 0.42719265818595886, "step": 160 }, { "clip_ratio/high_max": 0.00028774267993867395, "clip_ratio/high_mean": 0.00014365809038281442, "clip_ratio/low_mean": 5.938055255683139e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020303864730522036, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 508.6692708333333, "completions/min_length": 242.66666666666666, "epoch": 0.03226437231130231, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.0009233945631422103, "learning_rate": 2.9999635933038405e-06, "loss": 2.6580214034765957e-05, "reward": 0.30278892318407696, "reward_std": 0.2977292239665985, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3027889132499695, "rewards/QAReward/std": 0.4572309652964274, "step": 165 }, { "clip_ratio/high_max": 0.0004450582782737911, "clip_ratio/high_mean": 0.00026920008240267635, "clip_ratio/low_mean": 5.547836335608736e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032467843848280606, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 524.9609375, "completions/min_length": 252.0, "epoch": 0.033242080563159956, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8046875, "kl": 0.0009449144476093352, "learning_rate": 2.9999229746103054e-06, "loss": 9.412619983777404e-05, "reward": 0.2791503816843033, "reward_std": 0.27151282131671906, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2791503816843033, "rewards/QAReward/std": 0.45281238853931427, "step": 170 }, { "clip_ratio/high_max": 0.00038069887086749076, "clip_ratio/high_mean": 0.00018324204138480126, "clip_ratio/low_mean": 7.116267224773764e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000254404719453305, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 533.90625, "completions/min_length": 254.66666666666666, "epoch": 0.034219788815017596, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8203125, "kl": 0.0009628175059333444, "learning_rate": 2.999867312551686e-06, "loss": 5.631544627249241e-05, "reward": 0.2955248753229777, "reward_std": 0.3168955445289612, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2955248604218165, "rewards/QAReward/std": 0.4839466412862142, "step": 175 }, { "clip_ratio/high_max": 0.00043453254038468004, "clip_ratio/high_mean": 0.00023027803399600089, "clip_ratio/low_mean": 6.272599712247029e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029300404712557794, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 541.375, "completions/min_length": 263.0, "epoch": 0.03519749706687524, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.0010715228971093893, "learning_rate": 2.9997966076862404e-06, "loss": 4.482946824282408e-05, "reward": 0.28617002069950104, "reward_std": 0.33105072379112244, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28617002069950104, "rewards/QAReward/std": 0.4620743542909622, "step": 180 }, { "clip_ratio/high_max": 0.00041445326060056685, "clip_ratio/high_mean": 0.00022061330382712186, "clip_ratio/low_mean": 4.0029383671935645e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026064268313348294, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 522.8033854166666, "completions/min_length": 247.33333333333334, "epoch": 0.03617520531873289, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.765625, "kl": 0.0011305611813440918, "learning_rate": 2.9997108607230975e-06, "loss": -1.7737223242875188e-05, "reward": 0.38065580526987713, "reward_std": 0.2813877960046132, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38065579533576965, "rewards/QAReward/std": 0.42743276556332904, "step": 185 }, { "clip_ratio/high_max": 0.0004870320553891361, "clip_ratio/high_mean": 0.00022322615841403605, "clip_ratio/low_mean": 6.520387978525832e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028843004256486895, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 531.8671875, "completions/min_length": 237.0, "epoch": 0.037152913570590536, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.0011825134512037038, "learning_rate": 2.9996100725222498e-06, "loss": 5.4160784929990766e-05, "reward": 0.32592399418354034, "reward_std": 0.2959883511066437, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32592400908470154, "rewards/QAReward/std": 0.4781135022640228, "step": 190 }, { "clip_ratio/high_max": 0.00021211928105913103, "clip_ratio/high_mean": 0.00013984165852889418, "clip_ratio/low_mean": 4.302164525142871e-05, "clip_ratio/low_min": 2.2145941329654306e-05, "clip_ratio/region_mean": 0.00018286331323906778, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 534.9752604166666, "completions/min_length": 226.66666666666666, "epoch": 0.03813062182244818, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.87890625, "kl": 0.0011944581521674991, "learning_rate": 2.999494244094546e-06, "loss": 4.0679160156287254e-05, "reward": 0.30901899933815, "reward_std": 0.2905704081058502, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30901899933815, "rewards/QAReward/std": 0.45013880729675293, "step": 195 }, { "clip_ratio/high_max": 0.0005459138890728354, "clip_ratio/high_mean": 0.00026915946509689095, "clip_ratio/low_mean": 6.406812608474865e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003332275897264481, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 534.46484375, "completions/min_length": 241.0, "epoch": 0.03910833007430583, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7890625, "kl": 0.0012558508664369584, "learning_rate": 2.9993633766016773e-06, "loss": 0.00014316493179649114, "reward": 0.38112832605838776, "reward_std": 0.28995949029922485, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38112834095954895, "rewards/QAReward/std": 0.43942035734653473, "step": 200 }, { "clip_ratio/high_max": 0.0003338824608363211, "clip_ratio/high_mean": 0.00017864665132947267, "clip_ratio/low_mean": 3.955043066525832e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021819707471877338, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 524.5325520833334, "completions/min_length": 258.6666666666667, "epoch": 0.040086038326163476, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.83203125, "kl": 0.0013040038291364908, "learning_rate": 2.99921747135617e-06, "loss": 6.066013011150062e-05, "reward": 0.3577127655347188, "reward_std": 0.29994820555051166, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3577127754688263, "rewards/QAReward/std": 0.4509642521540324, "step": 205 }, { "clip_ratio/high_max": 0.00047162186820060015, "clip_ratio/high_mean": 0.00026601648423820736, "clip_ratio/low_mean": 6.977630546316505e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033579278388060627, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 518.9921875, "completions/min_length": 260.5, "epoch": 0.041063746578021115, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0013384334044530988, "learning_rate": 2.99905652982137e-06, "loss": 2.629185328260064e-05, "reward": 0.3787365108728409, "reward_std": 0.2779175639152527, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3787365108728409, "rewards/QAReward/std": 0.407595694065094, "step": 210 }, { "clip_ratio/high_max": 0.0003578921663574874, "clip_ratio/high_mean": 0.0002098641765769571, "clip_ratio/low_mean": 4.606424772646278e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000255928433034569, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 525.6783854166666, "completions/min_length": 240.33333333333334, "epoch": 0.04204145482987876, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80078125, "kl": 0.0013369214488193394, "learning_rate": 2.998880553611429e-06, "loss": 2.3784340010024606e-05, "reward": 0.3337249954541524, "reward_std": 0.28627315163612366, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3337249954541524, "rewards/QAReward/std": 0.4194161593914032, "step": 215 }, { "clip_ratio/high_max": 0.00046254150802269576, "clip_ratio/high_mean": 0.0002776319393888116, "clip_ratio/low_mean": 8.776165777817369e-05, "clip_ratio/low_min": 1.947988639585674e-05, "clip_ratio/region_mean": 0.00036539359134621916, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 546.119140625, "completions/min_length": 251.0, "epoch": 0.04301916308173641, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76171875, "kl": 0.0013577427016571164, "learning_rate": 2.998689544491286e-06, "loss": 8.829921716824174e-05, "reward": 0.2525530159473419, "reward_std": 0.2814728319644928, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2525530159473419, "rewards/QAReward/std": 0.45173561573028564, "step": 220 }, { "clip_ratio/high_max": 0.000208583619678393, "clip_ratio/high_mean": 0.00011387045960873366, "clip_ratio/low_mean": 3.861123259412125e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015248169074766338, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 523.4921875, "completions/min_length": 248.33333333333334, "epoch": 0.043996871333594055, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8515625, "kl": 0.0014033964602276684, "learning_rate": 2.998483504376653e-06, "loss": 6.173560977913439e-05, "reward": 0.3172117869059245, "reward_std": 0.2980707784493764, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3172117968400319, "rewards/QAReward/std": 0.4475576976935069, "step": 225 }, { "clip_ratio/high_max": 0.0005417009699158371, "clip_ratio/high_mean": 0.0002489374252036214, "clip_ratio/low_mean": 6.316469953162595e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031210212036967275, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 531.9609375, "completions/min_length": 242.0, "epoch": 0.0449745795854517, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8125, "kl": 0.0014231576351448894, "learning_rate": 2.998262435333994e-06, "loss": 3.2927346182987094e-05, "reward": 0.4087069183588028, "reward_std": 0.2857818901538849, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4087069034576416, "rewards/QAReward/std": 0.39398711919784546, "step": 230 }, { "clip_ratio/high_max": 0.0004042569431476295, "clip_ratio/high_mean": 0.00022136408369988203, "clip_ratio/low_mean": 4.521355876931921e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002665776410140097, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 520.3177083333334, "completions/min_length": 246.33333333333334, "epoch": 0.04595228783730935, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83984375, "kl": 0.0014652617741376162, "learning_rate": 2.998026339580504e-06, "loss": 2.8893430135212837e-05, "reward": 0.2981062928835551, "reward_std": 0.29133155941963196, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2981062928835551, "rewards/QAReward/std": 0.4350563685099284, "step": 235 }, { "clip_ratio/high_max": 0.00039810159942135217, "clip_ratio/high_mean": 0.00027676037861965595, "clip_ratio/low_mean": 9.547655645292252e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037223692052066324, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 498.275390625, "completions/min_length": 239.0, "epoch": 0.046929996089166995, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84375, "kl": 0.0015633862931281328, "learning_rate": 2.997775219484089e-06, "loss": 0.00010462243808433414, "reward": 0.32125262916088104, "reward_std": 0.30038125813007355, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32125264406204224, "rewards/QAReward/std": 0.43265801668167114, "step": 240 }, { "clip_ratio/high_max": 0.0003045087622012943, "clip_ratio/high_mean": 0.00014610658399760723, "clip_ratio/low_mean": 5.5006860930006954e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020111343474127352, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 513.7083333333334, "completions/min_length": 247.0, "epoch": 0.04790770434102464, "frac_reward_zero_std": 0.03125, "grad_norm": 0.88671875, "kl": 0.0015771694947034121, "learning_rate": 2.997509077563338e-06, "loss": 6.160531775094569e-05, "reward": 0.33918341994285583, "reward_std": 0.2947988410790761, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33918341994285583, "rewards/QAReward/std": 0.4674031635125478, "step": 245 }, { "clip_ratio/high_max": 0.0005032939021475613, "clip_ratio/high_mean": 0.00029714072588831185, "clip_ratio/low_mean": 6.743109552189707e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036457182141020895, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 508.177734375, "completions/min_length": 235.5, "epoch": 0.04888541259288228, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8203125, "kl": 0.0016014215536415577, "learning_rate": 2.9972279164875013e-06, "loss": 3.9598150760866704e-05, "reward": 0.2992923855781555, "reward_std": 0.2855536937713623, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2992923855781555, "rewards/QAReward/std": 0.3973202705383301, "step": 250 }, { "clip_ratio/high_max": 0.0003854024806059897, "clip_ratio/high_mean": 0.00022040652111172677, "clip_ratio/low_mean": 5.1602337043732406e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000272008846513927, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 517.984375, "completions/min_length": 226.33333333333334, "epoch": 0.04986312084473993, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.0016391061479225754, "learning_rate": 2.996931739076464e-06, "loss": 6.0653011314570906e-05, "reward": 0.2691814402739207, "reward_std": 0.3068946997324626, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.26918145020802814, "rewards/QAReward/std": 0.44760560989379883, "step": 255 }, { "clip_ratio/high_max": 0.0005109531106427312, "clip_ratio/high_mean": 0.00028737043612636626, "clip_ratio/low_mean": 4.515758701018058e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033252802677452564, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 520.234375, "completions/min_length": 228.0, "epoch": 0.050840829096597574, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7734375, "kl": 0.0016139087732881308, "learning_rate": 2.996620548300714e-06, "loss": 0.00012853837106376886, "reward": 0.3425951898097992, "reward_std": 0.29456040263175964, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3425951898097992, "rewards/QAReward/std": 0.49161286652088165, "step": 260 }, { "clip_ratio/high_max": 0.0002980126650072634, "clip_ratio/high_mean": 0.00016314120148308575, "clip_ratio/low_mean": 5.3032999858260155e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000216174207162112, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 528.734375, "completions/min_length": 246.33333333333334, "epoch": 0.05181853734845522, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0015783462673425674, "learning_rate": 2.9962943472813165e-06, "loss": 2.5498546892777086e-05, "reward": 0.30211227138837177, "reward_std": 0.27567411959171295, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30211226145426434, "rewards/QAReward/std": 0.46301643053690594, "step": 265 }, { "clip_ratio/high_max": 0.00048427743604406714, "clip_ratio/high_mean": 0.0002586977381724864, "clip_ratio/low_mean": 5.135441606398672e-05, "clip_ratio/low_min": 2.0058169320691376e-05, "clip_ratio/region_mean": 0.0003100521513260901, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 537.201171875, "completions/min_length": 256.5, "epoch": 0.05279624560031287, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80078125, "kl": 0.0015226518968120216, "learning_rate": 2.9959531392898802e-06, "loss": 6.156372837722302e-05, "reward": 0.4270531088113785, "reward_std": 0.28061749041080475, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4270531237125397, "rewards/QAReward/std": 0.42423784732818604, "step": 270 }, { "clip_ratio/high_max": 0.00026588336331769826, "clip_ratio/high_mean": 0.0001457453181501478, "clip_ratio/low_mean": 4.5464897993952036e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019121022196486593, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 507.51171875, "completions/min_length": 240.66666666666666, "epoch": 0.053773953852170514, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.87109375, "kl": 0.0015405261889100074, "learning_rate": 2.995596927748525e-06, "loss": 0.00011309806723147631, "reward": 0.26225922008355457, "reward_std": 0.29167203108469647, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.26225922008355457, "rewards/QAReward/std": 0.4539941648642222, "step": 275 }, { "clip_ratio/high_max": 0.0005810294649563729, "clip_ratio/high_mean": 0.0002989702043123543, "clip_ratio/low_mean": 9.183903312077746e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003908092388883233, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 512.947265625, "completions/min_length": 258.5, "epoch": 0.05475166210402816, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87109375, "kl": 0.0015530727803707122, "learning_rate": 2.9952257162298477e-06, "loss": 2.921558334492147e-05, "reward": 0.21311229467391968, "reward_std": 0.2939104586839676, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.21311229467391968, "rewards/QAReward/std": 0.4654459208250046, "step": 280 }, { "clip_ratio/high_max": 0.00038171361666172743, "clip_ratio/high_mean": 0.00017659750301390886, "clip_ratio/low_mean": 4.0344925946556034e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021694242605008185, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 521.3489583333334, "completions/min_length": 251.33333333333334, "epoch": 0.0557293703558858, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80078125, "kl": 0.0015532762976363302, "learning_rate": 2.9948395084568865e-06, "loss": 8.97413759957999e-05, "reward": 0.3415111502011617, "reward_std": 0.3071461816628774, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34151114026705426, "rewards/QAReward/std": 0.4366106688976288, "step": 285 }, { "clip_ratio/high_max": 0.0004507273202762008, "clip_ratio/high_mean": 0.0003039865056052804, "clip_ratio/low_mean": 8.23892158223316e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003863757359795272, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 518.6796875, "completions/min_length": 246.5, "epoch": 0.05670707860774345, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8125, "kl": 0.0015857026912271977, "learning_rate": 2.994438308303083e-06, "loss": 9.163999347947538e-05, "reward": 0.3899576812982559, "reward_std": 0.2568272203207016, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3899576961994171, "rewards/QAReward/std": 0.4005284458398819, "step": 290 }, { "clip_ratio/high_max": 0.0003407334093935788, "clip_ratio/high_mean": 0.00019293794175609947, "clip_ratio/low_mean": 2.751668835117016e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022045462392270566, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 514.8932291666666, "completions/min_length": 252.0, "epoch": 0.057684786859601093, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0016192864393815398, "learning_rate": 2.994022119792245e-06, "loss": 3.805523447226733e-05, "reward": 0.4081837236881256, "reward_std": 0.2799052298069, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4081837236881256, "rewards/QAReward/std": 0.41300448775291443, "step": 295 }, { "clip_ratio/high_max": 0.0004913972457870841, "clip_ratio/high_mean": 0.00028027832158841195, "clip_ratio/low_mean": 6.39098565443419e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034418818540871143, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 510.806640625, "completions/min_length": 220.5, "epoch": 0.05866249511145874, "frac_reward_zero_std": 0.03125, "grad_norm": 0.86328125, "kl": 0.0017103672958910464, "learning_rate": 2.9935909470985035e-06, "loss": 7.9916330287233e-05, "reward": 0.3466680645942688, "reward_std": 0.2883760929107666, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3466680645942688, "rewards/QAReward/std": 0.45993703603744507, "step": 300 }, { "clip_ratio/high_max": 0.0003524662111885846, "clip_ratio/high_mean": 0.00019121920340694487, "clip_ratio/low_mean": 2.3086759756552056e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021430596243590116, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 511.41015625, "completions/min_length": 214.33333333333334, "epoch": 0.05964020336331639, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.79296875, "kl": 0.0017492811661213637, "learning_rate": 2.9931447945462726e-06, "loss": 6.234260508790612e-05, "reward": 0.3526010811328888, "reward_std": 0.2752201557159424, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3526010711987813, "rewards/QAReward/std": 0.41810839374860126, "step": 305 }, { "clip_ratio/high_max": 0.0004657356534153223, "clip_ratio/high_mean": 0.0002684437786228955, "clip_ratio/low_mean": 9.092339314520359e-05, "clip_ratio/low_min": 2.1570319950114937e-05, "clip_ratio/region_mean": 0.00035936717176809907, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 517.513671875, "completions/min_length": 263.5, "epoch": 0.06061791161517403, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.001705275010317564, "learning_rate": 2.9926836666102055e-06, "loss": 0.00011276727309450507, "reward": 0.36900344491004944, "reward_std": 0.29044967889785767, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36900344491004944, "rewards/QAReward/std": 0.42730917036533356, "step": 310 }, { "clip_ratio/high_max": 0.00030065301107242706, "clip_ratio/high_mean": 0.00017464872798882424, "clip_ratio/low_mean": 4.780857561854645e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022245731088332832, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 506.0182291666667, "completions/min_length": 238.0, "epoch": 0.06159561986703168, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.0017308081034570933, "learning_rate": 2.992207567915151e-06, "loss": 8.708415552973747e-05, "reward": 0.256049523750941, "reward_std": 0.30431340138117474, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2560495336850484, "rewards/QAReward/std": 0.4811026950677236, "step": 315 }, { "clip_ratio/high_max": 0.0006970042595639825, "clip_ratio/high_mean": 0.00035121252294629813, "clip_ratio/low_mean": 9.717851935420185e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00044839103939011695, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 522.849609375, "completions/min_length": 274.0, "epoch": 0.06257332811888933, "frac_reward_zero_std": 0.015625, "grad_norm": 0.85546875, "kl": 0.0016907631885260343, "learning_rate": 2.991716503236105e-06, "loss": 0.00013893813593313098, "reward": 0.2381804883480072, "reward_std": 0.2868662178516388, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2381804883480072, "rewards/QAReward/std": 0.47461022436618805, "step": 320 }, { "clip_ratio/high_max": 0.000431795057374984, "clip_ratio/high_mean": 0.00020189945353195072, "clip_ratio/low_mean": 3.95984374335967e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002414979040622711, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 514.7122395833334, "completions/min_length": 253.0, "epoch": 0.06355103637074697, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.84765625, "kl": 0.0016861926298588515, "learning_rate": 2.991210477498164e-06, "loss": 3.0295064789243042e-05, "reward": 0.32870158553123474, "reward_std": 0.26766403516133624, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32870160539944965, "rewards/QAReward/std": 0.40676307678222656, "step": 325 }, { "clip_ratio/high_max": 0.00057776621542871, "clip_ratio/high_mean": 0.0002547657117247581, "clip_ratio/low_mean": 7.688511250307784e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003316508315037936, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 525.8828125, "completions/min_length": 266.5, "epoch": 0.06452874462260462, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8515625, "kl": 0.0016540790209546685, "learning_rate": 2.990689495776475e-06, "loss": 5.245559150353074e-05, "reward": 0.3319765031337738, "reward_std": 0.29766978323459625, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3319765031337738, "rewards/QAReward/std": 0.45330552756786346, "step": 330 }, { "clip_ratio/high_max": 0.00026017670752480625, "clip_ratio/high_mean": 0.00014610940124839544, "clip_ratio/low_mean": 4.632669151760638e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019243609276600182, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 529.8841145833334, "completions/min_length": 265.3333333333333, "epoch": 0.06550645287446226, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8125, "kl": 0.0016726297792047263, "learning_rate": 2.9901535632961854e-06, "loss": 0.00015253580641001464, "reward": 0.3477564851442973, "reward_std": 0.2758673628171285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3477564851442973, "rewards/QAReward/std": 0.4221237103144328, "step": 335 }, { "clip_ratio/high_max": 0.0006037259241566062, "clip_ratio/high_mean": 0.00035229517379775643, "clip_ratio/low_mean": 6.479219009634108e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004170873668044806, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1024.0, "completions/mean_length": 515.296875, "completions/min_length": 230.0, "epoch": 0.06648416112631991, "frac_reward_zero_std": 0.03125, "grad_norm": 0.86328125, "kl": 0.0017515066778287292, "learning_rate": 2.9896026854323896e-06, "loss": 3.284300619270653e-05, "reward": 0.4117133170366287, "reward_std": 0.27548281848430634, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4117133319377899, "rewards/QAReward/std": 0.39852260053157806, "step": 340 }, { "clip_ratio/high_max": 0.00039218314923346044, "clip_ratio/high_mean": 0.00017603070591576397, "clip_ratio/low_mean": 2.1510593069251626e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019754129461944104, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 522.9010416666666, "completions/min_length": 226.33333333333334, "epoch": 0.06746186937817755, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.734375, "kl": 0.0018277551513165235, "learning_rate": 2.9890368677100763e-06, "loss": 9.934260742738843e-05, "reward": 0.3828570445378621, "reward_std": 0.2495946784814199, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3828570346037547, "rewards/QAReward/std": 0.42582037051518756, "step": 345 }, { "clip_ratio/high_max": 0.00048620671732351183, "clip_ratio/high_mean": 0.000261587934801355, "clip_ratio/low_mean": 9.145392396021634e-05, "clip_ratio/low_min": 2.133560919901356e-05, "clip_ratio/region_mean": 0.0003530418733134866, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 498.259765625, "completions/min_length": 235.0, "epoch": 0.06843957763003519, "frac_reward_zero_std": 0.015625, "grad_norm": 0.82421875, "kl": 0.0018198314122855664, "learning_rate": 2.988456115804071e-06, "loss": 1.3254198711365461e-05, "reward": 0.30966952443122864, "reward_std": 0.31776344776153564, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30966952443122864, "rewards/QAReward/std": 0.48341183364391327, "step": 350 }, { "clip_ratio/high_max": 0.00041342613985762, "clip_ratio/high_mean": 0.00015201132628135384, "clip_ratio/low_mean": 2.9337331943679602e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018134865094907582, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 526.4427083333334, "completions/min_length": 238.0, "epoch": 0.06941728588189285, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.84375, "kl": 0.0018169629387557507, "learning_rate": 2.9878604355389827e-06, "loss": 0.00015597811434417964, "reward": 0.36865390340487164, "reward_std": 0.2851994037628174, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36865390340487164, "rewards/QAReward/std": 0.4293197989463806, "step": 355 }, { "clip_ratio/high_max": 0.0005751843680627644, "clip_ratio/high_mean": 0.00030066526378504934, "clip_ratio/low_mean": 8.661319880047813e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038727845530956985, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 523.978515625, "completions/min_length": 241.0, "epoch": 0.07039499413375049, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83984375, "kl": 0.0017869805917143822, "learning_rate": 2.987249832889141e-06, "loss": 0.00017945946892723442, "reward": 0.364153191447258, "reward_std": 0.2643546685576439, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.364153191447258, "rewards/QAReward/std": 0.38081030547618866, "step": 360 }, { "clip_ratio/high_max": 0.00033935869578272104, "clip_ratio/high_mean": 0.00015456558903679252, "clip_ratio/low_mean": 5.9562138631008565e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021412774221971632, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 525.0247395833334, "completions/min_length": 242.66666666666666, "epoch": 0.07137270238560814, "frac_reward_zero_std": 0.03125, "grad_norm": 0.89453125, "kl": 0.0018288392340764404, "learning_rate": 2.98662431397854e-06, "loss": 8.895811624825001e-05, "reward": 0.375039279460907, "reward_std": 0.2861775557200114, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.375039279460907, "rewards/QAReward/std": 0.42430828015009564, "step": 365 }, { "clip_ratio/high_max": 0.0005156975472345948, "clip_ratio/high_mean": 0.00022789735812693833, "clip_ratio/low_mean": 5.867889849469066e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002865762508008629, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 547.05859375, "completions/min_length": 260.0, "epoch": 0.07235041063746578, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.0017969131702557206, "learning_rate": 2.985983885080774e-06, "loss": 6.066660862416029e-05, "reward": 0.28242962062358856, "reward_std": 0.2945100665092468, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28242963552474976, "rewards/QAReward/std": 0.42537805438041687, "step": 370 }, { "clip_ratio/high_max": 0.00033926969626918435, "clip_ratio/high_mean": 0.00013034055009484292, "clip_ratio/low_mean": 2.3270256497198717e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015361080877482892, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 525.1809895833334, "completions/min_length": 232.33333333333334, "epoch": 0.07332811888932343, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0018698264146223664, "learning_rate": 2.9853285526189776e-06, "loss": 2.5256891967728733e-05, "reward": 0.3273422022660573, "reward_std": 0.2921946942806244, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3273422022660573, "rewards/QAReward/std": 0.4412424564361572, "step": 375 }, { "clip_ratio/high_max": 0.000521813309751451, "clip_ratio/high_mean": 0.0002407918742392212, "clip_ratio/low_mean": 6.900132138980553e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030979320872575044, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 526.4765625, "completions/min_length": 228.5, "epoch": 0.07430582714118107, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.0019409350818023086, "learning_rate": 2.9846583231657585e-06, "loss": -1.5186675591394306e-05, "reward": 0.3206651508808136, "reward_std": 0.31188416481018066, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3206651657819748, "rewards/QAReward/std": 0.47431306540966034, "step": 380 }, { "clip_ratio/high_max": 0.00035786955850198866, "clip_ratio/high_mean": 0.00017589289927855134, "clip_ratio/low_mean": 7.511696021538228e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025100987404584884, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 518.5286458333334, "completions/min_length": 230.66666666666666, "epoch": 0.07528353539303871, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.796875, "kl": 0.0019658677745610477, "learning_rate": 2.9839732034431317e-06, "loss": 4.325800109654665e-05, "reward": 0.37099093198776245, "reward_std": 0.2845005691051483, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37099092205365497, "rewards/QAReward/std": 0.46651757756869, "step": 385 }, { "clip_ratio/high_max": 0.0006125018815509975, "clip_ratio/high_mean": 0.00029409107519313694, "clip_ratio/low_mean": 0.00010132876923307776, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003954198444262147, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 526.8359375, "completions/min_length": 208.0, "epoch": 0.07626124364489636, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.002001641062088311, "learning_rate": 2.9832732003224554e-06, "loss": 5.968711338937283e-05, "reward": 0.28051313757896423, "reward_std": 0.3028603792190552, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2805131524801254, "rewards/QAReward/std": 0.45404887199401855, "step": 390 }, { "clip_ratio/high_max": 0.000405120337381959, "clip_ratio/high_mean": 0.00022557746851816773, "clip_ratio/low_mean": 5.5239243374671786e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028081671334803107, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 512.4283854166666, "completions/min_length": 232.33333333333334, "epoch": 0.077238951896754, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8203125, "kl": 0.002042924938723445, "learning_rate": 2.982558320824358e-06, "loss": 0.00011199988657608628, "reward": 0.2932239919900894, "reward_std": 0.298792431751887, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2932240118583043, "rewards/QAReward/std": 0.4944570263226827, "step": 395 }, { "clip_ratio/high_max": 0.0005610274383798242, "clip_ratio/high_mean": 0.0003159915329888463, "clip_ratio/low_mean": 6.806209858041256e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038405362283810973, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 515.078125, "completions/min_length": 270.5, "epoch": 0.07821666014861166, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.0020684354472905397, "learning_rate": 2.9818285721186696e-06, "loss": 4.748038481920958e-05, "reward": 0.3546324670314789, "reward_std": 0.31251098215579987, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3546324819326401, "rewards/QAReward/std": 0.4957164227962494, "step": 400 }, { "clip_ratio/high_max": 0.00034943055361509324, "clip_ratio/high_mean": 0.00017243340262211858, "clip_ratio/low_mean": 4.846860974794254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022090200800448657, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 527.83203125, "completions/min_length": 230.33333333333334, "epoch": 0.0791943684004693, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8515625, "kl": 0.001963405543938279, "learning_rate": 2.98108396152435e-06, "loss": 9.16824908927083e-05, "reward": 0.3374364674091339, "reward_std": 0.2969689468542735, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.337436447540919, "rewards/QAReward/std": 0.46700002749760944, "step": 405 }, { "clip_ratio/high_max": 0.0005422988091595471, "clip_ratio/high_mean": 0.00025214682100340726, "clip_ratio/low_mean": 8.20818546344526e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003342286683619022, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 524.48046875, "completions/min_length": 271.0, "epoch": 0.08017207665232695, "frac_reward_zero_std": 0.046875, "grad_norm": 0.79296875, "kl": 0.0020670751109719276, "learning_rate": 2.9803244965094166e-06, "loss": 5.433866172097623e-05, "reward": 0.3954845666885376, "reward_std": 0.269412562251091, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3954845517873764, "rewards/QAReward/std": 0.4278332591056824, "step": 410 }, { "clip_ratio/high_max": 0.00040932291885837914, "clip_ratio/high_mean": 0.00019085683161392807, "clip_ratio/low_mean": 3.6176010326016694e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022703284630551935, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 520.41015625, "completions/min_length": 231.66666666666666, "epoch": 0.08114978490418459, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8203125, "kl": 0.0020641277078539133, "learning_rate": 2.9795501846908654e-06, "loss": 8.51099845021963e-05, "reward": 0.3084728419780731, "reward_std": 0.2781275113423665, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3084728519121806, "rewards/QAReward/std": 0.4601224561532338, "step": 415 }, { "clip_ratio/high_max": 0.0005607949686236679, "clip_ratio/high_mean": 0.00028254612698219717, "clip_ratio/low_mean": 7.781500098644756e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036036111414432525, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 518.6875, "completions/min_length": 251.5, "epoch": 0.08212749315604223, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.002155265957117081, "learning_rate": 2.9787610338345995e-06, "loss": 4.60147304693237e-05, "reward": 0.3250139355659485, "reward_std": 0.28139355778694153, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3250139504671097, "rewards/QAReward/std": 0.42099951207637787, "step": 420 }, { "clip_ratio/high_max": 0.00048141712322831155, "clip_ratio/high_mean": 0.00019367055501788854, "clip_ratio/low_mean": 6.170918932184577e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002553797559812665, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 542.4765625, "completions/min_length": 237.0, "epoch": 0.08310520140789988, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.84765625, "kl": 0.0020353679778054357, "learning_rate": 2.9779570518553477e-06, "loss": 8.650352829135955e-05, "reward": 0.36794447898864746, "reward_std": 0.2843100229899089, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36794446905454, "rewards/QAReward/std": 0.4336024920145671, "step": 425 }, { "clip_ratio/high_max": 0.0005391911370679736, "clip_ratio/high_mean": 0.00029169689514674246, "clip_ratio/low_mean": 6.039327854523435e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003520901664160192, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/mean_length": 536.083984375, "completions/min_length": 248.5, "epoch": 0.08408290965975752, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87890625, "kl": 0.002017058525234461, "learning_rate": 2.977138246816588e-06, "loss": 0.00010031766723841428, "reward": 0.3163899779319763, "reward_std": 0.2755403071641922, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3163899779319763, "rewards/QAReward/std": 0.498916357755661, "step": 430 }, { "clip_ratio/high_max": 0.0002602844964712858, "clip_ratio/high_mean": 0.00016815796261653303, "clip_ratio/low_mean": 4.818374873138964e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021634171134792267, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 522.8385416666666, "completions/min_length": 239.0, "epoch": 0.08506061791161518, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.82421875, "kl": 0.0021131239831447602, "learning_rate": 2.9763046269304634e-06, "loss": 0.00011570673668757082, "reward": 0.3123718698819478, "reward_std": 0.28766363859176636, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3123718698819478, "rewards/QAReward/std": 0.44475991527239483, "step": 435 }, { "clip_ratio/high_max": 0.00043757708044722676, "clip_ratio/high_mean": 0.00022844619234092535, "clip_ratio/low_mean": 8.054038917180151e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000308986590243876, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 519.255859375, "completions/min_length": 231.0, "epoch": 0.08603832616347282, "frac_reward_zero_std": 0.078125, "grad_norm": 0.80859375, "kl": 0.0021863482892513276, "learning_rate": 2.9754562005577022e-06, "loss": 0.00013220836408436298, "reward": 0.4593610465526581, "reward_std": 0.28819575905799866, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4593610614538193, "rewards/QAReward/std": 0.45233795046806335, "step": 440 }, { "clip_ratio/high_max": 0.00041755178244784474, "clip_ratio/high_mean": 0.00017790257115848362, "clip_ratio/low_mean": 5.3979117365088317e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023188168415799737, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 527.91015625, "completions/min_length": 257.6666666666667, "epoch": 0.08701603441533047, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.80078125, "kl": 0.0021770766470581293, "learning_rate": 2.974592976207533e-06, "loss": 6.588537944480777e-05, "reward": 0.3582046826680501, "reward_std": 0.3087433874607086, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3582046727339427, "rewards/QAReward/std": 0.42953739563624066, "step": 445 }, { "clip_ratio/high_max": 0.0004682058119215071, "clip_ratio/high_mean": 0.00025825575576163826, "clip_ratio/low_mean": 8.413409523200244e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034238984808325766, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 513.36328125, "completions/min_length": 222.5, "epoch": 0.08799374266718811, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84765625, "kl": 0.0021895467769354584, "learning_rate": 2.9737149625376007e-06, "loss": 0.00016124406829476357, "reward": 0.31552810966968536, "reward_std": 0.28738151490688324, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31552810966968536, "rewards/QAReward/std": 0.42980828881263733, "step": 450 }, { "clip_ratio/high_max": 0.0003990645753219724, "clip_ratio/high_mean": 0.00016882603522390128, "clip_ratio/low_mean": 6.196554459165781e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023079157108440995, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 537.33203125, "completions/min_length": 232.33333333333334, "epoch": 0.08897145091904576, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.002024461282417178, "learning_rate": 2.9728221683538765e-06, "loss": 3.025783225893974e-05, "reward": 0.31748202443122864, "reward_std": 0.3017444809277852, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31748201449712116, "rewards/QAReward/std": 0.4659950037797292, "step": 455 }, { "clip_ratio/high_max": 0.00041153295896947386, "clip_ratio/high_mean": 0.00022500330815091729, "clip_ratio/low_mean": 8.553526131436229e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003105385636445135, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 533.53515625, "completions/min_length": 249.0, "epoch": 0.0899491591709034, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.002082060556858778, "learning_rate": 2.9719146026105732e-06, "loss": 7.337721181102096e-05, "reward": 0.33864009380340576, "reward_std": 0.3065963238477707, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33864009380340576, "rewards/QAReward/std": 0.4540373682975769, "step": 460 }, { "clip_ratio/high_max": 0.00036295580212026836, "clip_ratio/high_mean": 0.00017517414526082575, "clip_ratio/low_mean": 4.088271234650165e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021605686051771045, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 523.87890625, "completions/min_length": 250.0, "epoch": 0.09092686742276104, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.002087195171043277, "learning_rate": 2.9709922744100535e-06, "loss": 0.00011708575766533613, "reward": 0.3711319863796234, "reward_std": 0.2940832773844401, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3711319963137309, "rewards/QAReward/std": 0.4542688727378845, "step": 465 }, { "clip_ratio/high_max": 0.00048177570570260284, "clip_ratio/high_mean": 0.00025879747117869557, "clip_ratio/low_mean": 4.749475629068911e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030629223911091683, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 537.08984375, "completions/min_length": 247.5, "epoch": 0.0919045756746187, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.0020891576539725067, "learning_rate": 2.970055193002739e-06, "loss": 7.503859233111143e-05, "reward": 0.32493259012699127, "reward_std": 0.28843073546886444, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3249325752258301, "rewards/QAReward/std": 0.45634840428829193, "step": 470 }, { "clip_ratio/high_max": 0.00032001992221921685, "clip_ratio/high_mean": 0.00015759814996272326, "clip_ratio/low_mean": 4.401552869239822e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020161367137916386, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 532.4088541666666, "completions/min_length": 254.0, "epoch": 0.09288228392647634, "frac_reward_zero_std": 0.03125, "grad_norm": 0.828125, "kl": 0.0021110848523676396, "learning_rate": 2.9691033677870165e-06, "loss": 0.00011368205305188895, "reward": 0.32531415422757465, "reward_std": 0.2699113190174103, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3253141591946284, "rewards/QAReward/std": 0.4366338849067688, "step": 475 }, { "clip_ratio/high_max": 0.0004893531673587859, "clip_ratio/high_mean": 0.0002492470142897218, "clip_ratio/low_mean": 7.754575490253046e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003267927502747625, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/mean_length": 537.80078125, "completions/min_length": 220.5, "epoch": 0.09385999217833399, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.002111182687804103, "learning_rate": 2.9681368083091472e-06, "loss": 0.00010364674963057042, "reward": 0.3133556544780731, "reward_std": 0.257010743021965, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3133556693792343, "rewards/QAReward/std": 0.473889097571373, "step": 480 }, { "clip_ratio/high_max": 0.00037525681545957923, "clip_ratio/high_mean": 0.00019137247581966222, "clip_ratio/low_mean": 3.8772248080931605e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023014473263174295, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 539.8971354166666, "completions/min_length": 240.33333333333334, "epoch": 0.09483770043019163, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.796875, "kl": 0.0021941117942333222, "learning_rate": 2.967155524263166e-06, "loss": 0.00011870814487338066, "reward": 0.3866102298100789, "reward_std": 0.28109365701675415, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3866102397441864, "rewards/QAReward/std": 0.4314187268416087, "step": 485 }, { "clip_ratio/high_max": 0.00047559946542605756, "clip_ratio/high_mean": 0.00022939799819141627, "clip_ratio/low_mean": 7.846434018574656e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003078623441979289, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 538.982421875, "completions/min_length": 248.5, "epoch": 0.09581540868204928, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8203125, "kl": 0.0021865266375243665, "learning_rate": 2.9661595254907884e-06, "loss": 3.236140473745763e-05, "reward": 0.3542618006467819, "reward_std": 0.28122323751449585, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3542617857456207, "rewards/QAReward/std": 0.42857733368873596, "step": 490 }, { "clip_ratio/high_max": 0.00032197299879044294, "clip_ratio/high_mean": 0.00016032237326726317, "clip_ratio/low_mean": 6.0880350065417585e-05, "clip_ratio/low_min": 2.1917808044236154e-05, "clip_ratio/region_mean": 0.0002212027320638299, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 539.2486979166666, "completions/min_length": 253.0, "epoch": 0.09679311693390692, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.7578125, "kl": 0.0022099025547504423, "learning_rate": 2.9651488219813086e-06, "loss": 0.0001030614017508924, "reward": 0.38783949613571167, "reward_std": 0.25425126155217487, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38783947626749676, "rewards/QAReward/std": 0.3912319839000702, "step": 495 }, { "clip_ratio/high_max": 0.0004730363027192652, "clip_ratio/high_mean": 0.00022171238670125603, "clip_ratio/low_mean": 8.620296866865829e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030791535973548887, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 511.458984375, "completions/min_length": 225.0, "epoch": 0.09777082518576456, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8359375, "kl": 0.0023590422235429285, "learning_rate": 2.9641234238715025e-06, "loss": 0.00011315098963677883, "reward": 0.3850446492433548, "reward_std": 0.29779286682605743, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3850446492433548, "rewards/QAReward/std": 0.45281919836997986, "step": 500 }, { "clip_ratio/high_max": 0.0003560552024282515, "clip_ratio/high_mean": 0.0001874975161626935, "clip_ratio/low_mean": 4.9740268150344494e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002372377784922719, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 986.3333333333334, "completions/mean_length": 506.45703125, "completions/min_length": 245.0, "epoch": 0.09874853343762222, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.0023576607927680016, "learning_rate": 2.963083341445523e-06, "loss": 9.332572808489204e-05, "reward": 0.38681452473004657, "reward_std": 0.27443426847457886, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38681453466415405, "rewards/QAReward/std": 0.4509049554665883, "step": 505 }, { "clip_ratio/high_max": 0.0005503524793311953, "clip_ratio/high_mean": 0.0002552241901867092, "clip_ratio/low_mean": 9.380720148328691e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000349031388759613, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 551.091796875, "completions/min_length": 245.0, "epoch": 0.09972624168947986, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.002279928745701909, "learning_rate": 2.962028585134799e-06, "loss": 0.00017228389624506234, "reward": 0.30327148735523224, "reward_std": 0.28660623729228973, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30327148735523224, "rewards/QAReward/std": 0.453860267996788, "step": 510 }, { "clip_ratio/high_max": 0.00040781700517982243, "clip_ratio/high_mean": 0.00017290760297328234, "clip_ratio/low_mean": 6.612828583456576e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023903588298708202, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 544.8671875, "completions/min_length": 246.66666666666666, "epoch": 0.10070394994133751, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8046875, "kl": 0.002296267403289676, "learning_rate": 2.9609591655179303e-06, "loss": 0.00014649503864347935, "reward": 0.2728515565395355, "reward_std": 0.3066355586051941, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2728515565395355, "rewards/QAReward/std": 0.46818673610687256, "step": 515 }, { "clip_ratio/high_max": 0.0005006002727895975, "clip_ratio/high_mean": 0.00023747007944621145, "clip_ratio/low_mean": 9.32647890294902e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033073486993089317, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 539.8125, "completions/min_length": 242.0, "epoch": 0.10168165819319515, "frac_reward_zero_std": 0.046875, "grad_norm": 0.73828125, "kl": 0.002295937854796648, "learning_rate": 2.9598750933205804e-06, "loss": 5.909251049160957e-05, "reward": 0.39192475378513336, "reward_std": 0.28069189190864563, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39192475378513336, "rewards/QAReward/std": 0.4672185033559799, "step": 520 }, { "clip_ratio/high_max": 0.00031258598901331427, "clip_ratio/high_mean": 0.00015590335824526845, "clip_ratio/low_mean": 3.914340486517176e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019504677038639784, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 551.7981770833334, "completions/min_length": 234.0, "epoch": 0.1026593664450528, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.00228310776874423, "learning_rate": 2.9587763794153705e-06, "loss": 0.00012830172199755906, "reward": 0.33885271350542706, "reward_std": 0.298476775487264, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33885271350542706, "rewards/QAReward/std": 0.4209779401620229, "step": 525 }, { "clip_ratio/high_max": 0.0005077763576991856, "clip_ratio/high_mean": 0.00025401373859494923, "clip_ratio/low_mean": 5.3779622976435346e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030779336229898037, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 531.962890625, "completions/min_length": 241.5, "epoch": 0.10363707469691044, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.0022962348069995643, "learning_rate": 2.9576630348217685e-06, "loss": 9.075198322534561e-05, "reward": 0.3980585187673569, "reward_std": 0.2733176648616791, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3980585038661957, "rewards/QAReward/std": 0.47966407239437103, "step": 530 }, { "clip_ratio/high_max": 0.000342744926456362, "clip_ratio/high_mean": 0.00015752646140754223, "clip_ratio/low_mean": 6.016865954734385e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002176951093133539, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 515.5247395833334, "completions/min_length": 226.33333333333334, "epoch": 0.10461478294876808, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7734375, "kl": 0.0022941336967051027, "learning_rate": 2.956535070705982e-06, "loss": 0.0001596100628376007, "reward": 0.3453187048435211, "reward_std": 0.2718038558959961, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3453187048435211, "rewards/QAReward/std": 0.4729204972585042, "step": 535 }, { "clip_ratio/high_max": 0.0004362393927294761, "clip_ratio/high_mean": 0.00027271175058558584, "clip_ratio/low_mean": 5.5994893773458895e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003287066356278956, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 506.82421875, "completions/min_length": 247.5, "epoch": 0.10559249120062573, "frac_reward_zero_std": 0.0625, "grad_norm": 0.796875, "kl": 0.002372738206759095, "learning_rate": 2.9553924983808415e-06, "loss": 0.000173167884349823, "reward": 0.3572637587785721, "reward_std": 0.289211243391037, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3572637587785721, "rewards/QAReward/std": 0.45352108776569366, "step": 540 }, { "clip_ratio/high_max": 0.0003233369323424995, "clip_ratio/high_mean": 0.00017554530058987438, "clip_ratio/low_mean": 2.8864146588603035e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002044094493612647, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 516.78515625, "completions/min_length": 219.66666666666666, "epoch": 0.10657019945248337, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78125, "kl": 0.0024338227696716784, "learning_rate": 2.9542353293056906e-06, "loss": 9.84331825748086e-05, "reward": 0.39755653341611225, "reward_std": 0.28545599182446796, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3975565234820048, "rewards/QAReward/std": 0.441084881623586, "step": 545 }, { "clip_ratio/high_max": 0.0005499540362507105, "clip_ratio/high_mean": 0.0003071936662308872, "clip_ratio/low_mean": 7.609973836224526e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038329340750351546, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 528.12109375, "completions/min_length": 252.5, "epoch": 0.10754790770434103, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.0022667938843369483, "learning_rate": 2.9530635750862706e-06, "loss": 2.6867128326557577e-05, "reward": 0.372577965259552, "reward_std": 0.29600565135478973, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.372577965259552, "rewards/QAReward/std": 0.4304935783147812, "step": 550 }, { "clip_ratio/high_max": 0.0003672841703519225, "clip_ratio/high_mean": 0.00015828605392016472, "clip_ratio/low_mean": 6.056911952327937e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021885517053306104, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 539.4127604166666, "completions/min_length": 283.3333333333333, "epoch": 0.10852561595619867, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.76953125, "kl": 0.00221448983065784, "learning_rate": 2.951877247474602e-06, "loss": 8.155513787642122e-05, "reward": 0.3860444625218709, "reward_std": 0.2814856866995494, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3860444823900859, "rewards/QAReward/std": 0.4102632999420166, "step": 555 }, { "clip_ratio/high_max": 0.0004912075353786349, "clip_ratio/high_mean": 0.00026915142661891877, "clip_ratio/low_mean": 9.100033494178206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003601517644710839, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 541.447265625, "completions/min_length": 280.5, "epoch": 0.10950332420805632, "frac_reward_zero_std": 0.015625, "grad_norm": 0.79296875, "kl": 0.0022263114340603353, "learning_rate": 2.950676358368869e-06, "loss": 0.00015949623193591833, "reward": 0.31662172079086304, "reward_std": 0.30223822593688965, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31662172079086304, "rewards/QAReward/std": 0.4408811330795288, "step": 560 }, { "clip_ratio/high_max": 0.00032405111705884336, "clip_ratio/high_mean": 0.00016519209602847696, "clip_ratio/low_mean": 4.323323228163645e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020842532394453882, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 550.3606770833334, "completions/min_length": 240.33333333333334, "epoch": 0.11048103245991396, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.82421875, "kl": 0.0022715011611580847, "learning_rate": 2.9494609198132983e-06, "loss": 0.00011570313945412636, "reward": 0.31465205550193787, "reward_std": 0.2804287274678548, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31465205550193787, "rewards/QAReward/std": 0.4403270383675893, "step": 565 }, { "clip_ratio/high_max": 0.0005414953804574907, "clip_ratio/high_mean": 0.00029949386953376234, "clip_ratio/low_mean": 7.171909819589928e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037121297209523617, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 533.97265625, "completions/min_length": 261.5, "epoch": 0.1114587407117716, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0022161758970469236, "learning_rate": 2.9482309439980404e-06, "loss": 3.294477646704763e-05, "reward": 0.329043447971344, "reward_std": 0.2987218499183655, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3290434330701828, "rewards/QAReward/std": 0.46389444172382355, "step": 570 }, { "clip_ratio/high_max": 0.0003307290258817375, "clip_ratio/high_mean": 0.0001660423178691417, "clip_ratio/low_mean": 8.587207121308893e-05, "clip_ratio/low_min": 1.9629011512733995e-05, "clip_ratio/region_mean": 0.0002519143861718476, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.3190104166666, "completions/min_length": 254.33333333333334, "epoch": 0.11243644896362925, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.0022551337257027625, "learning_rate": 2.9469864432590455e-06, "loss": 0.00011798120103776455, "reward": 0.2864888310432434, "reward_std": 0.26506828268369037, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2864888211091359, "rewards/QAReward/std": 0.4353450934092204, "step": 575 }, { "clip_ratio/high_max": 0.0004712329595349729, "clip_ratio/high_mean": 0.0002667003544047475, "clip_ratio/low_mean": 6.265685951802879e-05, "clip_ratio/low_min": 2.179123985115439e-05, "clip_ratio/region_mean": 0.0003293572226539254, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 527.953125, "completions/min_length": 217.5, "epoch": 0.1134141572154869, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7578125, "kl": 0.0022976955864578486, "learning_rate": 2.9457274300779404e-06, "loss": 0.00015609164256602526, "reward": 0.4012772887945175, "reward_std": 0.25653063505887985, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4012772887945175, "rewards/QAReward/std": 0.4311900734901428, "step": 580 }, { "clip_ratio/high_max": 0.0003352689556777477, "clip_ratio/high_mean": 0.0001754030818119645, "clip_ratio/low_mean": 6.613015284528956e-05, "clip_ratio/low_min": 1.9552253070287408e-05, "clip_ratio/region_mean": 0.0002415332361124456, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 525.3645833333334, "completions/min_length": 238.66666666666666, "epoch": 0.11439186546734455, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.80078125, "kl": 0.002300409320741892, "learning_rate": 2.9444539170819034e-06, "loss": 0.00012840385315939784, "reward": 0.318158894777298, "reward_std": 0.3025158445040385, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31815890471140545, "rewards/QAReward/std": 0.46006999413172406, "step": 585 }, { "clip_ratio/high_max": 0.0005577659467235208, "clip_ratio/high_mean": 0.00028751627542078495, "clip_ratio/low_mean": 8.502237033098936e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037253862828947606, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 518.939453125, "completions/min_length": 233.0, "epoch": 0.11536957371920219, "frac_reward_zero_std": 0.046875, "grad_norm": 0.796875, "kl": 0.0023243166506290436, "learning_rate": 2.943165917043538e-06, "loss": 0.00012734195915982128, "reward": 0.3870667368173599, "reward_std": 0.27198562026023865, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3870667517185211, "rewards/QAReward/std": 0.4706833213567734, "step": 590 }, { "clip_ratio/high_max": 0.0004405945306643844, "clip_ratio/high_mean": 0.00019274525111541153, "clip_ratio/low_mean": 4.168710729572922e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002344323555007577, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 551.5325520833334, "completions/min_length": 238.33333333333334, "epoch": 0.11634728197105984, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.81640625, "kl": 0.002295883931219578, "learning_rate": 2.9418634428807435e-06, "loss": 5.483963177539408e-05, "reward": 0.3179057190815608, "reward_std": 0.2962558964888255, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3179057190815608, "rewards/QAReward/std": 0.4660305678844452, "step": 595 }, { "clip_ratio/high_max": 0.0006452300935052335, "clip_ratio/high_mean": 0.0003167148446664214, "clip_ratio/low_mean": 7.107395940693095e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038778880843892694, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 524.474609375, "completions/min_length": 225.5, "epoch": 0.11732499022291748, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7578125, "kl": 0.002408105693757534, "learning_rate": 2.9405465076565876e-06, "loss": 0.000172391370870173, "reward": 0.3435221463441849, "reward_std": 0.2634137123823166, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3435221314430237, "rewards/QAReward/std": 0.48144303262233734, "step": 600 }, { "clip_ratio/high_max": 0.00032660284778103233, "clip_ratio/high_mean": 0.00016179757658392192, "clip_ratio/low_mean": 4.989908193238079e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002116966643370688, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 526.2174479166666, "completions/min_length": 248.0, "epoch": 0.11830269847477513, "frac_reward_zero_std": 0.03125, "grad_norm": 0.765625, "kl": 0.002351461490616202, "learning_rate": 2.9392151245791727e-06, "loss": 8.530248887836934e-05, "reward": 0.3481869002183278, "reward_std": 0.28292057911554974, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3481869101524353, "rewards/QAReward/std": 0.4324797789255778, "step": 605 }, { "clip_ratio/high_max": 0.00039284147787839175, "clip_ratio/high_mean": 0.00025288198376074435, "clip_ratio/low_mean": 6.853333761682733e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000321415311191231, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 517.2578125, "completions/min_length": 225.5, "epoch": 0.11928040672663277, "frac_reward_zero_std": 0.078125, "grad_norm": 0.77734375, "kl": 0.0024663155898451807, "learning_rate": 2.9378693070015065e-06, "loss": 8.783287485130132e-05, "reward": 0.39404402673244476, "reward_std": 0.26017989218235016, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39404404163360596, "rewards/QAReward/std": 0.46086740493774414, "step": 610 }, { "clip_ratio/high_max": 0.00024940032744780183, "clip_ratio/high_mean": 0.0001499630161561072, "clip_ratio/low_mean": 2.849449228961021e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017845749971456826, "completions/clipped_ratio": 0.044270833333333336, "completions/max_length": 1024.0, "completions/mean_length": 543.81640625, "completions/min_length": 246.33333333333334, "epoch": 0.12025811497849041, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8828125, "kl": 0.0024074978195130826, "learning_rate": 2.936509068421365e-06, "loss": 0.00014891652390360832, "reward": 0.3626725971698761, "reward_std": 0.28846657276153564, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3626725872357686, "rewards/QAReward/std": 0.44391360878944397, "step": 615 }, { "clip_ratio/high_max": 0.0004946506349369883, "clip_ratio/high_mean": 0.00026826068642549217, "clip_ratio/low_mean": 6.838412373326719e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033664481597952547, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 530.314453125, "completions/min_length": 243.0, "epoch": 0.12123582323034807, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.002447653142735362, "learning_rate": 2.935134422481159e-06, "loss": 3.5097624640911816e-05, "reward": 0.3172541558742523, "reward_std": 0.3025584816932678, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3172541707754135, "rewards/QAReward/std": 0.41422924399375916, "step": 620 }, { "clip_ratio/high_max": 0.00036205766955390575, "clip_ratio/high_mean": 0.00019455134170129896, "clip_ratio/low_mean": 3.9251369889825584e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023380271159112453, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 519.0442708333334, "completions/min_length": 256.6666666666667, "epoch": 0.1222135314822057, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7890625, "kl": 0.0024739947635680436, "learning_rate": 2.933745382967797e-06, "loss": 0.00011275178985670209, "reward": 0.35127943754196167, "reward_std": 0.279291366537412, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35127945244312286, "rewards/QAReward/std": 0.4474065999190013, "step": 625 }, { "clip_ratio/high_max": 0.00045289339032024146, "clip_ratio/high_mean": 0.000256777013419196, "clip_ratio/low_mean": 0.00010895548621192575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003657325054518878, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 531.1796875, "completions/min_length": 242.0, "epoch": 0.12319123973406336, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87890625, "kl": 0.002469181036576629, "learning_rate": 2.9323419638125474e-06, "loss": 0.00018513004761189222, "reward": 0.38334573805332184, "reward_std": 0.2962743937969208, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38334572315216064, "rewards/QAReward/std": 0.42632828652858734, "step": 630 }, { "clip_ratio/high_max": 0.0002600791398435831, "clip_ratio/high_mean": 0.0001284496975131333, "clip_ratio/low_mean": 5.477250670082867e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018322220421396196, "completions/clipped_ratio": 0.02734375, "completions/max_length": 970.0, "completions/mean_length": 520.7630208333334, "completions/min_length": 228.66666666666666, "epoch": 0.124168947985921, "frac_reward_zero_std": 0.0625, "grad_norm": 0.86328125, "kl": 0.0025809881277382373, "learning_rate": 2.9309241790908967e-06, "loss": 0.00011895562056452036, "reward": 0.3256029983361562, "reward_std": 0.2849144438902537, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32560298840204877, "rewards/QAReward/std": 0.4764625032742818, "step": 635 }, { "clip_ratio/high_max": 0.00037814242532476785, "clip_ratio/high_mean": 0.00024328248109668494, "clip_ratio/low_mean": 8.347079565282912e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032675327965989707, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 534.578125, "completions/min_length": 243.0, "epoch": 0.12514665623777865, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7890625, "kl": 0.0025107665453106163, "learning_rate": 2.929492043022411e-06, "loss": 9.384042932651937e-05, "reward": 0.33213432133197784, "reward_std": 0.28481119871139526, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33213433623313904, "rewards/QAReward/std": 0.4432569742202759, "step": 640 }, { "clip_ratio/high_max": 0.0003220280748791993, "clip_ratio/high_mean": 0.00015518334694206715, "clip_ratio/low_mean": 4.303200112190098e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019821535097435116, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1024.0, "completions/mean_length": 550.53125, "completions/min_length": 247.0, "epoch": 0.1261243644896363, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.78515625, "kl": 0.002422858541831374, "learning_rate": 2.928045569970591e-06, "loss": 2.7721107471734285e-05, "reward": 0.28428974747657776, "reward_std": 0.2778117557366689, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2842897375424703, "rewards/QAReward/std": 0.43526532252629596, "step": 645 }, { "clip_ratio/high_max": 0.0004567356314510107, "clip_ratio/high_mean": 0.0002728649764321744, "clip_ratio/low_mean": 8.03827919298783e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003532477712724358, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 540.248046875, "completions/min_length": 213.5, "epoch": 0.12710207274149393, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0025233123917132614, "learning_rate": 2.9265847744427307e-06, "loss": 0.00011664838530123234, "reward": 0.379561185836792, "reward_std": 0.2793487459421158, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3795611560344696, "rewards/QAReward/std": 0.4387076050043106, "step": 650 }, { "clip_ratio/high_max": 0.00028264372958801686, "clip_ratio/high_mean": 0.00013859325554221869, "clip_ratio/low_mean": 6.286978314165025e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000201463041594252, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 534.51953125, "completions/min_length": 255.66666666666666, "epoch": 0.12807978099335157, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0025481430813670157, "learning_rate": 2.9251096710897683e-06, "loss": 0.0001451093005016446, "reward": 0.3811270296573639, "reward_std": 0.2874213755130768, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3811270296573639, "rewards/QAReward/std": 0.43858132759730023, "step": 655 }, { "clip_ratio/high_max": 0.00043636378832161425, "clip_ratio/high_mean": 0.00023861246299929916, "clip_ratio/low_mean": 8.499561809003354e-05, "clip_ratio/low_min": 1.644466392463073e-05, "clip_ratio/region_mean": 0.00032360808691009877, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 541.875, "completions/min_length": 249.5, "epoch": 0.12905748924520924, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8125, "kl": 0.0025107369758188723, "learning_rate": 2.9236202747061433e-06, "loss": 5.014004418626428e-05, "reward": 0.4041759669780731, "reward_std": 0.29487523436546326, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4041759669780731, "rewards/QAReward/std": 0.4153124988079071, "step": 660 }, { "clip_ratio/high_max": 0.0003453765995800495, "clip_ratio/high_mean": 0.0002005379181355238, "clip_ratio/low_mean": 4.754517285618931e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024808308808133006, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 519.625, "completions/min_length": 247.66666666666666, "epoch": 0.13003519749706688, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.0025300010573118927, "learning_rate": 2.922116600229645e-06, "loss": 0.00012053452664986252, "reward": 0.3694418668746948, "reward_std": 0.28334476550420123, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3694418668746948, "rewards/QAReward/std": 0.45911434292793274, "step": 665 }, { "clip_ratio/high_max": 0.00044662655564025043, "clip_ratio/high_mean": 0.0002457114227581769, "clip_ratio/low_mean": 0.00010377285652793944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003494842676445842, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 531.4609375, "completions/min_length": 257.0, "epoch": 0.13101290574892452, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.002566836215555668, "learning_rate": 2.9205986627412636e-06, "loss": 0.00010560688097029924, "reward": 0.4036698639392853, "reward_std": 0.28061237931251526, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40366987884044647, "rewards/QAReward/std": 0.423944890499115, "step": 670 }, { "clip_ratio/high_max": 0.0003656473709270358, "clip_ratio/high_mean": 0.00021690765861421825, "clip_ratio/low_mean": 4.6214452595449984e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026312211994081733, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 518.81640625, "completions/min_length": 247.33333333333334, "epoch": 0.13199061400078216, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.78125, "kl": 0.002727434691041708, "learning_rate": 2.9190664774650397e-06, "loss": 7.368785445578396e-05, "reward": 0.3989521265029907, "reward_std": 0.26887038350105286, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3989521364370982, "rewards/QAReward/std": 0.40024229884147644, "step": 675 }, { "clip_ratio/high_max": 0.0005500661674886942, "clip_ratio/high_mean": 0.00025326615432277324, "clip_ratio/low_mean": 0.00010987290006596595, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036313905147835614, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 522.05859375, "completions/min_length": 245.5, "epoch": 0.13296832225263983, "frac_reward_zero_std": 0.015625, "grad_norm": 0.80859375, "kl": 0.0026818458456546066, "learning_rate": 2.9175200597679104e-06, "loss": 6.119172903709113e-05, "reward": 0.40567725896835327, "reward_std": 0.2585034817457199, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40567725896835327, "rewards/QAReward/std": 0.4176330715417862, "step": 680 }, { "clip_ratio/high_max": 0.00029115250799804926, "clip_ratio/high_mean": 0.00015562548651359975, "clip_ratio/low_mean": 3.6132603418082e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019175808993168175, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 571.3346354166666, "completions/min_length": 260.6666666666667, "epoch": 0.13394603050449747, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.76953125, "kl": 0.0024846422020345926, "learning_rate": 2.915959425159556e-06, "loss": 0.0001521088182926178, "reward": 0.3798590401808421, "reward_std": 0.293532798687617, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3798590501149495, "rewards/QAReward/std": 0.4481613834698995, "step": 685 }, { "clip_ratio/high_max": 0.00048541646683588626, "clip_ratio/high_mean": 0.0002695739676710218, "clip_ratio/low_mean": 7.114887484931387e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003407228272408247, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 552.96875, "completions/min_length": 245.0, "epoch": 0.1349237387563551, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8046875, "kl": 0.0025441261008381844, "learning_rate": 2.914384589292245e-06, "loss": 0.00016928629484027624, "reward": 0.3937973082065582, "reward_std": 0.2885735183954239, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39379726350307465, "rewards/QAReward/std": 0.4170465022325516, "step": 690 }, { "clip_ratio/high_max": 0.00025589353172108533, "clip_ratio/high_mean": 0.0001400622248183936, "clip_ratio/low_mean": 6.131280388217419e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020137501996941864, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 527.84765625, "completions/min_length": 243.66666666666666, "epoch": 0.13590144700821274, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8046875, "kl": 0.002765376074239612, "learning_rate": 2.9127955679606747e-06, "loss": 0.00013820442836731672, "reward": 0.34690239032109577, "reward_std": 0.29208047191301983, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34690239032109577, "rewards/QAReward/std": 0.45621851086616516, "step": 695 }, { "clip_ratio/high_max": 0.0004217173554934561, "clip_ratio/high_mean": 0.00023047392023727297, "clip_ratio/low_mean": 5.5375111696776005e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002858490217477083, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 539.326171875, "completions/min_length": 238.0, "epoch": 0.13687915526007038, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.002625506883487105, "learning_rate": 2.9111923771018167e-06, "loss": 0.00011327113024890423, "reward": 0.3719106465578079, "reward_std": 0.2674054652452469, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3719106614589691, "rewards/QAReward/std": 0.4362659454345703, "step": 700 }, { "clip_ratio/high_max": 0.0003190278774127364, "clip_ratio/high_mean": 0.0001678028318565339, "clip_ratio/low_mean": 6.001232250127941e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022781515726819635, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 541.4049479166666, "completions/min_length": 268.3333333333333, "epoch": 0.13785686351192805, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.002604367909952998, "learning_rate": 2.909575032794753e-06, "loss": 0.00011723516508936882, "reward": 0.34353246291478473, "reward_std": 0.3155005971590678, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3435324827829997, "rewards/QAReward/std": 0.43806209166844684, "step": 705 }, { "clip_ratio/high_max": 0.0004237129120156169, "clip_ratio/high_mean": 0.0002470939187332988, "clip_ratio/low_mean": 8.678873418830336e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033388265874236824, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 540.453125, "completions/min_length": 238.0, "epoch": 0.1388345717637857, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0026281938422471284, "learning_rate": 2.907943551260517e-06, "loss": 0.00010637397645041347, "reward": 0.3943080008029938, "reward_std": 0.2659175992012024, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3943079560995102, "rewards/QAReward/std": 0.42777077853679657, "step": 710 }, { "clip_ratio/high_max": 0.00025893591227941213, "clip_ratio/high_mean": 0.00013497075415216386, "clip_ratio/low_mean": 3.40255573973991e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016899631591513753, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 946.6666666666666, "completions/mean_length": 543.46484375, "completions/min_length": 256.0, "epoch": 0.13981228001564333, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.7578125, "kl": 0.002635808847844601, "learning_rate": 2.906297948861931e-06, "loss": 5.773576558567584e-05, "reward": 0.2904227375984192, "reward_std": 0.27614377935727435, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29042274753252667, "rewards/QAReward/std": 0.43450503547986347, "step": 715 }, { "clip_ratio/high_max": 0.00046869091456755994, "clip_ratio/high_mean": 0.0002379044541157782, "clip_ratio/low_mean": 6.560853071277962e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030351296882145105, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1024.0, "completions/mean_length": 540.93359375, "completions/min_length": 251.0, "epoch": 0.14078998826750097, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.0026079695206135513, "learning_rate": 2.9046382421034402e-06, "loss": 8.268830133602023e-05, "reward": 0.29507532715797424, "reward_std": 0.28438900411129, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29507534205913544, "rewards/QAReward/std": 0.45565474033355713, "step": 720 }, { "clip_ratio/high_max": 0.00030440917471423744, "clip_ratio/high_mean": 0.00017412551678717137, "clip_ratio/low_mean": 3.251432644901797e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020663983887061476, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 528.69921875, "completions/min_length": 225.0, "epoch": 0.1417676965193586, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.79296875, "kl": 0.0026768934447318317, "learning_rate": 2.9029644476309496e-06, "loss": 7.290365174412727e-05, "reward": 0.4202060600121816, "reward_std": 0.2832299768924713, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4202060600121816, "rewards/QAReward/std": 0.41629313429196674, "step": 725 }, { "clip_ratio/high_max": 0.0003859093994833529, "clip_ratio/high_mean": 0.0002625824767164886, "clip_ratio/low_mean": 6.403774168575182e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003266202169470489, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 527.34765625, "completions/min_length": 234.0, "epoch": 0.14274540477121628, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.0026234570425003765, "learning_rate": 2.901276582231656e-06, "loss": 0.0001854259753599763, "reward": 0.3627117872238159, "reward_std": 0.2985486686229706, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3627117872238159, "rewards/QAReward/std": 0.49209366738796234, "step": 730 }, { "clip_ratio/high_max": 0.0002796151675283909, "clip_ratio/high_mean": 0.00016406587092205883, "clip_ratio/low_mean": 8.75673460541293e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025163320824503896, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 549.3359375, "completions/min_length": 266.3333333333333, "epoch": 0.14372311302307392, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.0026639390271157026, "learning_rate": 2.899574662833877e-06, "loss": 0.00012752769980579616, "reward": 0.3591318726539612, "reward_std": 0.29017338156700134, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3591318726539612, "rewards/QAReward/std": 0.4552359879016876, "step": 735 }, { "clip_ratio/high_max": 0.00038576263468712566, "clip_ratio/high_mean": 0.00024717270280234517, "clip_ratio/low_mean": 5.833071045344695e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003055034205317497, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 570.169921875, "completions/min_length": 277.5, "epoch": 0.14470082127493156, "frac_reward_zero_std": 0.015625, "grad_norm": 0.765625, "kl": 0.0025558616034686565, "learning_rate": 2.8978587065068884e-06, "loss": 0.00014973374782130123, "reward": 0.3009757995605469, "reward_std": 0.28782975673675537, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3009757846593857, "rewards/QAReward/std": 0.4156697541475296, "step": 740 }, { "clip_ratio/high_max": 0.0003579303272999823, "clip_ratio/high_mean": 0.0001577846531290561, "clip_ratio/low_mean": 4.796531575266272e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020574997179210187, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 530.44921875, "completions/min_length": 219.66666666666666, "epoch": 0.1456785295267892, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0027174376882612705, "learning_rate": 2.896128730460744e-06, "loss": 9.621670469641685e-05, "reward": 0.33975230157375336, "reward_std": 0.29788826902707416, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33975230157375336, "rewards/QAReward/std": 0.45232181747754413, "step": 745 }, { "clip_ratio/high_max": 0.0004457814968191087, "clip_ratio/high_mean": 0.00024784314446151255, "clip_ratio/low_mean": 0.0001011388550978154, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034898199373856187, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 510.3671875, "completions/min_length": 261.5, "epoch": 0.14665623777864686, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84375, "kl": 0.0027571146376430987, "learning_rate": 2.894384752046109e-06, "loss": 0.0001737992512062192, "reward": 0.3109491318464279, "reward_std": 0.28364813327789307, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3109491318464279, "rewards/QAReward/std": 0.4162992835044861, "step": 750 }, { "clip_ratio/high_max": 0.00027732643065974115, "clip_ratio/high_mean": 0.00015368172898888588, "clip_ratio/low_mean": 5.2652356680482625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020633407984860241, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 544.359375, "completions/min_length": 257.3333333333333, "epoch": 0.1476339460305045, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.002709922706708312, "learning_rate": 2.8926267887540845e-06, "loss": 6.024042377248406e-05, "reward": 0.3231993516286214, "reward_std": 0.29497015476226807, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3231993516286214, "rewards/QAReward/std": 0.44812918702761334, "step": 755 }, { "clip_ratio/high_max": 0.0005010574939660728, "clip_ratio/high_mean": 0.0002922236453741789, "clip_ratio/low_mean": 8.305573137477041e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003752793942112476, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 533.640625, "completions/min_length": 258.5, "epoch": 0.14861165428236214, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.002823435282334685, "learning_rate": 2.890854858216031e-06, "loss": 0.00011248041409999132, "reward": 0.29793061316013336, "reward_std": 0.3302502781152725, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29793062806129456, "rewards/QAReward/std": 0.4685276746749878, "step": 760 }, { "clip_ratio/high_max": 0.00028396096313372255, "clip_ratio/high_mean": 0.00014396918704733251, "clip_ratio/low_mean": 4.180366959189996e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000185772858094424, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 525.3411458333334, "completions/min_length": 255.0, "epoch": 0.14958936253421978, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8046875, "kl": 0.002954332483932376, "learning_rate": 2.889068978203394e-06, "loss": 0.0001372610917314887, "reward": 0.3507900337378184, "reward_std": 0.2988939384619395, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35079004367192584, "rewards/QAReward/std": 0.4307340780893962, "step": 765 }, { "clip_ratio/high_max": 0.00039920835988596084, "clip_ratio/high_mean": 0.00019305965397506953, "clip_ratio/low_mean": 7.035502785583958e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002634146832861006, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 535.736328125, "completions/min_length": 282.0, "epoch": 0.15056707078607742, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.0028407552279531957, "learning_rate": 2.887269166627523e-06, "loss": 8.61491251271218e-05, "reward": 0.3718060255050659, "reward_std": 0.29961277544498444, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3718060255050659, "rewards/QAReward/std": 0.42225250601768494, "step": 770 }, { "clip_ratio/high_max": 0.0003251201589591801, "clip_ratio/high_mean": 0.00013807982322759925, "clip_ratio/low_mean": 4.4580630492419006e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018266044789925218, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 525.4921875, "completions/min_length": 259.6666666666667, "epoch": 0.1515447790379351, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84765625, "kl": 0.002794521441683173, "learning_rate": 2.8854554415394933e-06, "loss": 0.0001436874154023826, "reward": 0.35719503959019977, "reward_std": 0.28298242886861164, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35719504952430725, "rewards/QAReward/std": 0.44573845465977985, "step": 775 }, { "clip_ratio/high_max": 0.0005085867829620838, "clip_ratio/high_mean": 0.00027707531116902826, "clip_ratio/low_mean": 7.971736995386891e-05, "clip_ratio/low_min": 2.0508613670244812e-05, "clip_ratio/region_mean": 0.00035679268185049293, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 529.56640625, "completions/min_length": 239.0, "epoch": 0.15252248728979273, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0029533873312175274, "learning_rate": 2.883627821129925e-06, "loss": 0.00016194406198337673, "reward": 0.37168899178504944, "reward_std": 0.27583397924900055, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37168899178504944, "rewards/QAReward/std": 0.42886118590831757, "step": 780 }, { "clip_ratio/high_max": 0.0002885629888623953, "clip_ratio/high_mean": 0.00016432945267297328, "clip_ratio/low_mean": 4.232698993291706e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002066564396955073, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 518.9921875, "completions/min_length": 252.66666666666666, "epoch": 0.15350019554165037, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.003013950819149613, "learning_rate": 2.8817863237287987e-06, "loss": 0.00017201821319758892, "reward": 0.3512778083483378, "reward_std": 0.2858836352825165, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3512778083483378, "rewards/QAReward/std": 0.4254446029663086, "step": 785 }, { "clip_ratio/high_max": 0.0004613963421434164, "clip_ratio/high_mean": 0.00026519681559875605, "clip_ratio/low_mean": 5.9113596216775476e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032431040890514853, "completions/clipped_ratio": 0.0078125, "completions/max_length": 932.0, "completions/mean_length": 511.853515625, "completions/min_length": 220.5, "epoch": 0.154477903793508, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80078125, "kl": 0.003007697220891714, "learning_rate": 2.8799309678052763e-06, "loss": 0.000169729208573699, "reward": 0.39172016084194183, "reward_std": 0.2677987217903137, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39172014594078064, "rewards/QAReward/std": 0.40146879851818085, "step": 790 }, { "clip_ratio/high_max": 0.00048263208009302616, "clip_ratio/high_mean": 0.00020186130423098801, "clip_ratio/low_mean": 5.989712080918252e-05, "clip_ratio/low_min": 2.2983222152106464e-05, "clip_ratio/region_mean": 0.0002617584308609366, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 520.6419270833334, "completions/min_length": 232.0, "epoch": 0.15545561204536568, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8203125, "kl": 0.0030296710319817066, "learning_rate": 2.8780617719675098e-06, "loss": 0.00010611761827021838, "reward": 0.40285321076711017, "reward_std": 0.27441274126370746, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40285322070121765, "rewards/QAReward/std": 0.4205743173758189, "step": 795 }, { "clip_ratio/high_max": 0.00044413937721401455, "clip_ratio/high_mean": 0.00023630562354810537, "clip_ratio/low_mean": 7.215343357529491e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030845906585454943, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 524.611328125, "completions/min_length": 247.5, "epoch": 0.15643332029722332, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84375, "kl": 0.003050504298880696, "learning_rate": 2.8761787549624597e-06, "loss": 8.133540395647288e-05, "reward": 0.2827722132205963, "reward_std": 0.29597413539886475, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2827722132205963, "rewards/QAReward/std": 0.4243307262659073, "step": 800 }, { "clip_ratio/high_max": 0.0003574608126655221, "clip_ratio/high_mean": 0.0001788202440366149, "clip_ratio/low_mean": 3.185979003319517e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021068003261461853, "completions/clipped_ratio": 0.044270833333333336, "completions/max_length": 1024.0, "completions/mean_length": 552.0950520833334, "completions/min_length": 247.0, "epoch": 0.15741102854908096, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.00281602474860847, "learning_rate": 2.8742819356757044e-06, "loss": 0.00012042006710544229, "reward": 0.2772399038076401, "reward_std": 0.3017568190892537, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2772398889064789, "rewards/QAReward/std": 0.4434732000033061, "step": 805 }, { "clip_ratio/high_max": 0.0006288111326284707, "clip_ratio/high_mean": 0.00035340230679139497, "clip_ratio/low_mean": 4.7799607273191214e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000401201902423054, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 506.748046875, "completions/min_length": 228.5, "epoch": 0.1583887368009386, "frac_reward_zero_std": 0.015625, "grad_norm": 0.85546875, "kl": 0.0030094243120402098, "learning_rate": 2.8723713331312485e-06, "loss": 7.579482044093311e-05, "reward": 0.3989839106798172, "reward_std": 0.2951173186302185, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.398983895778656, "rewards/QAReward/std": 0.4484301507472992, "step": 810 }, { "clip_ratio/high_max": 0.00026064437115564943, "clip_ratio/high_mean": 0.0001634145330172032, "clip_ratio/low_mean": 4.29639098001644e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020637843990698457, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 525.5143229166666, "completions/min_length": 243.0, "epoch": 0.15936644505279624, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.77734375, "kl": 0.0029698253143578767, "learning_rate": 2.8704469664913397e-06, "loss": 0.00010718863923102618, "reward": 0.33390067517757416, "reward_std": 0.2753683229287465, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33390068014462787, "rewards/QAReward/std": 0.459412415822347, "step": 815 }, { "clip_ratio/high_max": 0.0005916809779591859, "clip_ratio/high_mean": 0.00027094674878753723, "clip_ratio/low_mean": 8.672079129610211e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003576675371732563, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 544.134765625, "completions/min_length": 232.0, "epoch": 0.1603441533046539, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.0028850138653069733, "learning_rate": 2.8685088550562673e-06, "loss": 0.00016706761671230196, "reward": 0.3040837347507477, "reward_std": 0.2935687005519867, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3040837347507477, "rewards/QAReward/std": 0.4472915381193161, "step": 820 }, { "clip_ratio/high_max": 0.00025026112562045457, "clip_ratio/high_mean": 0.0001507633365690708, "clip_ratio/low_mean": 5.413971084635705e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020490305032581091, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 546.5169270833334, "completions/min_length": 237.33333333333334, "epoch": 0.16132186155651154, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80859375, "kl": 0.002790681133046746, "learning_rate": 2.8665570182641755e-06, "loss": 0.00016828476218506695, "reward": 0.3120690683523814, "reward_std": 0.26376019914944965, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3120690683523814, "rewards/QAReward/std": 0.43172980348269147, "step": 825 }, { "clip_ratio/high_max": 0.0006569300079718232, "clip_ratio/high_mean": 0.0003208718553651124, "clip_ratio/low_mean": 8.761810313444585e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004084899672307074, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 535.75, "completions/min_length": 219.0, "epoch": 0.16229956980836918, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.0028023441322147847, "learning_rate": 2.8645914756908652e-06, "loss": 0.00011830118019133806, "reward": 0.34965044260025024, "reward_std": 0.28973840177059174, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34965045750141144, "rewards/QAReward/std": 0.41327548027038574, "step": 830 }, { "clip_ratio/high_max": 0.00027449553599581125, "clip_ratio/high_mean": 0.00014654266997240483, "clip_ratio/low_mean": 4.887542309006676e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001954181003384292, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 540.5026041666666, "completions/min_length": 236.66666666666666, "epoch": 0.16327727806022682, "frac_reward_zero_std": 0.0625, "grad_norm": 0.875, "kl": 0.002776476368308067, "learning_rate": 2.8626122470495976e-06, "loss": 0.00012139513855800033, "reward": 0.3794017831484477, "reward_std": 0.26091734568277997, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3794017732143402, "rewards/QAReward/std": 0.438503623008728, "step": 835 }, { "clip_ratio/high_max": 0.00040763687575235964, "clip_ratio/high_mean": 0.00022430375684052707, "clip_ratio/low_mean": 7.765636692056433e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030196012812666595, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 538.12109375, "completions/min_length": 264.5, "epoch": 0.16425498631208446, "frac_reward_zero_std": 0.046875, "grad_norm": 0.81640625, "kl": 0.0028752932325005532, "learning_rate": 2.860619352190899e-06, "loss": 0.00019772015511989593, "reward": 0.40206240117549896, "reward_std": 0.27183882892131805, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40206240117549896, "rewards/QAReward/std": 0.4379418343305588, "step": 840 }, { "clip_ratio/high_max": 0.0003536206320859492, "clip_ratio/high_mean": 0.00019235279178246855, "clip_ratio/low_mean": 4.378075827844441e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023613355588167906, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 989.6666666666666, "completions/mean_length": 515.58984375, "completions/min_length": 240.0, "epoch": 0.16523269456394213, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.8125, "kl": 0.0029595441184937952, "learning_rate": 2.8586128111023585e-06, "loss": 0.00016811067471280693, "reward": 0.3826311429341634, "reward_std": 0.2581139753262202, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3826311429341634, "rewards/QAReward/std": 0.4233449399471283, "step": 845 }, { "clip_ratio/high_max": 0.00046585147501900794, "clip_ratio/high_mean": 0.00024300510995090008, "clip_ratio/low_mean": 8.011092577362434e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003231160342693329, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 554.0078125, "completions/min_length": 249.0, "epoch": 0.16621040281579977, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.0028064261190593244, "learning_rate": 2.8565926439084297e-06, "loss": 7.197422673925758e-05, "reward": 0.35638487339019775, "reward_std": 0.29438282549381256, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35638485848903656, "rewards/QAReward/std": 0.42317599058151245, "step": 850 }, { "clip_ratio/high_max": 0.00034364496823400257, "clip_ratio/high_mean": 0.00018646831158548595, "clip_ratio/low_mean": 4.6664240653626623e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023313254350796342, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 530.2408854166666, "completions/min_length": 239.0, "epoch": 0.1671881110676574, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0028712233528494836, "learning_rate": 2.8545588708702284e-06, "loss": 0.00012025479227304459, "reward": 0.31018053491910297, "reward_std": 0.28823234637578327, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31018053491910297, "rewards/QAReward/std": 0.4304101864496867, "step": 855 }, { "clip_ratio/high_max": 0.0005273956339806318, "clip_ratio/high_mean": 0.0002636721474118531, "clip_ratio/low_mean": 7.41194366128184e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000337791588390246, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 526.015625, "completions/min_length": 255.5, "epoch": 0.16816581931951505, "frac_reward_zero_std": 0.046875, "grad_norm": 0.83984375, "kl": 0.002786661870777607, "learning_rate": 2.852511512385328e-06, "loss": 0.00013507618568837644, "reward": 0.40372566878795624, "reward_std": 0.2809899002313614, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40372568368911743, "rewards/QAReward/std": 0.4370897263288498, "step": 860 }, { "clip_ratio/high_max": 0.0003648433834314346, "clip_ratio/high_mean": 0.00017930310568772256, "clip_ratio/low_mean": 5.851685637026094e-05, "clip_ratio/low_min": 2.2200022067409007e-05, "clip_ratio/region_mean": 0.0002378199598751962, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 541.1223958333334, "completions/min_length": 252.33333333333334, "epoch": 0.16914352757137271, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8046875, "kl": 0.002725826622918248, "learning_rate": 2.850450588987558e-06, "loss": 0.00012090791715309024, "reward": 0.2973086138566335, "reward_std": 0.3066663146018982, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2973086138566335, "rewards/QAReward/std": 0.4334605534871419, "step": 865 }, { "clip_ratio/high_max": 0.0004730112967081368, "clip_ratio/high_mean": 0.00027071283548139034, "clip_ratio/low_mean": 8.10417957836762e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035175462253391745, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 534.55078125, "completions/min_length": 224.0, "epoch": 0.17012123582323035, "frac_reward_zero_std": 0.046875, "grad_norm": 0.74609375, "kl": 0.002845734171569347, "learning_rate": 2.8483761213467946e-06, "loss": 0.00011732756393030286, "reward": 0.2952194958925247, "reward_std": 0.29322484135627747, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2952194958925247, "rewards/QAReward/std": 0.4826876074075699, "step": 870 }, { "clip_ratio/high_max": 0.00038339369930326936, "clip_ratio/high_mean": 0.00017721367185004057, "clip_ratio/low_mean": 3.452249511610716e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021173616987653078, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 537.4231770833334, "completions/min_length": 239.0, "epoch": 0.171098944075088, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.0027357677929103373, "learning_rate": 2.8462881302687565e-06, "loss": 6.976190488785505e-05, "reward": 0.41817477345466614, "reward_std": 0.26396407683690387, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41817477345466614, "rewards/QAReward/std": 0.41471532980600995, "step": 875 }, { "clip_ratio/high_max": 0.0006954479846172035, "clip_ratio/high_mean": 0.00033646294614300134, "clip_ratio/low_mean": 7.174641505116596e-05, "clip_ratio/low_min": 2.554604725446552e-05, "clip_ratio/region_mean": 0.0004082093713805079, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 542.189453125, "completions/min_length": 220.0, "epoch": 0.17207665232694563, "frac_reward_zero_std": 0.03125, "grad_norm": 0.890625, "kl": 0.002752136066555977, "learning_rate": 2.8441866366947924e-06, "loss": 9.197358740493655e-05, "reward": 0.3194359242916107, "reward_std": 0.28243036568164825, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3194359093904495, "rewards/QAReward/std": 0.47209930419921875, "step": 880 }, { "clip_ratio/high_max": 0.00038703265599906447, "clip_ratio/high_mean": 0.000157076062168926, "clip_ratio/low_mean": 3.403751034056768e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019111357978545128, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 541.9440104166666, "completions/min_length": 252.33333333333334, "epoch": 0.17305436057880327, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.85546875, "kl": 0.0027681363746523855, "learning_rate": 2.8420716617016756e-06, "loss": 0.00015435803215950726, "reward": 0.34430545568466187, "reward_std": 0.29346468051274616, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34430546561876935, "rewards/QAReward/std": 0.42908283074696857, "step": 885 }, { "clip_ratio/high_max": 0.0005640657502226532, "clip_ratio/high_mean": 0.00028021485777571795, "clip_ratio/low_mean": 4.2347973794676365e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003225628286600113, "completions/clipped_ratio": 0.001953125, "completions/max_length": 917.0, "completions/mean_length": 519.091796875, "completions/min_length": 211.5, "epoch": 0.17403206883066094, "frac_reward_zero_std": 0.015625, "grad_norm": 0.89453125, "kl": 0.00287937568500638, "learning_rate": 2.839943226501389e-06, "loss": 0.0001383100519888103, "reward": 0.3693080544471741, "reward_std": 0.30944839119911194, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3693080395460129, "rewards/QAReward/std": 0.43337175250053406, "step": 890 }, { "clip_ratio/high_max": 0.00043995589949190616, "clip_ratio/high_mean": 0.00018518818542361258, "clip_ratio/low_mean": 4.280187567928806e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022799007128924132, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 523.2018229166666, "completions/min_length": 262.0, "epoch": 0.17500977708251858, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.7734375, "kl": 0.002777970349416137, "learning_rate": 2.8378013524409137e-06, "loss": 0.0001360934227705002, "reward": 0.344482680161794, "reward_std": 0.3074180682500203, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3444826900959015, "rewards/QAReward/std": 0.4676949779192607, "step": 895 }, { "clip_ratio/high_max": 0.0007345490739680827, "clip_ratio/high_mean": 0.00033233710564672947, "clip_ratio/low_mean": 6.945007771719247e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040178720955736936, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 570.080078125, "completions/min_length": 249.5, "epoch": 0.17598748533437622, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.0026500858832150696, "learning_rate": 2.8356460610020152e-06, "loss": 3.137302992399782e-05, "reward": 0.32012078166007996, "reward_std": 0.2901321053504944, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32012078166007996, "rewards/QAReward/std": 0.4695633798837662, "step": 900 }, { "clip_ratio/high_max": 0.00036474757362157104, "clip_ratio/high_mean": 0.00016461247578263282, "clip_ratio/low_mean": 5.936159286648035e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000223974057007581, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 532.6510416666666, "completions/min_length": 245.33333333333334, "epoch": 0.17696519358623386, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.84375, "kl": 0.0028046810533851386, "learning_rate": 2.8334773738010285e-06, "loss": 0.0001724052242934704, "reward": 0.3508236308892568, "reward_std": 0.2909484803676605, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3508236308892568, "rewards/QAReward/std": 0.4537759820620219, "step": 905 }, { "clip_ratio/high_max": 0.0005334847955964506, "clip_ratio/high_mean": 0.00032268494833260776, "clip_ratio/low_mean": 8.177236159099266e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004044572939164937, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 527.72265625, "completions/min_length": 227.0, "epoch": 0.17794290183809153, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8515625, "kl": 0.0027028018608689307, "learning_rate": 2.8312953125886378e-06, "loss": 4.941091174259782e-05, "reward": 0.3654870539903641, "reward_std": 0.283578097820282, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3654870390892029, "rewards/QAReward/std": 0.46525320410728455, "step": 910 }, { "clip_ratio/high_max": 0.0002979319076985121, "clip_ratio/high_mean": 0.00019391909008845687, "clip_ratio/low_mean": 6.14674820099026e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025538656627759336, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 536.1328125, "completions/min_length": 229.33333333333334, "epoch": 0.17892061008994917, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.002741469629108906, "learning_rate": 2.8290998992496637e-06, "loss": 7.394690765067935e-05, "reward": 0.3736989498138428, "reward_std": 0.2943205237388611, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3736989498138428, "rewards/QAReward/std": 0.42324527104695636, "step": 915 }, { "clip_ratio/high_max": 0.000492507079616189, "clip_ratio/high_mean": 0.00027361964457668365, "clip_ratio/low_mean": 8.253323176177219e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003561528807040304, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 540.654296875, "completions/min_length": 267.5, "epoch": 0.1798983183418068, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.002695365948602557, "learning_rate": 2.826891155802839e-06, "loss": 7.347740465775132e-05, "reward": 0.31466394662857056, "reward_std": 0.29384225606918335, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31466394662857056, "rewards/QAReward/std": 0.42932693660259247, "step": 920 }, { "clip_ratio/high_max": 0.0004338463069871068, "clip_ratio/high_mean": 0.00018463609158061444, "clip_ratio/low_mean": 5.0779228331521153e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002354153199121356, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 508.7096354166667, "completions/min_length": 226.66666666666666, "epoch": 0.18087602659366445, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.78515625, "kl": 0.002877474715933204, "learning_rate": 2.824669104400591e-06, "loss": 0.00017154552042484284, "reward": 0.44975249965985614, "reward_std": 0.27123919129371643, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.44975249965985614, "rewards/QAReward/std": 0.3968106011549632, "step": 925 }, { "clip_ratio/high_max": 0.0005254827556200325, "clip_ratio/high_mean": 0.0002796916523948312, "clip_ratio/low_mean": 8.347995026269928e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003631716012023389, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 535.73046875, "completions/min_length": 252.0, "epoch": 0.18185373484552209, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83984375, "kl": 0.002734023239463568, "learning_rate": 2.8224337673288183e-06, "loss": 0.0001637633889913559, "reward": 0.3868272602558136, "reward_std": 0.2985483407974243, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3868272453546524, "rewards/QAReward/std": 0.45921580493450165, "step": 930 }, { "clip_ratio/high_max": 0.00041458511259406805, "clip_ratio/high_mean": 0.00020412018056958914, "clip_ratio/low_mean": 5.448626325232908e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025860643945634364, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 520.4713541666666, "completions/min_length": 243.33333333333334, "epoch": 0.18283144309737975, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.81640625, "kl": 0.002784621575847268, "learning_rate": 2.820185167006665e-06, "loss": 0.00015583691420033573, "reward": 0.36169291536013287, "reward_std": 0.2667351762453715, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3616929352283478, "rewards/QAReward/std": 0.45165525873502094, "step": 935 }, { "clip_ratio/high_max": 0.0005340928211808205, "clip_ratio/high_mean": 0.00026066279388032855, "clip_ratio/low_mean": 5.483595887199044e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003154987469315529, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 527.904296875, "completions/min_length": 259.5, "epoch": 0.1838091513492374, "frac_reward_zero_std": 0.03125, "grad_norm": 0.875, "kl": 0.002877052128314972, "learning_rate": 2.8179233259863012e-06, "loss": 0.00014075626386329533, "reward": 0.3596121817827225, "reward_std": 0.2729044556617737, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3596121817827225, "rewards/QAReward/std": 0.4337216317653656, "step": 940 }, { "clip_ratio/high_max": 0.0002287522191181779, "clip_ratio/high_mean": 0.00015793034108355642, "clip_ratio/low_mean": 4.393630661070347e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002018666476942599, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 541.7916666666666, "completions/min_length": 245.33333333333334, "epoch": 0.18478685960109503, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.87109375, "kl": 0.002759096631780267, "learning_rate": 2.815648266952691e-06, "loss": 0.00015722837997600436, "reward": 0.36338459451993305, "reward_std": 0.3005189796288808, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3633845845858256, "rewards/QAReward/std": 0.4415080746014913, "step": 945 }, { "clip_ratio/high_max": 0.0004411262460052967, "clip_ratio/high_mean": 0.0002634445088915527, "clip_ratio/low_mean": 9.218378691002726e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035562830162234605, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 539.55859375, "completions/min_length": 237.5, "epoch": 0.18576456785295267, "frac_reward_zero_std": 0.03125, "grad_norm": 0.87109375, "kl": 0.002828489849343896, "learning_rate": 2.813360012723369e-06, "loss": 0.00014208301436156035, "reward": 0.33318452537059784, "reward_std": 0.27880585938692093, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33318452537059784, "rewards/QAReward/std": 0.4404265433549881, "step": 950 }, { "clip_ratio/high_max": 0.00032120547257363796, "clip_ratio/high_mean": 0.0001639120455365628, "clip_ratio/low_mean": 2.990480643347837e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019381685415282847, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 529.2942708333334, "completions/min_length": 247.66666666666666, "epoch": 0.1867422761048103, "frac_reward_zero_std": 0.0625, "grad_norm": 0.79296875, "kl": 0.002857751213014126, "learning_rate": 2.811058586248209e-06, "loss": 0.00013113347813487053, "reward": 0.3967029352982839, "reward_std": 0.2908005714416504, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39670296510060626, "rewards/QAReward/std": 0.4577789902687073, "step": 955 }, { "clip_ratio/high_max": 0.0005128760356456041, "clip_ratio/high_mean": 0.00026526389410719275, "clip_ratio/low_mean": 6.39853926259093e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032924928236752746, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 545.40625, "completions/min_length": 260.0, "epoch": 0.18771998435666798, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.002795628970488906, "learning_rate": 2.808744010609196e-06, "loss": 0.00021263360977172852, "reward": 0.3537365049123764, "reward_std": 0.2836224287748337, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3537365198135376, "rewards/QAReward/std": 0.43803176283836365, "step": 960 }, { "clip_ratio/high_max": 0.00025744616868905723, "clip_ratio/high_mean": 0.000153185281669721, "clip_ratio/low_mean": 5.285989609546959e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002060451777651906, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 538.1953125, "completions/min_length": 256.3333333333333, "epoch": 0.18869769260852562, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.0028056415729224684, "learning_rate": 2.806416309020192e-06, "loss": 0.00011809039860963822, "reward": 0.3246114452679952, "reward_std": 0.2796885371208191, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3246114452679952, "rewards/QAReward/std": 0.4472622275352478, "step": 965 }, { "clip_ratio/high_max": 0.0006131923641078174, "clip_ratio/high_mean": 0.000335478933993727, "clip_ratio/low_mean": 5.766024551121518e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039313917513936756, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 536.8984375, "completions/min_length": 266.0, "epoch": 0.18967540086038326, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.002832575049251318, "learning_rate": 2.8040755048267067e-06, "loss": 0.00015413293149322272, "reward": 0.3476699888706207, "reward_std": 0.30399952828884125, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3476699888706207, "rewards/QAReward/std": 0.4998569041490555, "step": 970 }, { "clip_ratio/high_max": 0.00039631546242162584, "clip_ratio/high_mean": 0.00022233657073229552, "clip_ratio/low_mean": 3.9506182656623426e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002618427621200681, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1024.0, "completions/mean_length": 545.96875, "completions/min_length": 240.33333333333334, "epoch": 0.1906531091122409, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8046875, "kl": 0.002719588577747345, "learning_rate": 2.80172162150566e-06, "loss": 4.692150105256587e-05, "reward": 0.3073660731315613, "reward_std": 0.26843686898549396, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3073660631974538, "rewards/QAReward/std": 0.4474184612433116, "step": 975 }, { "clip_ratio/high_max": 0.0004993081791326403, "clip_ratio/high_mean": 0.00024871117202565076, "clip_ratio/low_mean": 8.409332076553255e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003328045131638646, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 547.345703125, "completions/min_length": 254.5, "epoch": 0.19163081736409857, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.002771843783557415, "learning_rate": 2.7993546826651483e-06, "loss": 0.00010466946987435222, "reward": 0.37156111001968384, "reward_std": 0.2733674496412277, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37156111001968384, "rewards/QAReward/std": 0.4127136915922165, "step": 980 }, { "clip_ratio/high_max": 0.00032783744391053917, "clip_ratio/high_mean": 0.00017885538982227446, "clip_ratio/low_mean": 4.107480635866523e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021993019618093967, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 518.58203125, "completions/min_length": 219.0, "epoch": 0.1926085256159562, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8515625, "kl": 0.0028940419666469096, "learning_rate": 2.796974712044207e-06, "loss": 0.0001164246816188097, "reward": 0.4101480046908061, "reward_std": 0.27055343985557556, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4101479848225911, "rewards/QAReward/std": 0.4269964098930359, "step": 985 }, { "clip_ratio/high_max": 0.00039286137325689197, "clip_ratio/high_mean": 0.00024262124206870795, "clip_ratio/low_mean": 0.00010392514668637887, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034654638729989526, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 539.11328125, "completions/min_length": 271.5, "epoch": 0.19358623386781385, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.0028598940931260585, "learning_rate": 2.7945817335125735e-06, "loss": 0.00018420444102957846, "reward": 0.32137663662433624, "reward_std": 0.265964075922966, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32137663662433624, "rewards/QAReward/std": 0.40062354505062103, "step": 990 }, { "clip_ratio/high_max": 0.0004164006095379591, "clip_ratio/high_mean": 0.00018681287765502929, "clip_ratio/low_mean": 6.057375285308808e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002473866334185004, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 533.7955729166666, "completions/min_length": 265.6666666666667, "epoch": 0.19456394211967148, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8359375, "kl": 0.002928525349125266, "learning_rate": 2.7921757710704465e-06, "loss": 0.00015268903225660325, "reward": 0.324891929825147, "reward_std": 0.27589574456214905, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32489190995693207, "rewards/QAReward/std": 0.42970911661783856, "step": 995 }, { "clip_ratio/high_max": 0.0005451580160297454, "clip_ratio/high_mean": 0.00027591870166361333, "clip_ratio/low_mean": 9.549259557388723e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003714112797752023, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 517.212890625, "completions/min_length": 252.0, "epoch": 0.19554165037152912, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0029814229812473057, "learning_rate": 2.789756848848246e-06, "loss": 0.00014051648322492837, "reward": 0.29968689382076263, "reward_std": 0.27377113699913025, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29968689382076263, "rewards/QAReward/std": 0.43680353462696075, "step": 1000 }, { "clip_ratio/high_max": 0.000427725026383996, "clip_ratio/high_mean": 0.00022251962218433619, "clip_ratio/low_mean": 6.0249213129281996e-05, "clip_ratio/low_min": 2.013693156186491e-05, "clip_ratio/region_mean": 0.00028276885859668255, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 530.86328125, "completions/min_length": 243.33333333333334, "epoch": 0.1965193586233868, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.0030310363974422215, "learning_rate": 2.7873249911063718e-06, "loss": 0.00013249205658212305, "reward": 0.27064216633637744, "reward_std": 0.2666209042072296, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.27064215143521625, "rewards/QAReward/std": 0.44904492298762005, "step": 1005 }, { "clip_ratio/high_max": 0.0004984898725524545, "clip_ratio/high_mean": 0.0002479042741470039, "clip_ratio/low_mean": 4.9520233369548804e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002974245115183294, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 537.560546875, "completions/min_length": 232.0, "epoch": 0.19749706687524443, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.002954976400360465, "learning_rate": 2.784880222234958e-06, "loss": 0.0001790443668141961, "reward": 0.30144771188497543, "reward_std": 0.3014725148677826, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3014477267861366, "rewards/QAReward/std": 0.413374587893486, "step": 1010 }, { "clip_ratio/high_max": 0.000352926051709801, "clip_ratio/high_mean": 0.00018327455036342143, "clip_ratio/low_mean": 4.178616945864633e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022506071254611016, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 543.1223958333334, "completions/min_length": 254.33333333333334, "epoch": 0.19847477512710207, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80078125, "kl": 0.0028684129007160663, "learning_rate": 2.7824225667536316e-06, "loss": 0.00010299201821908354, "reward": 0.35119173924128216, "reward_std": 0.2901313801606496, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3511917491753896, "rewards/QAReward/std": 0.44731202721595764, "step": 1015 }, { "clip_ratio/high_max": 0.0006597735569812358, "clip_ratio/high_mean": 0.0003049842198379338, "clip_ratio/low_mean": 9.005317260744051e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039503740845248103, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 514.0078125, "completions/min_length": 248.0, "epoch": 0.1994524833789597, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.0030144160147756336, "learning_rate": 2.7799520493112635e-06, "loss": 0.00015988070517778397, "reward": 0.3261253833770752, "reward_std": 0.2885162830352783, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3261253982782364, "rewards/QAReward/std": 0.46889422833919525, "step": 1020 }, { "clip_ratio/high_max": 0.0003904234152287245, "clip_ratio/high_mean": 0.00021279911743476987, "clip_ratio/low_mean": 2.5091526185860858e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023789064725860954, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 516.7942708333334, "completions/min_length": 229.0, "epoch": 0.20043019163081735, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.796875, "kl": 0.0030348805245012044, "learning_rate": 2.777468694685723e-06, "loss": 0.00016952611040323972, "reward": 0.3061104913552602, "reward_std": 0.28178735574086505, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30611049632231396, "rewards/QAReward/std": 0.4366755982240041, "step": 1025 }, { "clip_ratio/high_max": 0.0005545131280086935, "clip_ratio/high_mean": 0.00030634297290816904, "clip_ratio/low_mean": 6.80106197251007e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037435358390212057, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 518.08984375, "completions/min_length": 223.5, "epoch": 0.20140789988267502, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83984375, "kl": 0.0031253695487976074, "learning_rate": 2.77497252778363e-06, "loss": 0.00015804939903318882, "reward": 0.363130122423172, "reward_std": 0.29058273136615753, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.363130122423172, "rewards/QAReward/std": 0.41739891469478607, "step": 1030 }, { "clip_ratio/high_max": 0.0003738321829587221, "clip_ratio/high_mean": 0.0001745794725138694, "clip_ratio/low_mean": 5.146069888724014e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022604018449783324, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.7578125, "completions/min_length": 262.6666666666667, "epoch": 0.20238560813453266, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7890625, "kl": 0.0030194784514605997, "learning_rate": 2.7724635736401033e-06, "loss": 9.33474861085415e-05, "reward": 0.34407293796539307, "reward_std": 0.27444925904273987, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34407293796539307, "rewards/QAReward/std": 0.4087069829305013, "step": 1035 }, { "clip_ratio/high_max": 0.0006090503884479403, "clip_ratio/high_mean": 0.00025403815088793636, "clip_ratio/low_mean": 5.071565974503756e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003047538106329739, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 521.236328125, "completions/min_length": 258.5, "epoch": 0.2033633163863903, "frac_reward_zero_std": 0.03125, "grad_norm": 0.85546875, "kl": 0.0029567772056907416, "learning_rate": 2.76994185741851e-06, "loss": 9.993049316108227e-05, "reward": 0.2936128228902817, "reward_std": 0.28614334762096405, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2936128228902817, "rewards/QAReward/std": 0.46494558453559875, "step": 1040 }, { "clip_ratio/high_max": 0.0003817921620793641, "clip_ratio/high_mean": 0.0001992881065234542, "clip_ratio/low_mean": 3.7901762698311356e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002371898852288723, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 534.4934895833334, "completions/min_length": 238.33333333333334, "epoch": 0.20434102463824794, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.796875, "kl": 0.0030083763413131237, "learning_rate": 2.767407404410215e-06, "loss": 0.00012071022065356374, "reward": 0.3718130985895793, "reward_std": 0.26189911862214404, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3718130886554718, "rewards/QAReward/std": 0.4466805656750997, "step": 1045 }, { "clip_ratio/high_max": 0.0004568594507873058, "clip_ratio/high_mean": 0.00025829754886217413, "clip_ratio/low_mean": 5.844447296112776e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003167420218233019, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 544.310546875, "completions/min_length": 258.0, "epoch": 0.2053187328901056, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7734375, "kl": 0.003005328821018338, "learning_rate": 2.7648602400343236e-06, "loss": 0.00015766543801873923, "reward": 0.3082519620656967, "reward_std": 0.29226054251194, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3082519620656967, "rewards/QAReward/std": 0.43989790976047516, "step": 1050 }, { "clip_ratio/high_max": 0.0003672651248052716, "clip_ratio/high_mean": 0.00023273617262020707, "clip_ratio/low_mean": 5.8479847211856395e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029121602419763805, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 534.26171875, "completions/min_length": 248.33333333333334, "epoch": 0.20629644114196324, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8125, "kl": 0.0031166425440460445, "learning_rate": 2.762300389837431e-06, "loss": 0.0001506907632574439, "reward": 0.32301020125548047, "reward_std": 0.2959306836128235, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.323010191321373, "rewards/QAReward/std": 0.4628531038761139, "step": 1055 }, { "clip_ratio/high_max": 0.0005322155193425715, "clip_ratio/high_mean": 0.0002553156635258347, "clip_ratio/low_mean": 7.943687232909724e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033475253731012343, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 528.01953125, "completions/min_length": 240.5, "epoch": 0.20727414939382088, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.0030067528132349253, "learning_rate": 2.759727879493362e-06, "loss": 0.0002368553075939417, "reward": 0.3346858024597168, "reward_std": 0.27416351437568665, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3346858024597168, "rewards/QAReward/std": 0.4396251291036606, "step": 1060 }, { "clip_ratio/high_max": 0.0003631451749242842, "clip_ratio/high_mean": 0.00017868271097540856, "clip_ratio/low_mean": 5.633190885419026e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023501461837440729, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 534.1653645833334, "completions/min_length": 259.6666666666667, "epoch": 0.20825185764567852, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.86328125, "kl": 0.0029841818381100894, "learning_rate": 2.7571427348029166e-06, "loss": 9.41030215471983e-05, "reward": 0.33990731835365295, "reward_std": 0.276514599720637, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33990729848543805, "rewards/QAReward/std": 0.42405690749486286, "step": 1065 }, { "clip_ratio/high_max": 0.0005832069786265492, "clip_ratio/high_mean": 0.00030320389196276666, "clip_ratio/low_mean": 6.002497466397472e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036322887754067776, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 519.482421875, "completions/min_length": 229.0, "epoch": 0.20922956589753616, "frac_reward_zero_std": 0.03125, "grad_norm": 0.79296875, "kl": 0.0030740735586732627, "learning_rate": 2.7545449816936104e-06, "loss": 0.000146790430881083, "reward": 0.38598477840423584, "reward_std": 0.2945021837949753, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38598479330539703, "rewards/QAReward/std": 0.44297634065151215, "step": 1070 }, { "clip_ratio/high_max": 0.0004280435619875789, "clip_ratio/high_mean": 0.00019462336786091328, "clip_ratio/low_mean": 4.738590359920636e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024200925836339592, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 524.7513020833334, "completions/min_length": 274.0, "epoch": 0.21020727414939383, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80078125, "kl": 0.003030710993334651, "learning_rate": 2.7519346462194137e-06, "loss": 0.00018354710191488267, "reward": 0.39148564140001935, "reward_std": 0.28370718161265057, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39148563146591187, "rewards/QAReward/std": 0.4323514699935913, "step": 1075 }, { "clip_ratio/high_max": 0.0005958698457106948, "clip_ratio/high_mean": 0.00034177908673882484, "clip_ratio/low_mean": 6.92174566211179e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041099655209109187, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 512.94921875, "completions/min_length": 247.0, "epoch": 0.21118498240125147, "frac_reward_zero_std": 0.03125, "grad_norm": 0.875, "kl": 0.003096489142626524, "learning_rate": 2.7493117545604895e-06, "loss": 0.00011325301602482796, "reward": 0.4007812589406967, "reward_std": 0.30570948123931885, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4007812589406967, "rewards/QAReward/std": 0.42550063133239746, "step": 1080 }, { "clip_ratio/high_max": 0.00044490119908005, "clip_ratio/high_mean": 0.00023445268161594867, "clip_ratio/low_mean": 7.206164300441742e-05, "clip_ratio/low_min": 2.0807323744520544e-05, "clip_ratio/region_mean": 0.0003065143362618983, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.8958333333334, "completions/min_length": 231.33333333333334, "epoch": 0.2121626906531091, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8125, "kl": 0.0030324824154376985, "learning_rate": 2.7466763330229333e-06, "loss": 9.103157790377737e-05, "reward": 0.33889804283777875, "reward_std": 0.28805552919705707, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33889805277188617, "rewards/QAReward/std": 0.44834232330322266, "step": 1085 }, { "clip_ratio/high_max": 0.0005901610013097525, "clip_ratio/high_mean": 0.0002596235193777829, "clip_ratio/low_mean": 6.76115625537932e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032723507611081003, "completions/clipped_ratio": 0.048828125, "completions/max_length": 1024.0, "completions/mean_length": 530.072265625, "completions/min_length": 245.0, "epoch": 0.21314039890496675, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.003046577936038375, "learning_rate": 2.7440284080385087e-06, "loss": 0.00012971404939889907, "reward": 0.35976140201091766, "reward_std": 0.2899076044559479, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35976140201091766, "rewards/QAReward/std": 0.47276052832603455, "step": 1090 }, { "clip_ratio/high_max": 0.00032652679365128277, "clip_ratio/high_mean": 0.00017178545240312815, "clip_ratio/low_mean": 5.054922658018768e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002223346848040819, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 532.7278645833334, "completions/min_length": 228.33333333333334, "epoch": 0.21411810715682442, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.73828125, "kl": 0.0030678695067763327, "learning_rate": 2.741368006164381e-06, "loss": 0.0001698205480352044, "reward": 0.32603392998377484, "reward_std": 0.2568170875310898, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3260339101155599, "rewards/QAReward/std": 0.4281834065914154, "step": 1095 }, { "clip_ratio/high_max": 0.0005330123472958803, "clip_ratio/high_mean": 0.00029923936235718427, "clip_ratio/low_mean": 8.268659876193851e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038192596985027195, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 518.283203125, "completions/min_length": 212.5, "epoch": 0.21509581540868206, "frac_reward_zero_std": 0.03125, "grad_norm": 0.85546875, "kl": 0.003260294208303094, "learning_rate": 2.7386951540828505e-06, "loss": 0.00016609281301498412, "reward": 0.32124876976013184, "reward_std": 0.28163543343544006, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32124876976013184, "rewards/QAReward/std": 0.44507329165935516, "step": 1100 }, { "clip_ratio/high_max": 0.00032381160417571664, "clip_ratio/high_mean": 0.00020920239039696754, "clip_ratio/low_mean": 4.156077047809958e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025076315505430103, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 509.0716145833333, "completions/min_length": 237.0, "epoch": 0.2160735236605397, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7890625, "kl": 0.0032279551960527897, "learning_rate": 2.736009878601088e-06, "loss": 0.00011144108138978481, "reward": 0.3558524747689565, "reward_std": 0.28051485617955524, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3558524747689565, "rewards/QAReward/std": 0.42829521497090656, "step": 1105 }, { "clip_ratio/high_max": 0.0005187324131838977, "clip_ratio/high_mean": 0.0002340932667721063, "clip_ratio/low_mean": 8.559508132748306e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031968834809958935, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 530.81640625, "completions/min_length": 239.5, "epoch": 0.21705123191239734, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.003089990746229887, "learning_rate": 2.7333122066508616e-06, "loss": 0.00014335550367832183, "reward": 0.28334417939186096, "reward_std": 0.29813580214977264, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28334417939186096, "rewards/QAReward/std": 0.4590138643980026, "step": 1110 }, { "clip_ratio/high_max": 0.0004963308107107878, "clip_ratio/high_mean": 0.00019385506166145206, "clip_ratio/low_mean": 4.908507253276184e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002429401269182563, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 512.7291666666666, "completions/min_length": 221.33333333333334, "epoch": 0.21802894016425498, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.003147078724578023, "learning_rate": 2.7306021652882706e-06, "loss": 0.00014874045737087727, "reward": 0.41654524207115173, "reward_std": 0.27565282583236694, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41654524207115173, "rewards/QAReward/std": 0.4344892005125682, "step": 1115 }, { "clip_ratio/high_max": 0.0006136501207947731, "clip_ratio/high_mean": 0.00033408444141969085, "clip_ratio/low_mean": 7.506050606025383e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040914493147283795, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 528.35546875, "completions/min_length": 257.0, "epoch": 0.21900664841611264, "frac_reward_zero_std": 0.078125, "grad_norm": 0.7578125, "kl": 0.0032270252238959076, "learning_rate": 2.727879781693472e-06, "loss": 0.00019387187203392386, "reward": 0.33997398614883423, "reward_std": 0.275044709444046, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33997397124767303, "rewards/QAReward/std": 0.45066823065280914, "step": 1120 }, { "clip_ratio/high_max": 0.00030269998824223877, "clip_ratio/high_mean": 0.00019187795696780087, "clip_ratio/low_mean": 4.744295729324222e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023932091426104307, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 525.82421875, "completions/min_length": 253.66666666666666, "epoch": 0.21998435666797028, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.796875, "kl": 0.0032282390631735323, "learning_rate": 2.725145083170407e-06, "loss": 0.00016559022478759288, "reward": 0.36587923765182495, "reward_std": 0.26261622707049054, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36587920784950256, "rewards/QAReward/std": 0.4330042799313863, "step": 1125 }, { "clip_ratio/high_max": 0.0004863147623836994, "clip_ratio/high_mean": 0.0002280752407386899, "clip_ratio/low_mean": 8.911963377613574e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003171948774252087, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 512.330078125, "completions/min_length": 210.5, "epoch": 0.22096206491982792, "frac_reward_zero_std": 0.0625, "grad_norm": 0.81640625, "kl": 0.0030960886273533107, "learning_rate": 2.7223980971465314e-06, "loss": 0.0001603909768164158, "reward": 0.4354608505964279, "reward_std": 0.27254098653793335, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4354608654975891, "rewards/QAReward/std": 0.44826389849185944, "step": 1130 }, { "clip_ratio/high_max": 0.00031921467743813994, "clip_ratio/high_mean": 0.00019331412622705101, "clip_ratio/low_mean": 4.734019603347406e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024065431207418442, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 971.0, "completions/mean_length": 496.65234375, "completions/min_length": 249.66666666666666, "epoch": 0.22193977317168556, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.78125, "kl": 0.0033238044939935206, "learning_rate": 2.7196388511725336e-06, "loss": 0.00014054372441023587, "reward": 0.4385054608186086, "reward_std": 0.27055463194847107, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43850547075271606, "rewards/QAReward/std": 0.4385973910490672, "step": 1135 }, { "clip_ratio/high_max": 0.00044973938493058084, "clip_ratio/high_mean": 0.0002196151704993099, "clip_ratio/low_mean": 7.355367997661233e-05, "clip_ratio/low_min": 2.0783540094271302e-05, "clip_ratio/region_mean": 0.00029316884465515616, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 539.810546875, "completions/min_length": 244.0, "epoch": 0.2229174814235432, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.0030959904193878173, "learning_rate": 2.716867372922066e-06, "loss": 0.00015001294668763875, "reward": 0.3758130222558975, "reward_std": 0.27255477011203766, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3758130222558975, "rewards/QAReward/std": 0.4530903398990631, "step": 1140 }, { "clip_ratio/high_max": 0.0002668014611117542, "clip_ratio/high_mean": 0.00011667108628898859, "clip_ratio/low_mean": 6.370954652084038e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001803806284442544, "completions/clipped_ratio": 0.005208333333333333, "completions/max_length": 1024.0, "completions/mean_length": 507.1627604166667, "completions/min_length": 234.0, "epoch": 0.22389518967540087, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.796875, "kl": 0.00311912284232676, "learning_rate": 2.7140836901914607e-06, "loss": 0.00011368910782039165, "reward": 0.38578147689501446, "reward_std": 0.2828473597764969, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.385781466960907, "rewards/QAReward/std": 0.47928332289059955, "step": 1145 }, { "clip_ratio/high_max": 0.0005241815699264408, "clip_ratio/high_mean": 0.00029833061853423715, "clip_ratio/low_mean": 5.9016859449911865e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003573474765289575, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 531.15625, "completions/min_length": 246.5, "epoch": 0.2248728979272585, "frac_reward_zero_std": 0.046875, "grad_norm": 0.78515625, "kl": 0.003062966093420982, "learning_rate": 2.7112878308994565e-06, "loss": 0.0001647886005230248, "reward": 0.3514741361141205, "reward_std": 0.26056213676929474, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3514741510152817, "rewards/QAReward/std": 0.4852279871702194, "step": 1150 }, { "clip_ratio/high_max": 0.0003208875306881964, "clip_ratio/high_mean": 0.00018454293021932244, "clip_ratio/low_mean": 5.707677482860163e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024161971523426474, "completions/clipped_ratio": 0.01953125, "completions/max_length": 943.3333333333334, "completions/mean_length": 512.3893229166666, "completions/min_length": 249.66666666666666, "epoch": 0.22585060617911615, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8359375, "kl": 0.003074875473976135, "learning_rate": 2.7084798230869134e-06, "loss": 5.405168049037457e-05, "reward": 0.3149403731028239, "reward_std": 0.29144475360711414, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3149403929710388, "rewards/QAReward/std": 0.4335060715675354, "step": 1155 }, { "clip_ratio/high_max": 0.00046267557190731167, "clip_ratio/high_mean": 0.00024017448886297642, "clip_ratio/low_mean": 6.831096397945658e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030848545720800756, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 521.0, "completions/min_length": 238.5, "epoch": 0.2268283144309738, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.003173379972577095, "learning_rate": 2.7056596949165366e-06, "loss": 0.00015466423938050866, "reward": 0.3440747559070587, "reward_std": 0.2614349126815796, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3440747410058975, "rewards/QAReward/std": 0.4184312969446182, "step": 1160 }, { "clip_ratio/high_max": 0.00042361277155578136, "clip_ratio/high_mean": 0.00019288965268060565, "clip_ratio/low_mean": 6.735073984600603e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026024040998890997, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 519.9765625, "completions/min_length": 233.0, "epoch": 0.22780602268283145, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.82421875, "kl": 0.003146537486463785, "learning_rate": 2.7028274746725887e-06, "loss": 7.964825490489603e-05, "reward": 0.344008872906367, "reward_std": 0.2792116602261861, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3440088629722595, "rewards/QAReward/std": 0.453321248292923, "step": 1165 }, { "clip_ratio/high_max": 0.0005472594173625111, "clip_ratio/high_mean": 0.00029521890101023016, "clip_ratio/low_mean": 8.893857011571527e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038415745366364716, "completions/clipped_ratio": 0.029296875, "completions/max_length": 917.0, "completions/mean_length": 509.44921875, "completions/min_length": 231.5, "epoch": 0.2287837309346891, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80078125, "kl": 0.0032944954466074703, "learning_rate": 2.6999831907606116e-06, "loss": 0.00022460550535470248, "reward": 0.3789558559656143, "reward_std": 0.26344840228557587, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3789558708667755, "rewards/QAReward/std": 0.4732534885406494, "step": 1170 }, { "clip_ratio/high_max": 0.00042973628733307124, "clip_ratio/high_mean": 0.0002277903025969863, "clip_ratio/low_mean": 7.847714005038142e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030626743100583555, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 507.7513020833333, "completions/min_length": 220.66666666666666, "epoch": 0.22976143918654673, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.85546875, "kl": 0.003273191023617983, "learning_rate": 2.6971268717071376e-06, "loss": 0.00015966091305017472, "reward": 0.3148101568222046, "reward_std": 0.273919016122818, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3148101766904195, "rewards/QAReward/std": 0.46132628122965497, "step": 1175 }, { "clip_ratio/high_max": 0.0004574957536533475, "clip_ratio/high_mean": 0.0002740025636740029, "clip_ratio/low_mean": 8.735180017538369e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003613543754909188, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 533.869140625, "completions/min_length": 252.0, "epoch": 0.23073914743840437, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0031697927974164487, "learning_rate": 2.6942585461594044e-06, "loss": 0.00014790600398555397, "reward": 0.33206458389759064, "reward_std": 0.25192075967788696, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33206456899642944, "rewards/QAReward/std": 0.40642479062080383, "step": 1180 }, { "clip_ratio/high_max": 0.0004923330852761865, "clip_ratio/high_mean": 0.00019420827156864108, "clip_ratio/low_mean": 4.372885450720787e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023793710861355066, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 513.9635416666666, "completions/min_length": 251.33333333333334, "epoch": 0.231716855690262, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.84375, "kl": 0.0032305250875651836, "learning_rate": 2.691378242885068e-06, "loss": 0.000170522544067353, "reward": 0.36580326159795123, "reward_std": 0.2759823848803838, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3658032715320587, "rewards/QAReward/std": 0.43649742007255554, "step": 1185 }, { "clip_ratio/high_max": 0.0004885003552772104, "clip_ratio/high_mean": 0.00024122025934047997, "clip_ratio/low_mean": 9.070304950000718e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003319233190268278, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 527.828125, "completions/min_length": 234.5, "epoch": 0.23269456394211968, "frac_reward_zero_std": 0.046875, "grad_norm": 0.77734375, "kl": 0.0032074843998998403, "learning_rate": 2.6884859907719142e-06, "loss": 0.00015143175842240452, "reward": 0.2786543518304825, "reward_std": 0.26970866322517395, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2786543518304825, "rewards/QAReward/std": 0.3979388475418091, "step": 1190 }, { "clip_ratio/high_max": 0.00027875289088115097, "clip_ratio/high_mean": 0.00016642643604427576, "clip_ratio/low_mean": 3.331440238980576e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019974084571003914, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 507.35546875, "completions/min_length": 243.0, "epoch": 0.23367227219397732, "frac_reward_zero_std": 0.03125, "grad_norm": 0.85546875, "kl": 0.0033236455637961626, "learning_rate": 2.685581818827569e-06, "loss": 9.897121926769615e-05, "reward": 0.29922961195309955, "reward_std": 0.2921069363753001, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29922959208488464, "rewards/QAReward/std": 0.4735659758249919, "step": 1195 }, { "clip_ratio/high_max": 0.0007650746731087566, "clip_ratio/high_mean": 0.0003106331918388605, "clip_ratio/low_mean": 0.00010298948036506772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004136226838454604, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 501.0078125, "completions/min_length": 254.5, "epoch": 0.23464998044583496, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80859375, "kl": 0.0033715073484927418, "learning_rate": 2.6826657561792072e-06, "loss": 0.00015534963458776473, "reward": 0.30838683247566223, "reward_std": 0.28515003621578217, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30838683247566223, "rewards/QAReward/std": 0.4649941474199295, "step": 1200 }, { "clip_ratio/high_max": 0.00030586987268179657, "clip_ratio/high_mean": 0.00013931321445852517, "clip_ratio/low_mean": 4.505247925408184e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018436568789184093, "completions/clipped_ratio": 0.009114583333333334, "completions/max_length": 1024.0, "completions/mean_length": 508.6184895833333, "completions/min_length": 230.66666666666666, "epoch": 0.2356276886976926, "frac_reward_zero_std": 0.03125, "grad_norm": 0.875, "kl": 0.0034152254462242126, "learning_rate": 2.6797378320732615e-06, "loss": 0.00013229972682893276, "reward": 0.29872117439905804, "reward_std": 0.29290241996447247, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29872117439905804, "rewards/QAReward/std": 0.45030659437179565, "step": 1205 }, { "clip_ratio/high_max": 0.0005416675470769405, "clip_ratio/high_mean": 0.0003018399002030492, "clip_ratio/low_mean": 8.924772555474192e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003910876461304724, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 497.90625, "completions/min_length": 247.0, "epoch": 0.23660539694955027, "frac_reward_zero_std": 0.046875, "grad_norm": 0.87109375, "kl": 0.0033682373352348803, "learning_rate": 2.6767980758751263e-06, "loss": 9.943682234734297e-05, "reward": 0.4006897807121277, "reward_std": 0.25729192793369293, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4006897807121277, "rewards/QAReward/std": 0.430644229054451, "step": 1210 }, { "clip_ratio/high_max": 0.00033362298272550107, "clip_ratio/high_mean": 0.00014485553838312627, "clip_ratio/low_mean": 3.561864141374826e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001804741856176406, "completions/clipped_ratio": 0.01953125, "completions/max_length": 967.0, "completions/mean_length": 520.1458333333334, "completions/min_length": 232.0, "epoch": 0.2375831052014079, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.828125, "kl": 0.0033504455350339413, "learning_rate": 2.673846517068866e-06, "loss": 0.00014653329271823167, "reward": 0.36650081475575763, "reward_std": 0.2651844123999278, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36650081475575763, "rewards/QAReward/std": 0.43695183595021564, "step": 1215 }, { "clip_ratio/high_max": 0.0006452607456594705, "clip_ratio/high_mean": 0.00027387560112401843, "clip_ratio/low_mean": 8.93802527571097e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003632558509707451, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 516.28125, "completions/min_length": 257.5, "epoch": 0.23856081345326555, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.0031859111040830612, "learning_rate": 2.6708831852569195e-06, "loss": 0.00014770267298445106, "reward": 0.3277522027492523, "reward_std": 0.2757853716611862, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3277522027492523, "rewards/QAReward/std": 0.4732517898082733, "step": 1220 }, { "clip_ratio/high_max": 0.0004251386388204992, "clip_ratio/high_mean": 0.00023960485123097897, "clip_ratio/low_mean": 3.289823944214731e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027250308776274326, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 514.9752604166666, "completions/min_length": 250.33333333333334, "epoch": 0.2395385217051232, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8125, "kl": 0.003240442229434848, "learning_rate": 2.6679081101598003e-06, "loss": 0.00012995790457352995, "reward": 0.3715820411841075, "reward_std": 0.27278950810432434, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3715820411841075, "rewards/QAReward/std": 0.44827864567438763, "step": 1225 }, { "clip_ratio/high_max": 0.000634684378746897, "clip_ratio/high_mean": 0.000275516853434965, "clip_ratio/low_mean": 0.00010097066988237202, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037648750585503874, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 522.26171875, "completions/min_length": 262.5, "epoch": 0.24051622995698083, "frac_reward_zero_std": 0.046875, "grad_norm": 0.84375, "kl": 0.0032108381390571596, "learning_rate": 2.6649213216158017e-06, "loss": 0.0001009650295600295, "reward": 0.30656933784484863, "reward_std": 0.2761073261499405, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30656932294368744, "rewards/QAReward/std": 0.4674234390258789, "step": 1230 }, { "clip_ratio/high_max": 0.00028153740568086504, "clip_ratio/high_mean": 0.00017323988140560687, "clip_ratio/low_mean": 3.727941366378218e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002105192863382399, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 517.1536458333334, "completions/min_length": 240.66666666666666, "epoch": 0.2414939382088385, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83984375, "kl": 0.003122801100835204, "learning_rate": 2.661922849580695e-06, "loss": 0.0001245168154127896, "reward": 0.43429646889368695, "reward_std": 0.27498048543930054, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43429645895957947, "rewards/QAReward/std": 0.4136141240596771, "step": 1235 }, { "clip_ratio/high_max": 0.0004962988197803498, "clip_ratio/high_mean": 0.0002637240686453879, "clip_ratio/low_mean": 5.382381787057966e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003175479010678828, "completions/clipped_ratio": 0.001953125, "completions/max_length": 975.5, "completions/mean_length": 499.935546875, "completions/min_length": 258.0, "epoch": 0.24247164646069613, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.0031691731885075567, "learning_rate": 2.6589127241274315e-06, "loss": 0.00016097663901746274, "reward": 0.42164093255996704, "reward_std": 0.29319414496421814, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42164094746112823, "rewards/QAReward/std": 0.41628216207027435, "step": 1240 }, { "clip_ratio/high_max": 0.0003602221957407892, "clip_ratio/high_mean": 0.00018335261265747249, "clip_ratio/low_mean": 2.412954272585921e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020748216775245965, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 530.203125, "completions/min_length": 224.0, "epoch": 0.24344935471255377, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80859375, "kl": 0.0031028748489916325, "learning_rate": 2.655890975445838e-06, "loss": 9.657815680839122e-05, "reward": 0.3607654372851054, "reward_std": 0.2785087029139201, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36076544721921283, "rewards/QAReward/std": 0.4404661953449249, "step": 1245 }, { "clip_ratio/high_max": 0.0004420161014422774, "clip_ratio/high_mean": 0.00022553378948941827, "clip_ratio/low_mean": 7.649043836863712e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030202423222362994, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 533.12109375, "completions/min_length": 275.0, "epoch": 0.2444270629644114, "frac_reward_zero_std": 0.015625, "grad_norm": 0.80078125, "kl": 0.002974556107074022, "learning_rate": 2.6528576338423175e-06, "loss": 0.00011992724612355232, "reward": 0.3405327796936035, "reward_std": 0.3100473880767822, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3405327796936035, "rewards/QAReward/std": 0.5325755476951599, "step": 1250 }, { "clip_ratio/high_max": 0.0002436392824165523, "clip_ratio/high_mean": 0.00013114686589688063, "clip_ratio/low_mean": 5.4670704412274065e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018581757321953774, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 525.70703125, "completions/min_length": 251.33333333333334, "epoch": 0.24540477121626905, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8125, "kl": 0.0030151598155498506, "learning_rate": 2.649812729739542e-06, "loss": 0.00012477824930101633, "reward": 0.39084097743034363, "reward_std": 0.2793751160303752, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39084097743034363, "rewards/QAReward/std": 0.437321017185847, "step": 1255 }, { "clip_ratio/high_max": 0.0003857045667245984, "clip_ratio/high_mean": 0.0002143125020666048, "clip_ratio/low_mean": 6.487061327788978e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027918312698602674, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 522.501953125, "completions/min_length": 209.5, "epoch": 0.24638247946812672, "frac_reward_zero_std": 0.046875, "grad_norm": 0.796875, "kl": 0.003140829550102353, "learning_rate": 2.6467562936761487e-06, "loss": 0.00015409071929752826, "reward": 0.3562113344669342, "reward_std": 0.27492280304431915, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.356211319565773, "rewards/QAReward/std": 0.4216196835041046, "step": 1260 }, { "clip_ratio/high_max": 0.00045363393146544693, "clip_ratio/high_mean": 0.00020571185159496964, "clip_ratio/low_mean": 5.907275481149554e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026478460058569907, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 541.8190104166666, "completions/min_length": 258.3333333333333, "epoch": 0.24736018771998436, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.796875, "kl": 0.0029436558485031127, "learning_rate": 2.643688356306434e-06, "loss": 9.240475483238698e-05, "reward": 0.32451844215393066, "reward_std": 0.27998353044192, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32451844215393066, "rewards/QAReward/std": 0.49342966079711914, "step": 1265 }, { "clip_ratio/high_max": 0.0005260190693661571, "clip_ratio/high_mean": 0.00028544061933644116, "clip_ratio/low_mean": 9.811250201892107e-05, "clip_ratio/low_min": 2.2922636708244683e-05, "clip_ratio/region_mean": 0.00038355314172804355, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 549.080078125, "completions/min_length": 259.0, "epoch": 0.248337895971842, "frac_reward_zero_std": 0.03125, "grad_norm": 0.75390625, "kl": 0.002941098343580961, "learning_rate": 2.6406089484000465e-06, "loss": 0.00010208950843662023, "reward": 0.3642663359642029, "reward_std": 0.2767999470233917, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3642663359642029, "rewards/QAReward/std": 0.4082060754299164, "step": 1270 }, { "clip_ratio/high_max": 0.0003433891455642879, "clip_ratio/high_mean": 0.00018126696813851594, "clip_ratio/low_mean": 2.832285172189586e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020958981476724148, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 509.1549479166667, "completions/min_length": 251.0, "epoch": 0.24931560422369964, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.8125, "kl": 0.003309054672718048, "learning_rate": 2.6375181008416765e-06, "loss": 0.00019198821391910314, "reward": 0.37159186601638794, "reward_std": 0.27724933127562207, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37159185608228046, "rewards/QAReward/std": 0.44489939014116925, "step": 1275 }, { "clip_ratio/high_max": 0.0005691378493793309, "clip_ratio/high_mean": 0.0003544486011378467, "clip_ratio/low_mean": 6.352522468660027e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041797381127253177, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 520.609375, "completions/min_length": 263.5, "epoch": 0.2502933124755573, "frac_reward_zero_std": 0.015625, "grad_norm": 0.85546875, "kl": 0.003074201988056302, "learning_rate": 2.6344158446307486e-06, "loss": 0.00010991955641657115, "reward": 0.4058668464422226, "reward_std": 0.28709159791469574, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4058668464422226, "rewards/QAReward/std": 0.41494426131248474, "step": 1280 }, { "clip_ratio/high_max": 0.0002509695594199002, "clip_ratio/high_mean": 0.00016653157072141767, "clip_ratio/low_mean": 5.673249979736284e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022326406324282288, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 523.4908854166666, "completions/min_length": 257.6666666666667, "epoch": 0.2512710207274149, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.78125, "kl": 0.0030524313915520906, "learning_rate": 2.631302210881108e-06, "loss": 8.742116042412818e-05, "reward": 0.33851170539855957, "reward_std": 0.27991431951522827, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33851170539855957, "rewards/QAReward/std": 0.4209769666194916, "step": 1285 }, { "clip_ratio/high_max": 0.00059872210258618, "clip_ratio/high_mean": 0.00034901987528428434, "clip_ratio/low_mean": 7.744718168396502e-05, "clip_ratio/low_min": 2.0214270625729115e-05, "clip_ratio/region_mean": 0.00042646708898246287, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 539.25390625, "completions/min_length": 204.0, "epoch": 0.2522487289792726, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.003111954778432846, "learning_rate": 2.6281772308207117e-06, "loss": 0.00016200051177293063, "reward": 0.3941158354282379, "reward_std": 0.2864273935556412, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3941158205270767, "rewards/QAReward/std": 0.43056620657444, "step": 1290 }, { "clip_ratio/high_max": 0.00027180296601727607, "clip_ratio/high_mean": 0.00012478922144509852, "clip_ratio/low_mean": 6.132196867838501e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018611119012348353, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 538.9453125, "completions/min_length": 262.3333333333333, "epoch": 0.25322643723113025, "frac_reward_zero_std": 0.09375, "grad_norm": 0.75, "kl": 0.0029322762042284013, "learning_rate": 2.6250409357913132e-06, "loss": 0.000122888945043087, "reward": 0.42431435982386273, "reward_std": 0.2531987875699997, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42431435982386273, "rewards/QAReward/std": 0.44662946462631226, "step": 1295 }, { "clip_ratio/high_max": 0.0005636439542286098, "clip_ratio/high_mean": 0.00030241676140576603, "clip_ratio/low_mean": 6.201988144312054e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003644366399385035, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 505.7578125, "completions/min_length": 265.5, "epoch": 0.25420414548298786, "frac_reward_zero_std": 0.0625, "grad_norm": 0.91015625, "kl": 0.0031360150314867496, "learning_rate": 2.6218933572481487e-06, "loss": 0.00012986541260033845, "reward": 0.38123372197151184, "reward_std": 0.2602849155664444, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38123373687267303, "rewards/QAReward/std": 0.450019434094429, "step": 1300 }, { "clip_ratio/high_max": 0.00034461140166968106, "clip_ratio/high_mean": 0.00017372703878208994, "clip_ratio/low_mean": 2.7537441928870977e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020126448944211007, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 530.2291666666666, "completions/min_length": 213.33333333333334, "epoch": 0.25518185373484553, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8359375, "kl": 0.002977911988273263, "learning_rate": 2.618734526759621e-06, "loss": 0.00014185897307470441, "reward": 0.3301713665326436, "reward_std": 0.2570934146642685, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3301713565985362, "rewards/QAReward/std": 0.4165300726890564, "step": 1305 }, { "clip_ratio/high_max": 0.0005167638417333364, "clip_ratio/high_mean": 0.0002479526796378195, "clip_ratio/low_mean": 7.221578562166542e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000320168468169868, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 537.6640625, "completions/min_length": 288.0, "epoch": 0.25615956198670314, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7734375, "kl": 0.002925369469448924, "learning_rate": 2.6155644760069837e-06, "loss": 0.00018291972810402513, "reward": 0.2998085916042328, "reward_std": 0.2918834090232849, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2998085916042328, "rewards/QAReward/std": 0.45616382360458374, "step": 1310 }, { "clip_ratio/high_max": 0.00029362642671912906, "clip_ratio/high_mean": 0.00016934828017838298, "clip_ratio/low_mean": 6.468995125032961e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023403821978718042, "completions/clipped_ratio": 0.02734375, "completions/max_length": 987.3333333333334, "completions/mean_length": 500.859375, "completions/min_length": 233.33333333333334, "epoch": 0.2571372702385608, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.828125, "kl": 0.003054302232339978, "learning_rate": 2.612383236784023e-06, "loss": 6.409583147615194e-05, "reward": 0.34642496705055237, "reward_std": 0.27671009798844654, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3464249571164449, "rewards/QAReward/std": 0.45505766073862713, "step": 1315 }, { "clip_ratio/high_max": 0.0004966994049027563, "clip_ratio/high_mean": 0.0002464923192746937, "clip_ratio/low_mean": 6.852254591649398e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003150148724671453, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 529.900390625, "completions/min_length": 241.5, "epoch": 0.2581149784904185, "frac_reward_zero_std": 0.046875, "grad_norm": 0.79296875, "kl": 0.002898013964295387, "learning_rate": 2.60919084099674e-06, "loss": 8.53169010952115e-05, "reward": 0.35086962580680847, "reward_std": 0.25840523838996887, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3508696109056473, "rewards/QAReward/std": 0.45527221262454987, "step": 1320 }, { "clip_ratio/high_max": 0.00040386393666267393, "clip_ratio/high_mean": 0.00014256389695219696, "clip_ratio/low_mean": 5.560645658988506e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019817035645246505, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 524.6393229166666, "completions/min_length": 237.66666666666666, "epoch": 0.2590926867422761, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.828125, "kl": 0.0029184194281697273, "learning_rate": 2.605987320663029e-06, "loss": 0.00019145393744111062, "reward": 0.2886621554692586, "reward_std": 0.2731269697348277, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2886621455351512, "rewards/QAReward/std": 0.4396228889624278, "step": 1325 }, { "clip_ratio/high_max": 0.0006985468789935112, "clip_ratio/high_mean": 0.0002866434282623231, "clip_ratio/low_mean": 8.57790553709492e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037242246326059104, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 520.615234375, "completions/min_length": 253.5, "epoch": 0.26007039499413376, "frac_reward_zero_std": 0.078125, "grad_norm": 0.8359375, "kl": 0.0030548883602023124, "learning_rate": 2.6027727079123562e-06, "loss": 0.00017926233122125267, "reward": 0.391303151845932, "reward_std": 0.28654278814792633, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3913031816482544, "rewards/QAReward/std": 0.4604186862707138, "step": 1330 }, { "clip_ratio/high_max": 0.00021800021640956403, "clip_ratio/high_mean": 0.00012079512816853821, "clip_ratio/low_mean": 4.316903068684042e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016396415303461255, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 509.5416666666667, "completions/min_length": 236.66666666666666, "epoch": 0.26104810324599137, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9140625, "kl": 0.0030335218645632265, "learning_rate": 2.5995470349854407e-06, "loss": 3.3251653076149526e-05, "reward": 0.3861297070980072, "reward_std": 0.3002127707004547, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3861297369003296, "rewards/QAReward/std": 0.4564511775970459, "step": 1335 }, { "clip_ratio/high_max": 0.0004962823470123113, "clip_ratio/high_mean": 0.00026552107883617284, "clip_ratio/low_mean": 6.92428118782118e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033476391108706595, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 533.283203125, "completions/min_length": 239.5, "epoch": 0.26202581149784904, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.0030011676717549562, "learning_rate": 2.5963103342339264e-06, "loss": 0.00016267600003629922, "reward": 0.3712441027164459, "reward_std": 0.26647088676691055, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3712441325187683, "rewards/QAReward/std": 0.44732317328453064, "step": 1340 }, { "clip_ratio/high_max": 0.00029384176013991237, "clip_ratio/high_mean": 0.00015823394060134887, "clip_ratio/low_mean": 5.565198080148548e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021388591267168522, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 515.77734375, "completions/min_length": 251.0, "epoch": 0.2630035197497067, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.003228190029039979, "learning_rate": 2.593062638120061e-06, "loss": 0.00015515347477048635, "reward": 0.3800920844078064, "reward_std": 0.28476013739903766, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3800920893748601, "rewards/QAReward/std": 0.4696885347366333, "step": 1345 }, { "clip_ratio/high_max": 0.0005468286690302193, "clip_ratio/high_mean": 0.00024798605008982124, "clip_ratio/low_mean": 6.61170473904349e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003141030902042985, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 525.576171875, "completions/min_length": 245.5, "epoch": 0.2639812280015643, "frac_reward_zero_std": 0.046875, "grad_norm": 0.87890625, "kl": 0.0030523513909429313, "learning_rate": 2.5898039792163702e-06, "loss": 0.0001071014441549778, "reward": 0.38870055973529816, "reward_std": 0.2681543529033661, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38870054483413696, "rewards/QAReward/std": 0.46043260395526886, "step": 1350 }, { "clip_ratio/high_max": 0.00034261770779266956, "clip_ratio/high_mean": 0.00018477399717085064, "clip_ratio/low_mean": 6.119250319898128e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024596650619059803, "completions/clipped_ratio": 0.009114583333333334, "completions/max_length": 971.0, "completions/mean_length": 510.2109375, "completions/min_length": 241.33333333333334, "epoch": 0.264958936253422, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.003145247232168913, "learning_rate": 2.5865343902053286e-06, "loss": 0.00018093908438459038, "reward": 0.3587859670321147, "reward_std": 0.2849981486797333, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3587859471638997, "rewards/QAReward/std": 0.4307633141676585, "step": 1355 }, { "clip_ratio/high_max": 0.00032091353205032647, "clip_ratio/high_mean": 0.00020560246775858104, "clip_ratio/low_mean": 0.00010069584532175213, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000306298304349184, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 536.6640625, "completions/min_length": 253.0, "epoch": 0.26593664450527965, "frac_reward_zero_std": 0.015625, "grad_norm": 0.796875, "kl": 0.0030579672195017336, "learning_rate": 2.583253903879034e-06, "loss": 0.00015343963168561458, "reward": 0.3630533814430237, "reward_std": 0.2998335659503937, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3630533814430237, "rewards/QAReward/std": 0.4325074702501297, "step": 1360 }, { "clip_ratio/high_max": 0.00036559778964146973, "clip_ratio/high_mean": 0.000200019555632025, "clip_ratio/low_mean": 4.691956855822355e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002469391212798655, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.2526041666666, "completions/min_length": 270.6666666666667, "epoch": 0.26691435275713726, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.7890625, "kl": 0.002901044487953186, "learning_rate": 2.5799625531388785e-06, "loss": 0.00017080268589779734, "reward": 0.38119783997535706, "reward_std": 0.2742587725321452, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3811978300412496, "rewards/QAReward/std": 0.43732481201489765, "step": 1365 }, { "clip_ratio/high_max": 0.0005821676226332783, "clip_ratio/high_mean": 0.00031655142083764076, "clip_ratio/low_mean": 9.257502970285713e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004091264680027962, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 508.3828125, "completions/min_length": 246.5, "epoch": 0.26789206100899493, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8359375, "kl": 0.0031105336267501114, "learning_rate": 2.5766603709952182e-06, "loss": 0.00013581688981503248, "reward": 0.3623891621828079, "reward_std": 0.30098286271095276, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3623891621828079, "rewards/QAReward/std": 0.46293117105960846, "step": 1370 }, { "clip_ratio/high_max": 0.00033257181057706476, "clip_ratio/high_mean": 0.00020465408451855182, "clip_ratio/low_mean": 7.212294149212539e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002767770201899111, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 522.7473958333334, "completions/min_length": 241.66666666666666, "epoch": 0.26886976926085254, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.003069460391998291, "learning_rate": 2.573347390567041e-06, "loss": 0.00021728833671659232, "reward": 0.32618118325869244, "reward_std": 0.27586973706881207, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32618117332458496, "rewards/QAReward/std": 0.43856749931971234, "step": 1375 }, { "clip_ratio/high_max": 0.0005346686695702374, "clip_ratio/high_mean": 0.0003323872573673725, "clip_ratio/low_mean": 8.892616024240852e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004213134292513132, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 507.26953125, "completions/min_length": 235.0, "epoch": 0.2698474775127102, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8671875, "kl": 0.0030990542843937876, "learning_rate": 2.5700236450816354e-06, "loss": 0.0001386275514960289, "reward": 0.4055563360452652, "reward_std": 0.28288671374320984, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4055563360452652, "rewards/QAReward/std": 0.4292929023504257, "step": 1380 }, { "clip_ratio/high_max": 0.00045872669434174893, "clip_ratio/high_mean": 0.00022316694958135486, "clip_ratio/low_mean": 5.58510102564469e-05, "clip_ratio/low_min": 1.8407731840852647e-05, "clip_ratio/region_mean": 0.0002790179569274187, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 527.6822916666666, "completions/min_length": 241.66666666666666, "epoch": 0.2708251857645679, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83984375, "kl": 0.0029750486370176076, "learning_rate": 2.5666891678742584e-06, "loss": 0.00011160506401211023, "reward": 0.3202902873357137, "reward_std": 0.2896516223748525, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32029027740160626, "rewards/QAReward/std": 0.4741848011811574, "step": 1385 }, { "clip_ratio/high_max": 0.0004309849115088582, "clip_ratio/high_mean": 0.00024480002466589215, "clip_ratio/low_mean": 9.14137388463132e-05, "clip_ratio/low_min": 1.9756988331209867e-05, "clip_ratio/region_mean": 0.0003362137707881629, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 519.94140625, "completions/min_length": 250.5, "epoch": 0.2718028940164255, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.0030105235520750282, "learning_rate": 2.5633439923877992e-06, "loss": 0.000126453279517591, "reward": 0.34074048697948456, "reward_std": 0.2711641937494278, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34074048697948456, "rewards/QAReward/std": 0.42885883152484894, "step": 1390 }, { "clip_ratio/high_max": 0.00026976176304742693, "clip_ratio/high_mean": 0.00015020148712210357, "clip_ratio/low_mean": 5.722895584767684e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020743043860420584, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 525.4466145833334, "completions/min_length": 268.3333333333333, "epoch": 0.27278060226828316, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.81640625, "kl": 0.003000221587717533, "learning_rate": 2.5599881521724443e-06, "loss": 0.00014870594022795559, "reward": 0.3572317361831665, "reward_std": 0.2704586386680603, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3572317361831665, "rewards/QAReward/std": 0.4546924630800883, "step": 1395 }, { "clip_ratio/high_max": 0.0005754355806857347, "clip_ratio/high_mean": 0.0002786210272461176, "clip_ratio/low_mean": 6.983177590882405e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003484527871478349, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 516.181640625, "completions/min_length": 226.0, "epoch": 0.27375831052014077, "frac_reward_zero_std": 0.046875, "grad_norm": 0.78515625, "kl": 0.0030617361888289453, "learning_rate": 2.556621680885342e-06, "loss": 0.0002232902217656374, "reward": 0.38196612894535065, "reward_std": 0.2663272023200989, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38196614384651184, "rewards/QAReward/std": 0.44425900280475616, "step": 1400 }, { "clip_ratio/high_max": 0.0003772052237764001, "clip_ratio/high_mean": 0.00017851598095148803, "clip_ratio/low_mean": 6.124173232819885e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023975770454853773, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 531.8958333333334, "completions/min_length": 239.66666666666666, "epoch": 0.27473601877199844, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8515625, "kl": 0.0030962209217250346, "learning_rate": 2.5532446122902642e-06, "loss": 9.998445166274905e-05, "reward": 0.28453725079695386, "reward_std": 0.28587618470191956, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2845372458299001, "rewards/QAReward/std": 0.4574250777562459, "step": 1405 }, { "clip_ratio/high_max": 0.0006600976339541376, "clip_ratio/high_mean": 0.00028144634561613204, "clip_ratio/low_mean": 5.856423522345722e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034001058083958923, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 522.220703125, "completions/min_length": 244.5, "epoch": 0.2757137270238561, "frac_reward_zero_std": 0.046875, "grad_norm": 0.86328125, "kl": 0.002984838094562292, "learning_rate": 2.5498569802572665e-06, "loss": 6.687769200652838e-05, "reward": 0.33384256064891815, "reward_std": 0.2694394886493683, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33384254574775696, "rewards/QAReward/std": 0.43138745427131653, "step": 1410 }, { "clip_ratio/high_max": 0.00039195161079987886, "clip_ratio/high_mean": 0.00017090283799916505, "clip_ratio/low_mean": 7.010565313976258e-05, "clip_ratio/low_min": 2.379252982791513e-05, "clip_ratio/region_mean": 0.00024100850569084287, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 531.2239583333334, "completions/min_length": 242.66666666666666, "epoch": 0.2766914352757137, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.828125, "kl": 0.002990884380415082, "learning_rate": 2.5464588187623513e-06, "loss": 8.336496539413929e-05, "reward": 0.37096301714579266, "reward_std": 0.28946061929066974, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37096303701400757, "rewards/QAReward/std": 0.43118906021118164, "step": 1415 }, { "clip_ratio/high_max": 0.0004227628349326551, "clip_ratio/high_mean": 0.0002551058190874755, "clip_ratio/low_mean": 9.34312934987247e-05, "clip_ratio/low_min": 2.093583170790225e-05, "clip_ratio/region_mean": 0.00034853711258620024, "completions/clipped_ratio": 0.048828125, "completions/max_length": 1024.0, "completions/mean_length": 542.568359375, "completions/min_length": 266.5, "epoch": 0.2776691435275714, "frac_reward_zero_std": 0.015625, "grad_norm": 0.82421875, "kl": 0.002948349853977561, "learning_rate": 2.543050161887124e-06, "loss": 0.00011325324885547161, "reward": 0.4042380005121231, "reward_std": 0.27978575229644775, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4042379856109619, "rewards/QAReward/std": 0.39731453359127045, "step": 1420 }, { "clip_ratio/high_max": 0.00037338570691645143, "clip_ratio/high_mean": 0.00020466604037210344, "clip_ratio/low_mean": 4.555788764264435e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025022393092513087, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 537.8580729166666, "completions/min_length": 252.66666666666666, "epoch": 0.278646851779429, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.80859375, "kl": 0.002991762291640043, "learning_rate": 2.539631043818454e-06, "loss": 0.00012043544556945562, "reward": 0.3839735190073649, "reward_std": 0.2871878246466319, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38397352894147235, "rewards/QAReward/std": 0.460706889629364, "step": 1425 }, { "clip_ratio/high_max": 0.0005976891028694808, "clip_ratio/high_mean": 0.0003243521088734269, "clip_ratio/low_mean": 7.933563028927892e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040368774207308886, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 518.12890625, "completions/min_length": 250.0, "epoch": 0.27962456003128666, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.00313473935239017, "learning_rate": 2.536201498848129e-06, "loss": 0.00010043089278042317, "reward": 0.43094927072525024, "reward_std": 0.2628187835216522, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43094927072525024, "rewards/QAReward/std": 0.4058062583208084, "step": 1430 }, { "clip_ratio/high_max": 0.0004216104280203581, "clip_ratio/high_mean": 0.00022188016446307303, "clip_ratio/low_mean": 2.2899795294506475e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024477996630594133, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 1024.0, "completions/mean_length": 510.2109375, "completions/min_length": 248.33333333333334, "epoch": 0.28060226828314433, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.86328125, "kl": 0.003177571902051568, "learning_rate": 2.5327615613725144e-06, "loss": 0.0001537930453196168, "reward": 0.29289383192857105, "reward_std": 0.29353047410647076, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2928938368956248, "rewards/QAReward/std": 0.4575629035631816, "step": 1435 }, { "clip_ratio/high_max": 0.0004978297045454382, "clip_ratio/high_mean": 0.0002861741289962083, "clip_ratio/low_mean": 6.970395625103266e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000355878088157624, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 502.462890625, "completions/min_length": 246.0, "epoch": 0.28157997653500194, "frac_reward_zero_std": 0.046875, "grad_norm": 0.84765625, "kl": 0.0032330852933228015, "learning_rate": 2.529311265892204e-06, "loss": 0.0002349859569221735, "reward": 0.4153056740760803, "reward_std": 0.2751936912536621, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4153056889772415, "rewards/QAReward/std": 0.4492518901824951, "step": 1440 }, { "clip_ratio/high_max": 0.00032146469457075, "clip_ratio/high_mean": 0.00013906254898756742, "clip_ratio/low_mean": 2.339923084946349e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001624617725610733, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 522.9895833333334, "completions/min_length": 237.0, "epoch": 0.2825576847868596, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8046875, "kl": 0.003250228660181165, "learning_rate": 2.525850647011679e-06, "loss": 0.000127575418446213, "reward": 0.3136124610900879, "reward_std": 0.29757757981618244, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3136124561230342, "rewards/QAReward/std": 0.4832280973593394, "step": 1445 }, { "clip_ratio/high_max": 0.0005050158360973, "clip_ratio/high_mean": 0.0002502390940207988, "clip_ratio/low_mean": 9.259751241188496e-05, "clip_ratio/low_min": 2.1417862444650382e-05, "clip_ratio/region_mean": 0.0003428366035223007, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 524.06640625, "completions/min_length": 245.0, "epoch": 0.2835353930387172, "frac_reward_zero_std": 0.015625, "grad_norm": 0.890625, "kl": 0.003108046716079116, "learning_rate": 2.5223797394389567e-06, "loss": 0.00016583342803642154, "reward": 0.4035022705793381, "reward_std": 0.254044845700264, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4035022258758545, "rewards/QAReward/std": 0.4063895046710968, "step": 1450 }, { "clip_ratio/high_max": 0.000317487888969481, "clip_ratio/high_mean": 0.00016672746860422195, "clip_ratio/low_mean": 5.2092391706537454e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021881985012441873, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 542.703125, "completions/min_length": 267.3333333333333, "epoch": 0.2845131012905749, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8046875, "kl": 0.0031294235959649086, "learning_rate": 2.5188985779852437e-06, "loss": 0.00018689215648919343, "reward": 0.33224567770957947, "reward_std": 0.28838029503822327, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33224568764368695, "rewards/QAReward/std": 0.4290009140968323, "step": 1455 }, { "clip_ratio/high_max": 0.0004749684128910303, "clip_ratio/high_mean": 0.00024343710392713547, "clip_ratio/low_mean": 0.00011448262084741145, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003579196985810995, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 514.73046875, "completions/min_length": 245.5, "epoch": 0.28549080954243256, "frac_reward_zero_std": 0.046875, "grad_norm": 0.89453125, "kl": 0.003155302396044135, "learning_rate": 2.5154071975645892e-06, "loss": 0.00015474590472877025, "reward": 0.3420921713113785, "reward_std": 0.2716662883758545, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3420921564102173, "rewards/QAReward/std": 0.42263932526111603, "step": 1460 }, { "clip_ratio/high_max": 0.00032419952331110837, "clip_ratio/high_mean": 0.0001738120394293219, "clip_ratio/low_mean": 4.65257529867813e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022033780114725233, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 515.4544270833334, "completions/min_length": 241.66666666666666, "epoch": 0.28646851779429017, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8203125, "kl": 0.003196585411205888, "learning_rate": 2.511905633193531e-06, "loss": 0.00013407240621745586, "reward": 0.3749131957689921, "reward_std": 0.26407720148563385, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3749131957689921, "rewards/QAReward/std": 0.42537763714790344, "step": 1465 }, { "clip_ratio/high_max": 0.0004183606128208339, "clip_ratio/high_mean": 0.00024102468159981072, "clip_ratio/low_mean": 8.222003962146118e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003232447197660804, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 530.7109375, "completions/min_length": 244.5, "epoch": 0.28744622604614783, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.0032114644534885885, "learning_rate": 2.508393919990747e-06, "loss": 0.00013611866161227225, "reward": 0.3760356903076172, "reward_std": 0.297013595700264, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3760356903076172, "rewards/QAReward/std": 0.4022676348686218, "step": 1470 }, { "clip_ratio/high_max": 0.00045989041682332755, "clip_ratio/high_mean": 0.00019004009664058685, "clip_ratio/low_mean": 2.922614148701541e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002192662446759641, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 549.19921875, "completions/min_length": 252.0, "epoch": 0.2884239342980055, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.81640625, "kl": 0.0031181414145976306, "learning_rate": 2.504872093176701e-06, "loss": 0.00018887026235461235, "reward": 0.2957330991824468, "reward_std": 0.2938343683878581, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29573309421539307, "rewards/QAReward/std": 0.46282894412676495, "step": 1475 }, { "clip_ratio/high_max": 0.0005100415553897619, "clip_ratio/high_mean": 0.0002644222287926823, "clip_ratio/low_mean": 6.902858440298587e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033345080446451903, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 527.302734375, "completions/min_length": 253.0, "epoch": 0.2894016425498631, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.0030475802719593047, "learning_rate": 2.5013401880732914e-06, "loss": 0.0001225472311489284, "reward": 0.3162481486797333, "reward_std": 0.2717610001564026, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31624817848205566, "rewards/QAReward/std": 0.43684859573841095, "step": 1480 }, { "clip_ratio/high_max": 0.00046398728154599666, "clip_ratio/high_mean": 0.00018260306678712367, "clip_ratio/low_mean": 4.04945167247206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022309758933261036, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 515.4583333333334, "completions/min_length": 257.6666666666667, "epoch": 0.2903793508017208, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.828125, "kl": 0.0031142215710133316, "learning_rate": 2.497798240103496e-06, "loss": 0.00011046929284930229, "reward": 0.40056270360946655, "reward_std": 0.2675718863805135, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40056270360946655, "rewards/QAReward/std": 0.43258267641067505, "step": 1485 }, { "clip_ratio/high_max": 0.0005204747896641493, "clip_ratio/high_mean": 0.0002601319691166282, "clip_ratio/low_mean": 4.0960099795483984e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030109205981716515, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 520.65234375, "completions/min_length": 246.0, "epoch": 0.2913570590535784, "frac_reward_zero_std": 0.046875, "grad_norm": 0.77734375, "kl": 0.0030709617771208287, "learning_rate": 2.4942462847910165e-06, "loss": 0.00013745047617703677, "reward": 0.3993571996688843, "reward_std": 0.27175380289554596, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3993571996688843, "rewards/QAReward/std": 0.4315006881952286, "step": 1490 }, { "clip_ratio/high_max": 0.00038643388543277977, "clip_ratio/high_mean": 0.00021750395535491407, "clip_ratio/low_mean": 6.833021179772914e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002858341671526432, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 533.3580729166666, "completions/min_length": 230.66666666666666, "epoch": 0.29233476730543606, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8203125, "kl": 0.003058929229155183, "learning_rate": 2.4906843577599216e-06, "loss": 0.00018063858151435853, "reward": 0.415044238169988, "reward_std": 0.2855355242888133, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41504421830177307, "rewards/QAReward/std": 0.42750715216000873, "step": 1495 }, { "clip_ratio/high_max": 0.0005119177862070501, "clip_ratio/high_mean": 0.0002869551826734096, "clip_ratio/low_mean": 4.9033819959731775e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003359890193678439, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 540.0703125, "completions/min_length": 234.5, "epoch": 0.29331247555729373, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0234375, "kl": 0.0030189513694494964, "learning_rate": 2.487112494734293e-06, "loss": 0.00017006639391183853, "reward": 0.3763067424297333, "reward_std": 0.30036965012550354, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3763067424297333, "rewards/QAReward/std": 0.4600937068462372, "step": 1500 }, { "clip_ratio/high_max": 0.0002857569372281432, "clip_ratio/high_mean": 0.00017534815124236046, "clip_ratio/low_mean": 7.556581986136734e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025091397110372783, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 522.265625, "completions/min_length": 247.33333333333334, "epoch": 0.29429018380915134, "frac_reward_zero_std": 0.03125, "grad_norm": 0.85546875, "kl": 0.0031586749944835903, "learning_rate": 2.4835307315378628e-06, "loss": 0.00014265014324337243, "reward": 0.3614609440167745, "reward_std": 0.27946730454762775, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36146092414855957, "rewards/QAReward/std": 0.42381081978480023, "step": 1505 }, { "clip_ratio/high_max": 0.0005057372036390007, "clip_ratio/high_mean": 0.00026330248219892384, "clip_ratio/low_mean": 5.2875948313158005e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031617841450497507, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 507.072265625, "completions/min_length": 233.5, "epoch": 0.295267892061009, "frac_reward_zero_std": 0.046875, "grad_norm": 0.82421875, "kl": 0.003102501155808568, "learning_rate": 2.4799391040936566e-06, "loss": 0.00018904911121353507, "reward": 0.3743102103471756, "reward_std": 0.2825523614883423, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3743102103471756, "rewards/QAReward/std": 0.4295196235179901, "step": 1510 }, { "clip_ratio/high_max": 0.0003835358307696879, "clip_ratio/high_mean": 0.00016850512474775314, "clip_ratio/low_mean": 6.921976746525616e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023772489512339235, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 521.8138020833334, "completions/min_length": 231.33333333333334, "epoch": 0.2962456003128666, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.81640625, "kl": 0.0030347732827067375, "learning_rate": 2.4763376484236318e-06, "loss": 0.0001055027823895216, "reward": 0.3055385152498881, "reward_std": 0.2760383188724518, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3055385152498881, "rewards/QAReward/std": 0.4309710164864858, "step": 1515 }, { "clip_ratio/high_max": 0.0005892940796911716, "clip_ratio/high_mean": 0.00025099554331973193, "clip_ratio/low_mean": 9.688942809589207e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034788497723639014, "completions/clipped_ratio": 0.048828125, "completions/max_length": 1024.0, "completions/mean_length": 537.4140625, "completions/min_length": 220.5, "epoch": 0.2972233085647243, "frac_reward_zero_std": 0.046875, "grad_norm": 0.79296875, "kl": 0.002922083996236324, "learning_rate": 2.4727264006483196e-06, "loss": 3.210185095667839e-05, "reward": 0.4181315153837204, "reward_std": 0.2742404341697693, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4181315153837204, "rewards/QAReward/std": 0.44581839442253113, "step": 1520 }, { "clip_ratio/high_max": 0.00022546894615516067, "clip_ratio/high_mean": 0.00015856915852054953, "clip_ratio/low_mean": 3.0364788835868238e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018893394153565168, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 516.4583333333334, "completions/min_length": 232.66666666666666, "epoch": 0.29820101681658195, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.84375, "kl": 0.0030734393279999495, "learning_rate": 2.4691053969864583e-06, "loss": 5.121263675391674e-05, "reward": 0.3614443838596344, "reward_std": 0.2619309326012929, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3614443739255269, "rewards/QAReward/std": 0.4814991553624471, "step": 1525 }, { "clip_ratio/high_max": 0.0004311722470447421, "clip_ratio/high_mean": 0.0002490897895768285, "clip_ratio/low_mean": 7.2704759077169e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003217945515643805, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 538.2421875, "completions/min_length": 246.5, "epoch": 0.29917872506843957, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80859375, "kl": 0.0030612707603722812, "learning_rate": 2.465474673754633e-06, "loss": 0.00014077411033213137, "reward": 0.2720664441585541, "reward_std": 0.2795709818601608, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2720664292573929, "rewards/QAReward/std": 0.4821469187736511, "step": 1530 }, { "clip_ratio/high_max": 0.000305862445384264, "clip_ratio/high_mean": 0.00014921730617061256, "clip_ratio/low_mean": 3.535872674547136e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018457603291608394, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 527.6692708333334, "completions/min_length": 261.0, "epoch": 0.30015643332029723, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.0030914078000932933, "learning_rate": 2.46183426736691e-06, "loss": 0.0001382795860990882, "reward": 0.34587931632995605, "reward_std": 0.2737285792827606, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34587934613227844, "rewards/QAReward/std": 0.40709465742111206, "step": 1535 }, { "clip_ratio/high_max": 0.000539431010838598, "clip_ratio/high_mean": 0.0002928336092736572, "clip_ratio/low_mean": 6.855453975731506e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036138815339654685, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 521.697265625, "completions/min_length": 207.5, "epoch": 0.30113414157215485, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.0030792819336056708, "learning_rate": 2.4581842143344723e-06, "loss": 8.356379694305361e-05, "reward": 0.3831729143857956, "reward_std": 0.3012009561061859, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3831728994846344, "rewards/QAReward/std": 0.44711488485336304, "step": 1540 }, { "clip_ratio/high_max": 0.0003719421220012009, "clip_ratio/high_mean": 0.00018033304950222374, "clip_ratio/low_mean": 4.3670277227647604e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022400332381948828, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 520.6549479166666, "completions/min_length": 243.66666666666666, "epoch": 0.3021118498240125, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83203125, "kl": 0.0030538678634911775, "learning_rate": 2.4545245512652536e-06, "loss": 0.00014978619292378427, "reward": 0.35235150655110675, "reward_std": 0.29283106327056885, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35235150655110675, "rewards/QAReward/std": 0.4446708858013153, "step": 1545 }, { "clip_ratio/high_max": 0.0005869533866643906, "clip_ratio/high_mean": 0.00027809306629933417, "clip_ratio/low_mean": 5.9686461463570593e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033777953358367087, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 539.345703125, "completions/min_length": 261.5, "epoch": 0.3030895580758702, "frac_reward_zero_std": 0.046875, "grad_norm": 0.78515625, "kl": 0.0031152948271483184, "learning_rate": 2.450855314863571e-06, "loss": 0.00014235074631869794, "reward": 0.35519903898239136, "reward_std": 0.27532845735549927, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35519902408123016, "rewards/QAReward/std": 0.4354543536901474, "step": 1550 }, { "clip_ratio/high_max": 0.0002984388265758753, "clip_ratio/high_mean": 0.00017397339106537402, "clip_ratio/low_mean": 4.0267416625283656e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021424081642180682, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.7200520833334, "completions/min_length": 269.6666666666667, "epoch": 0.3040672663277278, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.796875, "kl": 0.003088253736495972, "learning_rate": 2.4471765419297546e-06, "loss": 0.00014723283238708972, "reward": 0.33944589893023175, "reward_std": 0.2846665183703105, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3394458790620168, "rewards/QAReward/std": 0.4164922535419464, "step": 1555 }, { "clip_ratio/high_max": 0.0006108716945163906, "clip_ratio/high_mean": 0.0003332659718580544, "clip_ratio/low_mean": 6.386163440765812e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003971275989897549, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 535.798828125, "completions/min_length": 258.0, "epoch": 0.30504497457958546, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84375, "kl": 0.0030998675152659415, "learning_rate": 2.4434882693597837e-06, "loss": 0.00015270556323230267, "reward": 0.3322887718677521, "reward_std": 0.2955474704504013, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3322887718677521, "rewards/QAReward/std": 0.44305452704429626, "step": 1560 }, { "clip_ratio/high_max": 0.0003164665773510933, "clip_ratio/high_mean": 0.0001866869570221752, "clip_ratio/low_mean": 5.886949948035181e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000245556456502527, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 535.5690104166666, "completions/min_length": 244.0, "epoch": 0.30602268283144307, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.75390625, "kl": 0.003131114598363638, "learning_rate": 2.43979053414491e-06, "loss": 0.00011631818488240242, "reward": 0.3736937840779622, "reward_std": 0.2657015224297841, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3736937840779622, "rewards/QAReward/std": 0.46139928698539734, "step": 1565 }, { "clip_ratio/high_max": 0.0005425469717010856, "clip_ratio/high_mean": 0.00024962478782981635, "clip_ratio/low_mean": 6.35227159364149e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031314750085584817, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 525.4609375, "completions/min_length": 234.5, "epoch": 0.30700039108330074, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83984375, "kl": 0.003201489010825753, "learning_rate": 2.4360833733712922e-06, "loss": 0.0001394357532262802, "reward": 0.35574623942375183, "reward_std": 0.2519657164812088, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35574623942375183, "rewards/QAReward/std": 0.41011765599250793, "step": 1570 }, { "clip_ratio/high_max": 0.0002881757915019989, "clip_ratio/high_mean": 0.00018231848953291773, "clip_ratio/low_mean": 5.1840613014064727e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002341591170988977, "completions/clipped_ratio": 0.045572916666666664, "completions/max_length": 1024.0, "completions/mean_length": 539.6380208333334, "completions/min_length": 231.66666666666666, "epoch": 0.3079780993351584, "frac_reward_zero_std": 0.0625, "grad_norm": 0.77734375, "kl": 0.0031256461516022682, "learning_rate": 2.4323668242196222e-06, "loss": 0.00016456261510029436, "reward": 0.43227772911389667, "reward_std": 0.26638289789358777, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43227771917978924, "rewards/QAReward/std": 0.41820430755615234, "step": 1575 }, { "clip_ratio/high_max": 0.0004919132916256785, "clip_ratio/high_mean": 0.0002683959435671568, "clip_ratio/low_mean": 8.975132368505001e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003581472672522068, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 517.490234375, "completions/min_length": 233.5, "epoch": 0.308955807587016, "frac_reward_zero_std": 0.046875, "grad_norm": 0.81640625, "kl": 0.00315375467762351, "learning_rate": 2.4286409239647513e-06, "loss": 0.00018848293693736197, "reward": 0.34483204782009125, "reward_std": 0.25740864872932434, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34483204782009125, "rewards/QAReward/std": 0.4394722431898117, "step": 1580 }, { "clip_ratio/high_max": 0.00037630461156368256, "clip_ratio/high_mean": 0.00022060112096369266, "clip_ratio/low_mean": 4.445026570465416e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026505140122026207, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 1024.0, "completions/mean_length": 534.1822916666666, "completions/min_length": 238.66666666666666, "epoch": 0.3099335158388737, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.7890625, "kl": 0.003070358606055379, "learning_rate": 2.424905709975316e-06, "loss": 0.00011969679035246372, "reward": 0.31632189949353534, "reward_std": 0.27074610193570453, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3163218895594279, "rewards/QAReward/std": 0.43024393916130066, "step": 1585 }, { "clip_ratio/high_max": 0.0003996100975200534, "clip_ratio/high_mean": 0.00024161075707525016, "clip_ratio/low_mean": 7.31754829757847e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031478626187890767, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 539.68359375, "completions/min_length": 244.0, "epoch": 0.31091122409073135, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.003184867510572076, "learning_rate": 2.421161219713365e-06, "loss": 0.00018971459940075874, "reward": 0.3779227286577225, "reward_std": 0.2627197429537773, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3779227137565613, "rewards/QAReward/std": 0.41635000705718994, "step": 1590 }, { "clip_ratio/high_max": 0.0002708187676034868, "clip_ratio/high_mean": 0.00014769660774618387, "clip_ratio/low_mean": 5.466024449560791e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020235686097294092, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.89453125, "completions/min_length": 248.33333333333334, "epoch": 0.31188893234258896, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.84375, "kl": 0.0031039312947541476, "learning_rate": 2.417407490733984e-06, "loss": 0.00016306266188621522, "reward": 0.318999061981837, "reward_std": 0.2832694451014201, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3189990520477295, "rewards/QAReward/std": 0.44725250204404193, "step": 1595 }, { "clip_ratio/high_max": 0.0006065418070647866, "clip_ratio/high_mean": 0.00023521729744970798, "clip_ratio/low_mean": 9.68696826021187e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003320869873277843, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 549.3984375, "completions/min_length": 266.5, "epoch": 0.31286664059444663, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0030777424573898314, "learning_rate": 2.413644560684916e-06, "loss": 0.00019719665870070456, "reward": 0.3580078184604645, "reward_std": 0.297286257147789, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3580078184604645, "rewards/QAReward/std": 0.4379808008670807, "step": 1600 }, { "clip_ratio/high_max": 0.0002811245154589415, "clip_ratio/high_mean": 0.00016150219598785043, "clip_ratio/low_mean": 9.744904527906329e-05, "clip_ratio/low_min": 1.9849147065542638e-05, "clip_ratio/region_mean": 0.0002589512383565307, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 537.8203125, "completions/min_length": 244.0, "epoch": 0.31384434884630424, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.0031721503008157017, "learning_rate": 2.4098724673061856e-06, "loss": 0.0001818015705794096, "reward": 0.3566939930121104, "reward_std": 0.2956686019897461, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3566939930121104, "rewards/QAReward/std": 0.43920695781707764, "step": 1605 }, { "clip_ratio/high_max": 0.0005558820674195885, "clip_ratio/high_mean": 0.0003034033696167171, "clip_ratio/low_mean": 7.457865867763758e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037798202829435467, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 521.595703125, "completions/min_length": 274.0, "epoch": 0.3148220570981619, "frac_reward_zero_std": 0.046875, "grad_norm": 0.78515625, "kl": 0.0031257773749530315, "learning_rate": 2.406091248429721e-06, "loss": 9.734397171996534e-05, "reward": 0.40342727303504944, "reward_std": 0.2648385688662529, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40342727303504944, "rewards/QAReward/std": 0.4390745609998703, "step": 1610 }, { "clip_ratio/high_max": 0.0002886445377953351, "clip_ratio/high_mean": 0.00013522561348509044, "clip_ratio/low_mean": 3.299343807157129e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016821904573589564, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 536.86328125, "completions/min_length": 260.6666666666667, "epoch": 0.3157997653500196, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.859375, "kl": 0.0030704420991241933, "learning_rate": 2.4023009419789733e-06, "loss": 0.00017183147137984635, "reward": 0.38137712081273395, "reward_std": 0.2999996840953827, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38137710094451904, "rewards/QAReward/std": 0.4518910348415375, "step": 1615 }, { "clip_ratio/high_max": 0.0005460744490846991, "clip_ratio/high_mean": 0.0002866209484636784, "clip_ratio/low_mean": 5.825303232995793e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003448739880695939, "completions/clipped_ratio": 0.072265625, "completions/max_length": 1024.0, "completions/mean_length": 561.06640625, "completions/min_length": 257.5, "epoch": 0.3167774736018772, "frac_reward_zero_std": 0.03125, "grad_norm": 0.89453125, "kl": 0.0030041032936424018, "learning_rate": 2.398501585968538e-06, "loss": 0.00015244365204125643, "reward": 0.386660635471344, "reward_std": 0.2834677994251251, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3866606205701828, "rewards/QAReward/std": 0.4266427904367447, "step": 1620 }, { "clip_ratio/high_max": 0.00037314981454983354, "clip_ratio/high_mean": 0.00015971346874721348, "clip_ratio/low_mean": 4.92298073368147e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020894327899441124, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 544.3854166666666, "completions/min_length": 256.6666666666667, "epoch": 0.31775518185373486, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.828125, "kl": 0.0031224068254232405, "learning_rate": 2.3946932185037706e-06, "loss": 0.00016195813659578562, "reward": 0.3574962913990021, "reward_std": 0.2821424702803294, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3574962913990021, "rewards/QAReward/std": 0.4461304346720378, "step": 1625 }, { "clip_ratio/high_max": 0.0005290948087349534, "clip_ratio/high_mean": 0.0002720575255807489, "clip_ratio/low_mean": 9.154489962384104e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003636024077422917, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 527.083984375, "completions/min_length": 252.5, "epoch": 0.31873289010559247, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.0031095186714082956, "learning_rate": 2.390875877780407e-06, "loss": 0.0001646564807742834, "reward": 0.39578838646411896, "reward_std": 0.2852749079465866, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39578840136528015, "rewards/QAReward/std": 0.4110846221446991, "step": 1630 }, { "clip_ratio/high_max": 0.0003420832101255655, "clip_ratio/high_mean": 0.0001851267646998167, "clip_ratio/low_mean": 4.992384056095034e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023505061399191617, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.6002604166666, "completions/min_length": 244.0, "epoch": 0.31971059835745014, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.734375, "kl": 0.003138577798381448, "learning_rate": 2.387049602084181e-06, "loss": 0.000131350033916533, "reward": 0.408608744541804, "reward_std": 0.25486864646275836, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40860873460769653, "rewards/QAReward/std": 0.41447192430496216, "step": 1635 }, { "clip_ratio/high_max": 0.0004887121147476136, "clip_ratio/high_mean": 0.00032077319920063017, "clip_ratio/low_mean": 8.023856789804996e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000401011761277914, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 526.962890625, "completions/min_length": 256.5, "epoch": 0.3206883066093078, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.003314780071377754, "learning_rate": 2.3832144297904372e-06, "loss": 0.00019158810609951616, "reward": 0.39455530047416687, "reward_std": 0.29959672689437866, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3945552855730057, "rewards/QAReward/std": 0.4375428557395935, "step": 1640 }, { "clip_ratio/high_max": 0.0002400041208602488, "clip_ratio/high_mean": 0.00014397093909792602, "clip_ratio/low_mean": 4.502774245338514e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018899867427535355, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 548.0807291666666, "completions/min_length": 265.6666666666667, "epoch": 0.3216660148611654, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.76953125, "kl": 0.00307962903752923, "learning_rate": 2.379370399363749e-06, "loss": 0.0001302505610510707, "reward": 0.34909318884213764, "reward_std": 0.2813304364681244, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3490931987762451, "rewards/QAReward/std": 0.4433274765809377, "step": 1645 }, { "clip_ratio/high_max": 0.0005215242155827582, "clip_ratio/high_mean": 0.0002879420237150043, "clip_ratio/low_mean": 6.210255669429899e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003500445804093033, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 529.400390625, "completions/min_length": 279.0, "epoch": 0.3226437231130231, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.0032632868736982346, "learning_rate": 2.3755175493575313e-06, "loss": 0.00021629133261740207, "reward": 0.31843842566013336, "reward_std": 0.288022518157959, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31843844056129456, "rewards/QAReward/std": 0.4386078119277954, "step": 1650 }, { "clip_ratio/high_max": 0.00023045461857691408, "clip_ratio/high_mean": 9.607518732082098e-05, "clip_ratio/low_mean": 5.136847321409732e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001474436605349183, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 543.4440104166666, "completions/min_length": 260.6666666666667, "epoch": 0.3236214313648807, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7578125, "kl": 0.0031787957530468704, "learning_rate": 2.371655918413655e-06, "loss": 0.00016939908964559437, "reward": 0.3975374201933543, "reward_std": 0.2755570709705353, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39753740032513935, "rewards/QAReward/std": 0.41469505429267883, "step": 1655 }, { "clip_ratio/high_max": 0.0004160961310844868, "clip_ratio/high_mean": 0.00023700236924923957, "clip_ratio/low_mean": 7.445460360031576e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031145697575993834, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 539.140625, "completions/min_length": 279.0, "epoch": 0.32459913961673836, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.003085389966145158, "learning_rate": 2.3677855452620576e-06, "loss": 0.0001847578794695437, "reward": 0.37041404843330383, "reward_std": 0.28827574849128723, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37041404843330383, "rewards/QAReward/std": 0.4685223251581192, "step": 1660 }, { "clip_ratio/high_max": 0.00032949968008324506, "clip_ratio/high_mean": 0.00014902856783010064, "clip_ratio/low_mean": 6.872722005937248e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021775579079985619, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.9973958333334, "completions/min_length": 268.3333333333333, "epoch": 0.32557684786859603, "frac_reward_zero_std": 0.03125, "grad_norm": 0.765625, "kl": 0.0031386065296828748, "learning_rate": 2.3639064687203576e-06, "loss": 0.0001426410279236734, "reward": 0.33378546436627704, "reward_std": 0.27939075231552124, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3337854544321696, "rewards/QAReward/std": 0.437458336353302, "step": 1665 }, { "clip_ratio/high_max": 0.0005966703873127698, "clip_ratio/high_mean": 0.00027793984627351166, "clip_ratio/low_mean": 7.785420311847701e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003557940537575632, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 534.44921875, "completions/min_length": 255.5, "epoch": 0.32655455612045364, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8671875, "kl": 0.0032496074214577674, "learning_rate": 2.3600187276934613e-06, "loss": 0.00021604474168270826, "reward": 0.31325412541627884, "reward_std": 0.2680049389600754, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31325412541627884, "rewards/QAReward/std": 0.4423658400774002, "step": 1670 }, { "clip_ratio/high_max": 0.0003895785077475011, "clip_ratio/high_mean": 0.0002082509221509099, "clip_ratio/low_mean": 6.548937817569822e-05, "clip_ratio/low_min": 2.2568268468603493e-05, "clip_ratio/region_mean": 0.000273740291595459, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.6770833333334, "completions/min_length": 267.3333333333333, "epoch": 0.3275322643723113, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83984375, "kl": 0.0030889186542481183, "learning_rate": 2.3561223611731775e-06, "loss": 0.00011849179863929749, "reward": 0.33149802684783936, "reward_std": 0.2639643798271815, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3314980169137319, "rewards/QAReward/std": 0.45267383257548016, "step": 1675 }, { "clip_ratio/high_max": 0.00042299130000174047, "clip_ratio/high_mean": 0.00019925771339330823, "clip_ratio/low_mean": 8.840976806823164e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028766747564077375, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 546.830078125, "completions/min_length": 272.5, "epoch": 0.3285099726241689, "frac_reward_zero_std": 0.078125, "grad_norm": 0.75390625, "kl": 0.0031874570064246654, "learning_rate": 2.3522174082378207e-06, "loss": 0.00013423669151961802, "reward": 0.3286598026752472, "reward_std": 0.2768038660287857, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.328659787774086, "rewards/QAReward/std": 0.4055539518594742, "step": 1680 }, { "clip_ratio/high_max": 0.0004138817079365253, "clip_ratio/high_mean": 0.00022784894099459052, "clip_ratio/low_mean": 6.056276324670762e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000288411695510149, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 525.7643229166666, "completions/min_length": 255.0, "epoch": 0.3294876808760266, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8203125, "kl": 0.0031052240636199714, "learning_rate": 2.348303908051825e-06, "loss": 6.555613945238292e-05, "reward": 0.3459625045458476, "reward_std": 0.26614412665367126, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3459624946117401, "rewards/QAReward/std": 0.4688928226629893, "step": 1685 }, { "clip_ratio/high_max": 0.0006425950909033417, "clip_ratio/high_mean": 0.0002605858840979636, "clip_ratio/low_mean": 6.11277049756609e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000321713590528816, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 525.34375, "completions/min_length": 248.0, "epoch": 0.33046538912788426, "frac_reward_zero_std": 0.0625, "grad_norm": 0.81640625, "kl": 0.003283754363656044, "learning_rate": 2.3443818998653467e-06, "loss": 0.00016153805190697313, "reward": 0.4187430441379547, "reward_std": 0.29116562008857727, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4187430292367935, "rewards/QAReward/std": 0.4268696904182434, "step": 1690 }, { "clip_ratio/high_max": 0.0003568500163964927, "clip_ratio/high_mean": 0.0001968166558071971, "clip_ratio/low_mean": 5.41277026059106e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025094435550272466, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 519.4674479166666, "completions/min_length": 257.0, "epoch": 0.33144309737974187, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.81640625, "kl": 0.00316446153447032, "learning_rate": 2.3404514230138726e-06, "loss": 0.0001910169143229723, "reward": 0.34462892015775043, "reward_std": 0.2655702630678813, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.344628910223643, "rewards/QAReward/std": 0.42394179105758667, "step": 1695 }, { "clip_ratio/high_max": 0.00039227913366630674, "clip_ratio/high_mean": 0.0002422584337182343, "clip_ratio/low_mean": 6.277037173276767e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003050288069061935, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 535.306640625, "completions/min_length": 222.5, "epoch": 0.33242080563159954, "frac_reward_zero_std": 0.03125, "grad_norm": 0.85546875, "kl": 0.0030486774165183307, "learning_rate": 2.336512516917827e-06, "loss": 0.00020847034174948932, "reward": 0.2770027369260788, "reward_std": 0.30406464636325836, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2770027369260788, "rewards/QAReward/std": 0.47769761085510254, "step": 1700 }, { "clip_ratio/high_max": 0.00028760236455127597, "clip_ratio/high_mean": 0.00014047553413547574, "clip_ratio/low_mean": 4.6625928371213375e-05, "clip_ratio/low_min": 2.1413275680970402e-05, "clip_ratio/region_mean": 0.0001871014595963061, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 533.8958333333334, "completions/min_length": 235.33333333333334, "epoch": 0.3333985138834572, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.0029875892214477062, "learning_rate": 2.3325652210821726e-06, "loss": 0.00013959729112684728, "reward": 0.35146071513493854, "reward_std": 0.2641863723595937, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3514607201019923, "rewards/QAReward/std": 0.4321695963541667, "step": 1705 }, { "clip_ratio/high_max": 0.0005877672694623471, "clip_ratio/high_mean": 0.0003105956129729748, "clip_ratio/low_mean": 0.00011328255786793306, "clip_ratio/low_min": 4.511275474214926e-05, "clip_ratio/region_mean": 0.00042387815192341806, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/mean_length": 557.62890625, "completions/min_length": 259.0, "epoch": 0.3343762221353148, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.002888892125338316, "learning_rate": 2.3286095750960173e-06, "loss": 0.00012396336533129215, "reward": 0.26845934987068176, "reward_std": 0.3008343279361725, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.26845934987068176, "rewards/QAReward/std": 0.458592027425766, "step": 1710 }, { "clip_ratio/high_max": 0.0003247870597988367, "clip_ratio/high_mean": 0.00017254981794394552, "clip_ratio/low_mean": 3.496415301924571e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020751396659761666, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 522.2981770833334, "completions/min_length": 255.66666666666666, "epoch": 0.3353539303871725, "frac_reward_zero_std": 0.0625, "grad_norm": 0.75390625, "kl": 0.003158809058368206, "learning_rate": 2.3246456186322163e-06, "loss": 0.000207035755738616, "reward": 0.32315383354822796, "reward_std": 0.28093554576237995, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3231538385152817, "rewards/QAReward/std": 0.4796378513177236, "step": 1715 }, { "clip_ratio/high_max": 0.0005786812514998018, "clip_ratio/high_mean": 0.0002625434543006122, "clip_ratio/low_mean": 7.007470703683794e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003326181438751519, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 530.037109375, "completions/min_length": 275.0, "epoch": 0.3363316386390301, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.0029488255269825457, "learning_rate": 2.320673391446973e-06, "loss": 0.00017040532547980548, "reward": 0.2969943806529045, "reward_std": 0.2858610451221466, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2969943508505821, "rewards/QAReward/std": 0.4056760221719742, "step": 1720 }, { "clip_ratio/high_max": 0.0003935872693546116, "clip_ratio/high_mean": 0.00018996980506926774, "clip_ratio/low_mean": 4.3684836418833585e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023365463130176067, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 538.5924479166666, "completions/min_length": 248.0, "epoch": 0.33730934689088776, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83203125, "kl": 0.0030172243248671295, "learning_rate": 2.3166929333794444e-06, "loss": 0.00013428491074591876, "reward": 0.3209914565086365, "reward_std": 0.2750645379225413, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.320991446574529, "rewards/QAReward/std": 0.44776955246925354, "step": 1725 }, { "clip_ratio/high_max": 0.0005135482759214937, "clip_ratio/high_mean": 0.00024937966372817755, "clip_ratio/low_mean": 0.00010270585771650076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003520855214446783, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 543.00390625, "completions/min_length": 273.0, "epoch": 0.33828705514274543, "frac_reward_zero_std": 0.015625, "grad_norm": 0.74609375, "kl": 0.002977828262373805, "learning_rate": 2.312704284351334e-06, "loss": 7.42110307328403e-05, "reward": 0.3541046679019928, "reward_std": 0.2961806207895279, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3541046530008316, "rewards/QAReward/std": 0.4662812352180481, "step": 1730 }, { "clip_ratio/high_max": 0.0002594390301965177, "clip_ratio/high_mean": 0.0001507539302110672, "clip_ratio/low_mean": 5.8119648019783196e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020887356949970127, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 554.95703125, "completions/min_length": 249.0, "epoch": 0.33926476339460304, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.0029755877796560524, "learning_rate": 2.3087074843665006e-06, "loss": 0.00013395127607509493, "reward": 0.3362051447232564, "reward_std": 0.28360780080159503, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3362051546573639, "rewards/QAReward/std": 0.46568189064661664, "step": 1735 }, { "clip_ratio/high_max": 0.0005114907282404602, "clip_ratio/high_mean": 0.0002760308445431292, "clip_ratio/low_mean": 6.998249737080186e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003460133448243141, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 534.298828125, "completions/min_length": 210.0, "epoch": 0.3402424716464607, "frac_reward_zero_std": 0.0625, "grad_norm": 0.79296875, "kl": 0.0029921647161245345, "learning_rate": 2.3047025735105487e-06, "loss": 0.0001977902837097645, "reward": 0.38686059415340424, "reward_std": 0.26030535995960236, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38686057925224304, "rewards/QAReward/std": 0.45797009766101837, "step": 1740 }, { "clip_ratio/high_max": 0.000312555325217545, "clip_ratio/high_mean": 0.00016711163916625083, "clip_ratio/low_mean": 7.306293409783393e-05, "clip_ratio/low_min": 2.2670596081297845e-05, "clip_ratio/region_mean": 0.00024017458781599998, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 544.1796875, "completions/min_length": 276.6666666666667, "epoch": 0.3412201798983183, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0029941624961793423, "learning_rate": 2.3006895919504314e-06, "loss": 0.00013250168412923813, "reward": 0.4181428750356038, "reward_std": 0.25952141483624774, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4181428750356038, "rewards/QAReward/std": 0.3910019099712372, "step": 1745 }, { "clip_ratio/high_max": 0.00051568424096331, "clip_ratio/high_mean": 0.00022170223528519272, "clip_ratio/low_mean": 7.091130828484893e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002926135435700417, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 532.380859375, "completions/min_length": 277.5, "epoch": 0.342197888150176, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.002986793126910925, "learning_rate": 2.296668579934048e-06, "loss": 0.00012788401218131185, "reward": 0.3544231951236725, "reward_std": 0.26824240386486053, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3544232100248337, "rewards/QAReward/std": 0.437225803732872, "step": 1750 }, { "clip_ratio/high_max": 0.0002087022818159312, "clip_ratio/high_mean": 0.00012914366088807584, "clip_ratio/low_mean": 5.966000026091933e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018880367279052734, "completions/clipped_ratio": 0.010416666666666666, "completions/max_length": 974.0, "completions/mean_length": 523.9114583333334, "completions/min_length": 262.0, "epoch": 0.34317559640203366, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.0031139086931943893, "learning_rate": 2.292639577789836e-06, "loss": 0.00016885455697774888, "reward": 0.3646256923675537, "reward_std": 0.30143149693806964, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3646257221698761, "rewards/QAReward/std": 0.4366949399312337, "step": 1755 }, { "clip_ratio/high_max": 0.000713574094697833, "clip_ratio/high_mean": 0.00029201506404206155, "clip_ratio/low_mean": 6.680096266791225e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003588160325307399, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/mean_length": 546.994140625, "completions/min_length": 235.5, "epoch": 0.34415330465389127, "frac_reward_zero_std": 0.03125, "grad_norm": 0.79296875, "kl": 0.003102137567475438, "learning_rate": 2.2886026259263712e-06, "loss": 8.99887876585126e-05, "reward": 0.3774484097957611, "reward_std": 0.268902450799942, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3774483948945999, "rewards/QAReward/std": 0.42787204682826996, "step": 1760 }, { "clip_ratio/high_max": 0.0003656798624433577, "clip_ratio/high_mean": 0.00017110088374465703, "clip_ratio/low_mean": 6.507034122478217e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023617121623829008, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 512.2135416666666, "completions/min_length": 242.33333333333334, "epoch": 0.34513101290574894, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7890625, "kl": 0.003131817700341344, "learning_rate": 2.28455776483196e-06, "loss": 6.383074214681983e-05, "reward": 0.41918764511744183, "reward_std": 0.25839775800704956, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41918766498565674, "rewards/QAReward/std": 0.45026977856953937, "step": 1765 }, { "clip_ratio/high_max": 0.000505481322761625, "clip_ratio/high_mean": 0.0002524467825423926, "clip_ratio/low_mean": 4.530155420070514e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000297748320735991, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 545.1796875, "completions/min_length": 264.5, "epoch": 0.34610872115760655, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578125, "kl": 0.003038247674703598, "learning_rate": 2.2805050350742325e-06, "loss": 0.00018942421302199365, "reward": 0.3533179759979248, "reward_std": 0.27776676416397095, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3533180058002472, "rewards/QAReward/std": 0.4491198658943176, "step": 1770 }, { "clip_ratio/high_max": 0.0003232820075936615, "clip_ratio/high_mean": 0.00017671892419457436, "clip_ratio/low_mean": 3.9349182043224576e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021606810623779892, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 555.9713541666666, "completions/min_length": 225.66666666666666, "epoch": 0.3470864294094642, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.75390625, "kl": 0.0030584285967051985, "learning_rate": 2.2764444772997385e-06, "loss": 0.00013345563784241676, "reward": 0.34079696734746295, "reward_std": 0.30038129289944965, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34079696734746295, "rewards/QAReward/std": 0.4112061560153961, "step": 1775 }, { "clip_ratio/high_max": 0.0004747906816191971, "clip_ratio/high_mean": 0.00024119658046402038, "clip_ratio/low_mean": 7.485636451747269e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031605294207111, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 521.939453125, "completions/min_length": 229.0, "epoch": 0.3480641376613219, "frac_reward_zero_std": 0.03125, "grad_norm": 0.828125, "kl": 0.0032338201999664307, "learning_rate": 2.2723761322335387e-06, "loss": 0.00015620575286448, "reward": 0.3632463812828064, "reward_std": 0.2513594701886177, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3632463812828064, "rewards/QAReward/std": 0.41659967601299286, "step": 1780 }, { "clip_ratio/high_max": 0.00035223818849772215, "clip_ratio/high_mean": 0.00019134975736960768, "clip_ratio/low_mean": 4.3660379014909266e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023501012474298478, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 551.8307291666666, "completions/min_length": 245.33333333333334, "epoch": 0.3490418459131795, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.75390625, "kl": 0.0031080534216016533, "learning_rate": 2.268300040678795e-06, "loss": 0.00011783387744799257, "reward": 0.38816241423288983, "reward_std": 0.28589971860249835, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38816241423288983, "rewards/QAReward/std": 0.4529069860776265, "step": 1785 }, { "clip_ratio/high_max": 0.0005166275892406702, "clip_ratio/high_mean": 0.00025983290397562085, "clip_ratio/low_mean": 2.5895740145642775e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002857286424841732, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 552.99609375, "completions/min_length": 256.5, "epoch": 0.35001955416503716, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.0031338911037892105, "learning_rate": 2.264216243516362e-06, "loss": 0.00010141987586393952, "reward": 0.2888966426253319, "reward_std": 0.3106348365545273, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2888966426253319, "rewards/QAReward/std": 0.466492161154747, "step": 1790 }, { "clip_ratio/high_max": 0.0002889981959015131, "clip_ratio/high_mean": 0.00015555795980617403, "clip_ratio/low_mean": 6.432099908124656e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021987896179780363, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 540.12890625, "completions/min_length": 234.33333333333334, "epoch": 0.3509972624168948, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.88671875, "kl": 0.003289820533245802, "learning_rate": 2.2601247817043767e-06, "loss": 0.0002093037823215127, "reward": 0.3422505458196004, "reward_std": 0.26460187633832294, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3422505458196004, "rewards/QAReward/std": 0.4784085551897685, "step": 1795 }, { "clip_ratio/high_max": 0.00036032922798767686, "clip_ratio/high_mean": 0.00023660312872380017, "clip_ratio/low_mean": 6.461392767960205e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003012170549482107, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 547.78125, "completions/min_length": 244.5, "epoch": 0.35197497066875244, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7734375, "kl": 0.0030878982041031124, "learning_rate": 2.25602569627785e-06, "loss": 8.241726318374276e-05, "reward": 0.3498186469078064, "reward_std": 0.28721338510513306, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3498186469078064, "rewards/QAReward/std": 0.40255969762802124, "step": 1800 }, { "clip_ratio/high_max": 0.00038333910051733253, "clip_ratio/high_mean": 0.00016256100498139857, "clip_ratio/low_mean": 6.187689141370356e-05, "clip_ratio/low_min": 2.1168500825297088e-05, "clip_ratio/region_mean": 0.00022443787893280386, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 527.5065104166666, "completions/min_length": 254.0, "epoch": 0.3529526789206101, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.85546875, "kl": 0.003230353258550167, "learning_rate": 2.251919028348251e-06, "loss": 8.069546893239022e-05, "reward": 0.39031241337458294, "reward_std": 0.31100690364837646, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39031240344047546, "rewards/QAReward/std": 0.44782935579617816, "step": 1805 }, { "clip_ratio/high_max": 0.0005892205634154379, "clip_ratio/high_mean": 0.00029287807992659507, "clip_ratio/low_mean": 8.581182628404349e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037868990330025554, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 550.37109375, "completions/min_length": 255.0, "epoch": 0.3539303871724677, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80078125, "kl": 0.0031580982264131307, "learning_rate": 2.247804819103098e-06, "loss": 0.0002326891990378499, "reward": 0.3556043952703476, "reward_std": 0.2807668596506119, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3556043952703476, "rewards/QAReward/std": 0.45687828958034515, "step": 1810 }, { "clip_ratio/high_max": 0.00027246500831097366, "clip_ratio/high_mean": 0.00015048140194267033, "clip_ratio/low_mean": 4.9063928599935025e-05, "clip_ratio/low_min": 2.0451989257708192e-05, "clip_ratio/region_mean": 0.0001995453261770308, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 966.0, "completions/mean_length": 524.5572916666666, "completions/min_length": 247.66666666666666, "epoch": 0.3549080954243254, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.003191929683089256, "learning_rate": 2.2436831098055455e-06, "loss": 0.00015544495545327663, "reward": 0.33811383446057636, "reward_std": 0.3022203743457794, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33811384439468384, "rewards/QAReward/std": 0.4335240920384725, "step": 1815 }, { "clip_ratio/high_max": 0.0003910935134626925, "clip_ratio/high_mean": 0.00019047503010369838, "clip_ratio/low_mean": 8.086279849521816e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027133782859891655, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 548.349609375, "completions/min_length": 304.5, "epoch": 0.35588580367618305, "frac_reward_zero_std": 0.015625, "grad_norm": 0.86328125, "kl": 0.003106874320656061, "learning_rate": 2.239553941793967e-06, "loss": 0.00020335095468908547, "reward": 0.35973891615867615, "reward_std": 0.2868459224700928, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35973891615867615, "rewards/QAReward/std": 0.407808780670166, "step": 1820 }, { "clip_ratio/high_max": 0.00030674153240397575, "clip_ratio/high_mean": 0.00016666933079250156, "clip_ratio/low_mean": 7.182966219261288e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023849899880588055, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 540.66015625, "completions/min_length": 259.6666666666667, "epoch": 0.35686351192804067, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8359375, "kl": 0.0032319521997123957, "learning_rate": 2.235417356481544e-06, "loss": 0.00020684832707047464, "reward": 0.3310118118921916, "reward_std": 0.27895936369895935, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3310118118921916, "rewards/QAReward/std": 0.4388355314731598, "step": 1825 }, { "clip_ratio/high_max": 0.000582100625615567, "clip_ratio/high_mean": 0.00024260974605567754, "clip_ratio/low_mean": 6.770464242435991e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003103144001215696, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 518.681640625, "completions/min_length": 232.5, "epoch": 0.35784122017989833, "frac_reward_zero_std": 0.0625, "grad_norm": 0.76953125, "kl": 0.003156721452251077, "learning_rate": 2.2312733953558486e-06, "loss": 0.00013533513993024826, "reward": 0.3739932030439377, "reward_std": 0.28671492636203766, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3739932179450989, "rewards/QAReward/std": 0.46045348048210144, "step": 1830 }, { "clip_ratio/high_max": 0.00028323088772594927, "clip_ratio/high_mean": 0.00013932859292253852, "clip_ratio/low_mean": 2.9188542976044117e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016851713880896568, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 550.6888020833334, "completions/min_length": 251.33333333333334, "epoch": 0.35881892843175595, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.7578125, "kl": 0.0031041425187140702, "learning_rate": 2.227122099978429e-06, "loss": 0.00014701394829899072, "reward": 0.38817791144053143, "reward_std": 0.27543925245602924, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38817791144053143, "rewards/QAReward/std": 0.4534567892551422, "step": 1835 }, { "clip_ratio/high_max": 0.0003030151943676174, "clip_ratio/high_mean": 0.00018827869207598268, "clip_ratio/low_mean": 8.747818937990815e-05, "clip_ratio/low_min": 2.0439448417164385e-05, "clip_ratio/region_mean": 0.00027575687854550777, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 541.27734375, "completions/min_length": 235.5, "epoch": 0.3597966366836136, "frac_reward_zero_std": 0.0625, "grad_norm": 0.78125, "kl": 0.003138299658894539, "learning_rate": 2.222963511984391e-06, "loss": 0.0001092634629458189, "reward": 0.34101252257823944, "reward_std": 0.278160884976387, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34101253747940063, "rewards/QAReward/std": 0.4701361507177353, "step": 1840 }, { "clip_ratio/high_max": 0.0004405775689519942, "clip_ratio/high_mean": 0.00019041270134039222, "clip_ratio/low_mean": 5.1138384151272476e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024155110586434602, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 545.6158854166666, "completions/min_length": 276.6666666666667, "epoch": 0.3607743449354713, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80859375, "kl": 0.00302460053935647, "learning_rate": 2.2187976730819807e-06, "loss": 0.00020684914197772742, "reward": 0.4037667413552602, "reward_std": 0.2792362670103709, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4037667413552602, "rewards/QAReward/std": 0.4579753677050273, "step": 1845 }, { "clip_ratio/high_max": 0.0006252504186704755, "clip_ratio/high_mean": 0.00030568454531021416, "clip_ratio/low_mean": 9.285790292778984e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003985424526035786, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 546.697265625, "completions/min_length": 275.5, "epoch": 0.3617520531873289, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.0030047872103750707, "learning_rate": 2.2146246250521677e-06, "loss": 7.510515861213208e-05, "reward": 0.3535071015357971, "reward_std": 0.29337242245674133, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3535071015357971, "rewards/QAReward/std": 0.4626868963241577, "step": 1850 }, { "clip_ratio/high_max": 0.0003224587067961693, "clip_ratio/high_mean": 0.00016891484847292304, "clip_ratio/low_mean": 4.834371502511203e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021725856931880115, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 556.3841145833334, "completions/min_length": 250.33333333333334, "epoch": 0.36272976143918656, "frac_reward_zero_std": 0.0625, "grad_norm": 0.77734375, "kl": 0.003122945735231042, "learning_rate": 2.2104444097482245e-06, "loss": 0.0001652156002819538, "reward": 0.3543950617313385, "reward_std": 0.26603715618451435, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3543950418631236, "rewards/QAReward/std": 0.48996209104855853, "step": 1855 }, { "clip_ratio/high_max": 0.0003802422317676246, "clip_ratio/high_mean": 0.00023279344895854592, "clip_ratio/low_mean": 4.5856386714149265e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027864982839673755, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 539.806640625, "completions/min_length": 221.5, "epoch": 0.36370746969104417, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.003146209940314293, "learning_rate": 2.2062570690953073e-06, "loss": 0.00019362738821655513, "reward": 0.4459434002637863, "reward_std": 0.2867434173822403, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4459434002637863, "rewards/QAReward/std": 0.4174394905567169, "step": 1860 }, { "clip_ratio/high_max": 0.0004090334172360599, "clip_ratio/high_mean": 0.0002143309684470296, "clip_ratio/low_mean": 7.715181418461725e-05, "clip_ratio/low_min": 2.0214270625729115e-05, "clip_ratio/region_mean": 0.0002914827782660723, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 530.734375, "completions/min_length": 257.0, "epoch": 0.36468517794290184, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.76953125, "kl": 0.0030838143080472945, "learning_rate": 2.2020626450900353e-06, "loss": 7.497487240470946e-05, "reward": 0.3018627166748047, "reward_std": 0.26974406838417053, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3018627067406972, "rewards/QAReward/std": 0.43339113394419354, "step": 1865 }, { "clip_ratio/high_max": 0.0005682095419615507, "clip_ratio/high_mean": 0.00025506713427603245, "clip_ratio/low_mean": 7.13197427103296e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003263868682552129, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 526.548828125, "completions/min_length": 243.0, "epoch": 0.3656628861947595, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.0032446722965687514, "learning_rate": 2.19786117980007e-06, "loss": 8.70212446898222e-05, "reward": 0.45540450513362885, "reward_std": 0.27096666395664215, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.45540450513362885, "rewards/QAReward/std": 0.4139018803834915, "step": 1870 }, { "clip_ratio/high_max": 0.00033893648069351914, "clip_ratio/high_mean": 0.00016986330738291144, "clip_ratio/low_mean": 2.9165545129217207e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019902883796021342, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 518.1171875, "completions/min_length": 250.66666666666666, "epoch": 0.3666405944466171, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.80859375, "kl": 0.0032763608731329443, "learning_rate": 2.1936527153636926e-06, "loss": 0.00012792233610525727, "reward": 0.37571148077646893, "reward_std": 0.2817549208799998, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37571149071057636, "rewards/QAReward/std": 0.47060713171958923, "step": 1875 }, { "clip_ratio/high_max": 0.0004533871717285365, "clip_ratio/high_mean": 0.00024231308489106596, "clip_ratio/low_mean": 5.848198779858649e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030079508433118464, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 525.568359375, "completions/min_length": 252.5, "epoch": 0.3676183026984748, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83984375, "kl": 0.0031528796534985304, "learning_rate": 2.1894372939893813e-06, "loss": 0.00018596050795167686, "reward": 0.33387045562267303, "reward_std": 0.2673041820526123, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33387044072151184, "rewards/QAReward/std": 0.43882738053798676, "step": 1880 }, { "clip_ratio/high_max": 0.0002997523755766451, "clip_ratio/high_mean": 0.00017394317546859384, "clip_ratio/low_mean": 3.762152919080108e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021156470756977797, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 543.5716145833334, "completions/min_length": 231.66666666666666, "epoch": 0.3685960109503324, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.87109375, "kl": 0.0031018166337162256, "learning_rate": 2.1852149579553894e-06, "loss": 0.0001037237117998302, "reward": 0.2971685032049815, "reward_std": 0.2872901459534963, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.297168493270874, "rewards/QAReward/std": 0.4302942454814911, "step": 1885 }, { "clip_ratio/high_max": 0.00042628844967111944, "clip_ratio/high_mean": 0.0002217784523963928, "clip_ratio/low_mean": 6.574758008355275e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002875260426662862, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1024.0, "completions/mean_length": 523.544921875, "completions/min_length": 257.5, "epoch": 0.36957371920219007, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.0030873918905854223, "learning_rate": 2.18098574960932e-06, "loss": 0.00015095050912350416, "reward": 0.3802780956029892, "reward_std": 0.2839731276035309, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.380278080701828, "rewards/QAReward/std": 0.4139036685228348, "step": 1890 }, { "clip_ratio/high_max": 0.0003757503814995289, "clip_ratio/high_mean": 0.0001874267356470227, "clip_ratio/low_mean": 6.991433911025524e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025734106311574577, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 524.0, "completions/min_length": 247.33333333333334, "epoch": 0.37055142745404773, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.0031001951079815625, "learning_rate": 2.176749711367701e-06, "loss": 0.00011733132414519787, "reward": 0.35789621869723004, "reward_std": 0.28258928656578064, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35789620876312256, "rewards/QAReward/std": 0.45577389001846313, "step": 1895 }, { "clip_ratio/high_max": 0.0005727345007471741, "clip_ratio/high_mean": 0.00032489299774169923, "clip_ratio/low_mean": 9.608294640202075e-05, "clip_ratio/low_min": 2.0493903139140458e-05, "clip_ratio/region_mean": 0.0004209759412333369, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 519.740234375, "completions/min_length": 253.0, "epoch": 0.37152913570590534, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8046875, "kl": 0.0031157202553004026, "learning_rate": 2.172506885715561e-06, "loss": 0.00012834907975047826, "reward": 0.3687841147184372, "reward_std": 0.2507079988718033, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3687841147184372, "rewards/QAReward/std": 0.4733692705631256, "step": 1900 }, { "clip_ratio/high_max": 0.0002604044391773641, "clip_ratio/high_mean": 0.00011641643359325827, "clip_ratio/low_mean": 4.168261511949822e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000158099050167948, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 544.0963541666666, "completions/min_length": 259.3333333333333, "epoch": 0.372506843957763, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78515625, "kl": 0.0030548347160220144, "learning_rate": 2.1682573152060013e-06, "loss": 0.00014896003995090723, "reward": 0.3107004761695862, "reward_std": 0.27768408258756, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3107004761695862, "rewards/QAReward/std": 0.4440153241157532, "step": 1905 }, { "clip_ratio/high_max": 0.0004553089034743607, "clip_ratio/high_mean": 0.0002463407057803124, "clip_ratio/low_mean": 6.913862307555974e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031547933467663826, "completions/clipped_ratio": 0.048828125, "completions/max_length": 1024.0, "completions/mean_length": 539.61328125, "completions/min_length": 225.5, "epoch": 0.3734845522096206, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8359375, "kl": 0.0029031367041170595, "learning_rate": 2.1640010424597713e-06, "loss": 0.00018307752907276154, "reward": 0.3920751363039017, "reward_std": 0.2691051959991455, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3920751065015793, "rewards/QAReward/std": 0.42891088128089905, "step": 1910 }, { "clip_ratio/high_max": 0.0004468217259272933, "clip_ratio/high_mean": 0.0002286730916239321, "clip_ratio/low_mean": 2.9339807224459945e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000258012895938009, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 519.0221354166666, "completions/min_length": 238.66666666666666, "epoch": 0.3744622604614783, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8203125, "kl": 0.003089649276807904, "learning_rate": 2.159738110164839e-06, "loss": 0.00016028021927922965, "reward": 0.4213486810525258, "reward_std": 0.27150891224543255, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4213486711184184, "rewards/QAReward/std": 0.42848341663678485, "step": 1915 }, { "clip_ratio/high_max": 0.00045130403013899923, "clip_ratio/high_mean": 0.00019671977788675575, "clip_ratio/low_mean": 7.035796297714115e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026707774959504603, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 541.49609375, "completions/min_length": 244.0, "epoch": 0.37543996871333596, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.003113234555348754, "learning_rate": 2.1554685610759654e-06, "loss": 0.00016413386911153793, "reward": 0.24991320073604584, "reward_std": 0.2849826514720917, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.24991320073604584, "rewards/QAReward/std": 0.4825275391340256, "step": 1920 }, { "clip_ratio/high_max": 0.00024135285057127475, "clip_ratio/high_mean": 0.00013273783843033017, "clip_ratio/low_mean": 5.203280743444338e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018477064440958202, "completions/clipped_ratio": 0.044270833333333336, "completions/max_length": 1024.0, "completions/mean_length": 563.2473958333334, "completions/min_length": 247.0, "epoch": 0.37641767696519357, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.84765625, "kl": 0.0029969144612550735, "learning_rate": 2.1511924380142722e-06, "loss": 9.898146381601692e-05, "reward": 0.330888311068217, "reward_std": 0.3004131813844045, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.330888311068217, "rewards/QAReward/std": 0.4454324046770732, "step": 1925 }, { "clip_ratio/high_max": 0.00048719607293605803, "clip_ratio/high_mean": 0.00023183924495242536, "clip_ratio/low_mean": 7.201438711490482e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003038536407984793, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 527.30859375, "completions/min_length": 272.0, "epoch": 0.37739538521705124, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8125, "kl": 0.0029849834740161897, "learning_rate": 2.146909783866816e-06, "loss": 0.00013880088226869702, "reward": 0.38347750902175903, "reward_std": 0.26063546538352966, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38347750902175903, "rewards/QAReward/std": 0.4186950773000717, "step": 1930 }, { "clip_ratio/high_max": 0.0004194301669485867, "clip_ratio/high_mean": 0.0001862906676251441, "clip_ratio/low_mean": 4.9117434537038206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023540810216218233, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 544.0481770833334, "completions/min_length": 238.0, "epoch": 0.37837309346890885, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.78515625, "kl": 0.003006468527019024, "learning_rate": 2.1426206415861555e-06, "loss": 0.0001876160502433777, "reward": 0.3157610793908437, "reward_std": 0.29976088802019757, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3157610793908437, "rewards/QAReward/std": 0.442494402329127, "step": 1935 }, { "clip_ratio/high_max": 0.0006279531982727349, "clip_ratio/high_mean": 0.00030518299899995327, "clip_ratio/low_mean": 7.206381415016949e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037724680732935666, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 540.65625, "completions/min_length": 238.5, "epoch": 0.3793508017207665, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.0029870229307562115, "learning_rate": 2.1383250541899213e-06, "loss": 0.00016236483352258802, "reward": 0.31368817389011383, "reward_std": 0.3078208863735199, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31368814408779144, "rewards/QAReward/std": 0.4546515494585037, "step": 1940 }, { "clip_ratio/high_max": 0.0004818021086975932, "clip_ratio/high_mean": 0.00019916645251214505, "clip_ratio/low_mean": 3.9515407115686686e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002386818639934063, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 532.41796875, "completions/min_length": 247.66666666666666, "epoch": 0.3803285099726242, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.81640625, "kl": 0.0030294050928205253, "learning_rate": 2.134023064760386e-06, "loss": 0.0001075640320777893, "reward": 0.34631593028704327, "reward_std": 0.26117004950841266, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3463159402211507, "rewards/QAReward/std": 0.43319498499234516, "step": 1945 }, { "clip_ratio/high_max": 0.00043812720105051994, "clip_ratio/high_mean": 0.0002534425293561071, "clip_ratio/low_mean": 5.989798955852166e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031334052328020335, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1024.0, "completions/mean_length": 548.49609375, "completions/min_length": 228.5, "epoch": 0.3813062182244818, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.002966347569599748, "learning_rate": 2.12971471644403e-06, "loss": 0.00017778470646589994, "reward": 0.37191763520240784, "reward_std": 0.3023186922073364, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37191763520240784, "rewards/QAReward/std": 0.4757414311170578, "step": 1950 }, { "clip_ratio/high_max": 0.00038092483300715684, "clip_ratio/high_mean": 0.00018743519322015346, "clip_ratio/low_mean": 4.373984120320529e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023117504315450788, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 523.79296875, "completions/min_length": 236.33333333333334, "epoch": 0.38228392647633946, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.84765625, "kl": 0.003152520162984729, "learning_rate": 2.12540005245111e-06, "loss": 0.00011247396469116211, "reward": 0.3636217614014943, "reward_std": 0.26086003084977466, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3636217614014943, "rewards/QAReward/std": 0.4485040307044983, "step": 1955 }, { "clip_ratio/high_max": 0.0006426281994208694, "clip_ratio/high_mean": 0.00037928177625872194, "clip_ratio/low_mean": 9.48182656429708e-05, "clip_ratio/low_min": 1.9870839605573564e-05, "clip_ratio/region_mean": 0.00047410003608092664, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1017.0, "completions/mean_length": 540.728515625, "completions/min_length": 241.5, "epoch": 0.38326163472819713, "frac_reward_zero_std": 0.046875, "grad_norm": 0.82421875, "kl": 0.0030753666069358587, "learning_rate": 2.1210791160552244e-06, "loss": 0.0002642794279381633, "reward": 0.3897530734539032, "reward_std": 0.2573747783899307, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3897530734539032, "rewards/QAReward/std": 0.41303685307502747, "step": 1960 }, { "clip_ratio/high_max": 0.0004147202125750482, "clip_ratio/high_mean": 0.0001424240123014897, "clip_ratio/low_mean": 4.403107159305364e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001864550868049264, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 543.3580729166666, "completions/min_length": 238.0, "epoch": 0.38423934298005474, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.83984375, "kl": 0.003132748883217573, "learning_rate": 2.116751950592882e-06, "loss": 0.00014251209795475007, "reward": 0.3367120424906413, "reward_std": 0.25799159705638885, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3367120325565338, "rewards/QAReward/std": 0.46455087264378864, "step": 1965 }, { "clip_ratio/high_max": 0.0006144563434645533, "clip_ratio/high_mean": 0.00026677293353714047, "clip_ratio/low_mean": 8.366066322196275e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003504335996694863, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 557.169921875, "completions/min_length": 251.0, "epoch": 0.3852170512319124, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80859375, "kl": 0.0029956573154777287, "learning_rate": 2.1124185994630646e-06, "loss": 0.00014025429263710975, "reward": 0.3815987855195999, "reward_std": 0.2512682229280472, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3815987706184387, "rewards/QAReward/std": 0.41153545677661896, "step": 1970 }, { "clip_ratio/high_max": 0.0005236145574599504, "clip_ratio/high_mean": 0.0001993307494558394, "clip_ratio/low_mean": 7.868554093874991e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027801627293229104, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.7356770833334, "completions/min_length": 236.0, "epoch": 0.38619475948377, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.003202219679951668, "learning_rate": 2.1080791061267924e-06, "loss": 0.00016769652720540761, "reward": 0.3999333580334981, "reward_std": 0.26747312148412067, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3999333580334981, "rewards/QAReward/std": 0.4241335093975067, "step": 1975 }, { "clip_ratio/high_max": 0.0005703012691810727, "clip_ratio/high_mean": 0.0003050957922823727, "clip_ratio/low_mean": 5.799691498395987e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000363092718180269, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 541.73828125, "completions/min_length": 252.0, "epoch": 0.3871724677356277, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.00313901724293828, "learning_rate": 2.103733514106688e-06, "loss": 0.00014767898246645928, "reward": 0.36530956625938416, "reward_std": 0.27055037021636963, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36530956625938416, "rewards/QAReward/std": 0.4017295837402344, "step": 1980 }, { "clip_ratio/high_max": 0.000278732250444591, "clip_ratio/high_mean": 0.00018415563972666859, "clip_ratio/low_mean": 5.9876238810829816e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024403186980634928, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 541.5455729166666, "completions/min_length": 264.6666666666667, "epoch": 0.38815017598748536, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.003157097101211548, "learning_rate": 2.099381866986542e-06, "loss": 0.0001644360600039363, "reward": 0.35782284537951153, "reward_std": 0.2827276984850566, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35782284537951153, "rewards/QAReward/std": 0.4385157724221547, "step": 1985 }, { "clip_ratio/high_max": 0.00048458357341587546, "clip_ratio/high_mean": 0.0002487486694008112, "clip_ratio/low_mean": 8.117431134451181e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032992297783493993, "completions/clipped_ratio": 0.056640625, "completions/max_length": 1024.0, "completions/mean_length": 552.96484375, "completions/min_length": 212.0, "epoch": 0.38912788423934297, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.002922248560935259, "learning_rate": 2.0950242084108724e-06, "loss": 0.00012267640559002758, "reward": 0.35667240619659424, "reward_std": 0.2749222368001938, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35667240619659424, "rewards/QAReward/std": 0.4371713548898697, "step": 1990 }, { "clip_ratio/high_max": 0.0003081267816014588, "clip_ratio/high_mean": 0.00017984728328883647, "clip_ratio/low_mean": 4.380288737593219e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022365017794072627, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 536.87109375, "completions/min_length": 256.0, "epoch": 0.39010559249120064, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.00311002554371953, "learning_rate": 2.09066058208449e-06, "loss": 0.00012496862327679992, "reward": 0.36146340767542523, "reward_std": 0.2944878538449605, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36146340767542523, "rewards/QAReward/std": 0.42530420422554016, "step": 1995 }, { "clip_ratio/high_max": 0.0005148391472175717, "clip_ratio/high_mean": 0.0002479001821484417, "clip_ratio/low_mean": 0.00010077184997498989, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034867202630266546, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 542.4453125, "completions/min_length": 247.0, "epoch": 0.39108330074305825, "frac_reward_zero_std": 0.015625, "grad_norm": 0.796875, "kl": 0.003163551352918148, "learning_rate": 2.086291031772057e-06, "loss": 0.00015528646763414146, "reward": 0.4119745343923569, "reward_std": 0.26646170020103455, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4119745343923569, "rewards/QAReward/std": 0.4160090535879135, "step": 2000 }, { "clip_ratio/high_max": 0.00020311201806180179, "clip_ratio/high_mean": 0.00010989459115080535, "clip_ratio/low_mean": 4.089048015885055e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015078506548888982, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 555.2890625, "completions/min_length": 257.3333333333333, "epoch": 0.3920610089949159, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8046875, "kl": 0.0030926327221095563, "learning_rate": 2.0819156012976525e-06, "loss": 8.182351593859494e-05, "reward": 0.32596468925476074, "reward_std": 0.29429154594739276, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32596468925476074, "rewards/QAReward/std": 0.4105528195699056, "step": 2005 }, { "clip_ratio/high_max": 0.00047972503816708925, "clip_ratio/high_mean": 0.00029589899349957705, "clip_ratio/low_mean": 6.13854470429942e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003572844318114221, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1024.0, "completions/mean_length": 557.498046875, "completions/min_length": 217.0, "epoch": 0.3930387172467736, "frac_reward_zero_std": 0.03125, "grad_norm": 0.765625, "kl": 0.002929199393838644, "learning_rate": 2.077534334544327e-06, "loss": 4.850427503697574e-05, "reward": 0.3700980395078659, "reward_std": 0.2627003788948059, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3700980246067047, "rewards/QAReward/std": 0.432176411151886, "step": 2010 }, { "clip_ratio/high_max": 0.00047387980157509446, "clip_ratio/high_mean": 0.00016622738912701606, "clip_ratio/low_mean": 6.33873394690454e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022961474023759366, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.2864583333334, "completions/min_length": 262.3333333333333, "epoch": 0.3940164254986312, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.0029854738153517247, "learning_rate": 2.073147275453668e-06, "loss": 0.00019004428759217262, "reward": 0.37287535270055133, "reward_std": 0.28331534067789715, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37287532289822894, "rewards/QAReward/std": 0.44183122118314105, "step": 2015 }, { "clip_ratio/high_max": 0.00029316735453903674, "clip_ratio/high_mean": 0.00016662509588059038, "clip_ratio/low_mean": 7.130462618079036e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023792972206138074, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 505.724609375, "completions/min_length": 245.0, "epoch": 0.39499413375048886, "frac_reward_zero_std": 0.046875, "grad_norm": 0.83203125, "kl": 0.0032076950650662185, "learning_rate": 2.0687544680253555e-06, "loss": 0.0001152177806943655, "reward": 0.3782707154750824, "reward_std": 0.28286178410053253, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3782707154750824, "rewards/QAReward/std": 0.4627995938062668, "step": 2020 }, { "clip_ratio/high_max": 0.0002884733024984598, "clip_ratio/high_mean": 0.00014705722569487988, "clip_ratio/low_mean": 3.924520715372637e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018630241975188256, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 533.6184895833334, "completions/min_length": 243.0, "epoch": 0.3959718420023465, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83203125, "kl": 0.0030484907794743775, "learning_rate": 2.0643559563167225e-06, "loss": 7.630304899066686e-05, "reward": 0.3422774076461792, "reward_std": 0.26529252529144287, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3422774076461792, "rewards/QAReward/std": 0.4181616206963857, "step": 2025 }, { "clip_ratio/high_max": 0.00043981762137264014, "clip_ratio/high_mean": 0.000254947307985276, "clip_ratio/low_mean": 5.1399307994870466e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003063466167077422, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1024.0, "completions/mean_length": 523.095703125, "completions/min_length": 230.5, "epoch": 0.39694955025420414, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7578125, "kl": 0.0031428590416908266, "learning_rate": 2.059951784442313e-06, "loss": 0.00011778720654547215, "reward": 0.3441995233297348, "reward_std": 0.27416762709617615, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3441995233297348, "rewards/QAReward/std": 0.456001415848732, "step": 2030 }, { "clip_ratio/high_max": 0.00040856797713786366, "clip_ratio/high_mean": 0.00018669923301786184, "clip_ratio/low_mean": 4.412299313116819e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023082219995558263, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 533.0546875, "completions/min_length": 252.0, "epoch": 0.3979272585060618, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80859375, "kl": 0.0031177987810224296, "learning_rate": 2.055541996573439e-06, "loss": 0.00013604718260467051, "reward": 0.3903351326783498, "reward_std": 0.27265916268030804, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3903351326783498, "rewards/QAReward/std": 0.43862306078275043, "step": 2035 }, { "clip_ratio/high_max": 0.00042258654721081255, "clip_ratio/high_mean": 0.00021824520663358272, "clip_ratio/low_mean": 9.20032529393211e-05, "clip_ratio/low_min": 1.759634033078328e-05, "clip_ratio/region_mean": 0.00031024846248328686, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 542.212890625, "completions/min_length": 225.0, "epoch": 0.3989049667579194, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7890625, "kl": 0.003039784822613001, "learning_rate": 2.051126636937736e-06, "loss": 0.00012301632668823003, "reward": 0.3436755985021591, "reward_std": 0.297541543841362, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3436755985021591, "rewards/QAReward/std": 0.44674642384052277, "step": 2040 }, { "clip_ratio/high_max": 0.00027195531874895095, "clip_ratio/high_mean": 0.00016325347824022173, "clip_ratio/low_mean": 2.5882708359858953e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018913618405349552, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 548.93359375, "completions/min_length": 256.3333333333333, "epoch": 0.3998826750097771, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80859375, "kl": 0.003015338908880949, "learning_rate": 2.046705749818725e-06, "loss": 0.00010177145013585686, "reward": 0.3488356073697408, "reward_std": 0.2900317708651225, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3488356073697408, "rewards/QAReward/std": 0.4517424801985423, "step": 2045 }, { "clip_ratio/high_max": 0.0005438508116640151, "clip_ratio/high_mean": 0.0002980380377266556, "clip_ratio/low_mean": 4.105230254936032e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033909033518284557, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 534.435546875, "completions/min_length": 255.0, "epoch": 0.4008603832616347, "frac_reward_zero_std": 0.046875, "grad_norm": 0.76171875, "kl": 0.0029363803565502166, "learning_rate": 2.04227937955536e-06, "loss": 8.281593327410519e-05, "reward": 0.35234685242176056, "reward_std": 0.2682139575481415, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35234683752059937, "rewards/QAReward/std": 0.46321889758110046, "step": 2050 }, { "clip_ratio/high_max": 0.00032326795626431705, "clip_ratio/high_mean": 0.00017149485065601767, "clip_ratio/low_mean": 5.475470388773829e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022624955745413898, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 540.375, "completions/min_length": 265.0, "epoch": 0.40183809151349237, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.0029690058436244726, "learning_rate": 2.0378475705415905e-06, "loss": 6.274193292483687e-05, "reward": 0.3573443690935771, "reward_std": 0.28437188267707825, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3573443690935771, "rewards/QAReward/std": 0.46309446295102435, "step": 2055 }, { "clip_ratio/high_max": 0.0004902711836621166, "clip_ratio/high_mean": 0.00025966225657612083, "clip_ratio/low_mean": 5.902121047256515e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031868346268311144, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 541.13671875, "completions/min_length": 249.5, "epoch": 0.40281579976535004, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.002951535303145647, "learning_rate": 2.0334103672259133e-06, "loss": 0.00013630837202072144, "reward": 0.28399600088596344, "reward_std": 0.28602561354637146, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28399601578712463, "rewards/QAReward/std": 0.4660990387201309, "step": 2060 }, { "clip_ratio/high_max": 0.0003704521921463311, "clip_ratio/high_mean": 0.00017663543112576008, "clip_ratio/low_mean": 4.15488364524208e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002181842690333724, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 523.4674479166666, "completions/min_length": 254.0, "epoch": 0.40379350801720765, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83984375, "kl": 0.003010098310187459, "learning_rate": 2.0289678141109265e-06, "loss": 0.00010476963361725212, "reward": 0.3261476953824361, "reward_std": 0.28977171579996747, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3261476755142212, "rewards/QAReward/std": 0.45609837770462036, "step": 2065 }, { "clip_ratio/high_max": 0.0005375015200115741, "clip_ratio/high_mean": 0.0002540568122640252, "clip_ratio/low_mean": 7.41555355489254e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003282123478129506, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 559.466796875, "completions/min_length": 260.5, "epoch": 0.4047712162690653, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.002872899267822504, "learning_rate": 2.0245199557528835e-06, "loss": 0.00017106984741985798, "reward": 0.31102506816387177, "reward_std": 0.26673130691051483, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31102508306503296, "rewards/QAReward/std": 0.40660299360752106, "step": 2070 }, { "clip_ratio/high_max": 0.0003418562933802605, "clip_ratio/high_mean": 0.00015097595169208944, "clip_ratio/low_mean": 5.891208711545914e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020988803589716554, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 526.4270833333334, "completions/min_length": 226.0, "epoch": 0.405748924520923, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8515625, "kl": 0.0029162134509533645, "learning_rate": 2.020066836761246e-06, "loss": 0.00019757915288209915, "reward": 0.37940849860509235, "reward_std": 0.28606236974398297, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37940849860509235, "rewards/QAReward/std": 0.4650686979293823, "step": 2075 }, { "clip_ratio/high_max": 0.0004960905644111335, "clip_ratio/high_mean": 0.00026868018321692946, "clip_ratio/low_mean": 4.766321217175573e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003163433982990682, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 521.716796875, "completions/min_length": 233.5, "epoch": 0.4067266327727806, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.003014309657737613, "learning_rate": 2.015608501798239e-06, "loss": 0.00018307159189134836, "reward": 0.37606723606586456, "reward_std": 0.28038595616817474, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37606723606586456, "rewards/QAReward/std": 0.4300529211759567, "step": 2080 }, { "clip_ratio/high_max": 0.0004253460559993982, "clip_ratio/high_mean": 0.0001998281804844737, "clip_ratio/low_mean": 5.107695033075288e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002509051468223333, "completions/clipped_ratio": 0.052083333333333336, "completions/max_length": 1024.0, "completions/mean_length": 554.3033854166666, "completions/min_length": 275.0, "epoch": 0.40770434102463826, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.0029279469046741722, "learning_rate": 2.0111449955783964e-06, "loss": 0.00014986051246523857, "reward": 0.363579918940862, "reward_std": 0.2826648751894633, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3635799090067546, "rewards/QAReward/std": 0.4312574664751689, "step": 2085 }, { "clip_ratio/high_max": 0.0006023830035701394, "clip_ratio/high_mean": 0.00032563174609094857, "clip_ratio/low_mean": 6.598554173251615e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039161727763712404, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 532.1875, "completions/min_length": 247.0, "epoch": 0.4086820492764959, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.002879186160862446, "learning_rate": 2.006676362868121e-06, "loss": 0.00011359219206497073, "reward": 0.38242730498313904, "reward_std": 0.2799226641654968, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38242730498313904, "rewards/QAReward/std": 0.4572891592979431, "step": 2090 }, { "clip_ratio/high_max": 0.0003168238908983767, "clip_ratio/high_mean": 0.0001727231196127832, "clip_ratio/low_mean": 7.82279996201396e-05, "clip_ratio/low_min": 2.0550760382320732e-05, "clip_ratio/region_mean": 0.0002509511075913906, "completions/clipped_ratio": 0.015625, "completions/max_length": 988.0, "completions/mean_length": 525.1848958333334, "completions/min_length": 195.33333333333334, "epoch": 0.40965975752835354, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.796875, "kl": 0.0030034986324608327, "learning_rate": 2.0022026484852295e-06, "loss": 0.00011804953683167697, "reward": 0.3264865477879842, "reward_std": 0.29459309577941895, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32648653785387677, "rewards/QAReward/std": 0.4332516590754191, "step": 2095 }, { "clip_ratio/high_max": 0.000588648032862693, "clip_ratio/high_mean": 0.0002261680259834975, "clip_ratio/low_mean": 9.804146102396772e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003242094768211246, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 513.35546875, "completions/min_length": 218.5, "epoch": 0.4106374657802112, "frac_reward_zero_std": 0.046875, "grad_norm": 0.76953125, "kl": 0.0029934097547084093, "learning_rate": 1.9977238972985047e-06, "loss": 4.854053258895874e-05, "reward": 0.4349423348903656, "reward_std": 0.24806972593069077, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4349423348903656, "rewards/QAReward/std": 0.41830703616142273, "step": 2100 }, { "clip_ratio/high_max": 0.00037367381155490876, "clip_ratio/high_mean": 0.00020643161842599512, "clip_ratio/low_mean": 5.051107582403347e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002569426782429218, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 522.7434895833334, "completions/min_length": 240.0, "epoch": 0.4116151740320688, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.003178066061809659, "learning_rate": 1.993240154227247e-06, "loss": 0.00015410272171720863, "reward": 0.3734886546929677, "reward_std": 0.28492889801661175, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3734886646270752, "rewards/QAReward/std": 0.4229225516319275, "step": 2105 }, { "clip_ratio/high_max": 0.0005355820409022272, "clip_ratio/high_mean": 0.00023571718484163284, "clip_ratio/low_mean": 6.501585885416716e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003007330466061831, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 520.19921875, "completions/min_length": 246.0, "epoch": 0.4125928822839265, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7734375, "kl": 0.003069043578580022, "learning_rate": 1.9887514642408204e-06, "loss": 0.00014466901775449513, "reward": 0.3547371029853821, "reward_std": 0.27294281125068665, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3547371029853821, "rewards/QAReward/std": 0.44952984154224396, "step": 2110 }, { "clip_ratio/high_max": 0.00032401575008407236, "clip_ratio/high_mean": 0.00017330931732431054, "clip_ratio/low_mean": 4.556438361760229e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021887370385229588, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 521.0677083333334, "completions/min_length": 228.0, "epoch": 0.4135705905357841, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.7890625, "kl": 0.0029252990148961543, "learning_rate": 1.984257872358206e-06, "loss": 0.00017527672462165357, "reward": 0.38476717472076416, "reward_std": 0.2691822399695714, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38476717472076416, "rewards/QAReward/std": 0.4366976022720337, "step": 2115 }, { "clip_ratio/high_max": 0.0004693020833656192, "clip_ratio/high_mean": 0.00020860365475527942, "clip_ratio/low_mean": 9.494983969489112e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003035535104572773, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 514.8203125, "completions/min_length": 262.5, "epoch": 0.41454829878764177, "frac_reward_zero_std": 0.078125, "grad_norm": 0.8046875, "kl": 0.003096156707033515, "learning_rate": 1.9797594236475456e-06, "loss": 0.00012233288725838065, "reward": 0.3846106231212616, "reward_std": 0.27341584861278534, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3846106231212616, "rewards/QAReward/std": 0.4553925693035126, "step": 2120 }, { "clip_ratio/high_max": 0.0004070268361829221, "clip_ratio/high_mean": 0.0001964917406439781, "clip_ratio/low_mean": 3.17994796205312e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022829121444374324, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 514.421875, "completions/min_length": 234.33333333333334, "epoch": 0.41552600703949943, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.81640625, "kl": 0.002986387861892581, "learning_rate": 1.975256163225694e-06, "loss": 0.00011827610433101654, "reward": 0.35878286759058636, "reward_std": 0.26624593138694763, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3587828477223714, "rewards/QAReward/std": 0.4645187358061473, "step": 2125 }, { "clip_ratio/high_max": 0.0006039576372131705, "clip_ratio/high_mean": 0.0003100281173828989, "clip_ratio/low_mean": 4.0634114702697845e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003506622277200222, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 534.18359375, "completions/min_length": 245.5, "epoch": 0.41650371529135705, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0030809080228209494, "learning_rate": 1.970748136257764e-06, "loss": 0.00014997739344835281, "reward": 0.3087146729230881, "reward_std": 0.2852516174316406, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3087146580219269, "rewards/QAReward/std": 0.4814152419567108, "step": 2130 }, { "clip_ratio/high_max": 0.00031412019161507486, "clip_ratio/high_mean": 0.00014227063511498272, "clip_ratio/low_mean": 4.5598915312439205e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018786955042742192, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 517.0950520833334, "completions/min_length": 230.66666666666666, "epoch": 0.4174814235432147, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8203125, "kl": 0.0030748378019779922, "learning_rate": 1.9662353879566746e-06, "loss": 0.0001470847288146615, "reward": 0.3910284141699473, "reward_std": 0.2600817531347275, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39102840423583984, "rewards/QAReward/std": 0.42310822010040283, "step": 2135 }, { "clip_ratio/high_max": 0.00046086767688393595, "clip_ratio/high_mean": 0.00023515591165050863, "clip_ratio/low_mean": 8.748780528549105e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032264371984638276, "completions/clipped_ratio": 0.03515625, "completions/max_length": 953.5, "completions/mean_length": 544.74609375, "completions/min_length": 304.5, "epoch": 0.4184591317950723, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80078125, "kl": 0.002944806544110179, "learning_rate": 1.961717963582697e-06, "loss": 0.0001276333234272897, "reward": 0.34815390408039093, "reward_std": 0.2575089856982231, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3481539189815521, "rewards/QAReward/std": 0.3959256708621979, "step": 2140 }, { "clip_ratio/high_max": 0.0003383943345397711, "clip_ratio/high_mean": 0.000165775278583169, "clip_ratio/low_mean": 5.211659299675375e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021789186866953968, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 522.3541666666666, "completions/min_length": 269.3333333333333, "epoch": 0.41943684004693, "frac_reward_zero_std": 0.03125, "grad_norm": 0.859375, "kl": 0.0030571685638278723, "learning_rate": 1.957195908443e-06, "loss": 9.537248406559229e-05, "reward": 0.3813590208689372, "reward_std": 0.2663493752479553, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3813590308030446, "rewards/QAReward/std": 0.431399405002594, "step": 2145 }, { "clip_ratio/high_max": 0.0004614034784026444, "clip_ratio/high_mean": 0.0002441417775116861, "clip_ratio/low_mean": 8.028616721276194e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032442796509712934, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 526.669921875, "completions/min_length": 269.0, "epoch": 0.42041454829878766, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.0030832790303975343, "learning_rate": 1.952669267891197e-06, "loss": 0.00010516843758523464, "reward": 0.3862537145614624, "reward_std": 0.3023880273103714, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3862537145614624, "rewards/QAReward/std": 0.43552394211292267, "step": 2150 }, { "clip_ratio/high_max": 0.00027914476813748477, "clip_ratio/high_mean": 0.0001575316651724279, "clip_ratio/low_mean": 4.747965140268206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020501130493357777, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 519.6184895833334, "completions/min_length": 232.0, "epoch": 0.42139225655064527, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.84375, "kl": 0.003144296584650874, "learning_rate": 1.9481380873268897e-06, "loss": 0.0001807604916393757, "reward": 0.36735697587331134, "reward_std": 0.2907773156960805, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36735697587331134, "rewards/QAReward/std": 0.43248451749483746, "step": 2155 }, { "clip_ratio/high_max": 0.0005050079082138837, "clip_ratio/high_mean": 0.0002672885428182781, "clip_ratio/low_mean": 4.431534980540164e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031160388607531786, "completions/clipped_ratio": 0.048828125, "completions/max_length": 1024.0, "completions/mean_length": 535.798828125, "completions/min_length": 252.0, "epoch": 0.42236996480250294, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7421875, "kl": 0.003064082795754075, "learning_rate": 1.943602412195215e-06, "loss": 0.0001718720537610352, "reward": 0.34630534052848816, "reward_std": 0.2889882028102875, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34630534052848816, "rewards/QAReward/std": 0.42980940639972687, "step": 2160 }, { "clip_ratio/high_max": 0.00040418480057269335, "clip_ratio/high_mean": 0.00016385539202019573, "clip_ratio/low_mean": 2.8963421937078237e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019281881977804006, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 538.02734375, "completions/min_length": 259.0, "epoch": 0.42334767305436055, "frac_reward_zero_std": 0.0625, "grad_norm": 0.83984375, "kl": 0.003032511007040739, "learning_rate": 1.939062287986386e-06, "loss": 9.180984925478697e-05, "reward": 0.4204768141110738, "reward_std": 0.2687724729379018, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42047680417696637, "rewards/QAReward/std": 0.4357854227224986, "step": 2165 }, { "clip_ratio/high_max": 0.0004912586417049169, "clip_ratio/high_mean": 0.0002061780367512256, "clip_ratio/low_mean": 8.46010705572553e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029077910585328937, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 528.12109375, "completions/min_length": 238.5, "epoch": 0.4243253813062182, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.003035994339734316, "learning_rate": 1.9345177602352387e-06, "loss": 0.00018605856457725166, "reward": 0.3399316221475601, "reward_std": 0.3005024492740631, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3399316072463989, "rewards/QAReward/std": 0.42050138115882874, "step": 2170 }, { "clip_ratio/high_max": 0.00035839329939335585, "clip_ratio/high_mean": 0.00016839513555169105, "clip_ratio/low_mean": 3.5937655775342135e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020433277823030948, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 520.96484375, "completions/min_length": 233.66666666666666, "epoch": 0.4253030895580759, "frac_reward_zero_std": 0.0625, "grad_norm": 0.91796875, "kl": 0.0030521638691425323, "learning_rate": 1.929968874520773e-06, "loss": 9.226709953509271e-05, "reward": 0.4262354373931885, "reward_std": 0.272815336783727, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42623541752497357, "rewards/QAReward/std": 0.43922919034957886, "step": 2175 }, { "clip_ratio/high_max": 0.0005000900127924979, "clip_ratio/high_mean": 0.00026663828175514935, "clip_ratio/low_mean": 6.822985887993127e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033486814936622977, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 534.638671875, "completions/min_length": 239.0, "epoch": 0.4262807978099335, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.003055526874959469, "learning_rate": 1.9254156764656982e-06, "loss": 0.00011713984422385692, "reward": 0.3681028485298157, "reward_std": 0.26918764412403107, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3681028336286545, "rewards/QAReward/std": 0.4077514410018921, "step": 2180 }, { "clip_ratio/high_max": 0.0002682521298993379, "clip_ratio/high_mean": 0.00015826691524125635, "clip_ratio/low_mean": 6.347400194499642e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022174092009663582, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 519.7278645833334, "completions/min_length": 254.33333333333334, "epoch": 0.42725850606179117, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.81640625, "kl": 0.0030595038551837206, "learning_rate": 1.9208582117359723e-06, "loss": 0.0002115244511514902, "reward": 0.36119017004966736, "reward_std": 0.30560757716496784, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36119017004966736, "rewards/QAReward/std": 0.44872914751370746, "step": 2185 }, { "clip_ratio/high_max": 0.00045265291118994354, "clip_ratio/high_mean": 0.0002655804855749011, "clip_ratio/low_mean": 7.333020039368421e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003389106714166701, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 521.498046875, "completions/min_length": 235.0, "epoch": 0.42823621431364883, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8828125, "kl": 0.0031507829669862987, "learning_rate": 1.916296526040347e-06, "loss": 0.0001369995530694723, "reward": 0.36830978095531464, "reward_std": 0.30703502893447876, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36830976605415344, "rewards/QAReward/std": 0.47035351395606995, "step": 2190 }, { "clip_ratio/high_max": 0.00023888067808002232, "clip_ratio/high_mean": 0.00013590502203442156, "clip_ratio/low_mean": 2.517074753995985e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016107576666399837, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 519.5729166666666, "completions/min_length": 240.66666666666666, "epoch": 0.42921392256550644, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.86328125, "kl": 0.003176939208060503, "learning_rate": 1.9117306651299065e-06, "loss": 0.00017613498494029045, "reward": 0.34142278631528217, "reward_std": 0.25999099016189575, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34142278631528217, "rewards/QAReward/std": 0.46803287665049237, "step": 2195 }, { "clip_ratio/high_max": 0.00041468538111075757, "clip_ratio/high_mean": 0.00023179774289019405, "clip_ratio/low_mean": 4.5618195144925267e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027741594240069387, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 537.115234375, "completions/min_length": 260.5, "epoch": 0.4301916308173641, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.0029547465965151787, "learning_rate": 1.9071606747976113e-06, "loss": 0.00010917907347902655, "reward": 0.3593974709510803, "reward_std": 0.276661217212677, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3593974709510803, "rewards/QAReward/std": 0.46794378757476807, "step": 2200 }, { "clip_ratio/high_max": 0.00024995008716359733, "clip_ratio/high_mean": 0.00012289267033338547, "clip_ratio/low_mean": 8.153723028954118e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002044299035333097, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 536.125, "completions/min_length": 257.0, "epoch": 0.4311693390692217, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.7109375, "kl": 0.0030259294901043175, "learning_rate": 1.9025866008778365e-06, "loss": 0.00017545218579471112, "reward": 0.32286450266838074, "reward_std": 0.2759895920753479, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32286447286605835, "rewards/QAReward/std": 0.4606989522775014, "step": 2205 }, { "clip_ratio/high_max": 0.0006067864364013076, "clip_ratio/high_mean": 0.00030701757641509174, "clip_ratio/low_mean": 6.423872109735384e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037125631934031844, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 529.73828125, "completions/min_length": 252.0, "epoch": 0.4321470473210794, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8203125, "kl": 0.0031023419462144373, "learning_rate": 1.8980084892459145e-06, "loss": 0.0001954509411007166, "reward": 0.3793410360813141, "reward_std": 0.25283078849315643, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3793410509824753, "rewards/QAReward/std": 0.4370255321264267, "step": 2210 }, { "clip_ratio/high_max": 0.00019993931637145578, "clip_ratio/high_mean": 0.00011332752765156329, "clip_ratio/low_mean": 4.756604030262679e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016089356504380702, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 974.6666666666666, "completions/mean_length": 526.39453125, "completions/min_length": 249.33333333333334, "epoch": 0.43312475557293706, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7578125, "kl": 0.003151051700115204, "learning_rate": 1.8934263858176719e-06, "loss": 9.65576444286853e-05, "reward": 0.3474082350730896, "reward_std": 0.2679486374060313, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3474082450071971, "rewards/QAReward/std": 0.4603317677974701, "step": 2215 }, { "clip_ratio/high_max": 0.0003869473410304636, "clip_ratio/high_mean": 0.0002320102765224874, "clip_ratio/low_mean": 7.438550164806657e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030639576725661756, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 521.197265625, "completions/min_length": 243.0, "epoch": 0.43410246382479467, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8203125, "kl": 0.0031533404253423214, "learning_rate": 1.8888403365489722e-06, "loss": 0.00028443862684071066, "reward": 0.4123465567827225, "reward_std": 0.27348221838474274, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4123465418815613, "rewards/QAReward/std": 0.4233645796775818, "step": 2220 }, { "clip_ratio/high_max": 0.00028322001453489064, "clip_ratio/high_mean": 0.00015231805155053736, "clip_ratio/low_mean": 6.188977567944676e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002142078126780689, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 539.7291666666666, "completions/min_length": 262.0, "epoch": 0.43508017207665234, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8359375, "kl": 0.0029368980787694452, "learning_rate": 1.8842503874352526e-06, "loss": 9.499126463197172e-05, "reward": 0.35548410813013714, "reward_std": 0.29109681646029156, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35548410813013714, "rewards/QAReward/std": 0.4406875669956207, "step": 2225 }, { "clip_ratio/high_max": 0.0004424496728461236, "clip_ratio/high_mean": 0.00027589687961153685, "clip_ratio/low_mean": 5.735764716519043e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003332545340526849, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 514.29296875, "completions/min_length": 244.0, "epoch": 0.43605788032850995, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.0031839975621551276, "learning_rate": 1.8796565845110644e-06, "loss": 0.0001505182357504964, "reward": 0.35466890037059784, "reward_std": 0.2553420662879944, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35466891527175903, "rewards/QAReward/std": 0.41455407440662384, "step": 2230 }, { "clip_ratio/high_max": 0.0002782124327495694, "clip_ratio/high_mean": 0.00018289199797436596, "clip_ratio/low_mean": 5.868180742254481e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024157380685210227, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 529.2083333333334, "completions/min_length": 262.3333333333333, "epoch": 0.4370355885803676, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.828125, "kl": 0.002985218074172735, "learning_rate": 1.8750589738496092e-06, "loss": 0.000101869972422719, "reward": 0.3472093145052592, "reward_std": 0.2979096869627635, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34720930457115173, "rewards/QAReward/std": 0.4518239100774129, "step": 2235 }, { "clip_ratio/high_max": 0.0005403876188211143, "clip_ratio/high_mean": 0.00028083831421099603, "clip_ratio/low_mean": 7.040189084364101e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003512402123305947, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 510.05078125, "completions/min_length": 223.0, "epoch": 0.4380132968322253, "frac_reward_zero_std": 0.046875, "grad_norm": 0.83984375, "kl": 0.003118863236159086, "learning_rate": 1.8704576015622789e-06, "loss": 0.00010954118333756923, "reward": 0.33160651475191116, "reward_std": 0.2604203000664711, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33160652220249176, "rewards/QAReward/std": 0.44569912552833557, "step": 2240 }, { "clip_ratio/high_max": 0.000274903210811317, "clip_ratio/high_mean": 0.0001585675112437457, "clip_ratio/low_mean": 4.744385078083724e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020601136493496596, "completions/clipped_ratio": 0.049479166666666664, "completions/max_length": 1024.0, "completions/mean_length": 547.5807291666666, "completions/min_length": 253.33333333333334, "epoch": 0.4389910050840829, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.002914897492155433, "learning_rate": 1.8658525137981924e-06, "loss": 0.00010501539800316096, "reward": 0.37239118417104083, "reward_std": 0.2810593048731486, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37239118417104083, "rewards/QAReward/std": 0.42773351073265076, "step": 2245 }, { "clip_ratio/high_max": 0.00041268900968134404, "clip_ratio/high_mean": 0.00018970814417116345, "clip_ratio/low_mean": 9.561892948113382e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002853270678315312, "completions/clipped_ratio": 0.02734375, "completions/max_length": 953.0, "completions/mean_length": 535.97265625, "completions/min_length": 224.5, "epoch": 0.43996871333594056, "frac_reward_zero_std": 0.0625, "grad_norm": 0.83984375, "kl": 0.0028840600047260524, "learning_rate": 1.8612437567437315e-06, "loss": 0.00012627722462639214, "reward": 0.3014020770788193, "reward_std": 0.26917167752981186, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3014020770788193, "rewards/QAReward/std": 0.4890313148498535, "step": 2250 }, { "clip_ratio/high_max": 0.0002505282871425152, "clip_ratio/high_mean": 0.00014039897359907627, "clip_ratio/low_mean": 4.176889051450416e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018216785392723977, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 539.2109375, "completions/min_length": 246.0, "epoch": 0.4409464215877982, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83984375, "kl": 0.002937528258189559, "learning_rate": 1.8566313766220805e-06, "loss": 0.00012600801419466735, "reward": 0.36831700801849365, "reward_std": 0.28877594073613483, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36831700801849365, "rewards/QAReward/std": 0.4195507764816284, "step": 2255 }, { "clip_ratio/high_max": 0.0005081459879875183, "clip_ratio/high_mean": 0.0002453687426168472, "clip_ratio/low_mean": 6.755855574738234e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031292727217078207, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 534.59375, "completions/min_length": 245.0, "epoch": 0.44192412983965584, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76171875, "kl": 0.0029321304988116025, "learning_rate": 1.8520154196927602e-06, "loss": 0.00019221666734665633, "reward": 0.34645646810531616, "reward_std": 0.2897801399230957, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34645648300647736, "rewards/QAReward/std": 0.4136037677526474, "step": 2260 }, { "clip_ratio/high_max": 0.00028870482929050923, "clip_ratio/high_mean": 0.00017941797850653529, "clip_ratio/low_mean": 6.967158406041562e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002490895567461848, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 533.9622395833334, "completions/min_length": 215.0, "epoch": 0.4429018380915135, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.0030140043701976536, "learning_rate": 1.8473959322511647e-06, "loss": 0.00013486368115991353, "reward": 0.384943887591362, "reward_std": 0.27714499831199646, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3849438925584157, "rewards/QAReward/std": 0.4036886195341746, "step": 2265 }, { "clip_ratio/high_max": 0.00045632979599758984, "clip_ratio/high_mean": 0.00022851008689031004, "clip_ratio/low_mean": 8.67633760208264e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003152734832838178, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 521.07421875, "completions/min_length": 251.0, "epoch": 0.4438795463433711, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.003026540717110038, "learning_rate": 1.8427729606280967e-06, "loss": 9.667971753515303e-05, "reward": 0.24562639743089676, "reward_std": 0.3053726404905319, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.24562638998031616, "rewards/QAReward/std": 0.4392862617969513, "step": 2270 }, { "clip_ratio/high_max": 0.00027021393179893494, "clip_ratio/high_mean": 0.00014673484256491064, "clip_ratio/low_mean": 5.7531939819455144e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002042667882051319, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 536.3059895833334, "completions/min_length": 239.33333333333334, "epoch": 0.4448572545952288, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.89453125, "kl": 0.002943475544452667, "learning_rate": 1.838146551189304e-06, "loss": 9.524362394586205e-05, "reward": 0.29519470036029816, "reward_std": 0.2740302085876465, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29519470532735187, "rewards/QAReward/std": 0.4140651623408, "step": 2275 }, { "clip_ratio/high_max": 0.00044551758328452705, "clip_ratio/high_mean": 0.00027557763387449087, "clip_ratio/low_mean": 5.269059183774516e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003282682329881936, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 520.22265625, "completions/min_length": 237.5, "epoch": 0.4458349628470864, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80078125, "kl": 0.003003347618505359, "learning_rate": 1.8335167503350137e-06, "loss": 3.9553179522044954e-05, "reward": 0.36720380187034607, "reward_std": 0.2948443591594696, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36720380187034607, "rewards/QAReward/std": 0.4174448996782303, "step": 2280 }, { "clip_ratio/high_max": 0.00030914673116058113, "clip_ratio/high_mean": 0.0001576203911099583, "clip_ratio/low_mean": 4.117568605579436e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019879608298651875, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 531.5026041666666, "completions/min_length": 250.33333333333334, "epoch": 0.44681267109894407, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.7734375, "kl": 0.0029829688370227813, "learning_rate": 1.8288836044994663e-06, "loss": 0.0001223767874762416, "reward": 0.42025256156921387, "reward_std": 0.27550508578618366, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42025257150332135, "rewards/QAReward/std": 0.42364325126012164, "step": 2285 }, { "clip_ratio/high_max": 0.00038850175915285947, "clip_ratio/high_mean": 0.0002274904982186854, "clip_ratio/low_mean": 6.27929126494564e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002902834035921842, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 567.349609375, "completions/min_length": 208.0, "epoch": 0.44779037935080174, "frac_reward_zero_std": 0.015625, "grad_norm": 0.82421875, "kl": 0.0028608930762857197, "learning_rate": 1.8242471601504504e-06, "loss": 9.567709639668465e-05, "reward": 0.3063817620277405, "reward_std": 0.2969301640987396, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3063817620277405, "rewards/QAReward/std": 0.45219528675079346, "step": 2290 }, { "clip_ratio/high_max": 0.0003029934712685645, "clip_ratio/high_mean": 0.00014367186231538652, "clip_ratio/low_mean": 4.020238993689418e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018387425807304681, "completions/clipped_ratio": 0.044270833333333336, "completions/max_length": 1024.0, "completions/mean_length": 528.9479166666666, "completions/min_length": 259.3333333333333, "epoch": 0.44876808760265935, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.77734375, "kl": 0.0030151328537613153, "learning_rate": 1.8196074637888375e-06, "loss": 0.00014085231814533472, "reward": 0.36936745047569275, "reward_std": 0.27376270294189453, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36936746040980023, "rewards/QAReward/std": 0.4492364823818207, "step": 2295 }, { "clip_ratio/high_max": 0.0005249876528978347, "clip_ratio/high_mean": 0.0002544682298321277, "clip_ratio/low_mean": 8.454405469819904e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003390122787095606, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 535.224609375, "completions/min_length": 276.5, "epoch": 0.449745795854517, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.0029244698584079743, "learning_rate": 1.8149645619481148e-06, "loss": 8.534114458598196e-05, "reward": 0.332643523812294, "reward_std": 0.26281698048114777, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3326435387134552, "rewards/QAReward/std": 0.40704579651355743, "step": 2300 }, { "clip_ratio/high_max": 0.0003662982489913702, "clip_ratio/high_mean": 0.00017016121419146656, "clip_ratio/low_mean": 6.174063892103732e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023190185893326998, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 542.3385416666666, "completions/min_length": 252.66666666666666, "epoch": 0.4507235041063747, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.7734375, "kl": 0.0030341320671141148, "learning_rate": 1.8103185011939184e-06, "loss": 0.0001876409165561199, "reward": 0.3319108784198761, "reward_std": 0.31264639894167584, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3319108684857686, "rewards/QAReward/std": 0.46851051847139996, "step": 2305 }, { "clip_ratio/high_max": 0.00041295521659776566, "clip_ratio/high_mean": 0.00020735340076498686, "clip_ratio/low_mean": 0.00012250113068148493, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003298545372672379, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 533.74609375, "completions/min_length": 234.0, "epoch": 0.4517012123582323, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7734375, "kl": 0.006897415174171329, "learning_rate": 1.8056693281235663e-06, "loss": 0.0002455746987834573, "reward": 0.40566639602184296, "reward_std": 0.26416008174419403, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40566638112068176, "rewards/QAReward/std": 0.4390583485364914, "step": 2310 }, { "clip_ratio/high_max": 0.00028557340847328304, "clip_ratio/high_mean": 0.00014459106023423374, "clip_ratio/low_mean": 5.3210233454592525e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001978013024199754, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 532.875, "completions/min_length": 261.6666666666667, "epoch": 0.45267892061008996, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83984375, "kl": 0.0030911510344594716, "learning_rate": 1.8010170893655917e-06, "loss": 7.439529290422797e-05, "reward": 0.39682798584302265, "reward_std": 0.2715022663275401, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3968279957771301, "rewards/QAReward/std": 0.4110926886399587, "step": 2315 }, { "clip_ratio/high_max": 0.00042118800338357686, "clip_ratio/high_mean": 0.00016809175431262702, "clip_ratio/low_mean": 7.282180595211685e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002409135573543608, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 533.373046875, "completions/min_length": 252.5, "epoch": 0.4536566288619476, "frac_reward_zero_std": 0.0625, "grad_norm": 0.78515625, "kl": 0.0029352245386689902, "learning_rate": 1.7963618315792746e-06, "loss": 0.0001230601337738335, "reward": 0.3820374459028244, "reward_std": 0.2822103053331375, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3820374459028244, "rewards/QAReward/std": 0.43913887441158295, "step": 2320 }, { "clip_ratio/high_max": 0.00032370880944654346, "clip_ratio/high_mean": 0.00016039899201132358, "clip_ratio/low_mean": 6.005708419252187e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002204560791142285, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 529.4283854166666, "completions/min_length": 204.66666666666666, "epoch": 0.45463433711380524, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.0029428301379084585, "learning_rate": 1.7917036014541742e-06, "loss": 0.0001423127483576536, "reward": 0.3923233946164449, "reward_std": 0.27988478541374207, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3923233946164449, "rewards/QAReward/std": 0.43461064497629803, "step": 2325 }, { "clip_ratio/high_max": 0.00041955504566431044, "clip_ratio/high_mean": 0.0002259968430735171, "clip_ratio/low_mean": 0.00011113184446003288, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000337128690443933, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 515.40234375, "completions/min_length": 211.0, "epoch": 0.4556120453656629, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83984375, "kl": 0.0031023234128952025, "learning_rate": 1.7870424457096594e-06, "loss": 0.00021807376760989426, "reward": 0.32249920070171356, "reward_std": 0.28335198760032654, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32249920070171356, "rewards/QAReward/std": 0.4538858234882355, "step": 2330 }, { "clip_ratio/high_max": 0.0003940211609005928, "clip_ratio/high_mean": 0.00018427788745611906, "clip_ratio/low_mean": 3.4190481528639793e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021846838062629105, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 540.8033854166666, "completions/min_length": 235.66666666666666, "epoch": 0.4565897536175205, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.81640625, "kl": 0.0030682821292430163, "learning_rate": 1.7823784110944444e-06, "loss": 0.0001435542246326804, "reward": 0.28392547865708667, "reward_std": 0.29954500993092853, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28392547865708667, "rewards/QAReward/std": 0.4537968536218007, "step": 2335 }, { "clip_ratio/high_max": 0.0004979401244781911, "clip_ratio/high_mean": 0.00024116599233821036, "clip_ratio/low_mean": 6.929315786692314e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003104591509327292, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 553.87109375, "completions/min_length": 225.0, "epoch": 0.4575674618693782, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7578125, "kl": 0.002855200506746769, "learning_rate": 1.7777115443861135e-06, "loss": 0.000124682136811316, "reward": 0.40465572476387024, "reward_std": 0.2656372934579849, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40465572476387024, "rewards/QAReward/std": 0.4343114495277405, "step": 2340 }, { "clip_ratio/high_max": 0.0003898528637364507, "clip_ratio/high_mean": 0.00017138475668616594, "clip_ratio/low_mean": 3.585989325074479e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020724463975057005, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 522.7513020833334, "completions/min_length": 249.33333333333334, "epoch": 0.4585451701212358, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.80078125, "kl": 0.003037937358021736, "learning_rate": 1.773041892390657e-06, "loss": 0.00014634830877184867, "reward": 0.3624422351519267, "reward_std": 0.27918432156244916, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3624422252178192, "rewards/QAReward/std": 0.48103173573811847, "step": 2345 }, { "clip_ratio/high_max": 0.0005302451318129897, "clip_ratio/high_mean": 0.0002459061681292951, "clip_ratio/low_mean": 6.583420181414113e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031174037721939387, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 545.548828125, "completions/min_length": 229.5, "epoch": 0.45952287837309347, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.002998462552204728, "learning_rate": 1.7683695019419998e-06, "loss": 0.00011412205640226603, "reward": 0.3915666937828064, "reward_std": 0.3059465289115906, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3915666937828064, "rewards/QAReward/std": 0.45117518305778503, "step": 2350 }, { "clip_ratio/high_max": 0.0003427958698011935, "clip_ratio/high_mean": 0.00015122267650440334, "clip_ratio/low_mean": 6.501219759229571e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000216234871186316, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 543.36328125, "completions/min_length": 246.66666666666666, "epoch": 0.46050058662495114, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.003082946175709367, "learning_rate": 1.763694419901532e-06, "loss": 0.00016284361481666565, "reward": 0.3484881520271301, "reward_std": 0.2598422070344289, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3484881321589152, "rewards/QAReward/std": 0.4274849494298299, "step": 2355 }, { "clip_ratio/high_max": 0.0005342728225514292, "clip_ratio/high_mean": 0.00024149538949131967, "clip_ratio/low_mean": 9.316910291090608e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003346644865814596, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 537.05859375, "completions/min_length": 230.0, "epoch": 0.46147829487680875, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87890625, "kl": 0.0030922869686037303, "learning_rate": 1.759016693157638e-06, "loss": 0.00011634104885160923, "reward": 0.3992970287799835, "reward_std": 0.26661360263824463, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3992970436811447, "rewards/QAReward/std": 0.43822167813777924, "step": 2360 }, { "clip_ratio/high_max": 0.0004074235912412405, "clip_ratio/high_mean": 0.00022740184795111418, "clip_ratio/low_mean": 5.757365142926574e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002849754877388477, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 536.4622395833334, "completions/min_length": 261.6666666666667, "epoch": 0.4624560031286664, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8203125, "kl": 0.0029339209664613008, "learning_rate": 1.7543363686252283e-06, "loss": 9.219368803314865e-05, "reward": 0.3612528045972188, "reward_std": 0.2695212960243225, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3612528145313263, "rewards/QAReward/std": 0.43666407465934753, "step": 2365 }, { "clip_ratio/high_max": 0.0004397041164338589, "clip_ratio/high_mean": 0.00021261798101477326, "clip_ratio/low_mean": 7.193826750153675e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028455624706111846, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 523.94921875, "completions/min_length": 255.5, "epoch": 0.463433711380524, "frac_reward_zero_std": 0.046875, "grad_norm": 0.79296875, "kl": 0.003114062827080488, "learning_rate": 1.749653493245267e-06, "loss": 9.196983301080763e-05, "reward": 0.40909211337566376, "reward_std": 0.2381610944867134, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40909212827682495, "rewards/QAReward/std": 0.41435959935188293, "step": 2370 }, { "clip_ratio/high_max": 0.0003266497049480677, "clip_ratio/high_mean": 0.00015920374426059425, "clip_ratio/low_mean": 4.4332575635053216e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002035363228060305, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 528.109375, "completions/min_length": 266.6666666666667, "epoch": 0.4644114196323817, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8125, "kl": 0.003085633087903261, "learning_rate": 1.7449681139843015e-06, "loss": 8.697374723851681e-05, "reward": 0.3047097126642863, "reward_std": 0.29073046644528705, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3047097126642863, "rewards/QAReward/std": 0.42834938565889996, "step": 2375 }, { "clip_ratio/high_max": 0.0004578448832035065, "clip_ratio/high_mean": 0.00023562597343698143, "clip_ratio/low_mean": 7.439706532750279e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003100230358541012, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 530.7734375, "completions/min_length": 236.0, "epoch": 0.46538912788423936, "frac_reward_zero_std": 0.015625, "grad_norm": 0.79296875, "kl": 0.0030617194250226023, "learning_rate": 1.7402802778339923e-06, "loss": 0.00015336626674979925, "reward": 0.3832728862762451, "reward_std": 0.27915869653224945, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3832728862762451, "rewards/QAReward/std": 0.4813126027584076, "step": 2380 }, { "clip_ratio/high_max": 0.00027799638919532297, "clip_ratio/high_mean": 0.00016935303574427962, "clip_ratio/low_mean": 4.9350803601555525e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021870384807698429, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 515.4192708333334, "completions/min_length": 231.33333333333334, "epoch": 0.466366836136097, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.84375, "kl": 0.0030944081488996746, "learning_rate": 1.7355900318106407e-06, "loss": 0.00010992127936333418, "reward": 0.3462053636709849, "reward_std": 0.26624204715092975, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34620535373687744, "rewards/QAReward/std": 0.4447603225708008, "step": 2385 }, { "clip_ratio/high_max": 0.0004509381949901581, "clip_ratio/high_mean": 0.00027897918480448427, "clip_ratio/low_mean": 8.115168020594865e-05, "clip_ratio/low_min": 2.064622676698491e-05, "clip_ratio/region_mean": 0.000360130873741582, "completions/clipped_ratio": 0.0078125, "completions/max_length": 938.0, "completions/mean_length": 527.697265625, "completions/min_length": 246.0, "epoch": 0.46734454438795464, "frac_reward_zero_std": 0.03125, "grad_norm": 0.828125, "kl": 0.0030874341726303102, "learning_rate": 1.7308974229547177e-06, "loss": 0.00011963262222707272, "reward": 0.33145771920681, "reward_std": 0.2721021920442581, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33145771920681, "rewards/QAReward/std": 0.4448835551738739, "step": 2390 }, { "clip_ratio/high_max": 0.00031801358563825487, "clip_ratio/high_mean": 0.0001387063763104379, "clip_ratio/low_mean": 4.2489581392146644e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001811959664337337, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 536.1536458333334, "completions/min_length": 231.33333333333334, "epoch": 0.46832225263981225, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7890625, "kl": 0.0029303704854100943, "learning_rate": 1.7262024983303924e-06, "loss": 7.658222457394004e-05, "reward": 0.3632957140604655, "reward_std": 0.28851226965586346, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3632957339286804, "rewards/QAReward/std": 0.41669472058614093, "step": 2395 }, { "clip_ratio/high_max": 0.00039311890723183753, "clip_ratio/high_mean": 0.00021756060887128114, "clip_ratio/low_mean": 8.966508612502366e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003072257153689861, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 530.4921875, "completions/min_length": 256.0, "epoch": 0.4692999608916699, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0030813675839453936, "learning_rate": 1.7215053050250589e-06, "loss": 0.00012665665708482264, "reward": 0.37315694987773895, "reward_std": 0.2933889180421829, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37315694987773895, "rewards/QAReward/std": 0.4399626702070236, "step": 2400 }, { "clip_ratio/high_max": 0.0002844486036337912, "clip_ratio/high_mean": 0.00015004035667516292, "clip_ratio/low_mean": 4.143075639149174e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019147112034261226, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 530.5572916666666, "completions/min_length": 238.0, "epoch": 0.4702776691435276, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.85546875, "kl": 0.0031251620035618543, "learning_rate": 1.7168058901488669e-06, "loss": 0.0001636108150705695, "reward": 0.3951125641663869, "reward_std": 0.2850572069485982, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.395112544298172, "rewards/QAReward/std": 0.42540624737739563, "step": 2405 }, { "clip_ratio/high_max": 0.000565172970527783, "clip_ratio/high_mean": 0.00024375877983402462, "clip_ratio/low_mean": 8.501549600623549e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032877427292987704, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 525.09375, "completions/min_length": 245.5, "epoch": 0.4712553773953852, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.003073735162615776, "learning_rate": 1.712104300834244e-06, "loss": 0.00019456639420241119, "reward": 0.34635649621486664, "reward_std": 0.2806183099746704, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34635651111602783, "rewards/QAReward/std": 0.4565685987472534, "step": 2410 }, { "clip_ratio/high_max": 0.0002908889320679009, "clip_ratio/high_mean": 0.0001597403024788946, "clip_ratio/low_mean": 4.9972644774243236e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020971295307390392, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 538.1927083333334, "completions/min_length": 265.0, "epoch": 0.47223308564724287, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78125, "kl": 0.0029958255123347043, "learning_rate": 1.7074005842354292e-06, "loss": 0.0001445260364562273, "reward": 0.3616279462973277, "reward_std": 0.2859138051668803, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3616279462973277, "rewards/QAReward/std": 0.44944961865743, "step": 2415 }, { "clip_ratio/high_max": 0.00040981973870657384, "clip_ratio/high_mean": 0.00022262403799686581, "clip_ratio/low_mean": 8.676944707985967e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003093934792559594, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 543.494140625, "completions/min_length": 256.0, "epoch": 0.47321079389910053, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.003009403683245182, "learning_rate": 1.7026947875279957e-06, "loss": 0.00022341464646160602, "reward": 0.3485587537288666, "reward_std": 0.28986871242523193, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34855876863002777, "rewards/QAReward/std": 0.45758984982967377, "step": 2420 }, { "clip_ratio/high_max": 0.0003884733421728015, "clip_ratio/high_mean": 0.0001381308538839221, "clip_ratio/low_mean": 4.4036346662323925e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018216719618067146, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 522.2604166666666, "completions/min_length": 234.66666666666666, "epoch": 0.47418850215095815, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8359375, "kl": 0.0030266177374869583, "learning_rate": 1.6979869579083781e-06, "loss": 9.242843370884656e-05, "reward": 0.3619311253229777, "reward_std": 0.28781601786613464, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3619311253229777, "rewards/QAReward/std": 0.4284774859746297, "step": 2425 }, { "clip_ratio/high_max": 0.000422279944177717, "clip_ratio/high_mean": 0.0002063574967905879, "clip_ratio/low_mean": 6.079721351852641e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026715470012277364, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 518.720703125, "completions/min_length": 244.5, "epoch": 0.4751662104028158, "frac_reward_zero_std": 0.015625, "grad_norm": 0.80859375, "kl": 0.0031364778988063335, "learning_rate": 1.6932771425934017e-06, "loss": 0.0001605012104846537, "reward": 0.4076575040817261, "reward_std": 0.2745314687490463, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4076574891805649, "rewards/QAReward/std": 0.4288863092660904, "step": 2430 }, { "clip_ratio/high_max": 0.0003616856760345399, "clip_ratio/high_mean": 0.00019785442855209113, "clip_ratio/low_mean": 2.2978077322477474e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022083251387812198, "completions/clipped_ratio": 0.010416666666666666, "completions/max_length": 1024.0, "completions/mean_length": 506.6510416666667, "completions/min_length": 259.0, "epoch": 0.4761439186546734, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8359375, "kl": 0.003149261185899377, "learning_rate": 1.6885653888198057e-06, "loss": 9.995452128350735e-05, "reward": 0.3943677445252736, "reward_std": 0.2733049988746643, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3943677345911662, "rewards/QAReward/std": 0.43076475461324054, "step": 2435 }, { "clip_ratio/high_max": 0.0004955955315381289, "clip_ratio/high_mean": 0.00025547738187015057, "clip_ratio/low_mean": 8.239152375608683e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033786890562623737, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 535.599609375, "completions/min_length": 249.0, "epoch": 0.4771216269065311, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.0030862133484333754, "learning_rate": 1.683851743843772e-06, "loss": 0.00017209690995514392, "reward": 0.3697374165058136, "reward_std": 0.2577958405017853, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3697374016046524, "rewards/QAReward/std": 0.4414421021938324, "step": 2440 }, { "clip_ratio/high_max": 0.0003166960086673498, "clip_ratio/high_mean": 0.0001244277984369546, "clip_ratio/low_mean": 3.474608965916559e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015917389537207783, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 539.0572916666666, "completions/min_length": 257.6666666666667, "epoch": 0.47809933515838876, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.8359375, "kl": 0.0029344069305807354, "learning_rate": 1.6791362549404488e-06, "loss": 0.0001458100974559784, "reward": 0.3971700568993886, "reward_std": 0.2658375898996989, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3971700370311737, "rewards/QAReward/std": 0.46457592646280926, "step": 2445 }, { "clip_ratio/high_max": 0.0004966688342392444, "clip_ratio/high_mean": 0.00024312916211783887, "clip_ratio/low_mean": 8.417355857091025e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003273027134127915, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 510.822265625, "completions/min_length": 251.5, "epoch": 0.4790770434102464, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8359375, "kl": 0.0031285787466913463, "learning_rate": 1.6744189694034798e-06, "loss": 8.596167317591608e-05, "reward": 0.36244651675224304, "reward_std": 0.2923379987478256, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36244653910398483, "rewards/QAReward/std": 0.4155856966972351, "step": 2450 }, { "clip_ratio/high_max": 0.0004482654156163335, "clip_ratio/high_mean": 0.0002079106925521046, "clip_ratio/low_mean": 2.889548777602613e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023680616868659854, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 533.2317708333334, "completions/min_length": 248.33333333333334, "epoch": 0.48005475166210404, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.7890625, "kl": 0.00305079217068851, "learning_rate": 1.669699934544526e-06, "loss": 8.819355862215162e-05, "reward": 0.3178116778532664, "reward_std": 0.29845017194747925, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3178116778532664, "rewards/QAReward/std": 0.47132988770802814, "step": 2455 }, { "clip_ratio/high_max": 0.0004698332282714546, "clip_ratio/high_mean": 0.0002217000408563763, "clip_ratio/low_mean": 0.00011191814846824854, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003336181864142418, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 507.255859375, "completions/min_length": 228.5, "epoch": 0.48103245991396165, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83984375, "kl": 0.0031015520915389063, "learning_rate": 1.6649791976927948e-06, "loss": 0.00018832426285371185, "reward": 0.3305129259824753, "reward_std": 0.2915779948234558, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3305129408836365, "rewards/QAReward/std": 0.4599938541650772, "step": 2460 }, { "clip_ratio/high_max": 0.0004221403272822499, "clip_ratio/high_mean": 0.00017933296621777117, "clip_ratio/low_mean": 5.845898413099348e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023779195034876466, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 546.8932291666666, "completions/min_length": 258.3333333333333, "epoch": 0.4820101681658193, "frac_reward_zero_std": 0.03125, "grad_norm": 0.875, "kl": 0.002900387905538082, "learning_rate": 1.6602568061945616e-06, "loss": 8.317084284499287e-05, "reward": 0.3583374619483948, "reward_std": 0.2918021281560262, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3583374818166097, "rewards/QAReward/std": 0.44290218750635785, "step": 2465 }, { "clip_ratio/high_max": 0.000502830499317497, "clip_ratio/high_mean": 0.0002697893534786999, "clip_ratio/low_mean": 7.118352659745142e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034097288735210893, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1013.0, "completions/mean_length": 518.791015625, "completions/min_length": 245.0, "epoch": 0.482987876417677, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.003045383468270302, "learning_rate": 1.655532807412699e-06, "loss": 0.00011206358904018998, "reward": 0.29010263085365295, "reward_std": 0.2964576482772827, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29010263085365295, "rewards/QAReward/std": 0.44041620194911957, "step": 2470 }, { "clip_ratio/high_max": 0.000355547817889601, "clip_ratio/high_mean": 0.00021716960472986102, "clip_ratio/low_mean": 7.313584501389414e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002903054468333721, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 525.3111979166666, "completions/min_length": 237.33333333333334, "epoch": 0.4839655846695346, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.77734375, "kl": 0.003035518666729331, "learning_rate": 1.6508072487261984e-06, "loss": 7.633913774043321e-05, "reward": 0.30433544516563416, "reward_std": 0.2810378074645996, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30433544516563416, "rewards/QAReward/std": 0.43370260794957477, "step": 2475 }, { "clip_ratio/high_max": 0.0005078349262475967, "clip_ratio/high_mean": 0.0002889911294914782, "clip_ratio/low_mean": 8.258828456746415e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037157939514145257, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 522.947265625, "completions/min_length": 249.5, "epoch": 0.48494329292139227, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0030447917990386484, "learning_rate": 1.6460801775296967e-06, "loss": 0.00012412352953106165, "reward": 0.36315490305423737, "reward_std": 0.27833518385887146, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36315491795539856, "rewards/QAReward/std": 0.44538192451000214, "step": 2480 }, { "clip_ratio/high_max": 0.00040114125004038216, "clip_ratio/high_mean": 0.00015687685809098184, "clip_ratio/low_mean": 4.2725562525447455e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019960241625085473, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 541.9088541666666, "completions/min_length": 259.3333333333333, "epoch": 0.4859210011732499, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.82421875, "kl": 0.0029191827401518823, "learning_rate": 1.6413516412330002e-06, "loss": 0.00016334288520738482, "reward": 0.32220207651456195, "reward_std": 0.27406538526217145, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3222020665804545, "rewards/QAReward/std": 0.42223790287971497, "step": 2485 }, { "clip_ratio/high_max": 0.0004675347474403679, "clip_ratio/high_mean": 0.00024310406297445297, "clip_ratio/low_mean": 4.4779658492188903e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002878837229218334, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 525.3828125, "completions/min_length": 240.0, "epoch": 0.48689870942510755, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8125, "kl": 0.0030995755922049286, "learning_rate": 1.6366216872606097e-06, "loss": 0.00011754890438169241, "reward": 0.3399832546710968, "reward_std": 0.31382741034030914, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3399832397699356, "rewards/QAReward/std": 0.49786433577537537, "step": 2490 }, { "clip_ratio/high_max": 0.00025803433964028956, "clip_ratio/high_mean": 0.00012462002923712133, "clip_ratio/low_mean": 6.443149177357555e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018905152101069688, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 546.4466145833334, "completions/min_length": 244.33333333333334, "epoch": 0.4878764176769652, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80078125, "kl": 0.002955926675349474, "learning_rate": 1.6318903630512435e-06, "loss": 0.00012877091066911816, "reward": 0.3232106665770213, "reward_std": 0.28273991743723553, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3232106765111287, "rewards/QAReward/std": 0.42761261264483136, "step": 2495 }, { "clip_ratio/high_max": 0.00040734343929216266, "clip_ratio/high_mean": 0.00022768884664401413, "clip_ratio/low_mean": 6.775224464945495e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000295441085472703, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 532.3359375, "completions/min_length": 221.5, "epoch": 0.4888541259288228, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.0029609156772494316, "learning_rate": 1.6271577160573638e-06, "loss": 0.00013481636997312308, "reward": 0.3505231738090515, "reward_std": 0.29277727007865906, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3505231738090515, "rewards/QAReward/std": 0.4498744308948517, "step": 2500 }, { "clip_ratio/high_max": 0.000343914411496371, "clip_ratio/high_mean": 0.00014943652204237878, "clip_ratio/low_mean": 6.016251281835139e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020959902321919798, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 535.2213541666666, "completions/min_length": 235.33333333333334, "epoch": 0.4898318341806805, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.002962984796613455, "learning_rate": 1.6224237937446984e-06, "loss": 0.00010984536493197083, "reward": 0.3580067853132884, "reward_std": 0.2782738407452901, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3580067853132884, "rewards/QAReward/std": 0.42352813482284546, "step": 2505 }, { "clip_ratio/high_max": 0.0003880946082063019, "clip_ratio/high_mean": 0.0002013959805481136, "clip_ratio/low_mean": 0.00010197327501373365, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003033692541066557, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 537.15625, "completions/min_length": 252.5, "epoch": 0.4908095424325381, "frac_reward_zero_std": 0.078125, "grad_norm": 0.75390625, "kl": 0.0029784788377583025, "learning_rate": 1.6176886435917677e-06, "loss": 0.0001715857069939375, "reward": 0.38552679121494293, "reward_std": 0.2802630662918091, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38552679121494293, "rewards/QAReward/std": 0.45894259214401245, "step": 2510 }, { "clip_ratio/high_max": 0.0004308754811063409, "clip_ratio/high_mean": 0.00015821700217202305, "clip_ratio/low_mean": 3.585175145417452e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019406875944696366, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 541.1640625, "completions/min_length": 252.66666666666666, "epoch": 0.49178725068439577, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.002997906366363168, "learning_rate": 1.6129523130894037e-06, "loss": 0.00011746367672458292, "reward": 0.32663796345392865, "reward_std": 0.3076520363489787, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32663795351982117, "rewards/QAReward/std": 0.4451409876346588, "step": 2515 }, { "clip_ratio/high_max": 0.0006277687847614288, "clip_ratio/high_mean": 0.00032127355225384233, "clip_ratio/low_mean": 7.500186184188351e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003962754155509174, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 526.400390625, "completions/min_length": 213.0, "epoch": 0.49276495893625344, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84375, "kl": 0.0030603181105107067, "learning_rate": 1.6082148497402796e-06, "loss": 0.00011871096212416888, "reward": 0.41671784222126007, "reward_std": 0.2805017977952957, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4167178273200989, "rewards/QAReward/std": 0.41030243039131165, "step": 2520 }, { "clip_ratio/high_max": 0.0003755448036827147, "clip_ratio/high_mean": 0.00021877798717468978, "clip_ratio/low_mean": 2.0830521680181847e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023960850667208432, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 1024.0, "completions/mean_length": 513.8958333333334, "completions/min_length": 237.66666666666666, "epoch": 0.49374266718811105, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.828125, "kl": 0.0030685793608427048, "learning_rate": 1.6034763010584282e-06, "loss": 0.00013217786327004432, "reward": 0.4208596845467885, "reward_std": 0.28538156549135846, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.420859694480896, "rewards/QAReward/std": 0.4704528550306956, "step": 2525 }, { "clip_ratio/high_max": 0.00038639429258182647, "clip_ratio/high_mean": 0.0002572878205683082, "clip_ratio/low_mean": 8.788774284766987e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003451755736023188, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 523.400390625, "completions/min_length": 234.5, "epoch": 0.4947203754399687, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.0030710535123944283, "learning_rate": 1.598736714568769e-06, "loss": 9.276667842641472e-05, "reward": 0.3513881117105484, "reward_std": 0.2890641987323761, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3513880968093872, "rewards/QAReward/std": 0.4588424265384674, "step": 2530 }, { "clip_ratio/high_max": 0.0003211510367691517, "clip_ratio/high_mean": 0.00016709000919945538, "clip_ratio/low_mean": 2.8962331998627634e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019605234265327454, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 519.7005208333334, "completions/min_length": 269.6666666666667, "epoch": 0.4956980836918264, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83984375, "kl": 0.003134562820196152, "learning_rate": 1.5939961378066292e-06, "loss": 3.128994430880994e-05, "reward": 0.3003580669562022, "reward_std": 0.28526533643404645, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3003580868244171, "rewards/QAReward/std": 0.44042033950487774, "step": 2535 }, { "clip_ratio/high_max": 0.00045353422174230216, "clip_ratio/high_mean": 0.00026264997432008387, "clip_ratio/low_mean": 6.948231311980635e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033213227870874105, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 499.669921875, "completions/min_length": 264.0, "epoch": 0.496675791943684, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.003113846015185118, "learning_rate": 1.5892546183172686e-06, "loss": 0.00014073150232434272, "reward": 0.3367931693792343, "reward_std": 0.28483930230140686, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3367931693792343, "rewards/QAReward/std": 0.4381035417318344, "step": 2540 }, { "clip_ratio/high_max": 0.00032090852037072184, "clip_ratio/high_mean": 0.00016356416745111346, "clip_ratio/low_mean": 4.5916842645965515e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020948101882822812, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 536.953125, "completions/min_length": 274.6666666666667, "epoch": 0.49765350019554166, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.0029082653112709523, "learning_rate": 1.5845122036554012e-06, "loss": 0.00012022685259580613, "reward": 0.297909955183665, "reward_std": 0.29505204161008197, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2979099651177724, "rewards/QAReward/std": 0.4601087172826131, "step": 2545 }, { "clip_ratio/high_max": 0.000515260617248714, "clip_ratio/high_mean": 0.00027787319850176574, "clip_ratio/low_mean": 7.069686253089458e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034857005812227726, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 528.677734375, "completions/min_length": 246.5, "epoch": 0.4986312084473993, "frac_reward_zero_std": 0.046875, "grad_norm": 0.79296875, "kl": 0.0030343794729560613, "learning_rate": 1.5797689413847196e-06, "loss": 6.0728611424565314e-05, "reward": 0.404006227850914, "reward_std": 0.2622915357351303, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.404006227850914, "rewards/QAReward/std": 0.41980141401290894, "step": 2550 }, { "clip_ratio/high_max": 0.00029606848256662487, "clip_ratio/high_mean": 0.00013179776724427938, "clip_ratio/low_mean": 4.2394670890644194e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001741924323141575, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 518.8463541666666, "completions/min_length": 251.33333333333334, "epoch": 0.49960891669925694, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8203125, "kl": 0.0031037820503115655, "learning_rate": 1.575024879077418e-06, "loss": 0.00016813542461022735, "reward": 0.44437459111213684, "reward_std": 0.2398336430390676, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.44437459111213684, "rewards/QAReward/std": 0.4323459366957347, "step": 2555 }, { "clip_ratio/high_max": 0.0006422913051210344, "clip_ratio/high_mean": 0.0002778956724796444, "clip_ratio/low_mean": 7.674931257497519e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003546449937857687, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 545.22265625, "completions/min_length": 266.5, "epoch": 0.5005866249511146, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.002972870832309127, "learning_rate": 1.5702800643137128e-06, "loss": 0.00013836704893037676, "reward": 0.3101988732814789, "reward_std": 0.279832661151886, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3101988881826401, "rewards/QAReward/std": 0.44206178188323975, "step": 2560 }, { "clip_ratio/high_max": 0.00041515091434121134, "clip_ratio/high_mean": 0.0001810057496186346, "clip_ratio/low_mean": 6.238902278710157e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002433947753161192, "completions/clipped_ratio": 0.052083333333333336, "completions/max_length": 1024.0, "completions/mean_length": 543.6901041666666, "completions/min_length": 258.0, "epoch": 0.5015643332029722, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.74609375, "kl": 0.0028800063766539095, "learning_rate": 1.565534544681369e-06, "loss": 8.403714164160193e-05, "reward": 0.3828626275062561, "reward_std": 0.2806281050046285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3828626275062561, "rewards/QAReward/std": 0.4511597851912181, "step": 2565 }, { "clip_ratio/high_max": 0.00044379699975252154, "clip_ratio/high_mean": 0.0002634812903124839, "clip_ratio/low_mean": 8.599966968176886e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003494809614494443, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 530.259765625, "completions/min_length": 274.0, "epoch": 0.5025420414548298, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0029549215454608203, "learning_rate": 1.5607883677752196e-06, "loss": 0.0001144532347097993, "reward": 0.28543759882450104, "reward_std": 0.28437021374702454, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28543759882450104, "rewards/QAReward/std": 0.4399776756763458, "step": 2570 }, { "clip_ratio/high_max": 0.00030024630250409247, "clip_ratio/high_mean": 0.00014583306619897485, "clip_ratio/low_mean": 5.6154077174142e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020198713755235076, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 527.5807291666666, "completions/min_length": 258.0, "epoch": 0.5035197497066876, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.86328125, "kl": 0.0030075214803218842, "learning_rate": 1.5560415811966903e-06, "loss": 0.00022119160275906323, "reward": 0.3443323274453481, "reward_std": 0.2664599120616913, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34433233737945557, "rewards/QAReward/std": 0.4378133813540141, "step": 2575 }, { "clip_ratio/high_max": 0.0004937993129715323, "clip_ratio/high_mean": 0.00026340632466599346, "clip_ratio/low_mean": 5.322036886354908e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031662669498473407, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 505.04296875, "completions/min_length": 226.0, "epoch": 0.5044974579585452, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8515625, "kl": 0.0030814945232123135, "learning_rate": 1.5512942325533219e-06, "loss": 0.00011853810865432024, "reward": 0.3386656790971756, "reward_std": 0.2811288982629776, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3386656939983368, "rewards/QAReward/std": 0.4768162965774536, "step": 2580 }, { "clip_ratio/high_max": 0.0003347896039485931, "clip_ratio/high_mean": 0.00017640601727180182, "clip_ratio/low_mean": 4.1517858335282654e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021792387706227602, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 516.59765625, "completions/min_length": 236.66666666666666, "epoch": 0.5054751662104028, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.00311517296358943, "learning_rate": 1.5465463694582917e-06, "loss": 0.00011757300235331059, "reward": 0.44653812050819397, "reward_std": 0.2569870501756668, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.44653812050819397, "rewards/QAReward/std": 0.4104577402273814, "step": 2585 }, { "clip_ratio/high_max": 0.0005632673855870962, "clip_ratio/high_mean": 0.00029051723540760575, "clip_ratio/low_mean": 5.960185808362439e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035011908039450645, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 532.201171875, "completions/min_length": 247.5, "epoch": 0.5064528744622605, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87109375, "kl": 0.0029604009818285705, "learning_rate": 1.5417980395299362e-06, "loss": 0.0001230774214491248, "reward": 0.3380991071462631, "reward_std": 0.291157990694046, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3380991220474243, "rewards/QAReward/std": 0.4436649978160858, "step": 2590 }, { "clip_ratio/high_max": 0.00031992830336093905, "clip_ratio/high_mean": 0.00014929079334251582, "clip_ratio/low_mean": 8.650572272017598e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023579651024192573, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 524.6158854166666, "completions/min_length": 227.33333333333334, "epoch": 0.5074305827141181, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.77734375, "kl": 0.002999387215822935, "learning_rate": 1.5370492903912756e-06, "loss": 8.601432782597839e-05, "reward": 0.30649028221766156, "reward_std": 0.2896392146746318, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3064902623494466, "rewards/QAReward/std": 0.4481540024280548, "step": 2595 }, { "clip_ratio/high_max": 0.0004375432268716395, "clip_ratio/high_mean": 0.00024244203232228755, "clip_ratio/low_mean": 6.079075101297349e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030323277460411193, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 541.484375, "completions/min_length": 241.5, "epoch": 0.5084082909659757, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.0029769277665764094, "learning_rate": 1.532300169669533e-06, "loss": 0.00013023018836975098, "reward": 0.4088650196790695, "reward_std": 0.2660537362098694, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4088650196790695, "rewards/QAReward/std": 0.4336906969547272, "step": 2600 }, { "clip_ratio/high_max": 0.00033582490868866444, "clip_ratio/high_mean": 0.00016822831239551305, "clip_ratio/low_mean": 5.02886512549594e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002185169607400894, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 541.2903645833334, "completions/min_length": 271.3333333333333, "epoch": 0.5093859992178335, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7734375, "kl": 0.0028389643412083387, "learning_rate": 1.5275507249956586e-06, "loss": 0.00011630270164459944, "reward": 0.36873294909795123, "reward_std": 0.2778282215197881, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3687329391638438, "rewards/QAReward/std": 0.43512697021166485, "step": 2605 }, { "clip_ratio/high_max": 0.0004813494044356048, "clip_ratio/high_mean": 0.00022209645248949528, "clip_ratio/low_mean": 7.035113376332446e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002924475935287774, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 530.3046875, "completions/min_length": 236.5, "epoch": 0.5103637074696911, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84765625, "kl": 0.0030186167918145657, "learning_rate": 1.5228010040038522e-06, "loss": 0.00015474630054086448, "reward": 0.3562523275613785, "reward_std": 0.2733142077922821, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3562523424625397, "rewards/QAReward/std": 0.4023304432630539, "step": 2610 }, { "clip_ratio/high_max": 0.00040444103069603443, "clip_ratio/high_mean": 0.00022135752951726317, "clip_ratio/low_mean": 4.0014809928834436e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002613723394460976, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 537.1809895833334, "completions/min_length": 262.3333333333333, "epoch": 0.5113414157215487, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.002968330681324005, "learning_rate": 1.5180510543310855e-06, "loss": 0.00013662099372595548, "reward": 0.36160611112912494, "reward_std": 0.2676021953423818, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36160611112912494, "rewards/QAReward/std": 0.4508379101753235, "step": 2615 }, { "clip_ratio/high_max": 0.00047607263550162313, "clip_ratio/high_mean": 0.00023315151920542121, "clip_ratio/low_mean": 9.396984823979437e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003271213732659817, "completions/clipped_ratio": 0.013671875, "completions/max_length": 983.5, "completions/mean_length": 538.994140625, "completions/min_length": 238.0, "epoch": 0.5123191239734063, "frac_reward_zero_std": 0.015625, "grad_norm": 0.82421875, "kl": 0.0029932602774351835, "learning_rate": 1.5133009236166212e-06, "loss": 0.00017238666769117116, "reward": 0.30063556134700775, "reward_std": 0.30032511055469513, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30063554644584656, "rewards/QAReward/std": 0.4163617640733719, "step": 2620 }, { "clip_ratio/high_max": 0.00039321443764492867, "clip_ratio/high_mean": 0.00020971084013581275, "clip_ratio/low_mean": 6.483213510364295e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002745429752394557, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 532.2369791666666, "completions/min_length": 228.66666666666666, "epoch": 0.513296832225264, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.003052525082603097, "learning_rate": 1.5085506595015404e-06, "loss": 0.0001435756217688322, "reward": 0.4438750644524892, "reward_std": 0.27295879522959393, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4438750346501668, "rewards/QAReward/std": 0.4073036511739095, "step": 2625 }, { "clip_ratio/high_max": 0.0004479521187022328, "clip_ratio/high_mean": 0.00022952015860937535, "clip_ratio/low_mean": 6.929375522304326e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029881392256356774, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 515.044921875, "completions/min_length": 233.0, "epoch": 0.5142745404771216, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.0030861135572195055, "learning_rate": 1.5038003096282607e-06, "loss": 0.00017189240315929055, "reward": 0.35616008937358856, "reward_std": 0.2755216807126999, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35616010427474976, "rewards/QAReward/std": 0.46964341402053833, "step": 2630 }, { "clip_ratio/high_max": 0.0002686988213099539, "clip_ratio/high_mean": 0.00017636829288676382, "clip_ratio/low_mean": 6.319631647784263e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002395645948126912, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 529.1002604166666, "completions/min_length": 259.0, "epoch": 0.5152522487289792, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84765625, "kl": 0.003098144568502903, "learning_rate": 1.4990499216400601e-06, "loss": 0.00012649216223508118, "reward": 0.382675568262736, "reward_std": 0.2780485252539317, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38267557819684345, "rewards/QAReward/std": 0.4358435869216919, "step": 2635 }, { "clip_ratio/high_max": 0.0004889116971753538, "clip_ratio/high_mean": 0.0002993141650222242, "clip_ratio/low_mean": 0.00010108074638992548, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040039492305368187, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 530.474609375, "completions/min_length": 213.0, "epoch": 0.516229956980837, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7578125, "kl": 0.002913780556991696, "learning_rate": 1.4942995431805997e-06, "loss": 0.00016999259823933244, "reward": 0.33747753500938416, "reward_std": 0.2711162716150284, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33747753500938416, "rewards/QAReward/std": 0.45414669811725616, "step": 2640 }, { "clip_ratio/high_max": 0.00035918777575716376, "clip_ratio/high_mean": 0.00016584052937105299, "clip_ratio/low_mean": 3.1079602194949985e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019692013156600297, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 526.1796875, "completions/min_length": 239.66666666666666, "epoch": 0.5172076652326946, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78515625, "kl": 0.0030595954973250627, "learning_rate": 1.4895492218934438e-06, "loss": 0.0001441399333998561, "reward": 0.4636253813902537, "reward_std": 0.2727625370025635, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.46362539132436115, "rewards/QAReward/std": 0.44794195890426636, "step": 2645 }, { "clip_ratio/high_max": 0.0005302327452227473, "clip_ratio/high_mean": 0.00025545571697875856, "clip_ratio/low_mean": 0.00010464472288731485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003601004369556904, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 511.69921875, "completions/min_length": 241.5, "epoch": 0.5181853734845522, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84765625, "kl": 0.003041298175230622, "learning_rate": 1.484799005421584e-06, "loss": 0.0001763333799317479, "reward": 0.3771189749240875, "reward_std": 0.2883683145046234, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3771189749240875, "rewards/QAReward/std": 0.456787109375, "step": 2650 }, { "clip_ratio/high_max": 0.00030913319205865263, "clip_ratio/high_mean": 0.00018124051857739686, "clip_ratio/low_mean": 4.382827173685655e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022506878012791277, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 532.0885416666666, "completions/min_length": 240.0, "epoch": 0.5191630817364099, "frac_reward_zero_std": 0.03125, "grad_norm": 0.89453125, "kl": 0.0030566631350666283, "learning_rate": 1.4800489414069595e-06, "loss": 0.00016520125791430473, "reward": 0.35099613666534424, "reward_std": 0.26905791461467743, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35099615653355914, "rewards/QAReward/std": 0.4297223885854085, "step": 2655 }, { "clip_ratio/high_max": 0.000505087268538773, "clip_ratio/high_mean": 0.00023801358765922486, "clip_ratio/low_mean": 8.325370290549473e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032126727746799587, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 529.0546875, "completions/min_length": 245.0, "epoch": 0.5201407899882675, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7734375, "kl": 0.0030367213767021894, "learning_rate": 1.475299077489983e-06, "loss": 7.241713465191423e-05, "reward": 0.37009860575199127, "reward_std": 0.2849390059709549, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3700985759496689, "rewards/QAReward/std": 0.4330751746892929, "step": 2660 }, { "clip_ratio/high_max": 0.00030246570240706206, "clip_ratio/high_mean": 0.00018008879269473254, "clip_ratio/low_mean": 5.729038384743035e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002373791765421629, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 541.8606770833334, "completions/min_length": 240.33333333333334, "epoch": 0.5211184982401251, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.75, "kl": 0.00305301733314991, "learning_rate": 1.4705494613090579e-06, "loss": 0.00012909346260130404, "reward": 0.3373265862464905, "reward_std": 0.29356907804807025, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.337326576312383, "rewards/QAReward/std": 0.4537530839443207, "step": 2665 }, { "clip_ratio/high_max": 0.00046206520637497305, "clip_ratio/high_mean": 0.0002445057965815067, "clip_ratio/low_mean": 9.824123990256339e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003427470335736871, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 546.22265625, "completions/min_length": 258.5, "epoch": 0.5220962064919827, "frac_reward_zero_std": 0.078125, "grad_norm": 0.83984375, "kl": 0.0029192504473030565, "learning_rate": 1.4658001405001038e-06, "loss": 0.00010938118211925029, "reward": 0.37962083518505096, "reward_std": 0.27208495140075684, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37962085008621216, "rewards/QAReward/std": 0.4597077816724777, "step": 2670 }, { "clip_ratio/high_max": 0.000434951635543257, "clip_ratio/high_mean": 0.00018261740333400667, "clip_ratio/low_mean": 5.044509016443044e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023306248476728797, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 538.5794270833334, "completions/min_length": 239.0, "epoch": 0.5230739147438405, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.0030269626062363387, "learning_rate": 1.4610511626960774e-06, "loss": 0.0001453724573366344, "reward": 0.34052372972170514, "reward_std": 0.28960126141707104, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34052373965581256, "rewards/QAReward/std": 0.42930082480112713, "step": 2675 }, { "clip_ratio/high_max": 0.00041616587550379335, "clip_ratio/high_mean": 0.00021407896419987083, "clip_ratio/low_mean": 0.00013720860006287693, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003512875759042799, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 525.390625, "completions/min_length": 220.5, "epoch": 0.5240516229956981, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.003101932303979993, "learning_rate": 1.4563025755264969e-06, "loss": 0.00022305992897599935, "reward": 0.3599221855401993, "reward_std": 0.27772821485996246, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3599221855401993, "rewards/QAReward/std": 0.4182479828596115, "step": 2680 }, { "clip_ratio/high_max": 0.000359340594150126, "clip_ratio/high_mean": 0.00015378983807750045, "clip_ratio/low_mean": 0.00010553839383646846, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000259328237734735, "completions/clipped_ratio": 0.009114583333333334, "completions/max_length": 1024.0, "completions/mean_length": 528.41796875, "completions/min_length": 235.33333333333334, "epoch": 0.5250293312475557, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8046875, "kl": 0.003072805982083082, "learning_rate": 1.451554426616961e-06, "loss": 0.0001803308492526412, "reward": 0.3071743845939636, "reward_std": 0.30264513691266376, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3071743845939636, "rewards/QAReward/std": 0.488217572371165, "step": 2685 }, { "clip_ratio/high_max": 0.00040210902225226166, "clip_ratio/high_mean": 0.0002494731161277741, "clip_ratio/low_mean": 9.158023458439856e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003410533594433218, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 520.923828125, "completions/min_length": 245.0, "epoch": 0.5260070394994134, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.003045393619686365, "learning_rate": 1.446806763588673e-06, "loss": 0.0001594509929418564, "reward": 0.4133138060569763, "reward_std": 0.27837827801704407, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4133138060569763, "rewards/QAReward/std": 0.41433247923851013, "step": 2690 }, { "clip_ratio/high_max": 0.0003485525958240032, "clip_ratio/high_mean": 0.00019937469623982907, "clip_ratio/low_mean": 6.262052338570356e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002619952196255326, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 538.2799479166666, "completions/min_length": 258.0, "epoch": 0.526984747751271, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.0029955849051475523, "learning_rate": 1.4420596340579638e-06, "loss": 7.229559705592691e-05, "reward": 0.309038112560908, "reward_std": 0.28386743863423664, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30903810262680054, "rewards/QAReward/std": 0.45608553290367126, "step": 2695 }, { "clip_ratio/high_max": 0.0004853511694818735, "clip_ratio/high_mean": 0.0002912961004767567, "clip_ratio/low_mean": 5.914210778428242e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003504382097162306, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 525.41015625, "completions/min_length": 241.0, "epoch": 0.5279624560031286, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83984375, "kl": 0.0029948563780635594, "learning_rate": 1.4373130856358135e-06, "loss": 0.0001605783822014928, "reward": 0.3509626090526581, "reward_std": 0.29594549536705017, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3509626090526581, "rewards/QAReward/std": 0.4478076547384262, "step": 2700 }, { "clip_ratio/high_max": 0.0002868094714358449, "clip_ratio/high_mean": 0.00017481467220932246, "clip_ratio/low_mean": 6.0195115656824784e-05, "clip_ratio/low_min": 2.1925017063040286e-05, "clip_ratio/region_mean": 0.00023500978713855147, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 544.43359375, "completions/min_length": 264.6666666666667, "epoch": 0.5289401642549864, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.84375, "kl": 0.0028981488663703202, "learning_rate": 1.4325671659273737e-06, "loss": 7.44817778468132e-05, "reward": 0.3758360246817271, "reward_std": 0.28660833835601807, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37583601474761963, "rewards/QAReward/std": 0.4195994734764099, "step": 2705 }, { "clip_ratio/high_max": 0.0006088788388296962, "clip_ratio/high_mean": 0.0002897442376706749, "clip_ratio/low_mean": 7.764790207147599e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003673921339213848, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 511.544921875, "completions/min_length": 248.5, "epoch": 0.529917872506844, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7421875, "kl": 0.0030611025635153056, "learning_rate": 1.4278219225314903e-06, "loss": 0.0001679949229583144, "reward": 0.4213681221008301, "reward_std": 0.27791646122932434, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4213680922985077, "rewards/QAReward/std": 0.4165608882904053, "step": 2710 }, { "clip_ratio/high_max": 0.00031339013949036596, "clip_ratio/high_mean": 0.00019753025844693184, "clip_ratio/low_mean": 6.997707532718778e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002675073221325874, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 528.03515625, "completions/min_length": 252.66666666666666, "epoch": 0.5308955807587016, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8046875, "kl": 0.002984713623300195, "learning_rate": 1.4230774030402267e-06, "loss": 0.00013850722461938858, "reward": 0.33065633475780487, "reward_std": 0.27194810907046, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3306563198566437, "rewards/QAReward/std": 0.43583033482233685, "step": 2715 }, { "clip_ratio/high_max": 0.0005469058989547193, "clip_ratio/high_mean": 0.00026972981868311763, "clip_ratio/low_mean": 8.614677644800395e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003558765922207385, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 517.66015625, "completions/min_length": 237.0, "epoch": 0.5318732890105593, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.0031383831985294817, "learning_rate": 1.4183336550383855e-06, "loss": 0.00022119618952274323, "reward": 0.3601996600627899, "reward_std": 0.2805817425251007, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3601996600627899, "rewards/QAReward/std": 0.4489070177078247, "step": 2720 }, { "clip_ratio/high_max": 0.0003300854004919529, "clip_ratio/high_mean": 0.00016764217289164663, "clip_ratio/low_mean": 3.3156008430523795e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020079818787053228, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 524.3580729166666, "completions/min_length": 250.0, "epoch": 0.5328509972624169, "frac_reward_zero_std": 0.0625, "grad_norm": 0.83203125, "kl": 0.0030137034598737957, "learning_rate": 1.4135907261030322e-06, "loss": 0.00013809422962367534, "reward": 0.3734896977742513, "reward_std": 0.2550647705793381, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3734896977742513, "rewards/QAReward/std": 0.4369669258594513, "step": 2725 }, { "clip_ratio/high_max": 0.0004733834881335497, "clip_ratio/high_mean": 0.00027320757508277895, "clip_ratio/low_mean": 8.069595496635884e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035390352131798864, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 518.2109375, "completions/min_length": 242.0, "epoch": 0.5338287055142745, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.003086337028071284, "learning_rate": 1.408848663803016e-06, "loss": 0.00015085919294506313, "reward": 0.3543922156095505, "reward_std": 0.2830130159854889, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35439223051071167, "rewards/QAReward/std": 0.46236975491046906, "step": 2730 }, { "clip_ratio/high_max": 0.00031826363410800693, "clip_ratio/high_mean": 0.00018991505494341253, "clip_ratio/low_mean": 6.197579059517012e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025189083535224197, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 553.5208333333334, "completions/min_length": 261.0, "epoch": 0.5348064137661321, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.765625, "kl": 0.0028071938082575797, "learning_rate": 1.404107515698497e-06, "loss": 7.407082011923194e-05, "reward": 0.368500014146169, "reward_std": 0.2688414553801219, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.368500014146169, "rewards/QAReward/std": 0.4381818473339081, "step": 2735 }, { "clip_ratio/high_max": 0.00042719224002212285, "clip_ratio/high_mean": 0.0002590612624771893, "clip_ratio/low_mean": 7.887033425504341e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003379315952770412, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 536.23828125, "completions/min_length": 245.5, "epoch": 0.5357841220179899, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7578125, "kl": 0.003031967720016837, "learning_rate": 1.3993673293404641e-06, "loss": 0.00013835643185302615, "reward": 0.3154994323849678, "reward_std": 0.3091016411781311, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.315499447286129, "rewards/QAReward/std": 0.4246963560581207, "step": 2740 }, { "clip_ratio/high_max": 0.0003294669208116829, "clip_ratio/high_mean": 0.00017725307843647898, "clip_ratio/low_mean": 3.654682950582355e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021379990503191947, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 533.9440104166666, "completions/min_length": 255.66666666666666, "epoch": 0.5367618302698475, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8046875, "kl": 0.0029813847970217465, "learning_rate": 1.394628152270261e-06, "loss": 5.655199056491256e-05, "reward": 0.38474392890930176, "reward_std": 0.2874509592851003, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38474392890930176, "rewards/QAReward/std": 0.4352800250053406, "step": 2745 }, { "clip_ratio/high_max": 0.0005362049909308553, "clip_ratio/high_mean": 0.00030593749834224583, "clip_ratio/low_mean": 8.13507693237625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038728825747966764, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 526.443359375, "completions/min_length": 256.0, "epoch": 0.5377395385217051, "frac_reward_zero_std": 0.046875, "grad_norm": 0.82421875, "kl": 0.002973454352468252, "learning_rate": 1.3898900320191087e-06, "loss": 0.00012948643416166306, "reward": 0.39669984579086304, "reward_std": 0.23730254918336868, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39669984579086304, "rewards/QAReward/std": 0.3872537165880203, "step": 2750 }, { "clip_ratio/high_max": 0.00043568830005824566, "clip_ratio/high_mean": 0.0001798333425540477, "clip_ratio/low_mean": 4.733253008453176e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022716587409377098, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 522.0963541666666, "completions/min_length": 251.66666666666666, "epoch": 0.5387172467735628, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.81640625, "kl": 0.0030485440511256456, "learning_rate": 1.3851530161076303e-06, "loss": 0.00015153121203184127, "reward": 0.42740265528361004, "reward_std": 0.2775548994541168, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42740267515182495, "rewards/QAReward/std": 0.40703802307446796, "step": 2755 }, { "clip_ratio/high_max": 0.0004955677315592766, "clip_ratio/high_mean": 0.00032954761991277337, "clip_ratio/low_mean": 6.677884957753121e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003963264753110707, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1024.0, "completions/mean_length": 510.0234375, "completions/min_length": 237.5, "epoch": 0.5396949550254204, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.0031634376384317873, "learning_rate": 1.3804171520453714e-06, "loss": 0.00012524370104074478, "reward": 0.353432297706604, "reward_std": 0.28182949125766754, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3534322530031204, "rewards/QAReward/std": 0.4279065430164337, "step": 2760 }, { "clip_ratio/high_max": 0.0003138279484119266, "clip_ratio/high_mean": 0.00015744853881187736, "clip_ratio/low_mean": 4.725331818917766e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002047018613666296, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 538.80859375, "completions/min_length": 252.33333333333334, "epoch": 0.540672663277278, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.76171875, "kl": 0.0030175966676324606, "learning_rate": 1.3756824873303258e-06, "loss": 0.00017397988121956587, "reward": 0.33737805485725403, "reward_std": 0.28555625180403393, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33737805485725403, "rewards/QAReward/std": 0.4367244740327199, "step": 2765 }, { "clip_ratio/high_max": 0.00048641647445037963, "clip_ratio/high_mean": 0.00026094171917065976, "clip_ratio/low_mean": 8.653823169879615e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034747994504868984, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 538.0546875, "completions/min_length": 233.5, "epoch": 0.5416503715291358, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.0029103257693350315, "learning_rate": 1.3709490694484577e-06, "loss": 0.00010630169417709112, "reward": 0.3878495395183563, "reward_std": 0.25578881800174713, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3878495544195175, "rewards/QAReward/std": 0.4035942703485489, "step": 2770 }, { "clip_ratio/high_max": 0.00036167679354548454, "clip_ratio/high_mean": 0.0001690529636107385, "clip_ratio/low_mean": 2.6096792134921996e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019514975138008596, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 540.16796875, "completions/min_length": 280.6666666666667, "epoch": 0.5426280797809934, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8125, "kl": 0.002975231595337391, "learning_rate": 1.3662169458732284e-06, "loss": 9.874717798084021e-05, "reward": 0.34169209003448486, "reward_std": 0.2732601463794708, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34169208506743115, "rewards/QAReward/std": 0.43766798575719196, "step": 2775 }, { "clip_ratio/high_max": 0.0006002578418701887, "clip_ratio/high_mean": 0.00024473071680404247, "clip_ratio/low_mean": 6.915531121194362e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031388602219522, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 542.208984375, "completions/min_length": 256.5, "epoch": 0.543605788032851, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.0029013318475335837, "learning_rate": 1.3614861640651163e-06, "loss": 0.00018320566741749644, "reward": 0.43345747888088226, "reward_std": 0.27589839696884155, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43345750868320465, "rewards/QAReward/std": 0.45872731506824493, "step": 2780 }, { "clip_ratio/high_max": 0.00027306316187605264, "clip_ratio/high_mean": 0.00016739994171075523, "clip_ratio/low_mean": 4.923864908050746e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021663858788087965, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 517.4388020833334, "completions/min_length": 253.33333333333334, "epoch": 0.5445834962847086, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.7734375, "kl": 0.0031127049122005703, "learning_rate": 1.3567567714711429e-06, "loss": 0.00011728403624147176, "reward": 0.39185837904612225, "reward_std": 0.26523634294668835, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3918583591779073, "rewards/QAReward/std": 0.43929222226142883, "step": 2785 }, { "clip_ratio/high_max": 0.0006349586416035891, "clip_ratio/high_mean": 0.0002725299913436174, "clip_ratio/low_mean": 8.593166130594909e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035846164682880045, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 542.408203125, "completions/min_length": 232.0, "epoch": 0.5455612045365663, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.0029511481523513796, "learning_rate": 1.352028815524396e-06, "loss": 8.510411716997624e-05, "reward": 0.3568909615278244, "reward_std": 0.2808682918548584, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3568909615278244, "rewards/QAReward/std": 0.4174426347017288, "step": 2790 }, { "clip_ratio/high_max": 0.00031162116210907697, "clip_ratio/high_mean": 0.00014998895348981023, "clip_ratio/low_mean": 3.436043334659189e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018434938974678516, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 517.0859375, "completions/min_length": 248.0, "epoch": 0.5465389127884239, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.003113091690465808, "learning_rate": 1.3473023436435571e-06, "loss": 0.00013985696714371442, "reward": 0.4347687363624573, "reward_std": 0.26647921403249103, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4347687363624573, "rewards/QAReward/std": 0.41446366906166077, "step": 2795 }, { "clip_ratio/high_max": 0.0004323336412198842, "clip_ratio/high_mean": 0.00023936228244565428, "clip_ratio/low_mean": 5.2675281040137634e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002920375613030046, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 550.419921875, "completions/min_length": 262.5, "epoch": 0.5475166210402815, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.0029192673973739147, "learning_rate": 1.342577403232421e-06, "loss": 0.00013595412019640207, "reward": 0.19879092276096344, "reward_std": 0.2896803319454193, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.19879091531038284, "rewards/QAReward/std": 0.4799639731645584, "step": 2800 }, { "clip_ratio/high_max": 0.000431563938036561, "clip_ratio/high_mean": 0.00018525893101468683, "clip_ratio/low_mean": 2.8461437614168973e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021372036426328122, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 540.4348958333334, "completions/min_length": 245.33333333333334, "epoch": 0.5484943292921393, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8125, "kl": 0.002898220391944051, "learning_rate": 1.337854041679423e-06, "loss": 7.850559777580201e-05, "reward": 0.37674904863039654, "reward_std": 0.27042152484258014, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37674903869628906, "rewards/QAReward/std": 0.4369051357110341, "step": 2805 }, { "clip_ratio/high_max": 0.0005282772704958916, "clip_ratio/high_mean": 0.00026872111484408376, "clip_ratio/low_mean": 0.00010342086316086351, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037214197218418124, "completions/clipped_ratio": 0.01171875, "completions/max_length": 946.0, "completions/mean_length": 503.515625, "completions/min_length": 236.5, "epoch": 0.5494720375439969, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8046875, "kl": 0.0032252151984721424, "learning_rate": 1.3331323063571647e-06, "loss": 0.00011728859972208739, "reward": 0.3554261177778244, "reward_std": 0.27859312295913696, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3554261177778244, "rewards/QAReward/std": 0.4892377406358719, "step": 2810 }, { "clip_ratio/high_max": 0.0003466745256446302, "clip_ratio/high_mean": 0.0001667659671511501, "clip_ratio/low_mean": 5.3313965327106414e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022007993538863957, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 553.2760416666666, "completions/min_length": 250.0, "epoch": 0.5504497457958545, "frac_reward_zero_std": 0.03125, "grad_norm": 0.765625, "kl": 0.0028579440433532, "learning_rate": 1.328412244621936e-06, "loss": 0.00013606869615614414, "reward": 0.34501126408576965, "reward_std": 0.2648254583279292, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34501127401987713, "rewards/QAReward/std": 0.43399250507354736, "step": 2815 }, { "clip_ratio/high_max": 0.00043636814225465057, "clip_ratio/high_mean": 0.00022633025073446334, "clip_ratio/low_mean": 7.023981161182746e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029657006380148234, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 551.064453125, "completions/min_length": 288.0, "epoch": 0.5514274540477122, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.0029706419445574284, "learning_rate": 1.3236939038132437e-06, "loss": 0.00018237074837088585, "reward": 0.3267655670642853, "reward_std": 0.3071053475141525, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3267655670642853, "rewards/QAReward/std": 0.4565768986940384, "step": 2820 }, { "clip_ratio/high_max": 0.0002684499020688236, "clip_ratio/high_mean": 0.00012291526654735209, "clip_ratio/low_mean": 3.885312835336663e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016176839126273989, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 520.06640625, "completions/min_length": 236.33333333333334, "epoch": 0.5524051622995698, "frac_reward_zero_std": 0.03125, "grad_norm": 0.79296875, "kl": 0.003082375321537256, "learning_rate": 1.3189773312533321e-06, "loss": 0.00010299220448359848, "reward": 0.349165012439092, "reward_std": 0.2914331754048665, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.349165012439092, "rewards/QAReward/std": 0.4411131739616394, "step": 2825 }, { "clip_ratio/high_max": 0.0004808641271665692, "clip_ratio/high_mean": 0.00022808118374086916, "clip_ratio/low_mean": 8.122480066958815e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003093059756793082, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 540.65234375, "completions/min_length": 247.5, "epoch": 0.5533828705514274, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.0030285123735666274, "learning_rate": 1.3142625742467124e-06, "loss": 0.00021882627625018357, "reward": 0.45136798918247223, "reward_std": 0.2584913522005081, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.45136795938014984, "rewards/QAReward/std": 0.38687020540237427, "step": 2830 }, { "clip_ratio/high_max": 0.00033881423296406866, "clip_ratio/high_mean": 0.00019060122431255878, "clip_ratio/low_mean": 1.6935603707679547e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002075368305668235, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 534.4583333333334, "completions/min_length": 263.0, "epoch": 0.5543605788032852, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.003029465675354004, "learning_rate": 1.3095496800796873e-06, "loss": 0.00014225305058062075, "reward": 0.3542080521583557, "reward_std": 0.27685268719991046, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3542080521583557, "rewards/QAReward/std": 0.4160299301147461, "step": 2835 }, { "clip_ratio/high_max": 0.00042371207382529974, "clip_ratio/high_mean": 0.00023069126764312385, "clip_ratio/low_mean": 5.773019947810099e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002884214627556503, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 536.091796875, "completions/min_length": 253.0, "epoch": 0.5553382870551428, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7734375, "kl": 0.002919706702232361, "learning_rate": 1.3048386960198756e-06, "loss": 8.885115676093847e-06, "reward": 0.34531715512275696, "reward_std": 0.28559522330760956, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34531715512275696, "rewards/QAReward/std": 0.45781415700912476, "step": 2840 }, { "clip_ratio/high_max": 0.000273800955619663, "clip_ratio/high_mean": 0.0001179073704406619, "clip_ratio/low_mean": 5.904190766159445e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001769492868334055, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 525.01953125, "completions/min_length": 246.0, "epoch": 0.5563159953070004, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.83203125, "kl": 0.0030497961211949586, "learning_rate": 1.3001296693157387e-06, "loss": 0.0001353971427306533, "reward": 0.35283513863881427, "reward_std": 0.2690599461396535, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35283513863881427, "rewards/QAReward/std": 0.4257403214772542, "step": 2845 }, { "clip_ratio/high_max": 0.00045319979544728995, "clip_ratio/high_mean": 0.0002467024023644626, "clip_ratio/low_mean": 6.19430014921818e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003086453943978995, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 501.50390625, "completions/min_length": 236.0, "epoch": 0.557293703558858, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0031035199761390688, "learning_rate": 1.2954226471961066e-06, "loss": 0.00012776947114616631, "reward": 0.3434106111526489, "reward_std": 0.2800644040107727, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3434106111526489, "rewards/QAReward/std": 0.4644434303045273, "step": 2850 }, { "clip_ratio/high_max": 0.0003249653847888112, "clip_ratio/high_mean": 0.00016625643474981188, "clip_ratio/low_mean": 4.128000364289619e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020753643475472926, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 531.98828125, "completions/min_length": 241.66666666666666, "epoch": 0.5582714118107157, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78125, "kl": 0.002911712788045406, "learning_rate": 1.290717676869706e-06, "loss": 0.00015607543755322695, "reward": 0.3198118209838867, "reward_std": 0.2863052388032277, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3198118209838867, "rewards/QAReward/std": 0.46380288402239483, "step": 2855 }, { "clip_ratio/high_max": 0.00045668951934203507, "clip_ratio/high_mean": 0.00025152829475700856, "clip_ratio/low_mean": 6.802321295253932e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003195515018887818, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 518.333984375, "completions/min_length": 239.0, "epoch": 0.5592491200625733, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0030689235776662827, "learning_rate": 1.2860148055246838e-06, "loss": 0.00021480005234479903, "reward": 0.424521803855896, "reward_std": 0.2896828204393387, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4245217889547348, "rewards/QAReward/std": 0.46138736605644226, "step": 2860 }, { "clip_ratio/high_max": 0.0003186075598932803, "clip_ratio/high_mean": 0.0001647803233936429, "clip_ratio/low_mean": 6.789963517803699e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023267995566129686, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 525.6315104166666, "completions/min_length": 258.3333333333333, "epoch": 0.5602268283144309, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.84375, "kl": 0.0030101681128144263, "learning_rate": 1.281314080328135e-06, "loss": 0.0002099792705848813, "reward": 0.3549572229385376, "reward_std": 0.27950405577818555, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3549572229385376, "rewards/QAReward/std": 0.42536259690920514, "step": 2865 }, { "clip_ratio/high_max": 0.0006502335658296942, "clip_ratio/high_mean": 0.0002598415943793952, "clip_ratio/low_mean": 7.810991519363597e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003379515022970736, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 517.763671875, "completions/min_length": 234.0, "epoch": 0.5612045365662887, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.0029926273971796038, "learning_rate": 1.2766155484256322e-06, "loss": 0.00017610942013561725, "reward": 0.35547804832458496, "reward_std": 0.2948673665523529, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35547804832458496, "rewards/QAReward/std": 0.44158923625946045, "step": 2870 }, { "clip_ratio/high_max": 0.00030550446826964616, "clip_ratio/high_mean": 0.00019082661019638182, "clip_ratio/low_mean": 5.808296264149248e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002489095786586404, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.9986979166666, "completions/min_length": 256.0, "epoch": 0.5621822448181463, "frac_reward_zero_std": 0.03125, "grad_norm": 0.85546875, "kl": 0.0029605931602418424, "learning_rate": 1.2719192569407483e-06, "loss": 0.0001725905924104154, "reward": 0.37147461374600727, "reward_std": 0.2689658502737681, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37147462368011475, "rewards/QAReward/std": 0.4194539189338684, "step": 2875 }, { "clip_ratio/high_max": 0.00033882581628859044, "clip_ratio/high_mean": 0.0002017885388340801, "clip_ratio/low_mean": 8.081010601017624e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002825986361131072, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 552.720703125, "completions/min_length": 229.5, "epoch": 0.5631599530700039, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.002930351532995701, "learning_rate": 1.2672252529745868e-06, "loss": 0.00016713705845177173, "reward": 0.36133742332458496, "reward_std": 0.2833786904811859, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36133743822574615, "rewards/QAReward/std": 0.404550239443779, "step": 2880 }, { "clip_ratio/high_max": 0.0003704380593262613, "clip_ratio/high_mean": 0.00018390845507383348, "clip_ratio/low_mean": 2.0020734518766402e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020392920123413205, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 532.9348958333334, "completions/min_length": 276.6666666666667, "epoch": 0.5641376613218616, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.86328125, "kl": 0.0029801741242408753, "learning_rate": 1.262533583605308e-06, "loss": 8.408010471612215e-05, "reward": 0.36118603746096295, "reward_std": 0.27863674362500507, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36118603746096295, "rewards/QAReward/std": 0.44650458296140033, "step": 2885 }, { "clip_ratio/high_max": 0.0005501682288013398, "clip_ratio/high_mean": 0.0002265829243697226, "clip_ratio/low_mean": 8.2003214629367e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003085861273575574, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 537.609375, "completions/min_length": 254.0, "epoch": 0.5651153695737192, "frac_reward_zero_std": 0.0625, "grad_norm": 0.73828125, "kl": 0.003010498080402613, "learning_rate": 1.25784429588766e-06, "loss": 0.00012064655311405659, "reward": 0.3653242737054825, "reward_std": 0.2714777886867523, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36532430350780487, "rewards/QAReward/std": 0.44826552271842957, "step": 2890 }, { "clip_ratio/high_max": 0.0003378224675543606, "clip_ratio/high_mean": 0.00016132445889525116, "clip_ratio/low_mean": 5.6706657051108775e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021803111885674298, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 522.5026041666666, "completions/min_length": 240.66666666666666, "epoch": 0.5660930778255768, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.796875, "kl": 0.003092497680336237, "learning_rate": 1.2531574368525015e-06, "loss": 0.00010561586823314429, "reward": 0.365262895822525, "reward_std": 0.28045427799224854, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.365262895822525, "rewards/QAReward/std": 0.4165775179862976, "step": 2895 }, { "clip_ratio/high_max": 0.0005228285444900393, "clip_ratio/high_mean": 0.0002676565607544035, "clip_ratio/low_mean": 5.5727383005432786e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003233839466702193, "completions/clipped_ratio": 0.005859375, "completions/max_length": 948.5, "completions/mean_length": 525.982421875, "completions/min_length": 230.0, "epoch": 0.5670707860774344, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0029901690315455197, "learning_rate": 1.248473053506334e-06, "loss": 0.00015408957842737435, "reward": 0.3776971995830536, "reward_std": 0.27270932495594025, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3776971846818924, "rewards/QAReward/std": 0.42544038593769073, "step": 2900 }, { "clip_ratio/high_max": 0.00032222854206338526, "clip_ratio/high_mean": 0.00015117579605430364, "clip_ratio/low_mean": 4.187238519079983e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019304815796203912, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.4518229166666, "completions/min_length": 224.33333333333334, "epoch": 0.5680484943292922, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.0029490042943507433, "learning_rate": 1.2437911928308297e-06, "loss": 0.00014619381399825216, "reward": 0.3351856966813405, "reward_std": 0.267026553551356, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.335185706615448, "rewards/QAReward/std": 0.436826358238856, "step": 2905 }, { "clip_ratio/high_max": 0.0004957802360877394, "clip_ratio/high_mean": 0.00027232393622398374, "clip_ratio/low_mean": 0.00010715909593272954, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037948302924633025, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 515.125, "completions/min_length": 230.5, "epoch": 0.5690262025811498, "frac_reward_zero_std": 0.0625, "grad_norm": 0.921875, "kl": 0.0031325860414654016, "learning_rate": 1.239111901782359e-06, "loss": 7.236694218590856e-05, "reward": 0.41746625304222107, "reward_std": 0.269707590341568, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41746625304222107, "rewards/QAReward/std": 0.4369102269411087, "step": 2910 }, { "clip_ratio/high_max": 0.0002887337002903223, "clip_ratio/high_mean": 0.0001736473524942994, "clip_ratio/low_mean": 6.239774083951488e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023604509187862276, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.8854166666666, "completions/min_length": 271.6666666666667, "epoch": 0.5700039108330074, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.76953125, "kl": 0.0029132286086678504, "learning_rate": 1.2344352272915212e-06, "loss": 0.00012868912890553474, "reward": 0.33169644077618915, "reward_std": 0.2825666069984436, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33169644077618915, "rewards/QAReward/std": 0.4418901304403941, "step": 2915 }, { "clip_ratio/high_max": 0.000517383124679327, "clip_ratio/high_mean": 0.00027686022222042086, "clip_ratio/low_mean": 7.06908671418205e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003475510864518583, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 540.330078125, "completions/min_length": 295.5, "epoch": 0.5709816190848651, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8203125, "kl": 0.0028833709191530945, "learning_rate": 1.2297612162626727e-06, "loss": 0.00020411494188010693, "reward": 0.40228796005249023, "reward_std": 0.2634739354252815, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40228797495365143, "rewards/QAReward/std": 0.38881903886795044, "step": 2920 }, { "clip_ratio/high_max": 0.00030987163772806526, "clip_ratio/high_mean": 0.0001876783964689821, "clip_ratio/low_mean": 5.210557428654283e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002397839678451419, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 534.1158854166666, "completions/min_length": 255.66666666666666, "epoch": 0.5719593273367227, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8359375, "kl": 0.002993867266923189, "learning_rate": 1.2250899155734564e-06, "loss": 0.00012412351788952946, "reward": 0.3417498668034871, "reward_std": 0.2669679621855418, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3417498568693797, "rewards/QAReward/std": 0.4817316134770711, "step": 2925 }, { "clip_ratio/high_max": 0.0005534481140784919, "clip_ratio/high_mean": 0.00026002678787335755, "clip_ratio/low_mean": 6.471771193901077e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032474452164024117, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 529.38671875, "completions/min_length": 245.5, "epoch": 0.5729370355885803, "frac_reward_zero_std": 0.015625, "grad_norm": 0.80078125, "kl": 0.0029716858640313148, "learning_rate": 1.2204213720743329e-06, "loss": 0.00012983796186745168, "reward": 0.3924975246191025, "reward_std": 0.2591482996940613, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3924975246191025, "rewards/QAReward/std": 0.44904182851314545, "step": 2930 }, { "clip_ratio/high_max": 0.0004342211992479861, "clip_ratio/high_mean": 0.0001627651625312865, "clip_ratio/low_mean": 3.1136893085204065e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001939020527061075, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 536.5325520833334, "completions/min_length": 243.33333333333334, "epoch": 0.5739147438404381, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.002921164548024535, "learning_rate": 1.2157556325881099e-06, "loss": 0.00017934553325176238, "reward": 0.34236007928848267, "reward_std": 0.2963632146517436, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34236007928848267, "rewards/QAReward/std": 0.4388943612575531, "step": 2935 }, { "clip_ratio/high_max": 0.00044609977630898355, "clip_ratio/high_mean": 0.0002459009760059416, "clip_ratio/low_mean": 6.560510519193485e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031150607974268497, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 493.099609375, "completions/min_length": 230.5, "epoch": 0.5748924520922957, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7734375, "kl": 0.0031193232629448176, "learning_rate": 1.211092743909471e-06, "loss": 0.00011043951380997896, "reward": 0.4107290208339691, "reward_std": 0.2678473889827728, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4107290208339691, "rewards/QAReward/std": 0.4734305739402771, "step": 2940 }, { "clip_ratio/high_max": 0.0002801387454383075, "clip_ratio/high_mean": 0.0001331403444055468, "clip_ratio/low_mean": 3.4874886478064584e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016801522579044104, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 533.9479166666666, "completions/min_length": 266.3333333333333, "epoch": 0.5758701603441533, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80859375, "kl": 0.0030094352550804615, "learning_rate": 1.2064327528045092e-06, "loss": 8.676808211021126e-05, "reward": 0.35375741124153137, "reward_std": 0.2804939051469167, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3537574013074239, "rewards/QAReward/std": 0.4004081388314565, "step": 2945 }, { "clip_ratio/high_max": 0.0005731657729484141, "clip_ratio/high_mean": 0.00027034098748117684, "clip_ratio/low_mean": 3.235299154766835e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003026939695701003, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 522.27734375, "completions/min_length": 220.0, "epoch": 0.576847868596011, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8203125, "kl": 0.0029957156162708997, "learning_rate": 1.2017757060102563e-06, "loss": 2.0296240109018983e-06, "reward": 0.3805695176124573, "reward_std": 0.2868618071079254, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3805695176124573, "rewards/QAReward/std": 0.44230884313583374, "step": 2950 }, { "clip_ratio/high_max": 0.0002049036498647183, "clip_ratio/high_mean": 0.00010954684403259307, "clip_ratio/low_mean": 4.295904582249932e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001525058934930712, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.9453125, "completions/min_length": 266.3333333333333, "epoch": 0.5778255768478686, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.7890625, "kl": 0.0029278050642460585, "learning_rate": 1.1971216502342145e-06, "loss": 0.0001402489491738379, "reward": 0.3226144115130107, "reward_std": 0.2868254482746124, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3226144115130107, "rewards/QAReward/std": 0.4347994228204091, "step": 2955 }, { "clip_ratio/high_max": 0.000469377008266747, "clip_ratio/high_mean": 0.0002753487497102469, "clip_ratio/low_mean": 7.946819096105173e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035481692757457497, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 543.19921875, "completions/min_length": 281.5, "epoch": 0.5788032850997262, "frac_reward_zero_std": 0.046875, "grad_norm": 0.79296875, "kl": 0.0029921372421085834, "learning_rate": 1.1924706321538868e-06, "loss": 9.031331865116954e-05, "reward": 0.30859221518039703, "reward_std": 0.27665382623672485, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30859218537807465, "rewards/QAReward/std": 0.4313024580478668, "step": 2960 }, { "clip_ratio/high_max": 0.0003026928170584142, "clip_ratio/high_mean": 0.00011636941344477237, "clip_ratio/low_mean": 2.8988380654482172e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014535780064761639, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 525.7330729166666, "completions/min_length": 244.66666666666666, "epoch": 0.5797809933515838, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.82421875, "kl": 0.0029700657352805136, "learning_rate": 1.1878226984163102e-06, "loss": 0.00014244976919144391, "reward": 0.3984344005584717, "reward_std": 0.25971171259880066, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3984344005584717, "rewards/QAReward/std": 0.41970818241437274, "step": 2965 }, { "clip_ratio/high_max": 0.0005797238787636161, "clip_ratio/high_mean": 0.0003004839818459004, "clip_ratio/low_mean": 8.72874865308404e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003877714625559747, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 535.3515625, "completions/min_length": 257.0, "epoch": 0.5807587016034416, "frac_reward_zero_std": 0.046875, "grad_norm": 0.78515625, "kl": 0.0030169361270964146, "learning_rate": 1.183177895637589e-06, "loss": 0.00013159632217139006, "reward": 0.3702032268047333, "reward_std": 0.2721528559923172, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3702032119035721, "rewards/QAReward/std": 0.42479652166366577, "step": 2970 }, { "clip_ratio/high_max": 0.0002990789245814085, "clip_ratio/high_mean": 0.00016051234561018646, "clip_ratio/low_mean": 5.5586174130439756e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021609851391986014, "completions/clipped_ratio": 0.016927083333333332, "completions/max_length": 1024.0, "completions/mean_length": 521.8984375, "completions/min_length": 249.66666666666666, "epoch": 0.5817364098552992, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.003127774549648166, "learning_rate": 1.1785362704024242e-06, "loss": 0.00015759326051920653, "reward": 0.3689422110716502, "reward_std": 0.2742192844549815, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3689422011375427, "rewards/QAReward/std": 0.42168975869814557, "step": 2975 }, { "clip_ratio/high_max": 0.0005058647715486586, "clip_ratio/high_mean": 0.0003137853287626058, "clip_ratio/low_mean": 6.395555101335049e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037774088559672236, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 515.9296875, "completions/min_length": 228.5, "epoch": 0.5827141181071568, "frac_reward_zero_std": 0.046875, "grad_norm": 0.86328125, "kl": 0.0030422912910580634, "learning_rate": 1.1738978692636482e-06, "loss": 0.00015904654283076524, "reward": 0.2775692790746689, "reward_std": 0.2878231108188629, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2775692939758301, "rewards/QAReward/std": 0.4440484046936035, "step": 2980 }, { "clip_ratio/high_max": 0.0003773331642150879, "clip_ratio/high_mean": 0.0001727728173136711, "clip_ratio/low_mean": 4.591906472342089e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021869187476113438, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 532.0833333333334, "completions/min_length": 246.0, "epoch": 0.5836918263590145, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8359375, "kl": 0.0029838602524250744, "learning_rate": 1.1692627387417568e-06, "loss": 0.0001049157464876771, "reward": 0.36593760053316754, "reward_std": 0.2817593514919281, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36593761046727497, "rewards/QAReward/std": 0.4186075031757355, "step": 2985 }, { "clip_ratio/high_max": 0.00041427368414588274, "clip_ratio/high_mean": 0.000225615271483548, "clip_ratio/low_mean": 6.680338556179777e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029241865850053727, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 510.18359375, "completions/min_length": 245.5, "epoch": 0.5846695346108721, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.003159819869324565, "learning_rate": 1.1646309253244457e-06, "loss": 0.00019875832367688417, "reward": 0.409555584192276, "reward_std": 0.27676111459732056, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4095555990934372, "rewards/QAReward/std": 0.4175288826227188, "step": 2990 }, { "clip_ratio/high_max": 0.000391699280589819, "clip_ratio/high_mean": 0.00022043632343411447, "clip_ratio/low_mean": 5.414248880697414e-05, "clip_ratio/low_min": 2.3446658451575787e-05, "clip_ratio/region_mean": 0.0002745788195170462, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 532.1458333333334, "completions/min_length": 240.33333333333334, "epoch": 0.5856472428627297, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.79296875, "kl": 0.0030382727738469837, "learning_rate": 1.1600024754661401e-06, "loss": 0.00012992096599191428, "reward": 0.395576536655426, "reward_std": 0.2786622146765391, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.395576536655426, "rewards/QAReward/std": 0.4307333032290141, "step": 2995 }, { "clip_ratio/high_max": 0.0004535638727247715, "clip_ratio/high_mean": 0.00022818303550593556, "clip_ratio/low_mean": 7.7218865044415e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030540189472958443, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 516.587890625, "completions/min_length": 247.0, "epoch": 0.5866249511145875, "frac_reward_zero_std": 0.0625, "grad_norm": 0.84375, "kl": 0.003074603946879506, "learning_rate": 1.1553774355875304e-06, "loss": 0.00015211488353088497, "reward": 0.30476656556129456, "reward_std": 0.2816520631313324, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30476656556129456, "rewards/QAReward/std": 0.4534723311662674, "step": 3000 }, { "clip_ratio/high_max": 0.00046533544082194567, "clip_ratio/high_mean": 0.00022059433395043014, "clip_ratio/low_mean": 6.662733503617346e-05, "clip_ratio/low_min": 1.846722007030621e-05, "clip_ratio/region_mean": 0.0002872216748073697, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 533.9557291666666, "completions/min_length": 265.3333333333333, "epoch": 0.5876026593664451, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8359375, "kl": 0.003043737169355154, "learning_rate": 1.1507558520751081e-06, "loss": 0.00010752936359494924, "reward": 0.40264494220415753, "reward_std": 0.2854978342851003, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40264494220415753, "rewards/QAReward/std": 0.424999604622523, "step": 3005 }, { "clip_ratio/high_max": 0.0005787238478660583, "clip_ratio/high_mean": 0.0002346461929846555, "clip_ratio/low_mean": 7.670361519558355e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003113497979938984, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 527.4140625, "completions/min_length": 225.0, "epoch": 0.5885803676183027, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8984375, "kl": 0.002987267216667533, "learning_rate": 1.1461377712806987e-06, "loss": 0.00010698058176785707, "reward": 0.32401493191719055, "reward_std": 0.2668403387069702, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32401491701602936, "rewards/QAReward/std": 0.4041232317686081, "step": 3010 }, { "clip_ratio/high_max": 0.0003631192841567099, "clip_ratio/high_mean": 0.00018177456222474576, "clip_ratio/low_mean": 5.856031202711165e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024033489171415567, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 541.2942708333334, "completions/min_length": 265.0, "epoch": 0.5895580758701603, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.002981298882514238, "learning_rate": 1.1415232395209972e-06, "loss": 0.00015510719968006014, "reward": 0.3854466378688812, "reward_std": 0.2650056481361389, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3854466478029887, "rewards/QAReward/std": 0.4277230401833852, "step": 3015 }, { "clip_ratio/high_max": 0.0006283653550781309, "clip_ratio/high_mean": 0.0002536395273637027, "clip_ratio/low_mean": 7.767036149743944e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033130988595075905, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 521.677734375, "completions/min_length": 251.0, "epoch": 0.590535784122018, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7734375, "kl": 0.0030469865538179874, "learning_rate": 1.136912303077104e-06, "loss": 0.00014047441072762012, "reward": 0.4036349952220917, "reward_std": 0.2649751901626587, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4036349952220917, "rewards/QAReward/std": 0.4430392235517502, "step": 3020 }, { "clip_ratio/high_max": 0.00034157970221713183, "clip_ratio/high_mean": 0.00017357696196995676, "clip_ratio/low_mean": 4.725883190985769e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002208357909694314, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 1024.0, "completions/mean_length": 501.3984375, "completions/min_length": 248.66666666666666, "epoch": 0.5915134923738756, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.003193373465910554, "learning_rate": 1.1323050081940617e-06, "loss": 0.00010679212864488364, "reward": 0.4173022210597992, "reward_std": 0.3124127686023712, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4173022210597992, "rewards/QAReward/std": 0.4433397551377614, "step": 3025 }, { "clip_ratio/high_max": 0.0004618222825229168, "clip_ratio/high_mean": 0.000262146774912253, "clip_ratio/low_mean": 8.436382631771266e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003465106012299657, "completions/clipped_ratio": 0.017578125, "completions/max_length": 967.0, "completions/mean_length": 541.138671875, "completions/min_length": 243.0, "epoch": 0.5924912006257332, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.0030168248806148766, "learning_rate": 1.127701401080388e-06, "loss": 0.00013836305588483812, "reward": 0.3158327341079712, "reward_std": 0.324203222990036, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3158327341079712, "rewards/QAReward/std": 0.46750521659851074, "step": 3030 }, { "clip_ratio/high_max": 0.00032461738446727396, "clip_ratio/high_mean": 0.00013079275959171355, "clip_ratio/low_mean": 4.991979221813381e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018071254598908126, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 521.2708333333334, "completions/min_length": 255.33333333333334, "epoch": 0.593468908877591, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.79296875, "kl": 0.0030333083122968674, "learning_rate": 1.1231015279076162e-06, "loss": 0.00018736369675025343, "reward": 0.3704923093318939, "reward_std": 0.29195424914360046, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3704923292001088, "rewards/QAReward/std": 0.4426786204179128, "step": 3035 }, { "clip_ratio/high_max": 0.0004450536565855145, "clip_ratio/high_mean": 0.0002440422773361206, "clip_ratio/low_mean": 6.47283231955953e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030877062235958876, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 530.16796875, "completions/min_length": 278.5, "epoch": 0.5944466171294486, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7890625, "kl": 0.0030733140185475348, "learning_rate": 1.1185054348098285e-06, "loss": 6.185034289956093e-05, "reward": 0.3532974272966385, "reward_std": 0.28303416818380356, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3532974272966385, "rewards/QAReward/std": 0.4583711326122284, "step": 3040 }, { "clip_ratio/high_max": 0.00030764900147914884, "clip_ratio/high_mean": 0.00015167349483817815, "clip_ratio/low_mean": 4.814527928829193e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019981877412647008, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 535.8294270833334, "completions/min_length": 260.6666666666667, "epoch": 0.5954243253813062, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.84375, "kl": 0.0030672785360366106, "learning_rate": 1.1139131678831977e-06, "loss": 9.350889595225454e-05, "reward": 0.37353309988975525, "reward_std": 0.2846968173980713, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37353308995564777, "rewards/QAReward/std": 0.4370540877183278, "step": 3045 }, { "clip_ratio/high_max": 0.0007076719077304006, "clip_ratio/high_mean": 0.00028588828863576057, "clip_ratio/low_mean": 7.763879548292608e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036352708702906965, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 528.482421875, "completions/min_length": 212.0, "epoch": 0.5964020336331639, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8515625, "kl": 0.002985588414594531, "learning_rate": 1.1093247731855204e-06, "loss": 5.597097333520651e-05, "reward": 0.43862536549568176, "reward_std": 0.2528783529996872, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43862538039684296, "rewards/QAReward/std": 0.45942364633083344, "step": 3050 }, { "clip_ratio/high_max": 0.00030257406178861855, "clip_ratio/high_mean": 0.0001796994125470519, "clip_ratio/low_mean": 4.5839721860829743e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022553913295269011, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 540.0859375, "completions/min_length": 262.6666666666667, "epoch": 0.5973797418850215, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.765625, "kl": 0.0028781897854059935, "learning_rate": 1.104740296735757e-06, "loss": 0.00012951315147802234, "reward": 0.38856904705365497, "reward_std": 0.2835467457771301, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38856905698776245, "rewards/QAReward/std": 0.4751809736092885, "step": 3055 }, { "clip_ratio/high_max": 0.0006124091218225658, "clip_ratio/high_mean": 0.00027798047522082926, "clip_ratio/low_mean": 9.110444225370883e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036908491747453807, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 528.40625, "completions/min_length": 230.5, "epoch": 0.5983574501368791, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.002997596748173237, "learning_rate": 1.1001597845135701e-06, "loss": 0.00012796919327229261, "reward": 0.3807152211666107, "reward_std": 0.28291845321655273, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3807152211666107, "rewards/QAReward/std": 0.403931125998497, "step": 3060 }, { "clip_ratio/high_max": 0.00032529341988265517, "clip_ratio/high_mean": 0.00016823966288939118, "clip_ratio/low_mean": 4.831999831367284e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021655966993421315, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 543.8489583333334, "completions/min_length": 254.66666666666666, "epoch": 0.5993351583887369, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.002886589337140322, "learning_rate": 1.0955832824588651e-06, "loss": 9.362951968796551e-05, "reward": 0.3225575586160024, "reward_std": 0.3037695387999217, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32255756358305615, "rewards/QAReward/std": 0.42510636647542316, "step": 3065 }, { "clip_ratio/high_max": 0.00044581524562090633, "clip_ratio/high_mean": 0.00021772898617200553, "clip_ratio/low_mean": 6.253192841541023e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002802609174977988, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 516.4921875, "completions/min_length": 234.0, "epoch": 0.6003128666405945, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.002990907337516546, "learning_rate": 1.0910108364713257e-06, "loss": 0.00011000062804669141, "reward": 0.40065181255340576, "reward_std": 0.2740155756473541, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40065181255340576, "rewards/QAReward/std": 0.46645107865333557, "step": 3070 }, { "clip_ratio/high_max": 0.0003264271654188633, "clip_ratio/high_mean": 0.0001648468489293009, "clip_ratio/low_mean": 3.4739474358502775e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019958632765337825, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 969.3333333333334, "completions/mean_length": 521.2057291666666, "completions/min_length": 222.33333333333334, "epoch": 0.6012905748924521, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.7578125, "kl": 0.0030271909199655056, "learning_rate": 1.0864424924099566e-06, "loss": 0.0001553291571326554, "reward": 0.41012778878211975, "reward_std": 0.2945246199766795, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41012777884801227, "rewards/QAReward/std": 0.4444362223148346, "step": 3075 }, { "clip_ratio/high_max": 0.0006332469987682998, "clip_ratio/high_mean": 0.00030790099408477544, "clip_ratio/low_mean": 6.609346019104123e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037399446591734886, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 524.326171875, "completions/min_length": 244.0, "epoch": 0.6022682831443097, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.0030714817345142364, "learning_rate": 1.0818782960926214e-06, "loss": 8.000258821994066e-05, "reward": 0.49142949283123016, "reward_std": 0.27504266798496246, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.49142949283123016, "rewards/QAReward/std": 0.40179434418678284, "step": 3080 }, { "clip_ratio/high_max": 0.0003643097705207765, "clip_ratio/high_mean": 0.00015497254789806903, "clip_ratio/low_mean": 6.190942949615418e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002168819773942232, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 527.1979166666666, "completions/min_length": 235.0, "epoch": 0.6032459913961674, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.84765625, "kl": 0.003017355874180794, "learning_rate": 1.0773182932955864e-06, "loss": 0.00011398668866604566, "reward": 0.37323914964993793, "reward_std": 0.2588479022185008, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37323911984761554, "rewards/QAReward/std": 0.407367746035258, "step": 3085 }, { "clip_ratio/high_max": 0.0004398652934469283, "clip_ratio/high_mean": 0.00024987185024656356, "clip_ratio/low_mean": 6.684011605102569e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031671197502873837, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 535.373046875, "completions/min_length": 243.5, "epoch": 0.604223699648025, "frac_reward_zero_std": 0.015625, "grad_norm": 0.85546875, "kl": 0.0029128103982657195, "learning_rate": 1.072762529753057e-06, "loss": 9.508489165455103e-05, "reward": 0.3521670550107956, "reward_std": 0.3054758906364441, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3521670401096344, "rewards/QAReward/std": 0.4260057061910629, "step": 3090 }, { "clip_ratio/high_max": 0.00047896940959617494, "clip_ratio/high_mean": 0.000207350350683555, "clip_ratio/low_mean": 5.132113874424249e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025867148069664837, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 516.5390625, "completions/min_length": 216.66666666666666, "epoch": 0.6052014078998826, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.859375, "kl": 0.0031468093395233153, "learning_rate": 1.0682110511567229e-06, "loss": 0.00015961560420691968, "reward": 0.37535135944684345, "reward_std": 0.29486114780108136, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37535135944684345, "rewards/QAReward/std": 0.4856092135111491, "step": 3095 }, { "clip_ratio/high_max": 0.00035136902006343006, "clip_ratio/high_mean": 0.00021818710956722498, "clip_ratio/low_mean": 6.134058348834514e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002795276988763362, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 511.296875, "completions/min_length": 243.0, "epoch": 0.6061791161517404, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8203125, "kl": 0.0031783921644091607, "learning_rate": 1.0636639031552964e-06, "loss": 0.00013782883761450648, "reward": 0.3575548380613327, "reward_std": 0.27329520881175995, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3575548380613327, "rewards/QAReward/std": 0.4606867581605911, "step": 3100 }, { "clip_ratio/high_max": 0.00030616113217547537, "clip_ratio/high_mean": 0.00019199843518435955, "clip_ratio/low_mean": 4.604003624990583e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023803847143426536, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 524.2122395833334, "completions/min_length": 241.0, "epoch": 0.607156824403598, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8125, "kl": 0.003020018571987748, "learning_rate": 1.0591211313540596e-06, "loss": 0.00010435979347676039, "reward": 0.3922869861125946, "reward_std": 0.27063465118408203, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3922869960467021, "rewards/QAReward/std": 0.4378087917963664, "step": 3105 }, { "clip_ratio/high_max": 0.0005982210510410369, "clip_ratio/high_mean": 0.00024049585917964577, "clip_ratio/low_mean": 6.462192977778614e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003051178005989641, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 537.90234375, "completions/min_length": 250.0, "epoch": 0.6081345326554556, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.003052795073017478, "learning_rate": 1.0545827813144008e-06, "loss": 0.0002049662172794342, "reward": 0.4272972494363785, "reward_std": 0.2668502628803253, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4272972494363785, "rewards/QAReward/std": 0.4304170310497284, "step": 3110 }, { "clip_ratio/high_max": 0.00030596572905778886, "clip_ratio/high_mean": 0.00016575795016251504, "clip_ratio/low_mean": 5.5892602540552615e-05, "clip_ratio/low_min": 2.1028282935731114e-05, "clip_ratio/region_mean": 0.00022165057016536592, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 527.8046875, "completions/min_length": 258.6666666666667, "epoch": 0.6091122409073133, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.82421875, "kl": 0.0030367865692824125, "learning_rate": 1.0500488985533614e-06, "loss": 8.701086044311523e-05, "reward": 0.35285837451616925, "reward_std": 0.2837148904800415, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35285839438438416, "rewards/QAReward/std": 0.4568295180797577, "step": 3115 }, { "clip_ratio/high_max": 0.0005449802963994443, "clip_ratio/high_mean": 0.0002572114928625524, "clip_ratio/low_mean": 8.286431984743104e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034007581416517496, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 526.15234375, "completions/min_length": 251.0, "epoch": 0.6100899491591709, "frac_reward_zero_std": 0.078125, "grad_norm": 0.7578125, "kl": 0.0030109799932688474, "learning_rate": 1.045519528543179e-06, "loss": 0.00013213552301749587, "reward": 0.4235770255327225, "reward_std": 0.26234233379364014, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4235769808292389, "rewards/QAReward/std": 0.45530965924263, "step": 3120 }, { "clip_ratio/high_max": 0.0003292763605713844, "clip_ratio/high_mean": 0.00016774144023656845, "clip_ratio/low_mean": 4.926691472064704e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021700835786759854, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 531.3424479166666, "completions/min_length": 233.33333333333334, "epoch": 0.6110676574110285, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8359375, "kl": 0.003085982380434871, "learning_rate": 1.040994716710831e-06, "loss": 0.00013956516049802304, "reward": 0.38735222816467285, "reward_std": 0.2629343718290329, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38735222816467285, "rewards/QAReward/std": 0.45471323529879254, "step": 3125 }, { "clip_ratio/high_max": 0.0005796781275421381, "clip_ratio/high_mean": 0.0002972294751089066, "clip_ratio/low_mean": 5.360596987884492e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000350835477001965, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 522.396484375, "completions/min_length": 238.5, "epoch": 0.6120453656628861, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.0030137831810861824, "learning_rate": 1.036474508437579e-06, "loss": 0.00013998510548844932, "reward": 0.3718331456184387, "reward_std": 0.3011208772659302, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3718331605195999, "rewards/QAReward/std": 0.45297564566135406, "step": 3130 }, { "clip_ratio/high_max": 0.0002407927066087723, "clip_ratio/high_mean": 0.00015626332024112343, "clip_ratio/low_mean": 6.106016226112843e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021732347668148577, "completions/clipped_ratio": 0.055989583333333336, "completions/max_length": 1024.0, "completions/mean_length": 558.03515625, "completions/min_length": 244.33333333333334, "epoch": 0.6130230739147439, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.0029251900967210533, "learning_rate": 1.0319589490585125e-06, "loss": 8.452382171526551e-05, "reward": 0.34708014130592346, "reward_std": 0.26913881798585254, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34708014130592346, "rewards/QAReward/std": 0.43119777242342633, "step": 3135 }, { "clip_ratio/high_max": 0.0005825163563713432, "clip_ratio/high_mean": 0.0002693879243452102, "clip_ratio/low_mean": 8.240104070864618e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003517889650538564, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 546.96484375, "completions/min_length": 238.0, "epoch": 0.6140007821666015, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.002906939946115017, "learning_rate": 1.0274480838620957e-06, "loss": 6.010587094351649e-05, "reward": 0.3465905636548996, "reward_std": 0.25938040018081665, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3465905487537384, "rewards/QAReward/std": 0.4416363537311554, "step": 3140 }, { "clip_ratio/high_max": 0.0003002774086780846, "clip_ratio/high_mean": 0.00017635347321629525, "clip_ratio/low_mean": 6.879720604047179e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000245150679256767, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 532.67578125, "completions/min_length": 247.33333333333334, "epoch": 0.6149784904184591, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.828125, "kl": 0.0030169500503689052, "learning_rate": 1.0229419580897143e-06, "loss": 0.00011126375757157803, "reward": 0.35345880190531415, "reward_std": 0.2872520287831624, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35345880190531415, "rewards/QAReward/std": 0.44958558678627014, "step": 3145 }, { "clip_ratio/high_max": 0.0004396335920318961, "clip_ratio/high_mean": 0.00026466473937034605, "clip_ratio/low_mean": 7.35102774342522e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033817500807344916, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 539.611328125, "completions/min_length": 250.0, "epoch": 0.6159561986703168, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7734375, "kl": 0.002968590892851353, "learning_rate": 1.0184406169352182e-06, "loss": 6.500877207145095e-05, "reward": 0.32829904556274414, "reward_std": 0.28325092792510986, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32829904556274414, "rewards/QAReward/std": 0.47404786944389343, "step": 3150 }, { "clip_ratio/high_max": 0.0002872691606171429, "clip_ratio/high_mean": 0.0001567366358358413, "clip_ratio/low_mean": 3.700874513015151e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001937453867867589, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 498.7161458333333, "completions/min_length": 195.66666666666666, "epoch": 0.6169339069221744, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.003209689911454916, "learning_rate": 1.0139441055444712e-06, "loss": 0.00019046461675316096, "reward": 0.3843300441900889, "reward_std": 0.28291767835617065, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38433005412419635, "rewards/QAReward/std": 0.4557638068993886, "step": 3155 }, { "clip_ratio/high_max": 0.0008171111578121782, "clip_ratio/high_mean": 0.0003804897831287235, "clip_ratio/low_mean": 6.728375374223105e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004477735375985503, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 517.673828125, "completions/min_length": 257.0, "epoch": 0.617911615174032, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.0031875976826995613, "learning_rate": 1.0094524690148973e-06, "loss": 0.00014200281584635376, "reward": 0.3858421593904495, "reward_std": 0.2948902100324631, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3858421742916107, "rewards/QAReward/std": 0.4469553381204605, "step": 3160 }, { "clip_ratio/high_max": 0.0002876669284887612, "clip_ratio/high_mean": 0.00016947119729593396, "clip_ratio/low_mean": 5.118016852065921e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022065136581659318, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 514.94140625, "completions/min_length": 244.33333333333334, "epoch": 0.6188893234258898, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.0030561983585357666, "learning_rate": 1.004965752395029e-06, "loss": 0.00017671892419457436, "reward": 0.34552229444185895, "reward_std": 0.2764174739519755, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.345522274573644, "rewards/QAReward/std": 0.43489550550778705, "step": 3165 }, { "clip_ratio/high_max": 0.0004271104233339429, "clip_ratio/high_mean": 0.00021912247175350785, "clip_ratio/low_mean": 8.998478879220783e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030910727800801394, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 534.37890625, "completions/min_length": 241.5, "epoch": 0.6198670316777474, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7734375, "kl": 0.003043881058692932, "learning_rate": 1.0004840006840543e-06, "loss": 0.00014027704019099475, "reward": 0.3826218545436859, "reward_std": 0.28995490074157715, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3826218396425247, "rewards/QAReward/std": 0.4383290410041809, "step": 3170 }, { "clip_ratio/high_max": 0.0003261619131080806, "clip_ratio/high_mean": 0.00018218971672467887, "clip_ratio/low_mean": 2.913518255809322e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021132490364834667, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 539.890625, "completions/min_length": 249.66666666666666, "epoch": 0.620844739929605, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.76953125, "kl": 0.0029772511683404447, "learning_rate": 9.960072588313655e-07, "loss": 0.00013726362958550454, "reward": 0.4045918981234233, "reward_std": 0.284005085627238, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4045919179916382, "rewards/QAReward/std": 0.4337902267773946, "step": 3175 }, { "clip_ratio/high_max": 0.0005165319074876606, "clip_ratio/high_mean": 0.0002634184551425278, "clip_ratio/low_mean": 7.784998451825232e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003412684425711632, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 524.859375, "completions/min_length": 255.0, "epoch": 0.6218224481814627, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.003090406907722354, "learning_rate": 9.915355717361107e-07, "loss": 8.45757662318647e-05, "reward": 0.3810035437345505, "reward_std": 0.3250425010919571, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3810035437345505, "rewards/QAReward/std": 0.4434066265821457, "step": 3180 }, { "clip_ratio/high_max": 0.00024219988263212143, "clip_ratio/high_mean": 0.00014560341369360686, "clip_ratio/low_mean": 5.120998248457909e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019681339617818593, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 540.6640625, "completions/min_length": 247.66666666666666, "epoch": 0.6228001564333203, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.77734375, "kl": 0.002937463391572237, "learning_rate": 9.870689842467403e-07, "loss": 0.00010336472187191247, "reward": 0.3808144231637319, "reward_std": 0.2791414757569631, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3808144231637319, "rewards/QAReward/std": 0.41506147384643555, "step": 3185 }, { "clip_ratio/high_max": 0.0006353018572553993, "clip_ratio/high_mean": 0.00026488570729270576, "clip_ratio/low_mean": 4.295746184652671e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003078431764151901, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 507.701171875, "completions/min_length": 236.0, "epoch": 0.6237778646851779, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8125, "kl": 0.00315112704411149, "learning_rate": 9.826075411605583e-07, "loss": 0.00018026374746114016, "reward": 0.3066297620534897, "reward_std": 0.28126853704452515, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3066297620534897, "rewards/QAReward/std": 0.4988982230424881, "step": 3190 }, { "clip_ratio/high_max": 0.0003031575120985508, "clip_ratio/high_mean": 0.00017101652338169515, "clip_ratio/low_mean": 4.751966916956007e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021853618090972304, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 531.2239583333334, "completions/min_length": 237.0, "epoch": 0.6247555729370355, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.796875, "kl": 0.0030317900236696005, "learning_rate": 9.781512872232735e-07, "loss": 0.00014799484051764011, "reward": 0.3524254063765208, "reward_std": 0.27292314171791077, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3524254063765208, "rewards/QAReward/std": 0.4083522856235504, "step": 3195 }, { "clip_ratio/high_max": 0.0005234945914708078, "clip_ratio/high_mean": 0.00023681391612626613, "clip_ratio/low_mean": 7.665443699806928e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031346834730356933, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 532.11328125, "completions/min_length": 239.5, "epoch": 0.6257332811888933, "frac_reward_zero_std": 0.078125, "grad_norm": 0.75390625, "kl": 0.0029555813409388064, "learning_rate": 9.73700267128552e-07, "loss": 0.00013591608731076122, "reward": 0.3940243721008301, "reward_std": 0.2832992374897003, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3940243721008301, "rewards/QAReward/std": 0.49713142216205597, "step": 3200 }, { "clip_ratio/high_max": 0.00026762966881506146, "clip_ratio/high_mean": 0.00015010198985692113, "clip_ratio/low_mean": 4.15570757468231e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019165906123816968, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 529.7916666666666, "completions/min_length": 246.0, "epoch": 0.6267109894407509, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.003096139617264271, "learning_rate": 9.692545255175658e-07, "loss": 0.00014469274319708347, "reward": 0.31808148821194965, "reward_std": 0.274871364235878, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31808148821194965, "rewards/QAReward/std": 0.44878636797269184, "step": 3205 }, { "clip_ratio/high_max": 0.0005192716489546001, "clip_ratio/high_mean": 0.00024716259795241056, "clip_ratio/low_mean": 8.67192808073014e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003338818671181798, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 544.04296875, "completions/min_length": 260.0, "epoch": 0.6276886976926085, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7734375, "kl": 0.0029931191354990007, "learning_rate": 9.648141069785469e-07, "loss": 0.00011753948638215661, "reward": 0.3346741795539856, "reward_std": 0.2553362399339676, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3346741795539856, "rewards/QAReward/std": 0.41436879336833954, "step": 3210 }, { "clip_ratio/high_max": 0.0004156983224675059, "clip_ratio/high_mean": 0.00019683671416714787, "clip_ratio/low_mean": 5.0619710236787796e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002474564127624035, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 514.9453125, "completions/min_length": 232.33333333333334, "epoch": 0.6286664059444662, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.85546875, "kl": 0.0030814926140010357, "learning_rate": 9.603790560463405e-07, "loss": 0.00015388941392302513, "reward": 0.36031849185625714, "reward_std": 0.2819107075532277, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36031850179036456, "rewards/QAReward/std": 0.4271426200866699, "step": 3215 }, { "clip_ratio/high_max": 0.0005501387175172567, "clip_ratio/high_mean": 0.00031537801260128617, "clip_ratio/low_mean": 7.448123069480062e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003898592432960868, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 528.40625, "completions/min_length": 206.0, "epoch": 0.6296441141963238, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84765625, "kl": 0.003104054555296898, "learning_rate": 9.559494172019583e-07, "loss": 7.54752429202199e-05, "reward": 0.34024523198604584, "reward_std": 0.28888945281505585, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34024523198604584, "rewards/QAReward/std": 0.4213835895061493, "step": 3220 }, { "clip_ratio/high_max": 0.00026307549560442565, "clip_ratio/high_mean": 0.00017282081535086036, "clip_ratio/low_mean": 3.824877057923004e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021106956992298365, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 535.7760416666666, "completions/min_length": 231.33333333333334, "epoch": 0.6306218224481814, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.82421875, "kl": 0.003049872536212206, "learning_rate": 9.515252348721304e-07, "loss": 0.00015340542886406184, "reward": 0.352636714776357, "reward_std": 0.27510538697242737, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3526367247104645, "rewards/QAReward/std": 0.40319058299064636, "step": 3225 }, { "clip_ratio/high_max": 0.0004866733914241195, "clip_ratio/high_mean": 0.00026151722413487733, "clip_ratio/low_mean": 9.138594032265246e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000352903176099062, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/mean_length": 563.515625, "completions/min_length": 246.0, "epoch": 0.6315995307000392, "frac_reward_zero_std": 0.03125, "grad_norm": 0.828125, "kl": 0.0029072435572743416, "learning_rate": 9.471065534288623e-07, "loss": 0.00010276725515723228, "reward": 0.37361888587474823, "reward_std": 0.28962792456150055, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37361885607242584, "rewards/QAReward/std": 0.4134511202573776, "step": 3230 }, { "clip_ratio/high_max": 0.0002803919720463455, "clip_ratio/high_mean": 0.00017687332001514733, "clip_ratio/low_mean": 6.586964009329677e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024274296592921018, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 524.5403645833334, "completions/min_length": 238.66666666666666, "epoch": 0.6325772389518968, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.003099613217636943, "learning_rate": 9.426934171889881e-07, "loss": 0.0001413551392033696, "reward": 0.3585348476966222, "reward_std": 0.28201112151145935, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3585348476966222, "rewards/QAReward/std": 0.4429518481095632, "step": 3235 }, { "clip_ratio/high_max": 0.00039163988549262285, "clip_ratio/high_mean": 0.00019874011632055045, "clip_ratio/low_mean": 7.076654437696562e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002695066505111754, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 523.154296875, "completions/min_length": 263.5, "epoch": 0.6335549472037544, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.0030085483100265264, "learning_rate": 9.382858704137273e-07, "loss": 0.00013748225755989553, "reward": 0.3334193825721741, "reward_std": 0.3009723424911499, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3334193825721741, "rewards/QAReward/std": 0.4244697839021683, "step": 3240 }, { "clip_ratio/high_max": 0.0003176926402375102, "clip_ratio/high_mean": 0.00019286953611299395, "clip_ratio/low_mean": 5.9662573039531705e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025253210915252564, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 531.45703125, "completions/min_length": 258.3333333333333, "epoch": 0.634532655455612, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.002966162143275142, "learning_rate": 9.338839573082404e-07, "loss": 0.00013934529852122068, "reward": 0.352939506371816, "reward_std": 0.272745817899704, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.352939506371816, "rewards/QAReward/std": 0.4568938910961151, "step": 3245 }, { "clip_ratio/high_max": 0.00040636324556544425, "clip_ratio/high_mean": 0.0002621184336021543, "clip_ratio/low_mean": 7.793516706442461e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034005361376330254, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 549.79296875, "completions/min_length": 246.5, "epoch": 0.6355103637074697, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0029497520066797734, "learning_rate": 9.294877220211846e-07, "loss": 0.0001183748827315867, "reward": 0.357724130153656, "reward_std": 0.26247501373291016, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3577241599559784, "rewards/QAReward/std": 0.44085611402988434, "step": 3250 }, { "clip_ratio/high_max": 0.0004101279657334089, "clip_ratio/high_mean": 0.00022040562471374868, "clip_ratio/low_mean": 5.458694213302806e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027499255957081915, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 528.4596354166666, "completions/min_length": 266.6666666666667, "epoch": 0.6364880719593273, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.003071265062317252, "learning_rate": 9.250972086442719e-07, "loss": 0.00022728727199137211, "reward": 0.3827706476052602, "reward_std": 0.28827807307243347, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38277063767115277, "rewards/QAReward/std": 0.4266890188058217, "step": 3255 }, { "clip_ratio/high_max": 0.0005846671643666923, "clip_ratio/high_mean": 0.00029443426756188275, "clip_ratio/low_mean": 6.837509426986799e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003628093749284744, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 515.220703125, "completions/min_length": 231.5, "epoch": 0.6374657802111849, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87109375, "kl": 0.0031135986559093, "learning_rate": 9.207124612118279e-07, "loss": 0.00020486731082201005, "reward": 0.3745822459459305, "reward_std": 0.28873905539512634, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3745822608470917, "rewards/QAReward/std": 0.4249763786792755, "step": 3260 }, { "clip_ratio/high_max": 0.0002497873385436833, "clip_ratio/high_mean": 0.00014088329626247287, "clip_ratio/low_mean": 3.637632471509278e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017725962679833175, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 512.4830729166666, "completions/min_length": 231.0, "epoch": 0.6384434884630427, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.83984375, "kl": 0.0031232246197760104, "learning_rate": 9.163335237003487e-07, "loss": 0.00016309937927871943, "reward": 0.41568700472513836, "reward_std": 0.27457576990127563, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41568702459335327, "rewards/QAReward/std": 0.43167906006177265, "step": 3265 }, { "clip_ratio/high_max": 0.0005306800128892064, "clip_ratio/high_mean": 0.0002730324224103242, "clip_ratio/low_mean": 6.738057127222418e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003404129878617823, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 525.94140625, "completions/min_length": 265.0, "epoch": 0.6394211967149003, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.0030340859666466713, "learning_rate": 9.119604400280592e-07, "loss": 0.00015261541120707988, "reward": 0.3515229821205139, "reward_std": 0.28147025406360626, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3515229821205139, "rewards/QAReward/std": 0.5014215558767319, "step": 3270 }, { "clip_ratio/high_max": 0.0003784305532462895, "clip_ratio/high_mean": 0.00019427193328738212, "clip_ratio/low_mean": 6.218482740223407e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002564567606896162, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 534.6458333333334, "completions/min_length": 227.33333333333334, "epoch": 0.6403989049667579, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.72265625, "kl": 0.0030853991862386464, "learning_rate": 9.075932540544745e-07, "loss": 9.649337152950466e-05, "reward": 0.4182291527589162, "reward_std": 0.29610931873321533, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4182291626930237, "rewards/QAReward/std": 0.45739981532096863, "step": 3275 }, { "clip_ratio/high_max": 0.0005959352594800294, "clip_ratio/high_mean": 0.00030913964728824795, "clip_ratio/low_mean": 7.15907517587766e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003807304019574076, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/mean_length": 541.580078125, "completions/min_length": 247.0, "epoch": 0.6413766132186156, "frac_reward_zero_std": 0.03125, "grad_norm": 0.859375, "kl": 0.0029462985694408417, "learning_rate": 9.032320095799594e-07, "loss": 0.00010468227555975318, "reward": 0.34517376124858856, "reward_std": 0.26782354712486267, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34517377614974976, "rewards/QAReward/std": 0.42615966498851776, "step": 3280 }, { "clip_ratio/high_max": 0.0004961822181940079, "clip_ratio/high_mean": 0.0002276115003041923, "clip_ratio/low_mean": 4.583327972795814e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027344477130100133, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 531.5703125, "completions/min_length": 254.0, "epoch": 0.6423543214704732, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8203125, "kl": 0.0030428606551140547, "learning_rate": 8.988767503452888e-07, "loss": 0.00010950576979666949, "reward": 0.2999917467435201, "reward_std": 0.2953946093718211, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2999917268753052, "rewards/QAReward/std": 0.45755505561828613, "step": 3285 }, { "clip_ratio/high_max": 0.0004833190934732556, "clip_ratio/high_mean": 0.000252678245306015, "clip_ratio/low_mean": 7.528306450694799e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032796130981296303, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 522.5078125, "completions/min_length": 217.5, "epoch": 0.6433320297223308, "frac_reward_zero_std": 0.046875, "grad_norm": 0.76953125, "kl": 0.003142729215323925, "learning_rate": 8.945275200312085e-07, "loss": 7.791444077156484e-05, "reward": 0.474741131067276, "reward_std": 0.2731355130672455, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4747411459684372, "rewards/QAReward/std": 0.4001406580209732, "step": 3290 }, { "clip_ratio/high_max": 0.0002439878066070378, "clip_ratio/high_mean": 0.00013681813143193722, "clip_ratio/low_mean": 6.388114707078785e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002006992930546403, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 527.2291666666666, "completions/min_length": 252.0, "epoch": 0.6443097379741886, "frac_reward_zero_std": 0.09375, "grad_norm": 0.71484375, "kl": 0.0031000790651887654, "learning_rate": 8.901843622579974e-07, "loss": 0.0001111298450268805, "reward": 0.4061993658542633, "reward_std": 0.27267930408318836, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4061993757883708, "rewards/QAReward/std": 0.4370726744333903, "step": 3295 }, { "clip_ratio/high_max": 0.0005000433418899774, "clip_ratio/high_mean": 0.0002447689883410931, "clip_ratio/low_mean": 8.205451013054699e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003268234780989587, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 538.474609375, "completions/min_length": 252.5, "epoch": 0.6452874462260462, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8515625, "kl": 0.0029481954872608183, "learning_rate": 8.858473205850322e-07, "loss": 0.00011975554516538978, "reward": 0.4294945299625397, "reward_std": 0.2664834260940552, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4294945001602173, "rewards/QAReward/std": 0.39895792305469513, "step": 3300 }, { "clip_ratio/high_max": 0.0004990535089746118, "clip_ratio/high_mean": 0.00018282217206433415, "clip_ratio/low_mean": 4.956728662364185e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002323894645087421, "completions/clipped_ratio": 0.010416666666666666, "completions/max_length": 943.3333333333334, "completions/mean_length": 506.0364583333333, "completions/min_length": 250.33333333333334, "epoch": 0.6462651544779038, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.86328125, "kl": 0.003155595064163208, "learning_rate": 8.815164385103468e-07, "loss": 0.00016573000466451048, "reward": 0.36257699131965637, "reward_std": 0.2932929793993632, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36257699131965637, "rewards/QAReward/std": 0.43282175064086914, "step": 3305 }, { "clip_ratio/high_max": 0.0005006179912015795, "clip_ratio/high_mean": 0.00026004097890108824, "clip_ratio/low_mean": 6.608868861803784e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032612966606393456, "completions/clipped_ratio": 0.015625, "completions/max_length": 910.5, "completions/mean_length": 504.783203125, "completions/min_length": 241.0, "epoch": 0.6472428627297614, "frac_reward_zero_std": 0.0625, "grad_norm": 0.88671875, "kl": 0.003144806157797575, "learning_rate": 8.771917594701983e-07, "loss": 7.54991895519197e-05, "reward": 0.4113839417695999, "reward_std": 0.23916299641132355, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4113839119672775, "rewards/QAReward/std": 0.41470690071582794, "step": 3310 }, { "clip_ratio/high_max": 0.00032039874931797383, "clip_ratio/high_mean": 0.00019057934987358748, "clip_ratio/low_mean": 5.10994897922501e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024167883675545453, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 514.9140625, "completions/min_length": 233.0, "epoch": 0.6482205709816191, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84765625, "kl": 0.003103073826059699, "learning_rate": 8.728733268386309e-07, "loss": 0.00015649236738681793, "reward": 0.3739330271879832, "reward_std": 0.275590717792511, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37393301725387573, "rewards/QAReward/std": 0.46601269642512005, "step": 3315 }, { "clip_ratio/high_max": 0.0005302331293933094, "clip_ratio/high_mean": 0.00026168826152570544, "clip_ratio/low_mean": 4.26600941864308e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030434835935011506, "completions/clipped_ratio": 0.037109375, "completions/max_length": 1024.0, "completions/mean_length": 548.888671875, "completions/min_length": 242.5, "epoch": 0.6491982792334767, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8046875, "kl": 0.0030343960039317607, "learning_rate": 8.685611839270421e-07, "loss": 0.00013359063304960728, "reward": 0.3096819370985031, "reward_std": 0.31104524433612823, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3096819221973419, "rewards/QAReward/std": 0.4379923641681671, "step": 3320 }, { "clip_ratio/high_max": 0.00040449678199365734, "clip_ratio/high_mean": 0.0001856176706496626, "clip_ratio/low_mean": 6.127438391558825e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002468920545652509, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 532.1380208333334, "completions/min_length": 237.66666666666666, "epoch": 0.6501759874853343, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.75, "kl": 0.003087708912789822, "learning_rate": 8.642553739837455e-07, "loss": 0.0001614987850189209, "reward": 0.3489971061547597, "reward_std": 0.26899483303229016, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3489970962206523, "rewards/QAReward/std": 0.47095242142677307, "step": 3325 }, { "clip_ratio/high_max": 0.00031504896469414234, "clip_ratio/high_mean": 0.0001624382159207016, "clip_ratio/low_mean": 6.759700772818178e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023003522073850036, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 527.060546875, "completions/min_length": 247.5, "epoch": 0.6511536957371921, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.003128330921754241, "learning_rate": 8.599559401935383e-07, "loss": 0.0001927789766341448, "reward": 0.36308127641677856, "reward_std": 0.3078865706920624, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36308127641677856, "rewards/QAReward/std": 0.45649254322052, "step": 3330 }, { "clip_ratio/high_max": 0.0003373193670995533, "clip_ratio/high_mean": 0.0001473819254897535, "clip_ratio/low_mean": 5.305635422701016e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020043827826157212, "completions/clipped_ratio": 0.044270833333333336, "completions/max_length": 1024.0, "completions/mean_length": 543.7513020833334, "completions/min_length": 244.66666666666666, "epoch": 0.6521314039890497, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.81640625, "kl": 0.0030526325572282076, "learning_rate": 8.556629256772716e-07, "loss": 0.0001350367092527449, "reward": 0.3094685773054759, "reward_std": 0.26297655204931897, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3094685723384221, "rewards/QAReward/std": 0.4402007460594177, "step": 3335 }, { "clip_ratio/high_max": 0.0005880636861547827, "clip_ratio/high_mean": 0.00026965659926645456, "clip_ratio/low_mean": 8.865518611855805e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035831177374348047, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 537.5078125, "completions/min_length": 288.5, "epoch": 0.6531091122409073, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8203125, "kl": 0.0029381427448242903, "learning_rate": 8.513763734914114e-07, "loss": 0.00013420101022347807, "reward": 0.2922859340906143, "reward_std": 0.2785126864910126, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2922859340906143, "rewards/QAReward/std": 0.465515673160553, "step": 3340 }, { "clip_ratio/high_max": 0.000406986300367862, "clip_ratio/high_mean": 0.0001740873500239104, "clip_ratio/low_mean": 6.690353620797396e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024099088041111826, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 518.4986979166666, "completions/min_length": 228.66666666666666, "epoch": 0.654086820492765, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8203125, "kl": 0.003112377366051078, "learning_rate": 8.47096326627612e-07, "loss": 0.00013570603914558887, "reward": 0.40604644020398456, "reward_std": 0.266481692592303, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40604642033576965, "rewards/QAReward/std": 0.4331008891264598, "step": 3345 }, { "clip_ratio/high_max": 0.000504429911961779, "clip_ratio/high_mean": 0.00027661336353048684, "clip_ratio/low_mean": 8.004769042599947e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035666105104610325, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 543.208984375, "completions/min_length": 263.0, "epoch": 0.6550645287446226, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.002999747358262539, "learning_rate": 8.428228280122823e-07, "loss": 0.00015426820609718562, "reward": 0.38498109579086304, "reward_std": 0.26367008686065674, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38498109579086304, "rewards/QAReward/std": 0.4168258458375931, "step": 3350 }, { "clip_ratio/high_max": 0.00035570093896239994, "clip_ratio/high_mean": 0.00015129793900996446, "clip_ratio/low_mean": 4.573411133605987e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019703204743564128, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 537.6979166666666, "completions/min_length": 251.33333333333334, "epoch": 0.6560422369964802, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.0030048264190554617, "learning_rate": 8.385559205061572e-07, "loss": 0.0001258039614185691, "reward": 0.358111172914505, "reward_std": 0.27616333961486816, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3581111431121826, "rewards/QAReward/std": 0.43948642412821454, "step": 3355 }, { "clip_ratio/high_max": 0.000503969332203269, "clip_ratio/high_mean": 0.00024849887704476715, "clip_ratio/low_mean": 6.326921939034947e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003117680957075208, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 518.818359375, "completions/min_length": 199.5, "epoch": 0.6570199452483378, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80078125, "kl": 0.003229114972054958, "learning_rate": 8.342956469038659e-07, "loss": 0.0001745034009218216, "reward": 0.35097333788871765, "reward_std": 0.27794818580150604, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35097333788871765, "rewards/QAReward/std": 0.4281361550092697, "step": 3360 }, { "clip_ratio/high_max": 0.0003508910653181374, "clip_ratio/high_mean": 0.00013191360631026328, "clip_ratio/low_mean": 6.900101725477725e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020091462647542357, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 521.2018229166666, "completions/min_length": 242.66666666666666, "epoch": 0.6579976535001956, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0031058108434081078, "learning_rate": 8.300420499335024e-07, "loss": 0.00018575857393443584, "reward": 0.3345382809638977, "reward_std": 0.29084498683611554, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3345382809638977, "rewards/QAReward/std": 0.46566282709439594, "step": 3365 }, { "clip_ratio/high_max": 0.0005798809113912284, "clip_ratio/high_mean": 0.00032505064154975116, "clip_ratio/low_mean": 7.477739418391139e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003998280270025134, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 522.310546875, "completions/min_length": 254.5, "epoch": 0.6589753617520532, "frac_reward_zero_std": 0.078125, "grad_norm": 0.82421875, "kl": 0.0030437923036515714, "learning_rate": 8.257951722561989e-07, "loss": 5.6967628188431264e-05, "reward": 0.3426292687654495, "reward_std": 0.2696191966533661, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3426292985677719, "rewards/QAReward/std": 0.47347067296504974, "step": 3370 }, { "clip_ratio/high_max": 0.0003477917402051389, "clip_ratio/high_mean": 0.00019174968474544585, "clip_ratio/low_mean": 5.928857135586441e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002510382561013103, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 523.0416666666666, "completions/min_length": 223.33333333333334, "epoch": 0.6599530700039108, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8046875, "kl": 0.0030826414469629524, "learning_rate": 8.215550564656976e-07, "loss": 0.00017693752888590098, "reward": 0.39683781067530316, "reward_std": 0.2699691951274872, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3968378007411957, "rewards/QAReward/std": 0.4543178776899974, "step": 3375 }, { "clip_ratio/high_max": 0.0004848493845202029, "clip_ratio/high_mean": 0.00028170732548460364, "clip_ratio/low_mean": 7.107943238224835e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035278676077723505, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1024.0, "completions/mean_length": 515.765625, "completions/min_length": 253.5, "epoch": 0.6609307782557685, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8984375, "kl": 0.003098878310993314, "learning_rate": 8.173217450879217e-07, "loss": 0.00015273644821718336, "reward": 0.27744682878255844, "reward_std": 0.263806015253067, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.27744682878255844, "rewards/QAReward/std": 0.43818481266498566, "step": 3380 }, { "clip_ratio/high_max": 0.00039950916543602945, "clip_ratio/high_mean": 0.0001863744924776256, "clip_ratio/low_mean": 5.763455992564559e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002440090524032712, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/mean_length": 520.9856770833334, "completions/min_length": 240.33333333333334, "epoch": 0.6619084865076261, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.003073196392506361, "learning_rate": 8.130952805805505e-07, "loss": 9.413603111170232e-05, "reward": 0.28437190254529315, "reward_std": 0.3021596471468608, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.28437189261118573, "rewards/QAReward/std": 0.43881450096766156, "step": 3385 }, { "clip_ratio/high_max": 0.00053595983190462, "clip_ratio/high_mean": 0.00023865097900852562, "clip_ratio/low_mean": 8.821052761049941e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032686151680536566, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 510.322265625, "completions/min_length": 239.5, "epoch": 0.6628861947594837, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.003085324354469776, "learning_rate": 8.088757053325925e-07, "loss": 0.00014894765336066486, "reward": 0.3959100544452667, "reward_std": 0.2590700089931488, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3959100544452667, "rewards/QAReward/std": 0.42781712114810944, "step": 3390 }, { "clip_ratio/high_max": 0.0002878171973861754, "clip_ratio/high_mean": 0.0001448601600714028, "clip_ratio/low_mean": 3.093253108090721e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001757926889695227, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 520.8815104166666, "completions/min_length": 248.0, "epoch": 0.6638639030113415, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.87890625, "kl": 0.0030123211443424227, "learning_rate": 8.046630616639626e-07, "loss": 0.0001108663622289896, "reward": 0.37694380680720013, "reward_std": 0.27120233575503033, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37694381674130756, "rewards/QAReward/std": 0.44019991159439087, "step": 3395 }, { "clip_ratio/high_max": 0.0004750288906507194, "clip_ratio/high_mean": 0.00023043914698064327, "clip_ratio/low_mean": 0.00010439998877700418, "clip_ratio/low_min": 2.8922632918693125e-05, "clip_ratio/region_mean": 0.00033483916195109487, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 522.12109375, "completions/min_length": 256.5, "epoch": 0.6648416112631991, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8671875, "kl": 0.003133668005466461, "learning_rate": 8.004573918250542e-07, "loss": 0.00014561181887984276, "reward": 0.31015393137931824, "reward_std": 0.28574293851852417, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31015394628047943, "rewards/QAReward/std": 0.42691704630851746, "step": 3400 }, { "clip_ratio/high_max": 0.0002831932040862739, "clip_ratio/high_mean": 0.0001807978725992143, "clip_ratio/low_mean": 5.147478950675577e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023227265337482094, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 529.85546875, "completions/min_length": 259.0, "epoch": 0.6658193195150567, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.85546875, "kl": 0.003091222932562232, "learning_rate": 7.962587379963178e-07, "loss": 0.00018638523761183023, "reward": 0.29058366517225903, "reward_std": 0.3066577613353729, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29058366517225903, "rewards/QAReward/std": 0.46060241262118023, "step": 3405 }, { "clip_ratio/high_max": 0.0004158995463512838, "clip_ratio/high_mean": 0.00023984333383850754, "clip_ratio/low_mean": 7.026864041108638e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031011197133921085, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 531.140625, "completions/min_length": 251.5, "epoch": 0.6667970277669144, "frac_reward_zero_std": 0.015625, "grad_norm": 0.82421875, "kl": 0.003034266410395503, "learning_rate": 7.92067142287837e-07, "loss": 0.0001820963341742754, "reward": 0.4156357944011688, "reward_std": 0.26962660253047943, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41563577950000763, "rewards/QAReward/std": 0.3905312269926071, "step": 3410 }, { "clip_ratio/high_max": 0.0003292669542133808, "clip_ratio/high_mean": 0.0001836495823226869, "clip_ratio/low_mean": 5.4070378246251495e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002377199474722147, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 499.4895833333333, "completions/min_length": 236.33333333333334, "epoch": 0.667774736018772, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8671875, "kl": 0.00318852080963552, "learning_rate": 7.878826467389076e-07, "loss": 0.00018169968388974666, "reward": 0.3652607301870982, "reward_std": 0.2873262862364451, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3652607301870982, "rewards/QAReward/std": 0.4654388725757599, "step": 3415 }, { "clip_ratio/high_max": 0.0007072931854054332, "clip_ratio/high_mean": 0.00026007568812929093, "clip_ratio/low_mean": 8.263928175438196e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034271497279405596, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 517.486328125, "completions/min_length": 241.0, "epoch": 0.6687524442706296, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80078125, "kl": 0.003004516242071986, "learning_rate": 7.83705293317614e-07, "loss": 0.0002199929906055331, "reward": 0.3736591637134552, "reward_std": 0.2712641954421997, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3736591637134552, "rewards/QAReward/std": 0.46522845327854156, "step": 3420 }, { "clip_ratio/high_max": 0.00031303587602451446, "clip_ratio/high_mean": 0.00016863304190337658, "clip_ratio/low_mean": 4.0300591354025525e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020893362816423177, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 517.6588541666666, "completions/min_length": 213.33333333333334, "epoch": 0.6697301525224872, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.78515625, "kl": 0.0031076863873749972, "learning_rate": 7.795351239204092e-07, "loss": 0.00014197854325175284, "reward": 0.37555492917696637, "reward_std": 0.2849554618199666, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3755549391110738, "rewards/QAReward/std": 0.47789283593495685, "step": 3425 }, { "clip_ratio/high_max": 0.0007381007773801685, "clip_ratio/high_mean": 0.000288464012555778, "clip_ratio/low_mean": 9.728380246087909e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003857478266581893, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 529.591796875, "completions/min_length": 247.0, "epoch": 0.670707860774345, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.0028923694044351578, "learning_rate": 7.753721803716943e-07, "loss": 3.312767948955297e-05, "reward": 0.3724353760480881, "reward_std": 0.2667178213596344, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3724353611469269, "rewards/QAReward/std": 0.4477660357952118, "step": 3430 }, { "clip_ratio/high_max": 0.00024000854464247822, "clip_ratio/high_mean": 0.00012965121422894298, "clip_ratio/low_mean": 3.9134384132921694e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016878560418263078, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 521.9973958333334, "completions/min_length": 238.66666666666666, "epoch": 0.6716855690262026, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 0.75390625, "kl": 0.0030586035922169686, "learning_rate": 7.712165044234006e-07, "loss": 0.0001322934404015541, "reward": 0.4005601207415263, "reward_std": 0.2592760423819224, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4005601108074188, "rewards/QAReward/std": 0.4470057586828868, "step": 3435 }, { "clip_ratio/high_max": 0.0006313214777037502, "clip_ratio/high_mean": 0.0002941659768112004, "clip_ratio/low_mean": 5.7630935043562206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003517969103995711, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 514.548828125, "completions/min_length": 246.0, "epoch": 0.6726632772780602, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.003068131860345602, "learning_rate": 7.67068137754568e-07, "loss": 0.00019353149691596626, "reward": 0.35357142984867096, "reward_std": 0.2815946191549301, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35357141494750977, "rewards/QAReward/std": 0.4065166413784027, "step": 3440 }, { "clip_ratio/high_max": 0.00030040795681998136, "clip_ratio/high_mean": 0.00015951464301906527, "clip_ratio/low_mean": 4.20173499151133e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020153200021013617, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 536.2669270833334, "completions/min_length": 248.33333333333334, "epoch": 0.6736409855299179, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83203125, "kl": 0.0030165395233780145, "learning_rate": 7.629271219709294e-07, "loss": 0.00015510086668655277, "reward": 0.3797794779141744, "reward_std": 0.2896074950695038, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37977946798006695, "rewards/QAReward/std": 0.4335404932498932, "step": 3445 }, { "clip_ratio/high_max": 0.000539517356082797, "clip_ratio/high_mean": 0.0002675482479389757, "clip_ratio/low_mean": 5.658654990838841e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032413480803370474, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 541.044921875, "completions/min_length": 248.0, "epoch": 0.6746186937817755, "frac_reward_zero_std": 0.046875, "grad_norm": 0.76171875, "kl": 0.002902092505246401, "learning_rate": 7.587934986044916e-07, "loss": 0.00018695194739848375, "reward": 0.39727097749710083, "reward_std": 0.2907847911119461, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39727096259593964, "rewards/QAReward/std": 0.45851752161979675, "step": 3450 }, { "clip_ratio/high_max": 0.0003425587434321642, "clip_ratio/high_mean": 0.00016471715061925352, "clip_ratio/low_mean": 4.2878447857219726e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002075956086628139, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 540.1236979166666, "completions/min_length": 250.0, "epoch": 0.6755964020336331, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.79296875, "kl": 0.0029213594738394023, "learning_rate": 7.546673091131216e-07, "loss": 0.00016131773591041565, "reward": 0.34804532925287884, "reward_std": 0.28542489806811017, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34804532925287884, "rewards/QAReward/std": 0.4188329180081685, "step": 3455 }, { "clip_ratio/high_max": 0.00037268292508088054, "clip_ratio/high_mean": 0.00017692227847874165, "clip_ratio/low_mean": 6.955959397600964e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002464818535372615, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 523.849609375, "completions/min_length": 256.5, "epoch": 0.6765741102854909, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78125, "kl": 0.003012212039902806, "learning_rate": 7.505485948801272e-07, "loss": 0.00018524554325267674, "reward": 0.38608554005622864, "reward_std": 0.3066231906414032, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38608554005622864, "rewards/QAReward/std": 0.46154849231243134, "step": 3460 }, { "clip_ratio/high_max": 0.00035168534377589823, "clip_ratio/high_mean": 0.0001680202898569405, "clip_ratio/low_mean": 3.49993395502679e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020301962504163385, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 531.2760416666666, "completions/min_length": 251.66666666666666, "epoch": 0.6775518185373485, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.828125, "kl": 0.0030681627802550794, "learning_rate": 7.464373972138436e-07, "loss": 0.00010837416630238295, "reward": 0.37953325112660724, "reward_std": 0.277037372191747, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3795332610607147, "rewards/QAReward/std": 0.46903881430625916, "step": 3465 }, { "clip_ratio/high_max": 0.0006138268974609673, "clip_ratio/high_mean": 0.00030930321663618087, "clip_ratio/low_mean": 5.508968388312496e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036439287941902876, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 551.296875, "completions/min_length": 264.5, "epoch": 0.6785295267892061, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.0029428108595311643, "learning_rate": 7.42333757347221e-07, "loss": 0.00015027004992589353, "reward": 0.26447638869285583, "reward_std": 0.30101004242897034, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.26447637379169464, "rewards/QAReward/std": 0.4652065336704254, "step": 3470 }, { "clip_ratio/high_max": 0.00039268441032618283, "clip_ratio/high_mean": 0.0001770475530065596, "clip_ratio/low_mean": 5.304677470121533e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023009433643892408, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.80078125, "completions/min_length": 252.0, "epoch": 0.6795072350410637, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.79296875, "kl": 0.0029606086667627097, "learning_rate": 7.382377164374071e-07, "loss": 8.294256404042243e-05, "reward": 0.38645599285761517, "reward_std": 0.2741359770298004, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3864559829235077, "rewards/QAReward/std": 0.4312160710493724, "step": 3475 }, { "clip_ratio/high_max": 0.0004701106692664325, "clip_ratio/high_mean": 0.00023922363761812447, "clip_ratio/low_mean": 0.00010883895447477698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034806259209290146, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 538.75, "completions/min_length": 244.5, "epoch": 0.6804849432929214, "frac_reward_zero_std": 0.0625, "grad_norm": 0.796875, "kl": 0.00290197734721005, "learning_rate": 7.341493155653368e-07, "loss": 4.798259469680488e-05, "reward": 0.3881091922521591, "reward_std": 0.27086248993873596, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3881091922521591, "rewards/QAReward/std": 0.44108887016773224, "step": 3480 }, { "clip_ratio/high_max": 0.0003671216079965234, "clip_ratio/high_mean": 0.00017547353054396807, "clip_ratio/low_mean": 3.1308112374972553e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020678164437413216, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 530.13671875, "completions/min_length": 226.33333333333334, "epoch": 0.681462651544779, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.0030067110899835827, "learning_rate": 7.300685957353216e-07, "loss": 0.00012049920624122024, "reward": 0.3108031650384267, "reward_std": 0.2919366757074992, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3108031650384267, "rewards/QAReward/std": 0.44839877883593243, "step": 3485 }, { "clip_ratio/high_max": 0.00042653443524613975, "clip_ratio/high_mean": 0.0002768858685158193, "clip_ratio/low_mean": 6.279646768234671e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033968231873586773, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 524.7578125, "completions/min_length": 260.0, "epoch": 0.6824403597966366, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.003043567994609475, "learning_rate": 7.259955978746346e-07, "loss": 0.0001169449184089899, "reward": 0.3763788193464279, "reward_std": 0.2682186961174011, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3763788491487503, "rewards/QAReward/std": 0.4680721163749695, "step": 3490 }, { "clip_ratio/high_max": 0.0003935369662940502, "clip_ratio/high_mean": 0.00021222997456789018, "clip_ratio/low_mean": 3.216348995920271e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024439346161670984, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 508.8216145833333, "completions/min_length": 250.66666666666666, "epoch": 0.6834180680484944, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8671875, "kl": 0.003122023027390242, "learning_rate": 7.219303628331021e-07, "loss": 0.00011126456083729864, "reward": 0.38869669040044147, "reward_std": 0.27222899595896405, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38869669040044147, "rewards/QAReward/std": 0.42661983768145245, "step": 3495 }, { "clip_ratio/high_max": 0.0005142051842994988, "clip_ratio/high_mean": 0.0002605669084005058, "clip_ratio/low_mean": 5.09366873302497e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031150358263403176, "completions/clipped_ratio": 0.005859375, "completions/max_length": 975.5, "completions/mean_length": 508.091796875, "completions/min_length": 205.5, "epoch": 0.684395776300352, "frac_reward_zero_std": 0.0625, "grad_norm": 0.87109375, "kl": 0.003110396396368742, "learning_rate": 7.178729313826954e-07, "loss": 0.00019000474130734802, "reward": 0.5515407919883728, "reward_std": 0.2240048199892044, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.5515407919883728, "rewards/QAReward/std": 0.3589402884244919, "step": 3500 }, { "clip_ratio/high_max": 0.00031235162168741226, "clip_ratio/high_mean": 0.00016605589771643282, "clip_ratio/low_mean": 7.203256827779115e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023808847181499003, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 529.6197916666666, "completions/min_length": 238.0, "epoch": 0.6853734845522096, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78125, "kl": 0.0029518859926611187, "learning_rate": 7.138233442171184e-07, "loss": 0.0001972290687263012, "reward": 0.3318514327208201, "reward_std": 0.29176661372184753, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33185144265492755, "rewards/QAReward/std": 0.449676513671875, "step": 3505 }, { "clip_ratio/high_max": 0.0007207520538941026, "clip_ratio/high_mean": 0.0003116531821433455, "clip_ratio/low_mean": 4.6833047235850245e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035848621046170593, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 551.34765625, "completions/min_length": 244.0, "epoch": 0.6863511928040673, "frac_reward_zero_std": 0.046875, "grad_norm": 0.828125, "kl": 0.0029919940512627362, "learning_rate": 7.097816419514028e-07, "loss": 0.00013076317263767124, "reward": 0.37433192133903503, "reward_std": 0.28594523668289185, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37433192133903503, "rewards/QAReward/std": 0.40959255397319794, "step": 3510 }, { "clip_ratio/high_max": 0.0002623169799335301, "clip_ratio/high_mean": 0.00011977727990597486, "clip_ratio/low_mean": 6.181187491165474e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018158917082473637, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 531.0533854166666, "completions/min_length": 246.0, "epoch": 0.6873289010559249, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.79296875, "kl": 0.0030881248414516447, "learning_rate": 7.057478651214984e-07, "loss": 0.00016584801487624646, "reward": 0.40515512228012085, "reward_std": 0.2653956711292267, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40515512228012085, "rewards/QAReward/std": 0.445218026638031, "step": 3515 }, { "clip_ratio/high_max": 0.0005121678696013987, "clip_ratio/high_mean": 0.000245915079722181, "clip_ratio/low_mean": 6.842594666522928e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031434104312211275, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 533.623046875, "completions/min_length": 252.5, "epoch": 0.6883066093077825, "frac_reward_zero_std": 0.078125, "grad_norm": 0.7734375, "kl": 0.002944614225998521, "learning_rate": 7.017220541838675e-07, "loss": 6.541631883010268e-05, "reward": 0.359561026096344, "reward_std": 0.26410695910453796, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.359561026096344, "rewards/QAReward/std": 0.45073454082012177, "step": 3520 }, { "clip_ratio/high_max": 0.00023862458765506745, "clip_ratio/high_mean": 0.00011690754909068345, "clip_ratio/low_mean": 4.402716876938939e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016093471203930675, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 536.7044270833334, "completions/min_length": 235.0, "epoch": 0.6892843175596403, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.0029293786734342574, "learning_rate": 6.977042495150788e-07, "loss": 7.434874423779547e-05, "reward": 0.3329964578151703, "reward_std": 0.285191277662913, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3329964578151703, "rewards/QAReward/std": 0.43052592873573303, "step": 3525 }, { "clip_ratio/high_max": 0.0004685175605118275, "clip_ratio/high_mean": 0.00024157254374586047, "clip_ratio/low_mean": 7.64042662922293e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031797681003808975, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 534.16015625, "completions/min_length": 260.5, "epoch": 0.6902620258114979, "frac_reward_zero_std": 0.0625, "grad_norm": 0.81640625, "kl": 0.0030145423021167516, "learning_rate": 6.936944914114042e-07, "loss": 0.00016570561565458773, "reward": 0.3669580966234207, "reward_std": 0.2736068144440651, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3669581115245819, "rewards/QAReward/std": 0.42853574454784393, "step": 3530 }, { "clip_ratio/high_max": 0.00031883615301921966, "clip_ratio/high_mean": 0.00015735525521449746, "clip_ratio/low_mean": 5.091291968710721e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020826817490160465, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 545.6184895833334, "completions/min_length": 247.0, "epoch": 0.6912397340633555, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80078125, "kl": 0.00295546380802989, "learning_rate": 6.896928200884115e-07, "loss": 0.00010778597788885236, "reward": 0.3636474112669627, "reward_std": 0.26713743805885315, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36364739139874774, "rewards/QAReward/std": 0.4386133948961894, "step": 3535 }, { "clip_ratio/high_max": 0.00043444057228043677, "clip_ratio/high_mean": 0.0002158799092285335, "clip_ratio/low_mean": 8.326993265654892e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002991498447954655, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 512.67578125, "completions/min_length": 239.0, "epoch": 0.6922174423152131, "frac_reward_zero_std": 0.03125, "grad_norm": 0.77734375, "kl": 0.0030502131208777427, "learning_rate": 6.856992756805636e-07, "loss": 0.00014841302763670682, "reward": 0.4174262285232544, "reward_std": 0.2708669602870941, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4174262136220932, "rewards/QAReward/std": 0.44858936965465546, "step": 3540 }, { "clip_ratio/high_max": 0.00026790250558406115, "clip_ratio/high_mean": 0.00013549254508689047, "clip_ratio/low_mean": 4.1228061309084294e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017672060057520865, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 532.9765625, "completions/min_length": 237.33333333333334, "epoch": 0.6931951505670708, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.77734375, "kl": 0.003020873758941889, "learning_rate": 6.817138982408143e-07, "loss": 0.00015408538747578858, "reward": 0.29348750909169513, "reward_std": 0.287171612183253, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29348750909169513, "rewards/QAReward/std": 0.43768956263860065, "step": 3545 }, { "clip_ratio/high_max": 0.0005281488178297877, "clip_ratio/high_mean": 0.00028164503164589404, "clip_ratio/low_mean": 4.506099357968196e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032670601503923533, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 491.6484375, "completions/min_length": 215.5, "epoch": 0.6941728588189284, "frac_reward_zero_std": 0.078125, "grad_norm": 0.8125, "kl": 0.0032693323213607074, "learning_rate": 6.777367277402091e-07, "loss": 0.00013740500435233116, "reward": 0.48272567987442017, "reward_std": 0.26732898503541946, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.48272567987442017, "rewards/QAReward/std": 0.4530844986438751, "step": 3550 }, { "clip_ratio/high_max": 0.00038614249788224695, "clip_ratio/high_mean": 0.00020981788402423262, "clip_ratio/low_mean": 4.5326794497668744e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002551446901634336, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 976.6666666666666, "completions/mean_length": 544.3294270833334, "completions/min_length": 266.6666666666667, "epoch": 0.695150567070786, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.765625, "kl": 0.0029650218319147827, "learning_rate": 6.737678040674809e-07, "loss": 0.00017762387869879603, "reward": 0.37462125221888226, "reward_std": 0.28452421228090924, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3746212621529897, "rewards/QAReward/std": 0.4353932738304138, "step": 3555 }, { "clip_ratio/high_max": 0.0005014748079702258, "clip_ratio/high_mean": 0.00031934795551933346, "clip_ratio/low_mean": 8.287467353511602e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040222262032330036, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 527.7734375, "completions/min_length": 231.5, "epoch": 0.6961282753226438, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.002994460053741932, "learning_rate": 6.698071670286525e-07, "loss": 0.00011865072883665562, "reward": 0.2963394522666931, "reward_std": 0.30114543437957764, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2963394448161125, "rewards/QAReward/std": 0.48781250417232513, "step": 3560 }, { "clip_ratio/high_max": 0.0002955470699816942, "clip_ratio/high_mean": 0.00017452180618420242, "clip_ratio/low_mean": 5.8348389575257895e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023287018993869423, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 999.0, "completions/mean_length": 506.8932291666667, "completions/min_length": 238.66666666666666, "epoch": 0.6971059835745014, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.828125, "kl": 0.0031050232704728843, "learning_rate": 6.658548563466353e-07, "loss": 0.00012140139006078243, "reward": 0.3912618160247803, "reward_std": 0.28076305985450745, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3912618160247803, "rewards/QAReward/std": 0.4554217954476674, "step": 3565 }, { "clip_ratio/high_max": 0.0005141039844602346, "clip_ratio/high_mean": 0.00021972020622342824, "clip_ratio/low_mean": 4.366374196251854e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002633839379996061, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 531.53125, "completions/min_length": 248.5, "epoch": 0.698083691826359, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8359375, "kl": 0.0029372700955718756, "learning_rate": 6.619109116608346e-07, "loss": 0.00017191029619425535, "reward": 0.3717331886291504, "reward_std": 0.2855970114469528, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3717331737279892, "rewards/QAReward/std": 0.42406444251537323, "step": 3570 }, { "clip_ratio/high_max": 0.00022558713681064546, "clip_ratio/high_mean": 0.00010605340939946472, "clip_ratio/low_mean": 4.174385539954528e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014779726625420154, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 536.1145833333334, "completions/min_length": 232.66666666666666, "epoch": 0.6990614000782167, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0029428056441247463, "learning_rate": 6.579753725267476e-07, "loss": 0.00010297620901837945, "reward": 0.386958509683609, "reward_std": 0.25793514649073285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3869584997495015, "rewards/QAReward/std": 0.3842958112557729, "step": 3575 }, { "clip_ratio/high_max": 0.0004853261634707451, "clip_ratio/high_mean": 0.0002102683443808928, "clip_ratio/low_mean": 4.643730353564024e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025670564500615003, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 536.892578125, "completions/min_length": 253.0, "epoch": 0.7000391083300743, "frac_reward_zero_std": 0.015625, "grad_norm": 0.75390625, "kl": 0.0029737253207713366, "learning_rate": 6.540482784155691e-07, "loss": 0.00015519657172262668, "reward": 0.3507990837097168, "reward_std": 0.2864631116390228, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3507990837097168, "rewards/QAReward/std": 0.44692689180374146, "step": 3580 }, { "clip_ratio/high_max": 0.00039072270737960935, "clip_ratio/high_mean": 0.0002021545427851379, "clip_ratio/low_mean": 3.686025593196973e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023901481181383134, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 500.9283854166667, "completions/min_length": 236.0, "epoch": 0.7010168165819319, "frac_reward_zero_std": 0.03125, "grad_norm": 0.82421875, "kl": 0.00316895698197186, "learning_rate": 6.501296687137944e-07, "loss": 8.723932551220059e-05, "reward": 0.39958512783050537, "reward_std": 0.2768508046865463, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3995851476987203, "rewards/QAReward/std": 0.4570709864298503, "step": 3585 }, { "clip_ratio/high_max": 0.00041617254028096794, "clip_ratio/high_mean": 0.00023270071251317858, "clip_ratio/low_mean": 7.919456984382123e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003118952736258507, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 512.07421875, "completions/min_length": 245.5, "epoch": 0.7019945248337895, "frac_reward_zero_std": 0.03125, "grad_norm": 0.86328125, "kl": 0.0031541716307401657, "learning_rate": 6.462195827228275e-07, "loss": 0.00013656760565936566, "reward": 0.4042317569255829, "reward_std": 0.27649159729480743, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4042317569255829, "rewards/QAReward/std": 0.41501250863075256, "step": 3590 }, { "clip_ratio/high_max": 0.00032760639442130923, "clip_ratio/high_mean": 0.00017079635872505606, "clip_ratio/low_mean": 3.0206929659470914e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020100330002605915, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 538.171875, "completions/min_length": 266.0, "epoch": 0.7029722330856473, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78125, "kl": 0.002845626650378108, "learning_rate": 6.423180596585818e-07, "loss": 0.00014931897167116403, "reward": 0.35636523365974426, "reward_std": 0.28391645352045697, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3563652237256368, "rewards/QAReward/std": 0.48018362124760944, "step": 3595 }, { "clip_ratio/high_max": 0.0005563204060308636, "clip_ratio/high_mean": 0.00027483517769724133, "clip_ratio/low_mean": 8.01667949417606e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003550019930116832, "completions/clipped_ratio": 0.00390625, "completions/max_length": 949.0, "completions/mean_length": 523.1015625, "completions/min_length": 249.0, "epoch": 0.7039499413375049, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.002975220698863268, "learning_rate": 6.38425138651091e-07, "loss": 0.00017348583787679672, "reward": 0.33235833048820496, "reward_std": 0.31178756058216095, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33235831558704376, "rewards/QAReward/std": 0.4794328510761261, "step": 3600 }, { "clip_ratio/high_max": 0.0002700698911212385, "clip_ratio/high_mean": 0.00015383182908408345, "clip_ratio/low_mean": 5.102841532789171e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020486024441197516, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 537.4518229166666, "completions/min_length": 248.33333333333334, "epoch": 0.7049276495893625, "frac_reward_zero_std": 0.03125, "grad_norm": 0.87890625, "kl": 0.0029691391624510287, "learning_rate": 6.345408587441137e-07, "loss": 0.00013405915815383195, "reward": 0.37410715222358704, "reward_std": 0.26720789074897766, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37410715222358704, "rewards/QAReward/std": 0.4056635995705922, "step": 3605 }, { "clip_ratio/high_max": 0.0005727416835725307, "clip_ratio/high_mean": 0.00023789707920514048, "clip_ratio/low_mean": 7.807138899806887e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003159684827551246, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 536.341796875, "completions/min_length": 226.5, "epoch": 0.7059053578412202, "frac_reward_zero_std": 0.015625, "grad_norm": 0.79296875, "kl": 0.00302656814455986, "learning_rate": 6.306652588947454e-07, "loss": 0.00012284715194255114, "reward": 0.313519686460495, "reward_std": 0.2832838445901871, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.313519686460495, "rewards/QAReward/std": 0.4362662136554718, "step": 3610 }, { "clip_ratio/high_max": 0.0003525549778714776, "clip_ratio/high_mean": 0.0001874673180282116, "clip_ratio/low_mean": 3.563289428711869e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022310020867735147, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1008.0, "completions/mean_length": 515.9635416666666, "completions/min_length": 255.66666666666666, "epoch": 0.7068830660930778, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84765625, "kl": 0.0029901810456067325, "learning_rate": 6.267983779730238e-07, "loss": 0.00017450847662985324, "reward": 0.36336806416511536, "reward_std": 0.2828848858674367, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36336806416511536, "rewards/QAReward/std": 0.44303499658902484, "step": 3615 }, { "clip_ratio/high_max": 0.00047207439783960583, "clip_ratio/high_mean": 0.0002317273465450853, "clip_ratio/low_mean": 7.31041538529098e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030483149457722903, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 520.537109375, "completions/min_length": 235.0, "epoch": 0.7078607743449354, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8359375, "kl": 0.003094942821189761, "learning_rate": 6.229402547615416e-07, "loss": 0.00010523756500333548, "reward": 0.4217168837785721, "reward_std": 0.2806236296892166, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4217168837785721, "rewards/QAReward/std": 0.42506812512874603, "step": 3620 }, { "clip_ratio/high_max": 0.0003064591553993523, "clip_ratio/high_mean": 0.00015749066369608045, "clip_ratio/low_mean": 3.855050163110718e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019604115514084696, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 553.2005208333334, "completions/min_length": 232.0, "epoch": 0.7088384825967932, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.0029477985575795175, "learning_rate": 6.190909279550556e-07, "loss": 0.00013512724544852971, "reward": 0.41600891947746277, "reward_std": 0.2695458233356476, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41600891947746277, "rewards/QAReward/std": 0.4279543062051137, "step": 3625 }, { "clip_ratio/high_max": 0.0004862891277298331, "clip_ratio/high_mean": 0.00025971790892072023, "clip_ratio/low_mean": 6.665776745649055e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032637567492201926, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 541.787109375, "completions/min_length": 253.5, "epoch": 0.7098161908486508, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.002959798509255052, "learning_rate": 6.152504361601021e-07, "loss": 0.00012332260375842453, "reward": 0.2705589681863785, "reward_std": 0.3023691624403, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2705589681863785, "rewards/QAReward/std": 0.4686335027217865, "step": 3630 }, { "clip_ratio/high_max": 0.000344787142239511, "clip_ratio/high_mean": 0.0001454663462936878, "clip_ratio/low_mean": 3.915850393241271e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018462485168129205, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 515.53125, "completions/min_length": 251.0, "epoch": 0.7107938991005084, "frac_reward_zero_std": 0.0625, "grad_norm": 0.79296875, "kl": 0.0030263370368629693, "learning_rate": 6.114188178946047e-07, "loss": 0.00017591977957636117, "reward": 0.3501906593640645, "reward_std": 0.25638548533121747, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3501906643311183, "rewards/QAReward/std": 0.44982372721036273, "step": 3635 }, { "clip_ratio/high_max": 0.00035772209521383046, "clip_ratio/high_mean": 0.00017431690357625485, "clip_ratio/low_mean": 7.787408394506201e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002521909773349762, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 549.294921875, "completions/min_length": 232.0, "epoch": 0.7117716073523661, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.0029485107399523256, "learning_rate": 6.075961115874916e-07, "loss": 0.00015308409929275512, "reward": 0.3135509788990021, "reward_std": 0.25896863639354706, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.31355099380016327, "rewards/QAReward/std": 0.4062660038471222, "step": 3640 }, { "clip_ratio/high_max": 0.0002815294195897877, "clip_ratio/high_mean": 0.00016241781413555144, "clip_ratio/low_mean": 4.595990758389234e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002083777217194438, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 542.4140625, "completions/min_length": 258.0, "epoch": 0.7127493156042237, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8125, "kl": 0.0029698691330850126, "learning_rate": 6.0378235557831e-07, "loss": 0.00013087965780869126, "reward": 0.3782738248507182, "reward_std": 0.2782745957374573, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3782738149166107, "rewards/QAReward/std": 0.4201439718405406, "step": 3645 }, { "clip_ratio/high_max": 0.0003846382023766637, "clip_ratio/high_mean": 0.00024200439802370965, "clip_ratio/low_mean": 7.925481477286667e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003212592215277255, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 545.634765625, "completions/min_length": 224.0, "epoch": 0.7137270238560813, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.0029450561851263046, "learning_rate": 5.999775881168397e-07, "loss": 0.00010842147748917342, "reward": 0.40290144085884094, "reward_std": 0.27589842677116394, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40290142595767975, "rewards/QAReward/std": 0.4262634813785553, "step": 3650 }, { "clip_ratio/high_max": 0.0003253112430684268, "clip_ratio/high_mean": 0.00019484565127640962, "clip_ratio/low_mean": 3.765854489756748e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002325042150914669, "completions/clipped_ratio": 0.013020833333333334, "completions/max_length": 1024.0, "completions/mean_length": 517.1080729166666, "completions/min_length": 230.0, "epoch": 0.714704732107939, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.83203125, "kl": 0.0031110307201743124, "learning_rate": 5.961818473627108e-07, "loss": 0.00010269482154399157, "reward": 0.46568702658017475, "reward_std": 0.26572780807813007, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.46568700671195984, "rewards/QAReward/std": 0.39732202887535095, "step": 3655 }, { "clip_ratio/high_max": 0.00041461200453341007, "clip_ratio/high_mean": 0.00020977841923013328, "clip_ratio/low_mean": 8.88988230144605e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002986772393342108, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 534.001953125, "completions/min_length": 247.5, "epoch": 0.7156824403597967, "frac_reward_zero_std": 0.078125, "grad_norm": 0.796875, "kl": 0.0030135916080325843, "learning_rate": 5.923951713850204e-07, "loss": 0.00011603470193222165, "reward": 0.38595688343048096, "reward_std": 0.2572702020406723, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38595688343048096, "rewards/QAReward/std": 0.4412245452404022, "step": 3660 }, { "clip_ratio/high_max": 0.00028521904023364185, "clip_ratio/high_mean": 0.00015225704992190004, "clip_ratio/low_mean": 3.857713018078357e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019083417719230057, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 530.9934895833334, "completions/min_length": 266.6666666666667, "epoch": 0.7166601486116543, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.78125, "kl": 0.002986851194873452, "learning_rate": 5.886175981619523e-07, "loss": 0.0001364084193482995, "reward": 0.3233429392178853, "reward_std": 0.2968634565671285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3233429392178853, "rewards/QAReward/std": 0.4496353467305501, "step": 3665 }, { "clip_ratio/high_max": 0.0005225587403401733, "clip_ratio/high_mean": 0.0002791015023831278, "clip_ratio/low_mean": 7.364557968685404e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035274706315249207, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 539.416015625, "completions/min_length": 242.5, "epoch": 0.7176378568635119, "frac_reward_zero_std": 0.0625, "grad_norm": 0.82421875, "kl": 0.002970031462609768, "learning_rate": 5.848491655803937e-07, "loss": 0.00016528158448636532, "reward": 0.36040037870407104, "reward_std": 0.2822582274675369, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36040037870407104, "rewards/QAReward/std": 0.46533703804016113, "step": 3670 }, { "clip_ratio/high_max": 0.00027807645965367557, "clip_ratio/high_mean": 0.00015521016903221607, "clip_ratio/low_mean": 7.328434439841658e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002284945221617818, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 512.28125, "completions/min_length": 229.0, "epoch": 0.7186155651153696, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.80078125, "kl": 0.0032390184234827755, "learning_rate": 5.810899114355563e-07, "loss": 0.0001631081337109208, "reward": 0.3438352545102437, "reward_std": 0.27621179819107056, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3438352545102437, "rewards/QAReward/std": 0.4610046148300171, "step": 3675 }, { "clip_ratio/high_max": 0.00046922825276851654, "clip_ratio/high_mean": 0.0002559850341640413, "clip_ratio/low_mean": 0.00010218410170637071, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035816914751194415, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 534.919921875, "completions/min_length": 273.5, "epoch": 0.7195932733672272, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.0029319482389837505, "learning_rate": 5.773398734305988e-07, "loss": 0.0001564843114465475, "reward": 0.32234159111976624, "reward_std": 0.26925696432590485, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32234159111976624, "rewards/QAReward/std": 0.41395506262779236, "step": 3680 }, { "clip_ratio/high_max": 0.00032500465167686345, "clip_ratio/high_mean": 0.00019344801548868418, "clip_ratio/low_mean": 4.2971351649612186e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023641937877982855, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 543.7955729166666, "completions/min_length": 255.0, "epoch": 0.7205709816190848, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.84375, "kl": 0.003021005867049098, "learning_rate": 5.735990891762458e-07, "loss": 0.00013343923492357135, "reward": 0.4127707580725352, "reward_std": 0.2802715102831523, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4127707580725352, "rewards/QAReward/std": 0.4247203568617503, "step": 3685 }, { "clip_ratio/high_max": 0.0004646521410904825, "clip_ratio/high_mean": 0.0002073825162369758, "clip_ratio/low_mean": 8.238609152613207e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002897686092182994, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 532.3125, "completions/min_length": 251.0, "epoch": 0.7215486898709426, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0029540438670665027, "learning_rate": 5.698675961904126e-07, "loss": 0.00021351431496441364, "reward": 0.2605282589793205, "reward_std": 0.27545100450515747, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2605282813310623, "rewards/QAReward/std": 0.44347554445266724, "step": 3690 }, { "clip_ratio/high_max": 0.00032606649911031126, "clip_ratio/high_mean": 0.00016608891892246901, "clip_ratio/low_mean": 3.86115672881715e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020470047602429985, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 532.7161458333334, "completions/min_length": 236.66666666666666, "epoch": 0.7225263981228002, "frac_reward_zero_std": 0.03125, "grad_norm": 0.90625, "kl": 0.002959425840526819, "learning_rate": 5.661454318978291e-07, "loss": 0.00012775221839547157, "reward": 0.42104556163152057, "reward_std": 0.2843133608500163, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42104556163152057, "rewards/QAReward/std": 0.4266700545946757, "step": 3695 }, { "clip_ratio/high_max": 0.0005526641733013093, "clip_ratio/high_mean": 0.0002692504960577935, "clip_ratio/low_mean": 7.793819677317514e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003471886797342449, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 538.125, "completions/min_length": 238.5, "epoch": 0.7235041063746578, "frac_reward_zero_std": 0.0625, "grad_norm": 0.828125, "kl": 0.002944187168031931, "learning_rate": 5.624326336296627e-07, "loss": 0.0001508257701061666, "reward": 0.4361738860607147, "reward_std": 0.2540326938033104, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4361738860607147, "rewards/QAReward/std": 0.4078160673379898, "step": 3700 }, { "clip_ratio/high_max": 0.00032056691125035284, "clip_ratio/high_mean": 0.00014199237921275198, "clip_ratio/low_mean": 6.993986753514036e-05, "clip_ratio/low_min": 2.202400646638125e-05, "clip_ratio/region_mean": 0.0002119322423823178, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 539.89453125, "completions/min_length": 257.0, "epoch": 0.7244818146265154, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.0029709195252507923, "learning_rate": 5.587292386231451e-07, "loss": 0.00010929945856332778, "reward": 0.29015688101450604, "reward_std": 0.29100023706754047, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29015687108039856, "rewards/QAReward/std": 0.43344147006670636, "step": 3705 }, { "clip_ratio/high_max": 0.00045232094125822184, "clip_ratio/high_mean": 0.0002676896168850362, "clip_ratio/low_mean": 7.234806907945313e-05, "clip_ratio/low_min": 2.5723472936078907e-05, "clip_ratio/region_mean": 0.00034003767650574445, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 534.822265625, "completions/min_length": 244.5, "epoch": 0.7254595228783731, "frac_reward_zero_std": 0.046875, "grad_norm": 0.83203125, "kl": 0.0029076574835926296, "learning_rate": 5.550352840211998e-07, "loss": 5.335469031706452e-05, "reward": 0.3343920558691025, "reward_std": 0.27608999609947205, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3343920409679413, "rewards/QAReward/std": 0.45418688654899597, "step": 3710 }, { "clip_ratio/high_max": 0.0003527021151967347, "clip_ratio/high_mean": 0.0002019757987000048, "clip_ratio/low_mean": 4.128308646613732e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002432588953524828, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 540.1184895833334, "completions/min_length": 250.0, "epoch": 0.7264372311302307, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.83984375, "kl": 0.00302979345433414, "learning_rate": 5.513508068720674e-07, "loss": 0.00013622208498418332, "reward": 0.2953166365623474, "reward_std": 0.2726506491502126, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2953166365623474, "rewards/QAReward/std": 0.44200722376505536, "step": 3715 }, { "clip_ratio/high_max": 0.0004812191822566092, "clip_ratio/high_mean": 0.0002422584337182343, "clip_ratio/low_mean": 7.867786480346695e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003209362970665097, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 525.162109375, "completions/min_length": 241.0, "epoch": 0.7274149393820883, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.002916257968172431, "learning_rate": 5.476758441289349e-07, "loss": 0.0001341427443549037, "reward": 0.4003627300262451, "reward_std": 0.2570704221725464, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4003627151250839, "rewards/QAReward/std": 0.4621230214834213, "step": 3720 }, { "clip_ratio/high_max": 0.00032439770875498654, "clip_ratio/high_mean": 0.0001515916548669338, "clip_ratio/low_mean": 6.565972580574453e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021725138649344444, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 524.0416666666666, "completions/min_length": 245.33333333333334, "epoch": 0.7283926476339461, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0030055832117795943, "learning_rate": 5.440104326495666e-07, "loss": 0.00010277185356244445, "reward": 0.3994884689648946, "reward_std": 0.2689501444498698, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3994884689648946, "rewards/QAReward/std": 0.3968155086040497, "step": 3725 }, { "clip_ratio/high_max": 0.0004343923763372004, "clip_ratio/high_mean": 0.00024921497097238896, "clip_ratio/low_mean": 3.679897854453884e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002860139531549066, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 539.19921875, "completions/min_length": 239.5, "epoch": 0.7293703558858037, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.003014591056853533, "learning_rate": 5.403546091959319e-07, "loss": 0.00014663536567240953, "reward": 0.3938538730144501, "reward_std": 0.30389878153800964, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3938538730144501, "rewards/QAReward/std": 0.4445585161447525, "step": 3730 }, { "clip_ratio/high_max": 0.00031362479785457253, "clip_ratio/high_mean": 0.00018991875695064663, "clip_ratio/low_mean": 5.038674571551382e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024030550848692657, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 532.96875, "completions/min_length": 269.6666666666667, "epoch": 0.7303480641376613, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.0029852432664483784, "learning_rate": 5.367084104338381e-07, "loss": 0.00014813791494816542, "reward": 0.2932012627522151, "reward_std": 0.304767370223999, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2932012677192688, "rewards/QAReward/std": 0.4763106902440389, "step": 3735 }, { "clip_ratio/high_max": 0.00038339670281857253, "clip_ratio/high_mean": 0.00019158394425176085, "clip_ratio/low_mean": 6.104865024099126e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002526325813960284, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 528.3984375, "completions/min_length": 218.0, "epoch": 0.731325772389519, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.002948185754939914, "learning_rate": 5.330718729325621e-07, "loss": 0.00012041795998811722, "reward": 0.2959868013858795, "reward_std": 0.2967400550842285, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2959868013858795, "rewards/QAReward/std": 0.46947304904460907, "step": 3740 }, { "clip_ratio/high_max": 0.00022234604693949223, "clip_ratio/high_mean": 0.0001446669630240649, "clip_ratio/low_mean": 5.145811883267015e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019612509640865028, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 523.7708333333334, "completions/min_length": 244.66666666666666, "epoch": 0.7323034806413766, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.00297856186516583, "learning_rate": 5.294450331644851e-07, "loss": 0.00013203462585806847, "reward": 0.3095382750034332, "reward_std": 0.30422839522361755, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3095382849375407, "rewards/QAReward/std": 0.4728691279888153, "step": 3745 }, { "clip_ratio/high_max": 0.000446371897123754, "clip_ratio/high_mean": 0.00024299792712554336, "clip_ratio/low_mean": 5.5905245244503024e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029890317819081247, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 540.9609375, "completions/min_length": 264.5, "epoch": 0.7332811888932342, "frac_reward_zero_std": 0.046875, "grad_norm": 0.82421875, "kl": 0.0029805262573063375, "learning_rate": 5.258279275047247e-07, "loss": 0.00020202798768877983, "reward": 0.3267988860607147, "reward_std": 0.2756524533033371, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3267989009618759, "rewards/QAReward/std": 0.4663669317960739, "step": 3750 }, { "clip_ratio/high_max": 0.0003850183100439608, "clip_ratio/high_mean": 0.00016716518439352512, "clip_ratio/low_mean": 6.223487143870443e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002294000529218465, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 521.8606770833334, "completions/min_length": 218.66666666666666, "epoch": 0.734258897145092, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.76953125, "kl": 0.0029420996084809302, "learning_rate": 5.222205922307705e-07, "loss": 7.378481095656752e-05, "reward": 0.42497366666793823, "reward_std": 0.2559735377629598, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42497365673383075, "rewards/QAReward/std": 0.4206259449323018, "step": 3755 }, { "clip_ratio/high_max": 0.0005359981791116297, "clip_ratio/high_mean": 0.0003152754157781601, "clip_ratio/low_mean": 8.927689486881718e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004045523004606366, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 522.19921875, "completions/min_length": 256.5, "epoch": 0.7352366053969496, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0030682186596095564, "learning_rate": 5.186230635221213e-07, "loss": 0.00010005943477153778, "reward": 0.34701527655124664, "reward_std": 0.27864497900009155, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34701529145240784, "rewards/QAReward/std": 0.45359116792678833, "step": 3760 }, { "clip_ratio/high_max": 0.0002583460649475455, "clip_ratio/high_mean": 0.0001500871207099408, "clip_ratio/low_mean": 2.401771562290378e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017410482396371663, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 531.87109375, "completions/min_length": 240.0, "epoch": 0.7362143136488072, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.76953125, "kl": 0.0029484303668141365, "learning_rate": 5.150353774599226e-07, "loss": 0.0001219906029291451, "reward": 0.39230356613794964, "reward_std": 0.26356928547223407, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39230358600616455, "rewards/QAReward/std": 0.41130272547403973, "step": 3765 }, { "clip_ratio/high_max": 0.0004834810388274491, "clip_ratio/high_mean": 0.00025008791126310823, "clip_ratio/low_mean": 8.976355893537402e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033985147019848226, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 537.998046875, "completions/min_length": 256.5, "epoch": 0.7371920219006648, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84375, "kl": 0.003008140902966261, "learning_rate": 5.114575700266024e-07, "loss": 0.00015597606543451547, "reward": 0.3089355528354645, "reward_std": 0.301028847694397, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3089355528354645, "rewards/QAReward/std": 0.4817429333925247, "step": 3770 }, { "clip_ratio/high_max": 0.00038312191609293225, "clip_ratio/high_mean": 0.00018101577879860997, "clip_ratio/low_mean": 5.2795093506574633e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023381087230518461, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 535.16015625, "completions/min_length": 253.0, "epoch": 0.7381697301525225, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.83203125, "kl": 0.0030629169661551712, "learning_rate": 5.078896771055121e-07, "loss": 0.00015642557991668582, "reward": 0.3608532249927521, "reward_std": 0.26010481516520184, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36085323492685956, "rewards/QAReward/std": 0.43692625562349957, "step": 3775 }, { "clip_ratio/high_max": 0.0005793328396975994, "clip_ratio/high_mean": 0.00023865411640144885, "clip_ratio/low_mean": 0.00010810178209794686, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034675589413382115, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1024.0, "completions/mean_length": 529.5546875, "completions/min_length": 234.0, "epoch": 0.7391474384043801, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.0030221065040677785, "learning_rate": 5.043317344805662e-07, "loss": 0.00012321248650550842, "reward": 0.39694398641586304, "reward_std": 0.27831771969795227, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39694398641586304, "rewards/QAReward/std": 0.42330051958560944, "step": 3780 }, { "clip_ratio/high_max": 0.000348194211255759, "clip_ratio/high_mean": 0.00016211403999477625, "clip_ratio/low_mean": 4.9572574789635836e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021168660605326296, "completions/clipped_ratio": 0.048177083333333336, "completions/max_length": 1024.0, "completions/mean_length": 533.8854166666666, "completions/min_length": 262.6666666666667, "epoch": 0.7401251466562377, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78515625, "kl": 0.0030282004736363888, "learning_rate": 5.007837778358845e-07, "loss": 0.00012390834745019675, "reward": 0.35629650950431824, "reward_std": 0.2946457862854004, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35629649957021076, "rewards/QAReward/std": 0.4421846965948741, "step": 3785 }, { "clip_ratio/high_max": 0.0005557505879551172, "clip_ratio/high_mean": 0.0002165295765735209, "clip_ratio/low_mean": 0.00012341981055215002, "clip_ratio/low_min": 3.8842523645143955e-05, "clip_ratio/region_mean": 0.00033994938130490484, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 547.68359375, "completions/min_length": 244.5, "epoch": 0.7411028549080955, "frac_reward_zero_std": 0.015625, "grad_norm": 0.76953125, "kl": 0.003001772752031684, "learning_rate": 4.97245842755432e-07, "loss": 0.0001335934968665242, "reward": 0.3453569710254669, "reward_std": 0.30735747516155243, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3453569561243057, "rewards/QAReward/std": 0.4773300141096115, "step": 3790 }, { "clip_ratio/high_max": 0.00026614338858053086, "clip_ratio/high_mean": 0.00012955950805917382, "clip_ratio/low_mean": 2.968660119222477e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015924610779620706, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 539.40234375, "completions/min_length": 247.66666666666666, "epoch": 0.7420805631599531, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.83984375, "kl": 0.002888229116797447, "learning_rate": 4.937179647226625e-07, "loss": 0.00014847617130726577, "reward": 0.3355349898338318, "reward_std": 0.277192085981369, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3355350196361542, "rewards/QAReward/std": 0.45509976148605347, "step": 3795 }, { "clip_ratio/high_max": 0.0003730277065187693, "clip_ratio/high_mean": 0.00022589772124774753, "clip_ratio/low_mean": 6.545604264829308e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029135376098565757, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 516.662109375, "completions/min_length": 256.0, "epoch": 0.7430582714118107, "frac_reward_zero_std": 0.046875, "grad_norm": 0.84765625, "kl": 0.0029250345658510925, "learning_rate": 4.90200179120166e-07, "loss": 0.00016163485124707223, "reward": 0.3665395677089691, "reward_std": 0.26884618401527405, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3665395677089691, "rewards/QAReward/std": 0.4439021348953247, "step": 3800 }, { "clip_ratio/high_max": 0.00024369853781536222, "clip_ratio/high_mean": 0.00014288516249507665, "clip_ratio/low_mean": 5.004948179703206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019293464720249176, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 517.2604166666666, "completions/min_length": 235.33333333333334, "epoch": 0.7440359796636684, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.84375, "kl": 0.0029630635399371386, "learning_rate": 4.866925212293088e-07, "loss": 0.00014990178169682622, "reward": 0.34748263160387677, "reward_std": 0.2804584801197052, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34748263160387677, "rewards/QAReward/std": 0.44035446643829346, "step": 3805 }, { "clip_ratio/high_max": 0.0005435169208794832, "clip_ratio/high_mean": 0.0002788115118164569, "clip_ratio/low_mean": 5.781756335636601e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003366290708072484, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 516.205078125, "completions/min_length": 261.5, "epoch": 0.745013687915526, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83203125, "kl": 0.0030812454875558614, "learning_rate": 4.83195026229883e-07, "loss": 9.680163348093628e-05, "reward": 0.41372379660606384, "reward_std": 0.2604994624853134, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41372379660606384, "rewards/QAReward/std": 0.42941200733184814, "step": 3810 }, { "clip_ratio/high_max": 0.00036596617428585887, "clip_ratio/high_mean": 0.00016488701221533119, "clip_ratio/low_mean": 4.649255570257083e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021137957228347659, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 541.05078125, "completions/min_length": 286.3333333333333, "epoch": 0.7459913961673836, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.0029355532489717005, "learning_rate": 4.79707729199752e-07, "loss": 0.00012609388213604688, "reward": 0.4013431469599406, "reward_std": 0.25422446926434833, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40134313702583313, "rewards/QAReward/std": 0.4372304081916809, "step": 3815 }, { "clip_ratio/high_max": 0.0004589830059558153, "clip_ratio/high_mean": 0.00023012850433588028, "clip_ratio/low_mean": 6.562119160662405e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000295749690849334, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 521.53125, "completions/min_length": 273.0, "epoch": 0.7469691044192412, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80078125, "kl": 0.0029598075896501543, "learning_rate": 4.76230665114501e-07, "loss": 0.00012756073847413062, "reward": 0.3684062361717224, "reward_std": 0.25472138822078705, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3684062361717224, "rewards/QAReward/std": 0.46461567282676697, "step": 3820 }, { "clip_ratio/high_max": 0.00034197751665487885, "clip_ratio/high_mean": 0.00014439746737480164, "clip_ratio/low_mean": 4.098029457964003e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018537776777520775, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 525.1197916666666, "completions/min_length": 243.66666666666666, "epoch": 0.747946812671099, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8125, "kl": 0.003077561128884554, "learning_rate": 4.727638688470835e-07, "loss": 0.00015213986625894904, "reward": 0.36151887973149616, "reward_std": 0.2683639923731486, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36151887973149616, "rewards/QAReward/std": 0.4329140782356262, "step": 3825 }, { "clip_ratio/high_max": 0.000563970545772463, "clip_ratio/high_mean": 0.0002891867712605745, "clip_ratio/low_mean": 8.930380281526595e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037849057698622344, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 547.724609375, "completions/min_length": 260.0, "epoch": 0.7489245209229566, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.0028866488486528397, "learning_rate": 4.6930737516747267e-07, "loss": 8.499575778841972e-05, "reward": 0.36367394030094147, "reward_std": 0.27498385310173035, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3636739253997803, "rewards/QAReward/std": 0.43685972690582275, "step": 3830 }, { "clip_ratio/high_max": 0.0003333412809297442, "clip_ratio/high_mean": 0.0001799731864593923, "clip_ratio/low_mean": 6.469425570685417e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002446674392558634, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 538.8541666666666, "completions/min_length": 257.6666666666667, "epoch": 0.7499022291748142, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76171875, "kl": 0.0030306994449347258, "learning_rate": 4.6586121874231245e-07, "loss": 0.00010167228756472469, "reward": 0.3213396966457367, "reward_std": 0.26079322894414264, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3213396966457367, "rewards/QAReward/std": 0.44812701145807904, "step": 3835 }, { "clip_ratio/high_max": 0.0003910264233127236, "clip_ratio/high_mean": 0.0002113383961841464, "clip_ratio/low_mean": 6.110003232606687e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027243843069300053, "completions/clipped_ratio": 0.048828125, "completions/max_length": 1024.0, "completions/mean_length": 545.619140625, "completions/min_length": 244.5, "epoch": 0.7508799374266719, "frac_reward_zero_std": 0.015625, "grad_norm": 0.75, "kl": 0.0028371814638376235, "learning_rate": 4.624254341345715e-07, "loss": 0.0001097559928894043, "reward": 0.343691885471344, "reward_std": 0.28075607120990753, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.343691885471344, "rewards/QAReward/std": 0.4617431163787842, "step": 3840 }, { "clip_ratio/high_max": 0.0003744207206182182, "clip_ratio/high_mean": 0.00018713658791966737, "clip_ratio/low_mean": 3.676720371004194e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022390377707779408, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 524.75, "completions/min_length": 228.33333333333334, "epoch": 0.7518576456785295, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.859375, "kl": 0.0029950186144560575, "learning_rate": 4.590000558031935e-07, "loss": 0.00015560209285467864, "reward": 0.361660361289978, "reward_std": 0.2776484390099843, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3616603712240855, "rewards/QAReward/std": 0.44847307602564496, "step": 3845 }, { "clip_ratio/high_max": 0.0004593746270984411, "clip_ratio/high_mean": 0.00026693997206166386, "clip_ratio/low_mean": 6.531662220368161e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033225659281015395, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 540.865234375, "completions/min_length": 231.5, "epoch": 0.7528353539303871, "frac_reward_zero_std": 0.015625, "grad_norm": 0.80078125, "kl": 0.0028764622285962103, "learning_rate": 4.5558511810275393e-07, "loss": 0.00010329738724976779, "reward": 0.33651646971702576, "reward_std": 0.2874341756105423, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33651645481586456, "rewards/QAReward/std": 0.44827666878700256, "step": 3850 }, { "clip_ratio/high_max": 0.00034837989369407296, "clip_ratio/high_mean": 0.00013953687739558517, "clip_ratio/low_mean": 7.014810689724982e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002096849784720689, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 553.2005208333334, "completions/min_length": 272.0, "epoch": 0.7538130621822449, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.80859375, "kl": 0.002865524124354124, "learning_rate": 4.521806552831138e-07, "loss": 9.458059212192893e-05, "reward": 0.36628226439158124, "reward_std": 0.2632755935192108, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36628226439158124, "rewards/QAReward/std": 0.42687876025835675, "step": 3855 }, { "clip_ratio/high_max": 0.0005978848785161972, "clip_ratio/high_mean": 0.00024664596421644094, "clip_ratio/low_mean": 6.644414970651269e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003130901139229536, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 537.37109375, "completions/min_length": 237.5, "epoch": 0.7547907704341025, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.002850598143413663, "learning_rate": 4.4878670148907924e-07, "loss": 0.0001939435489475727, "reward": 0.37295620143413544, "reward_std": 0.2675887942314148, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37295618653297424, "rewards/QAReward/std": 0.4374929815530777, "step": 3860 }, { "clip_ratio/high_max": 0.0003838145872578025, "clip_ratio/high_mean": 0.00020212007220834493, "clip_ratio/low_mean": 4.064262175234035e-05, "clip_ratio/low_min": 1.7204301548190416e-05, "clip_ratio/region_mean": 0.00024276268668472768, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 541.65625, "completions/min_length": 246.66666666666666, "epoch": 0.7557684786859601, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7890625, "kl": 0.00285688741132617, "learning_rate": 4.454032907600547e-07, "loss": 0.0001149325631558895, "reward": 0.3422272900740306, "reward_std": 0.27477702498435974, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3422272950410843, "rewards/QAReward/std": 0.4513181944688161, "step": 3865 }, { "clip_ratio/high_max": 0.0003813820076175034, "clip_ratio/high_mean": 0.00019942463841289283, "clip_ratio/low_mean": 7.487161929020658e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002742962504271418, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 534.10546875, "completions/min_length": 220.5, "epoch": 0.7567461869378177, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76171875, "kl": 0.003068583644926548, "learning_rate": 4.420304570297047e-07, "loss": 0.00014521160628646612, "reward": 0.3145538866519928, "reward_std": 0.2753191590309143, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3145538866519928, "rewards/QAReward/std": 0.44087621569633484, "step": 3870 }, { "clip_ratio/high_max": 0.000438674702309072, "clip_ratio/high_mean": 0.00022183000110089778, "clip_ratio/low_mean": 3.676098713185638e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002585910027846694, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 541.0182291666666, "completions/min_length": 258.0, "epoch": 0.7577238951896754, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.0029606206342577936, "learning_rate": 4.386682341256119e-07, "loss": 0.000173790636472404, "reward": 0.3785435160001119, "reward_std": 0.30240726470947266, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37854352593421936, "rewards/QAReward/std": 0.44806090990702313, "step": 3875 }, { "clip_ratio/high_max": 0.0006529089994728565, "clip_ratio/high_mean": 0.00026674219989217816, "clip_ratio/low_mean": 4.998942094971426e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031673162011429666, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 517.1015625, "completions/min_length": 219.5, "epoch": 0.758701603441533, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8046875, "kl": 0.003114236006513238, "learning_rate": 4.353166557689401e-07, "loss": 0.00010818776208907366, "reward": 0.39506369829177856, "reward_std": 0.2641453295946121, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39506369829177856, "rewards/QAReward/std": 0.4090758115053177, "step": 3880 }, { "clip_ratio/high_max": 0.0003340858966112137, "clip_ratio/high_mean": 0.00019570805015973748, "clip_ratio/low_mean": 3.005364560522139e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002257616841234267, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 536.26171875, "completions/min_length": 256.3333333333333, "epoch": 0.7596793116933906, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8125, "kl": 0.0029337367974221706, "learning_rate": 4.319757555740925e-07, "loss": 9.009492350742221e-05, "reward": 0.3765196204185486, "reward_std": 0.28952134648958844, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37651963035265607, "rewards/QAReward/std": 0.4410744110743205, "step": 3885 }, { "clip_ratio/high_max": 0.000613125052768737, "clip_ratio/high_mean": 0.0003109941957518458, "clip_ratio/low_mean": 7.493985467590391e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003859340562485158, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 508.900390625, "completions/min_length": 268.5, "epoch": 0.7606570199452484, "frac_reward_zero_std": 0.015625, "grad_norm": 0.859375, "kl": 0.0030217316467314958, "learning_rate": 4.2864556704837715e-07, "loss": 0.00014531961642205715, "reward": 0.36231397092342377, "reward_std": 0.2768574133515358, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36231401562690735, "rewards/QAReward/std": 0.43731725215911865, "step": 3890 }, { "clip_ratio/high_max": 0.00034838058054447173, "clip_ratio/high_mean": 0.0002212016610428691, "clip_ratio/low_mean": 5.3282974113244565e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027448462788015604, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.9986979166666, "completions/min_length": 251.33333333333334, "epoch": 0.761634728197106, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.002961681317538023, "learning_rate": 4.25326123591671e-07, "loss": 8.86454712599516e-05, "reward": 0.36432499686876935, "reward_std": 0.28430697321891785, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36432499686876935, "rewards/QAReward/std": 0.4260869522889455, "step": 3895 }, { "clip_ratio/high_max": 0.00041793645359575746, "clip_ratio/high_mean": 0.00023168109473772346, "clip_ratio/low_mean": 4.7299102880060674e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027898019179701806, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 533.771484375, "completions/min_length": 237.5, "epoch": 0.7626124364489636, "frac_reward_zero_std": 0.015625, "grad_norm": 0.80078125, "kl": 0.0029040826484560966, "learning_rate": 4.2201745849608264e-07, "loss": 0.0001433312427252531, "reward": 0.3214712142944336, "reward_std": 0.2786334156990051, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3214711993932724, "rewards/QAReward/std": 0.44557125866413116, "step": 3900 }, { "clip_ratio/high_max": 0.0002617456135340035, "clip_ratio/high_mean": 0.00014293087879195808, "clip_ratio/low_mean": 3.339738759677857e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017632826347835362, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 527.73828125, "completions/min_length": 239.0, "epoch": 0.7635901447008213, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.77734375, "kl": 0.0029548311606049536, "learning_rate": 4.187196049456218e-07, "loss": 7.43487908039242e-05, "reward": 0.3014835963646571, "reward_std": 0.28206655383110046, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3014836013317108, "rewards/QAReward/std": 0.4553208152453105, "step": 3905 }, { "clip_ratio/high_max": 0.0005132897407747805, "clip_ratio/high_mean": 0.00029595273663289844, "clip_ratio/low_mean": 6.489329098258168e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036084603052586315, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 547.330078125, "completions/min_length": 250.5, "epoch": 0.7645678529526789, "frac_reward_zero_std": 0.015625, "grad_norm": 0.828125, "kl": 0.0030011995229870083, "learning_rate": 4.1543259601586286e-07, "loss": 3.476183337625116e-05, "reward": 0.3425528258085251, "reward_std": 0.2961442619562149, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3425528407096863, "rewards/QAReward/std": 0.49132516980171204, "step": 3910 }, { "clip_ratio/high_max": 0.00037877506110817194, "clip_ratio/high_mean": 0.000154419761383906, "clip_ratio/low_mean": 6.267970311455429e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002170994644984603, "completions/clipped_ratio": 0.028645833333333332, "completions/max_length": 1024.0, "completions/mean_length": 537.9674479166666, "completions/min_length": 247.66666666666666, "epoch": 0.7655455612045365, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0029655661899596454, "learning_rate": 4.1215646467361517e-07, "loss": 0.00016788742505013943, "reward": 0.3477663000424703, "reward_std": 0.2669448256492615, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3477663000424703, "rewards/QAReward/std": 0.41803987820943195, "step": 3915 }, { "clip_ratio/high_max": 0.00046394842211157085, "clip_ratio/high_mean": 0.0002636984339915216, "clip_ratio/low_mean": 9.666536643635482e-05, "clip_ratio/low_min": 2.1588946401607247e-05, "clip_ratio/region_mean": 0.00036036379169672727, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 524.048828125, "completions/min_length": 245.0, "epoch": 0.7665232694563943, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.002988777169957757, "learning_rate": 4.088912437765936e-07, "loss": 0.0001564395846799016, "reward": 0.3175843209028244, "reward_std": 0.2655637115240097, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3175843209028244, "rewards/QAReward/std": 0.45128293335437775, "step": 3920 }, { "clip_ratio/high_max": 0.000351360731292516, "clip_ratio/high_mean": 0.0002031116106081754, "clip_ratio/low_mean": 6.159746553748846e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002647090936079621, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 533.48046875, "completions/min_length": 234.33333333333334, "epoch": 0.7675009777082519, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8203125, "kl": 0.002997124195098877, "learning_rate": 4.0563696607308606e-07, "loss": 9.126407094299794e-05, "reward": 0.37328921755154926, "reward_std": 0.2694497307141622, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37328922748565674, "rewards/QAReward/std": 0.4188243548075358, "step": 3925 }, { "clip_ratio/high_max": 0.0004895318183116615, "clip_ratio/high_mean": 0.0002646238193847239, "clip_ratio/low_mean": 8.425060368608684e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034887443180195985, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 518.623046875, "completions/min_length": 249.5, "epoch": 0.7684786859601095, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.002984962472692132, "learning_rate": 4.0239366420162655e-07, "loss": 0.00020144982263445855, "reward": 0.41119326651096344, "reward_std": 0.2616083323955536, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41119326651096344, "rewards/QAReward/std": 0.4422578066587448, "step": 3930 }, { "clip_ratio/high_max": 0.00025179149815812707, "clip_ratio/high_mean": 0.00015987305087037385, "clip_ratio/low_mean": 4.1746514034457506e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020161956781521438, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 531.60546875, "completions/min_length": 256.6666666666667, "epoch": 0.7694563942119671, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.7890625, "kl": 0.0030054125003516675, "learning_rate": 3.991613706906682e-07, "loss": 0.00010542487725615502, "reward": 0.37704405188560486, "reward_std": 0.26546143492062885, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37704405188560486, "rewards/QAReward/std": 0.40924322605133057, "step": 3935 }, { "clip_ratio/high_max": 0.0004940703278407454, "clip_ratio/high_mean": 0.0002381970582064241, "clip_ratio/low_mean": 3.39053753123153e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027210243279114367, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 540.001953125, "completions/min_length": 270.5, "epoch": 0.7704341024638248, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78125, "kl": 0.003030297579243779, "learning_rate": 3.9594011795825715e-07, "loss": 0.00010142514947801829, "reward": 0.32902252674102783, "reward_std": 0.28010064363479614, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32902252674102783, "rewards/QAReward/std": 0.4488862454891205, "step": 3940 }, { "clip_ratio/high_max": 0.00024820883991196754, "clip_ratio/high_mean": 0.0001574174268171191, "clip_ratio/low_mean": 4.200344847049564e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019942087819799781, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 518.3619791666666, "completions/min_length": 241.66666666666666, "epoch": 0.7714118107156824, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.81640625, "kl": 0.00303410729393363, "learning_rate": 3.9272993831170594e-07, "loss": 9.759076638147235e-05, "reward": 0.32652995983759564, "reward_std": 0.2829027275244395, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32652994990348816, "rewards/QAReward/std": 0.4192531108856201, "step": 3945 }, { "clip_ratio/high_max": 0.0005869194865226746, "clip_ratio/high_mean": 0.0002291334036272019, "clip_ratio/low_mean": 6.728487060172483e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029641828150488435, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 534.443359375, "completions/min_length": 247.0, "epoch": 0.77238951896754, "frac_reward_zero_std": 0.03125, "grad_norm": 0.79296875, "kl": 0.0029911546036601065, "learning_rate": 3.895308639472705e-07, "loss": 0.00018623368814587593, "reward": 0.36369822919368744, "reward_std": 0.27063292264938354, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36369824409484863, "rewards/QAReward/std": 0.462619885802269, "step": 3950 }, { "clip_ratio/high_max": 0.00030057246331125496, "clip_ratio/high_mean": 0.00016461629420518875, "clip_ratio/low_mean": 4.242693685228005e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002070432179607451, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 555.9036458333334, "completions/min_length": 225.66666666666666, "epoch": 0.7733672272193978, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.828125, "kl": 0.002815092960372567, "learning_rate": 3.863429269498282e-07, "loss": 0.00017885698471218348, "reward": 0.30608029663562775, "reward_std": 0.28606779376665753, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30608029663562775, "rewards/QAReward/std": 0.4185103277365367, "step": 3955 }, { "clip_ratio/high_max": 0.00046572558348998425, "clip_ratio/high_mean": 0.000256881135283038, "clip_ratio/low_mean": 5.630454688798636e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031318567343987527, "completions/clipped_ratio": 0.0546875, "completions/max_length": 1024.0, "completions/mean_length": 545.33984375, "completions/min_length": 259.0, "epoch": 0.7743449354712554, "frac_reward_zero_std": 0.046875, "grad_norm": 0.78515625, "kl": 0.002825916511937976, "learning_rate": 3.8316615929255385e-07, "loss": 0.00017615564865991473, "reward": 0.38824406266212463, "reward_std": 0.2818314731121063, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38824406266212463, "rewards/QAReward/std": 0.4384719580411911, "step": 3960 }, { "clip_ratio/high_max": 0.00028391553787514566, "clip_ratio/high_mean": 0.0001423526438884437, "clip_ratio/low_mean": 3.373672079760581e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017608937341719865, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 997.0, "completions/mean_length": 540.0091145833334, "completions/min_length": 250.33333333333334, "epoch": 0.775322643723113, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78515625, "kl": 0.002895719837397337, "learning_rate": 3.800005928366006e-07, "loss": 0.00016394510166719555, "reward": 0.4166313409805298, "reward_std": 0.25471608340740204, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41663135091463727, "rewards/QAReward/std": 0.43385130167007446, "step": 3965 }, { "clip_ratio/high_max": 0.0004964836174622178, "clip_ratio/high_mean": 0.00022108369739726186, "clip_ratio/low_mean": 5.1036504737567155e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002721201861277223, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 531.08203125, "completions/min_length": 233.0, "epoch": 0.7763003519749707, "frac_reward_zero_std": 0.03125, "grad_norm": 0.765625, "kl": 0.0029322295915335415, "learning_rate": 3.768462593307798e-07, "loss": 0.00014706337824463845, "reward": 0.38963836431503296, "reward_std": 0.29475264251232147, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38963837921619415, "rewards/QAReward/std": 0.4301145225763321, "step": 3970 }, { "clip_ratio/high_max": 0.000329832686111331, "clip_ratio/high_mean": 0.0001431265438441187, "clip_ratio/low_mean": 5.400196532718837e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019712852081283928, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 525.4322916666666, "completions/min_length": 241.66666666666666, "epoch": 0.7772780602268283, "frac_reward_zero_std": 0.03125, "grad_norm": 0.86328125, "kl": 0.0030398543924093245, "learning_rate": 3.737031904112437e-07, "loss": 0.0001194927841424942, "reward": 0.36251466969649, "reward_std": 0.26391489307085675, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3625146597623825, "rewards/QAReward/std": 0.3999740580717723, "step": 3975 }, { "clip_ratio/high_max": 0.0006142482976429165, "clip_ratio/high_mean": 0.00030455029336735607, "clip_ratio/low_mean": 4.7746585187269376e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003522968734614551, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 532.57421875, "completions/min_length": 221.0, "epoch": 0.7782557684786859, "frac_reward_zero_std": 0.015625, "grad_norm": 0.78515625, "kl": 0.003089840617030859, "learning_rate": 3.7057141760116593e-07, "loss": 5.623050965368748e-05, "reward": 0.3959023356437683, "reward_std": 0.2812371999025345, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3959023058414459, "rewards/QAReward/std": 0.42288820445537567, "step": 3980 }, { "clip_ratio/high_max": 0.00028884834609925746, "clip_ratio/high_mean": 0.00015261351363733412, "clip_ratio/low_mean": 4.954342148266733e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020215693512000145, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 536.84765625, "completions/min_length": 237.66666666666666, "epoch": 0.7792334767305436, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.80859375, "kl": 0.0030028429813683034, "learning_rate": 3.674509723104275e-07, "loss": 0.00017535868100821973, "reward": 0.32635172208150226, "reward_std": 0.2714060842990875, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3263517419497172, "rewards/QAReward/std": 0.45075011253356934, "step": 3985 }, { "clip_ratio/high_max": 0.0004368398222140968, "clip_ratio/high_mean": 0.00021522196475416423, "clip_ratio/low_mean": 6.980928446864709e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000285031262319535, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 539.34375, "completions/min_length": 239.5, "epoch": 0.7802111849824013, "frac_reward_zero_std": 0.015625, "grad_norm": 0.81640625, "kl": 0.0029273601714521645, "learning_rate": 3.6434188583530006e-07, "loss": 8.401431841775775e-06, "reward": 0.30968891084194183, "reward_std": 0.2734292298555374, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30968888103961945, "rewards/QAReward/std": 0.44033409655094147, "step": 3990 }, { "clip_ratio/high_max": 0.00028153988532722, "clip_ratio/high_mean": 0.00017582657164894044, "clip_ratio/low_mean": 4.4849164260085673e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022067573154345156, "completions/clipped_ratio": 0.032552083333333336, "completions/max_length": 1024.0, "completions/mean_length": 543.10546875, "completions/min_length": 252.66666666666666, "epoch": 0.7811888932342589, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83984375, "kl": 0.0029054704122245313, "learning_rate": 3.6124418935813434e-07, "loss": 0.00014677937142550945, "reward": 0.3398648003737132, "reward_std": 0.29273950060208637, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3398647904396057, "rewards/QAReward/std": 0.41786662737528485, "step": 3995 }, { "clip_ratio/high_max": 0.0003782927931752056, "clip_ratio/high_mean": 0.00022469384130090474, "clip_ratio/low_mean": 4.71541796287056e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027184803038835523, "completions/clipped_ratio": 0.021484375, "completions/max_length": 1024.0, "completions/mean_length": 529.923828125, "completions/min_length": 226.5, "epoch": 0.7821666014861165, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8203125, "kl": 0.003035546187311411, "learning_rate": 3.5815791394704475e-07, "loss": 0.00014755204319953917, "reward": 0.3745280057191849, "reward_std": 0.29438483715057373, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37452802062034607, "rewards/QAReward/std": 0.44121186435222626, "step": 4000 }, { "clip_ratio/high_max": 0.0003604173893108964, "clip_ratio/high_mean": 0.00017599738202989101, "clip_ratio/low_mean": 2.897631929954514e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020497370278462767, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 535.6510416666666, "completions/min_length": 244.33333333333334, "epoch": 0.7831443097379742, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8203125, "kl": 0.0029963834676891565, "learning_rate": 3.5508309055559915e-07, "loss": 0.00018358319066464902, "reward": 0.3250542680422465, "reward_std": 0.28131826718648273, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32505423823992413, "rewards/QAReward/std": 0.4427565932273865, "step": 4005 }, { "clip_ratio/high_max": 0.0005165286944247782, "clip_ratio/high_mean": 0.0002521167742088437, "clip_ratio/low_mean": 9.85204940661788e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035063727991655467, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 509.203125, "completions/min_length": 240.0, "epoch": 0.7841220179898318, "frac_reward_zero_std": 0.109375, "grad_norm": 0.81640625, "kl": 0.0030206031631678345, "learning_rate": 3.5201975002250783e-07, "loss": 0.0001704345690086484, "reward": 0.38107773661613464, "reward_std": 0.27157849073410034, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38107772171497345, "rewards/QAReward/std": 0.47911909222602844, "step": 4010 }, { "clip_ratio/high_max": 0.00037034942070022223, "clip_ratio/high_mean": 0.00019477703608572483, "clip_ratio/low_mean": 5.533230141736567e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002501093316823244, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 496.6080729166667, "completions/min_length": 230.66666666666666, "epoch": 0.7850997262416894, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.85546875, "kl": 0.0031572130508720877, "learning_rate": 3.4896792307131617e-07, "loss": 0.00016864127246662974, "reward": 0.37278129657109577, "reward_std": 0.26229830582936603, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37278130650520325, "rewards/QAReward/std": 0.43927620848019916, "step": 4015 }, { "clip_ratio/high_max": 0.0003987884731031954, "clip_ratio/high_mean": 0.00021105015184730291, "clip_ratio/low_mean": 6.358300015563145e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000274633162189275, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 516.490234375, "completions/min_length": 237.0, "epoch": 0.7860774344935472, "frac_reward_zero_std": 0.078125, "grad_norm": 0.75, "kl": 0.003016823949292302, "learning_rate": 3.459276403100933e-07, "loss": 0.00013324797619134187, "reward": 0.3737165182828903, "reward_std": 0.2666563540697098, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3737165182828903, "rewards/QAReward/std": 0.431633323431015, "step": 4020 }, { "clip_ratio/high_max": 0.00034030391834676265, "clip_ratio/high_mean": 0.00020424412214197217, "clip_ratio/low_mean": 3.3808840089477596e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023805294767953455, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 532.9869791666666, "completions/min_length": 252.0, "epoch": 0.7870551427454048, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.78125, "kl": 0.002940852986648679, "learning_rate": 3.428989322311274e-07, "loss": 0.00015491603408008815, "reward": 0.3619476358095805, "reward_std": 0.2951982418696086, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36194764574368793, "rewards/QAReward/std": 0.4230739672978719, "step": 4025 }, { "clip_ratio/high_max": 0.00042489934712648393, "clip_ratio/high_mean": 0.00027365551213733854, "clip_ratio/low_mean": 7.8756915172562e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035241242730990054, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 545.84375, "completions/min_length": 250.5, "epoch": 0.7880328509972624, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80859375, "kl": 0.0028859309386461973, "learning_rate": 3.398818292106189e-07, "loss": 0.0001441033324226737, "reward": 0.3619598001241684, "reward_std": 0.27797937393188477, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3619598001241684, "rewards/QAReward/std": 0.4442838728427887, "step": 4030 }, { "clip_ratio/high_max": 0.0003189053968526423, "clip_ratio/high_mean": 0.00016432227566838266, "clip_ratio/low_mean": 4.658054531319067e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021090280497446657, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 512.3046875, "completions/min_length": 225.0, "epoch": 0.7890105592491201, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8515625, "kl": 0.0030380739364773033, "learning_rate": 3.3687636150837723e-07, "loss": 0.00020193916279822587, "reward": 0.41221168637275696, "reward_std": 0.27617127696673077, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4122116764386495, "rewards/QAReward/std": 0.4364592730998993, "step": 4035 }, { "clip_ratio/high_max": 0.0004400754114612937, "clip_ratio/high_mean": 0.00020050881430506707, "clip_ratio/low_mean": 6.695382471662014e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026746264193207027, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 537.212890625, "completions/min_length": 231.5, "epoch": 0.7899882675009777, "frac_reward_zero_std": 0.046875, "grad_norm": 0.75, "kl": 0.002992244018241763, "learning_rate": 3.338825592675152e-07, "loss": 0.0001295844791457057, "reward": 0.40799930691719055, "reward_std": 0.2669537290930748, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40799930691719055, "rewards/QAReward/std": 0.4254425913095474, "step": 4040 }, { "clip_ratio/high_max": 0.0002583888825029135, "clip_ratio/high_mean": 0.00017432663589715957, "clip_ratio/low_mean": 4.176172951702029e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002160883625037968, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 951.3333333333334, "completions/mean_length": 532.1484375, "completions/min_length": 275.6666666666667, "epoch": 0.7909659757528353, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.80859375, "kl": 0.003037018049508333, "learning_rate": 3.3090045251414786e-07, "loss": 0.00015721230302006005, "reward": 0.3440321385860443, "reward_std": 0.2871328989664714, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3440321187178294, "rewards/QAReward/std": 0.4189276397228241, "step": 4045 }, { "clip_ratio/high_max": 0.0004104935680516064, "clip_ratio/high_mean": 0.00022239423706196248, "clip_ratio/low_mean": 8.023116970434785e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030262540094554423, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 512.8984375, "completions/min_length": 237.0, "epoch": 0.791943684004693, "frac_reward_zero_std": 0.015625, "grad_norm": 0.84765625, "kl": 0.0029847157653421163, "learning_rate": 3.279300711570911e-07, "loss": 0.0001639614929445088, "reward": 0.4448016881942749, "reward_std": 0.26998208463191986, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4448017030954361, "rewards/QAReward/std": 0.39799147844314575, "step": 4050 }, { "clip_ratio/high_max": 0.000285412825178355, "clip_ratio/high_mean": 0.00014847639831714333, "clip_ratio/low_mean": 4.398046876303851e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019245686708018183, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 542.4427083333334, "completions/min_length": 232.33333333333334, "epoch": 0.7929213922565507, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.0029466962441802025, "learning_rate": 3.249714449875631e-07, "loss": 0.00020330999977886678, "reward": 0.317832350730896, "reward_std": 0.3124580184618632, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.317832350730896, "rewards/QAReward/std": 0.4679554800192515, "step": 4055 }, { "clip_ratio/high_max": 0.0004992147441953421, "clip_ratio/high_mean": 0.0002412359695881605, "clip_ratio/low_mean": 6.973563431529328e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031097158789634706, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 542.173828125, "completions/min_length": 250.5, "epoch": 0.7938991005084083, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87109375, "kl": 0.00289596295915544, "learning_rate": 3.220246036788829e-07, "loss": 9.923804318532348e-05, "reward": 0.38652342557907104, "reward_std": 0.30188795924186707, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38652344048023224, "rewards/QAReward/std": 0.4007894694805145, "step": 4060 }, { "clip_ratio/high_max": 0.00033721764339134097, "clip_ratio/high_mean": 0.00018327589496038854, "clip_ratio/low_mean": 7.26764410501346e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002559523331001401, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 530.1145833333334, "completions/min_length": 261.3333333333333, "epoch": 0.7948768087602659, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.78125, "kl": 0.0029490959364920854, "learning_rate": 3.190895767861746e-07, "loss": 0.00014459113590419291, "reward": 0.379800150791804, "reward_std": 0.2801143129666646, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37980014085769653, "rewards/QAReward/std": 0.43357178568840027, "step": 4065 }, { "clip_ratio/high_max": 0.0005133481463417411, "clip_ratio/high_mean": 0.00020891036838293074, "clip_ratio/low_mean": 5.6503506493754684e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002654138777870685, "completions/clipped_ratio": 0.0390625, "completions/max_length": 1024.0, "completions/mean_length": 548.388671875, "completions/min_length": 249.0, "epoch": 0.7958545170121236, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.002926886128261685, "learning_rate": 3.1616639374607e-07, "loss": 0.0001040544593706727, "reward": 0.42271748185157776, "reward_std": 0.2503608912229538, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.42271748185157776, "rewards/QAReward/std": 0.38497014343738556, "step": 4070 }, { "clip_ratio/high_max": 0.0002821515663526952, "clip_ratio/high_mean": 0.00015792839694768191, "clip_ratio/low_mean": 3.564185171853751e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019357025157660246, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 536.5481770833334, "completions/min_length": 221.66666666666666, "epoch": 0.7968322252639812, "frac_reward_zero_std": 0.03125, "grad_norm": 0.83984375, "kl": 0.003000747552141547, "learning_rate": 3.1325508387641514e-07, "loss": 0.00010246917372569442, "reward": 0.4125756323337555, "reward_std": 0.2552257974942525, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.412575622399648, "rewards/QAReward/std": 0.4291192392508189, "step": 4075 }, { "clip_ratio/high_max": 0.000498860829975456, "clip_ratio/high_mean": 0.0002633398864418268, "clip_ratio/low_mean": 8.33298807265237e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003466697642579675, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 529.703125, "completions/min_length": 230.5, "epoch": 0.7978099335158388, "frac_reward_zero_std": 0.046875, "grad_norm": 0.7578125, "kl": 0.002939002029597759, "learning_rate": 3.1035567637597407e-07, "loss": 0.00022445085924118757, "reward": 0.36644347012043, "reward_std": 0.28405746817588806, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3664434403181076, "rewards/QAReward/std": 0.4401705861091614, "step": 4080 }, { "clip_ratio/high_max": 0.0002713432186283171, "clip_ratio/high_mean": 0.00014939727261662484, "clip_ratio/low_mean": 5.132190854055807e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002007191884331405, "completions/clipped_ratio": 0.045572916666666664, "completions/max_length": 1024.0, "completions/mean_length": 541.8671875, "completions/min_length": 261.3333333333333, "epoch": 0.7987876417676966, "frac_reward_zero_std": 0.03125, "grad_norm": 0.828125, "kl": 0.0029120282270014286, "learning_rate": 3.0746820032413624e-07, "loss": 0.0001647235592827201, "reward": 0.32852956652641296, "reward_std": 0.28499292333920795, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.32852956652641296, "rewards/QAReward/std": 0.4364874263604482, "step": 4085 }, { "clip_ratio/high_max": 0.00048713695723563434, "clip_ratio/high_mean": 0.00022763778688386082, "clip_ratio/low_mean": 6.98972085956484e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029753500130027535, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 517.736328125, "completions/min_length": 245.0, "epoch": 0.7997653500195542, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8203125, "kl": 0.003052777238190174, "learning_rate": 3.0459268468062766e-07, "loss": 0.0001077226479537785, "reward": 0.2998310476541519, "reward_std": 0.27840742468833923, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2998310327529907, "rewards/QAReward/std": 0.46888479590415955, "step": 4090 }, { "clip_ratio/high_max": 0.0003414836246520281, "clip_ratio/high_mean": 0.00013130552833899855, "clip_ratio/low_mean": 3.919865703210235e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001705041853711009, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 534.91796875, "completions/min_length": 237.0, "epoch": 0.8007430582714118, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.84765625, "kl": 0.002944136504083872, "learning_rate": 3.017291582852162e-07, "loss": 9.160153567790985e-05, "reward": 0.43831173578898114, "reward_std": 0.26541847983996075, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43831172585487366, "rewards/QAReward/std": 0.427385816971461, "step": 4095 }, { "clip_ratio/high_max": 0.0005568042164668441, "clip_ratio/high_mean": 0.00026432760641910137, "clip_ratio/low_mean": 8.234723791247233e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034667483996599915, "completions/clipped_ratio": 0.04296875, "completions/max_length": 1024.0, "completions/mean_length": 533.392578125, "completions/min_length": 244.0, "epoch": 0.8017207665232694, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8125, "kl": 0.0029872581362724304, "learning_rate": 2.9887764985742523e-07, "loss": 0.00014791518915444613, "reward": 0.3768756240606308, "reward_std": 0.27985601127147675, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3768756091594696, "rewards/QAReward/std": 0.4324638992547989, "step": 4100 }, { "clip_ratio/high_max": 0.0002920527011156082, "clip_ratio/high_mean": 0.00017525231814943253, "clip_ratio/low_mean": 6.279672379605472e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023804904194548726, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 520.125, "completions/min_length": 248.33333333333334, "epoch": 0.8026984747751271, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.002909116167575121, "learning_rate": 2.9603818799624525e-07, "loss": 8.477313676849007e-05, "reward": 0.4311363299687703, "reward_std": 0.28171368439992267, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4311363299687703, "rewards/QAReward/std": 0.42094801863034564, "step": 4105 }, { "clip_ratio/high_max": 0.0004909583018161357, "clip_ratio/high_mean": 0.00021505853510461748, "clip_ratio/low_mean": 4.575055936584249e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002608091046568006, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 532.7265625, "completions/min_length": 262.5, "epoch": 0.8036761830269847, "frac_reward_zero_std": 0.046875, "grad_norm": 0.8125, "kl": 0.0029552577529102565, "learning_rate": 2.9321080117984623e-07, "loss": 0.00019658345263451338, "reward": 0.3666217178106308, "reward_std": 0.2786809876561165, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3666217252612114, "rewards/QAReward/std": 0.43252380192279816, "step": 4110 }, { "clip_ratio/high_max": 0.00028318294789642096, "clip_ratio/high_mean": 0.00012845120509155094, "clip_ratio/low_mean": 3.506846842356026e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001635196735151112, "completions/clipped_ratio": 0.014322916666666666, "completions/max_length": 1024.0, "completions/mean_length": 530.7890625, "completions/min_length": 247.66666666666666, "epoch": 0.8046538912788423, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.002963567851111293, "learning_rate": 2.903955177652918e-07, "loss": 0.00017233919352293013, "reward": 0.3294462164243062, "reward_std": 0.2739834090073903, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3294462064901988, "rewards/QAReward/std": 0.4415527284145355, "step": 4115 }, { "clip_ratio/high_max": 0.000514805200509727, "clip_ratio/high_mean": 0.00024133772822096944, "clip_ratio/low_mean": 6.893543468322605e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031027315417304636, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 514.705078125, "completions/min_length": 238.0, "epoch": 0.8056315995307001, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8046875, "kl": 0.0030107936821877957, "learning_rate": 2.8759236598825674e-07, "loss": 0.00012817134847864509, "reward": 0.4149342179298401, "reward_std": 0.25981827080249786, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4149342179298401, "rewards/QAReward/std": 0.4092281013727188, "step": 4120 }, { "clip_ratio/high_max": 0.0003324665711261332, "clip_ratio/high_mean": 0.0001513334340415895, "clip_ratio/low_mean": 2.5196719798259436e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017653015675023198, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 516.24609375, "completions/min_length": 249.33333333333334, "epoch": 0.8066093077825577, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.8359375, "kl": 0.0030256406404078006, "learning_rate": 2.848013739627412e-07, "loss": 8.15432402305305e-05, "reward": 0.34702278176943463, "reward_std": 0.25976569453875226, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34702278176943463, "rewards/QAReward/std": 0.453553169965744, "step": 4125 }, { "clip_ratio/high_max": 0.0005000879056751728, "clip_ratio/high_mean": 0.0002683416532818228, "clip_ratio/low_mean": 6.269298028200865e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033103462192229927, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 516.541015625, "completions/min_length": 237.5, "epoch": 0.8075870160344153, "frac_reward_zero_std": 0.046875, "grad_norm": 0.83984375, "kl": 0.00300090997479856, "learning_rate": 2.8202256968079107e-07, "loss": 0.00016647926531732082, "reward": 0.3746434599161148, "reward_std": 0.2591760456562042, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.374643474817276, "rewards/QAReward/std": 0.4073624461889267, "step": 4130 }, { "clip_ratio/high_max": 0.00031878291629254817, "clip_ratio/high_mean": 0.00015943469479680062, "clip_ratio/low_mean": 4.09030108130537e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020033769542351365, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 961.6666666666666, "completions/mean_length": 532.4453125, "completions/min_length": 250.0, "epoch": 0.808564724286273, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.76171875, "kl": 0.00300232176668942, "learning_rate": 2.7925598101221555e-07, "loss": 0.0001293341163545847, "reward": 0.3974004884560903, "reward_std": 0.25464072823524475, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3974004884560903, "rewards/QAReward/std": 0.42650359869003296, "step": 4135 }, { "clip_ratio/high_max": 0.000506678456440568, "clip_ratio/high_mean": 0.0002958337136078626, "clip_ratio/low_mean": 7.327212078962475e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003691058198455721, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/mean_length": 533.84765625, "completions/min_length": 247.5, "epoch": 0.8095424325381306, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0031014815904200077, "learning_rate": 2.765016357043082e-07, "loss": 0.00012693866156041623, "reward": 0.3197660893201828, "reward_std": 0.27271272242069244, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3197660893201828, "rewards/QAReward/std": 0.4508563280105591, "step": 4140 }, { "clip_ratio/high_max": 0.00033268099650740623, "clip_ratio/high_mean": 0.00015487325144931675, "clip_ratio/low_mean": 3.246673950343393e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018733999459072948, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 547.3033854166666, "completions/min_length": 257.6666666666667, "epoch": 0.8105201407899882, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.79296875, "kl": 0.002840585168451071, "learning_rate": 2.737595613815687e-07, "loss": 0.00018038197886198758, "reward": 0.4386075437068939, "reward_std": 0.27096131443977356, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.43860751390457153, "rewards/QAReward/std": 0.41971928874651593, "step": 4145 }, { "clip_ratio/high_max": 0.0005542860832065344, "clip_ratio/high_mean": 0.00025771725340746344, "clip_ratio/low_mean": 6.637630285695195e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003240935504436493, "completions/clipped_ratio": 0.025390625, "completions/max_length": 1024.0, "completions/mean_length": 539.73046875, "completions/min_length": 244.5, "epoch": 0.811497849041846, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7734375, "kl": 0.002981612691655755, "learning_rate": 2.7102978554542656e-07, "loss": 0.00012335686478763818, "reward": 0.3955078125, "reward_std": 0.26694829761981964, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3955078125, "rewards/QAReward/std": 0.4346303939819336, "step": 4150 }, { "clip_ratio/high_max": 0.0003723349771462381, "clip_ratio/high_mean": 0.00017096953233703972, "clip_ratio/low_mean": 5.790251598227769e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022887205705046654, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 541.9765625, "completions/min_length": 253.0, "epoch": 0.8124755572937036, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8203125, "kl": 0.0029248018749058247, "learning_rate": 2.683123355739637e-07, "loss": 9.084476623684168e-05, "reward": 0.3290829658508301, "reward_std": 0.2999354600906372, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3290829658508301, "rewards/QAReward/std": 0.48005032539367676, "step": 4155 }, { "clip_ratio/high_max": 0.00045257257297635076, "clip_ratio/high_mean": 0.000281363062094897, "clip_ratio/low_mean": 7.285667961696163e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003542197402566671, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 548.626953125, "completions/min_length": 236.5, "epoch": 0.8134532655455612, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.0028622816782444714, "learning_rate": 2.6560723872164115e-07, "loss": 0.00011252930853515864, "reward": 0.3540457636117935, "reward_std": 0.28010228276252747, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3540457785129547, "rewards/QAReward/std": 0.4369889944791794, "step": 4160 }, { "clip_ratio/high_max": 0.00029088967712596057, "clip_ratio/high_mean": 0.00014731244882568718, "clip_ratio/low_mean": 4.2544995085336265e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001898574410006404, "completions/clipped_ratio": 0.05078125, "completions/max_length": 1024.0, "completions/mean_length": 540.1419270833334, "completions/min_length": 245.66666666666666, "epoch": 0.8144309737974188, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.8125, "kl": 0.0028685441240668295, "learning_rate": 2.629145221190245e-07, "loss": 0.0001948013319633901, "reward": 0.35699353615442914, "reward_std": 0.28557536005973816, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35699353615442914, "rewards/QAReward/std": 0.4431333839893341, "step": 4165 }, { "clip_ratio/high_max": 0.0004432573681697249, "clip_ratio/high_mean": 0.00025122512015514077, "clip_ratio/low_mean": 6.773944478482008e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003189645707607269, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 520.38671875, "completions/min_length": 235.0, "epoch": 0.8154086820492765, "frac_reward_zero_std": 0.046875, "grad_norm": 0.83984375, "kl": 0.002994669275358319, "learning_rate": 2.6023421277251403e-07, "loss": 8.541709976270795e-05, "reward": 0.38937796652317047, "reward_std": 0.2754826247692108, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38937798142433167, "rewards/QAReward/std": 0.4522203803062439, "step": 4170 }, { "clip_ratio/high_max": 0.0003754174918867648, "clip_ratio/high_mean": 0.00016005007200874387, "clip_ratio/low_mean": 3.473746037343517e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019478752510622145, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 539.87890625, "completions/min_length": 249.33333333333334, "epoch": 0.8163863903011341, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.83984375, "kl": 0.0030131963081657886, "learning_rate": 2.575663375640709e-07, "loss": 7.861235644668341e-05, "reward": 0.29924354950586957, "reward_std": 0.2904610832532247, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29924354950586957, "rewards/QAReward/std": 0.4359846313794454, "step": 4175 }, { "clip_ratio/high_max": 0.0004027107497677207, "clip_ratio/high_mean": 0.00025083976797759535, "clip_ratio/low_mean": 7.330713706323876e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032414690940640867, "completions/clipped_ratio": 0.013671875, "completions/max_length": 1024.0, "completions/mean_length": 516.0625, "completions/min_length": 249.5, "epoch": 0.8173640985529917, "frac_reward_zero_std": 0.0625, "grad_norm": 0.82421875, "kl": 0.003023416455835104, "learning_rate": 2.5491092325095e-07, "loss": 6.342997949104756e-06, "reward": 0.4043552279472351, "reward_std": 0.27889372408390045, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4043552279472351, "rewards/QAReward/std": 0.41833943128585815, "step": 4180 }, { "clip_ratio/high_max": 0.00024549721274524927, "clip_ratio/high_mean": 0.00013572477037087082, "clip_ratio/low_mean": 5.2897262503393e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001886220241431147, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 507.7005208333333, "completions/min_length": 253.66666666666666, "epoch": 0.8183418068048495, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.84375, "kl": 0.0030334720388054846, "learning_rate": 2.522679964654294e-07, "loss": 0.0001138171530328691, "reward": 0.37755870819091797, "reward_std": 0.27316559354464215, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3775586982568105, "rewards/QAReward/std": 0.4182936946551005, "step": 4185 }, { "clip_ratio/high_max": 0.0005263890838250518, "clip_ratio/high_mean": 0.00026215628022328017, "clip_ratio/low_mean": 7.305935578187928e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003352156491018832, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 540.953125, "completions/min_length": 201.5, "epoch": 0.8193195150567071, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.0029802118428051473, "learning_rate": 2.496375837145464e-07, "loss": 0.00015604622894898056, "reward": 0.3350074291229248, "reward_std": 0.27245599031448364, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.335007444024086, "rewards/QAReward/std": 0.42996105551719666, "step": 4190 }, { "clip_ratio/high_max": 0.0002767068799585104, "clip_ratio/high_mean": 0.00014573908993043005, "clip_ratio/low_mean": 5.446277500595897e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002002018562052399, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 514.1184895833334, "completions/min_length": 248.33333333333334, "epoch": 0.8202972233085647, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.0030440663918852805, "learning_rate": 2.47019711379828e-07, "loss": 0.00015333506744354963, "reward": 0.35040561358133954, "reward_std": 0.2992500166098277, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35040561358133954, "rewards/QAReward/std": 0.45419494311014813, "step": 4195 }, { "clip_ratio/high_max": 0.0005608421750366687, "clip_ratio/high_mean": 0.00025666001019999387, "clip_ratio/low_mean": 7.056884933263064e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003272288711741567, "completions/clipped_ratio": 0.02734375, "completions/max_length": 1024.0, "completions/mean_length": 530.966796875, "completions/min_length": 236.0, "epoch": 0.8212749315604224, "frac_reward_zero_std": 0.015625, "grad_norm": 0.7890625, "kl": 0.003074724553152919, "learning_rate": 2.444144057170287e-07, "loss": 5.890407483093441e-05, "reward": 0.39443978667259216, "reward_std": 0.2891537845134735, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39443978667259216, "rewards/QAReward/std": 0.4448383301496506, "step": 4200 }, { "clip_ratio/high_max": 0.0002845607348717749, "clip_ratio/high_mean": 0.00015813364298082887, "clip_ratio/low_mean": 3.6096471012569965e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001942301110830158, "completions/clipped_ratio": 0.01953125, "completions/max_length": 994.6666666666666, "completions/mean_length": 540.5872395833334, "completions/min_length": 237.33333333333334, "epoch": 0.82225263981228, "frac_reward_zero_std": 0.03125, "grad_norm": 0.73828125, "kl": 0.002989676361903548, "learning_rate": 2.41821692855866e-07, "loss": 0.00017001967644318937, "reward": 0.34301990270614624, "reward_std": 0.27035050590833026, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34301990767319995, "rewards/QAReward/std": 0.4465744396050771, "step": 4205 }, { "clip_ratio/high_max": 0.0005474612931720913, "clip_ratio/high_mean": 0.0002486381330527365, "clip_ratio/low_mean": 7.197188824648038e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003206100198440254, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 526.154296875, "completions/min_length": 285.5, "epoch": 0.8232303480641376, "frac_reward_zero_std": 0.03125, "grad_norm": 0.84375, "kl": 0.0029737478587776424, "learning_rate": 2.3924159879976e-07, "loss": 9.162118658423424e-05, "reward": 0.34228672087192535, "reward_std": 0.27374157309532166, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.34228672087192535, "rewards/QAReward/std": 0.44217656552791595, "step": 4210 }, { "clip_ratio/high_max": 0.00044342614710330964, "clip_ratio/high_mean": 0.00021618445171043277, "clip_ratio/low_mean": 5.1200912275817244e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002673853537999094, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 529.7916666666666, "completions/min_length": 256.0, "epoch": 0.8242080563159953, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.796875, "kl": 0.0030351311434060335, "learning_rate": 2.3667414942557037e-07, "loss": 8.127462933771312e-05, "reward": 0.37150606513023376, "reward_std": 0.27914197246233624, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37150607506434125, "rewards/QAReward/std": 0.43100252747535706, "step": 4215 }, { "clip_ratio/high_max": 0.000604753929655999, "clip_ratio/high_mean": 0.00024490252835676076, "clip_ratio/low_mean": 5.9072888689115644e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003039754112251103, "completions/clipped_ratio": 0.060546875, "completions/max_length": 1024.0, "completions/mean_length": 556.599609375, "completions/min_length": 240.5, "epoch": 0.825185764567853, "frac_reward_zero_std": 0.015625, "grad_norm": 0.77734375, "kl": 0.002828144654631615, "learning_rate": 2.341193704833382e-07, "loss": 0.00012279923539608716, "reward": 0.3295511156320572, "reward_std": 0.28718794882297516, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.329551100730896, "rewards/QAReward/std": 0.43903347849845886, "step": 4220 }, { "clip_ratio/high_max": 0.0003060837974771857, "clip_ratio/high_mean": 0.00018196212477050722, "clip_ratio/low_mean": 3.325949801364913e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021522162714973092, "completions/clipped_ratio": 0.020833333333333332, "completions/max_length": 1024.0, "completions/mean_length": 528.125, "completions/min_length": 211.66666666666666, "epoch": 0.8261634728197106, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.83984375, "kl": 0.003001666674390435, "learning_rate": 2.3157728759602725e-07, "loss": 6.384337320923806e-05, "reward": 0.3915783266226451, "reward_std": 0.30466102560361225, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3915783266226451, "rewards/QAReward/std": 0.48149969180425006, "step": 4225 }, { "clip_ratio/high_max": 0.0004929184215143323, "clip_ratio/high_mean": 0.000259951181942597, "clip_ratio/low_mean": 6.34424592135474e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003233936382457614, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 555.25, "completions/min_length": 259.0, "epoch": 0.8271411810715682, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.002902019023895264, "learning_rate": 2.2904792625926797e-07, "loss": 8.794032037258148e-05, "reward": 0.35964861512184143, "reward_std": 0.26862262189388275, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.35964861512184143, "rewards/QAReward/std": 0.4278406947851181, "step": 4230 }, { "clip_ratio/high_max": 0.0004651436349377036, "clip_ratio/high_mean": 0.00019002462504431606, "clip_ratio/low_mean": 4.8069222248159346e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023809383856132627, "completions/clipped_ratio": 0.018229166666666668, "completions/max_length": 1024.0, "completions/mean_length": 519.3489583333334, "completions/min_length": 249.0, "epoch": 0.8281188893234259, "frac_reward_zero_std": 0.0625, "grad_norm": 0.81640625, "kl": 0.003065728861838579, "learning_rate": 2.2653131184109982e-07, "loss": 0.00017642276361584663, "reward": 0.33554429312547046, "reward_std": 0.2774463891983032, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33554429312547046, "rewards/QAReward/std": 0.47652657826741535, "step": 4235 }, { "clip_ratio/high_max": 0.0004901065258309245, "clip_ratio/high_mean": 0.00022163320682011545, "clip_ratio/low_mean": 9.420286805834621e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003158360836096108, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 521.185546875, "completions/min_length": 243.5, "epoch": 0.8290965975752835, "frac_reward_zero_std": 0.03125, "grad_norm": 0.796875, "kl": 0.003029685281217098, "learning_rate": 2.2402746958171894e-07, "loss": 5.7853420730680225e-05, "reward": 0.37691281735897064, "reward_std": 0.29131169617176056, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37691283226013184, "rewards/QAReward/std": 0.4508178234100342, "step": 4240 }, { "clip_ratio/high_max": 0.00034571050200611354, "clip_ratio/high_mean": 0.00020405075047165156, "clip_ratio/low_mean": 5.02446957398206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002542954403907061, "completions/clipped_ratio": 0.036458333333333336, "completions/max_length": 1024.0, "completions/mean_length": 538.4114583333334, "completions/min_length": 251.0, "epoch": 0.8300743058271411, "frac_reward_zero_std": 0.03125, "grad_norm": 0.75, "kl": 0.002926244027912617, "learning_rate": 2.2153642459322287e-07, "loss": 0.00010862769559025765, "reward": 0.400546391805013, "reward_std": 0.27910976608594257, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.400546391805013, "rewards/QAReward/std": 0.4186622699101766, "step": 4245 }, { "clip_ratio/high_max": 0.0006227486883290112, "clip_ratio/high_mean": 0.0003071788756642491, "clip_ratio/low_mean": 6.722672987962142e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037440560990944507, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 538.16796875, "completions/min_length": 216.5, "epoch": 0.8310520140789989, "frac_reward_zero_std": 0.015625, "grad_norm": 0.83203125, "kl": 0.002984988037496805, "learning_rate": 2.1905820185936171e-07, "loss": 0.00014846080448478461, "reward": 0.33400142192840576, "reward_std": 0.3044060617685318, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33400142192840576, "rewards/QAReward/std": 0.46137474477291107, "step": 4250 }, { "clip_ratio/high_max": 0.0004893881734460593, "clip_ratio/high_mean": 0.00020448790164664387, "clip_ratio/low_mean": 3.7452721153385936e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024194061988964678, "completions/clipped_ratio": 0.037760416666666664, "completions/max_length": 1024.0, "completions/mean_length": 552.546875, "completions/min_length": 261.6666666666667, "epoch": 0.8320297223308565, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 0.79296875, "kl": 0.0028256270568817856, "learning_rate": 2.165928262352842e-07, "loss": 9.443595772609114e-05, "reward": 0.3671063780784607, "reward_std": 0.26736952861150104, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3671063880125682, "rewards/QAReward/std": 0.4345364769299825, "step": 4255 }, { "clip_ratio/high_max": 0.00044161612167954446, "clip_ratio/high_mean": 0.0002226921438705176, "clip_ratio/low_mean": 6.08931397437118e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028358527924865484, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/mean_length": 547.1953125, "completions/min_length": 239.5, "epoch": 0.8330074305827141, "frac_reward_zero_std": 0.015625, "grad_norm": 0.82421875, "kl": 0.0028978604823350906, "learning_rate": 2.1414032244729021e-07, "loss": 0.00020316753070801497, "reward": 0.3017297759652138, "reward_std": 0.2903878092765808, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3017297685146332, "rewards/QAReward/std": 0.4431062489748001, "step": 4260 }, { "clip_ratio/high_max": 0.0004697021562606096, "clip_ratio/high_mean": 0.0002544411225244403, "clip_ratio/low_mean": 3.648814163170755e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002909292466938496, "completions/clipped_ratio": 0.03515625, "completions/max_length": 1024.0, "completions/mean_length": 537.8411458333334, "completions/min_length": 250.0, "epoch": 0.8339851388345718, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80078125, "kl": 0.0029717348515987396, "learning_rate": 2.1170071509258297e-07, "loss": 0.00010695892851799726, "reward": 0.331640621026357, "reward_std": 0.28500524163246155, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3316406309604645, "rewards/QAReward/std": 0.4788203438123067, "step": 4265 }, { "clip_ratio/high_max": 0.00047941585071384905, "clip_ratio/high_mean": 0.0002454129862599075, "clip_ratio/low_mean": 5.478475650306791e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030019773403182625, "completions/clipped_ratio": 0.029296875, "completions/max_length": 1024.0, "completions/mean_length": 539.7421875, "completions/min_length": 215.0, "epoch": 0.8349628470864294, "frac_reward_zero_std": 0.046875, "grad_norm": 0.80078125, "kl": 0.003006558958441019, "learning_rate": 2.0927402863902195e-07, "loss": 8.76189791597426e-05, "reward": 0.4280939996242523, "reward_std": 0.2627129554748535, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4280940145254135, "rewards/QAReward/std": 0.42217838764190674, "step": 4270 }, { "clip_ratio/high_max": 0.00036748102866113186, "clip_ratio/high_mean": 0.00018401359557174146, "clip_ratio/low_mean": 5.886781145818532e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024288140702992677, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 527.7239583333334, "completions/min_length": 267.3333333333333, "epoch": 0.835940555338287, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.80859375, "kl": 0.002973565924912691, "learning_rate": 2.0686028742487651e-07, "loss": 0.0001379877678118646, "reward": 0.3472878634929657, "reward_std": 0.29487644632657367, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3472878535588582, "rewards/QAReward/std": 0.44527647892634076, "step": 4275 }, { "clip_ratio/high_max": 0.0006418951787054538, "clip_ratio/high_mean": 0.0003173695644363761, "clip_ratio/low_mean": 5.173338868189603e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036910296184942124, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 535.908203125, "completions/min_length": 252.5, "epoch": 0.8369182635901447, "frac_reward_zero_std": 0.0625, "grad_norm": 0.828125, "kl": 0.00298849199898541, "learning_rate": 2.044595156585834e-07, "loss": 0.00011486392468214035, "reward": 0.37330496311187744, "reward_std": 0.2551896572113037, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37330496311187744, "rewards/QAReward/std": 0.44161148369312286, "step": 4280 }, { "clip_ratio/high_max": 0.0005753670819103717, "clip_ratio/high_mean": 0.00021822965936735272, "clip_ratio/low_mean": 5.581903096754104e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002740486990660429, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 518.88671875, "completions/min_length": 250.66666666666666, "epoch": 0.8378959718420024, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8515625, "kl": 0.0030550121795386077, "learning_rate": 2.0207173741850342e-07, "loss": 0.00011963965371251107, "reward": 0.4276894231637319, "reward_std": 0.27997955679893494, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4276894231637319, "rewards/QAReward/std": 0.44328012069066364, "step": 4285 }, { "clip_ratio/high_max": 0.00045951567590236665, "clip_ratio/high_mean": 0.0002467838174197823, "clip_ratio/low_mean": 8.676619036123157e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003335500019602478, "completions/clipped_ratio": 0.041015625, "completions/max_length": 1024.0, "completions/mean_length": 538.09765625, "completions/min_length": 240.5, "epoch": 0.83887368009386, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80078125, "kl": 0.0029704909306019545, "learning_rate": 1.9969697665267966e-07, "loss": 0.00011517768725752831, "reward": 0.3746698200702667, "reward_std": 0.2758966535329819, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3746698200702667, "rewards/QAReward/std": 0.43867772817611694, "step": 4290 }, { "clip_ratio/high_max": 0.00034960206830874084, "clip_ratio/high_mean": 0.000124086975120008, "clip_ratio/low_mean": 3.5524835402611644e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001596118148881942, "completions/clipped_ratio": 0.029947916666666668, "completions/max_length": 1024.0, "completions/mean_length": 533.9817708333334, "completions/min_length": 270.6666666666667, "epoch": 0.8398513883457176, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 0.80078125, "kl": 0.0029391931369900704, "learning_rate": 1.9733525717859657e-07, "loss": 0.00011153124505653977, "reward": 0.3823862373828888, "reward_std": 0.28025717039903003, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3823862373828888, "rewards/QAReward/std": 0.46280749638875324, "step": 4295 }, { "clip_ratio/high_max": 0.00044844330986961725, "clip_ratio/high_mean": 0.000252344534965232, "clip_ratio/low_mean": 8.024775015655906e-05, "clip_ratio/low_min": 2.3468669678550213e-05, "clip_ratio/region_mean": 0.0003325922763906419, "completions/clipped_ratio": 0.033203125, "completions/max_length": 1024.0, "completions/mean_length": 542.474609375, "completions/min_length": 240.5, "epoch": 0.8408290965975753, "frac_reward_zero_std": 0.015625, "grad_norm": 0.87109375, "kl": 0.0029668739531189202, "learning_rate": 1.9498660268294343e-07, "loss": 0.00012696180492639543, "reward": 0.3302261680364609, "reward_std": 0.28383490443229675, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33022618293762207, "rewards/QAReward/std": 0.41345788538455963, "step": 4300 }, { "clip_ratio/high_max": 0.0003500675084069371, "clip_ratio/high_mean": 0.00013150591985322535, "clip_ratio/low_mean": 3.909639926860109e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001706023293081671, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/mean_length": 509.69140625, "completions/min_length": 237.0, "epoch": 0.8418068048494329, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 0.8671875, "kl": 0.0030482035130262376, "learning_rate": 1.9265103672137417e-07, "loss": 0.00012659155763685703, "reward": 0.39568089445432025, "reward_std": 0.2878165642420451, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.39568092425664264, "rewards/QAReward/std": 0.45612939198811847, "step": 4305 }, { "clip_ratio/high_max": 0.00036408109590411184, "clip_ratio/high_mean": 0.0002127488434780389, "clip_ratio/low_mean": 7.966904377099127e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002924178959801793, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 542.39453125, "completions/min_length": 243.5, "epoch": 0.8427845131012905, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8046875, "kl": 0.0029334927443414927, "learning_rate": 1.9032858271827243e-07, "loss": 0.00010988920694217086, "reward": 0.3048774003982544, "reward_std": 0.3080311566591263, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3048774003982544, "rewards/QAReward/std": 0.46594493091106415, "step": 4310 }, { "clip_ratio/high_max": 0.00038503409596160056, "clip_ratio/high_mean": 0.0001531415735371411, "clip_ratio/low_mean": 4.085551045136526e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019399707671254873, "completions/clipped_ratio": 0.041666666666666664, "completions/max_length": 1024.0, "completions/mean_length": 551.015625, "completions/min_length": 264.0, "epoch": 0.8437622213531483, "frac_reward_zero_std": 0.03125, "grad_norm": 0.76953125, "kl": 0.0029419029597193003, "learning_rate": 1.880192639665171e-07, "loss": 0.00012517670402303338, "reward": 0.3266642888387044, "reward_std": 0.2766266067822774, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3266642888387044, "rewards/QAReward/std": 0.42704201738039654, "step": 4315 }, { "clip_ratio/high_max": 0.0005366967525333166, "clip_ratio/high_mean": 0.00023875129409134388, "clip_ratio/low_mean": 7.958371425047517e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031833500834181904, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 544.25390625, "completions/min_length": 265.0, "epoch": 0.8447399296050059, "frac_reward_zero_std": 0.046875, "grad_norm": 0.734375, "kl": 0.0028462296817451716, "learning_rate": 1.857231036272472e-07, "loss": 9.87600302323699e-05, "reward": 0.3112723380327225, "reward_std": 0.2795201241970062, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3112723380327225, "rewards/QAReward/std": 0.4427582770586014, "step": 4320 }, { "clip_ratio/high_max": 0.00029233325039967895, "clip_ratio/high_mean": 0.00015443932497873902, "clip_ratio/low_mean": 3.514828276820481e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018958760774694382, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 543.2291666666666, "completions/min_length": 234.66666666666666, "epoch": 0.8457176378568635, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.78125, "kl": 0.0030477363616228104, "learning_rate": 1.8344012472963067e-07, "loss": 0.0001431920565664768, "reward": 0.2897791663805644, "reward_std": 0.27032653490702313, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.2897791663805644, "rewards/QAReward/std": 0.4112665156523387, "step": 4325 }, { "clip_ratio/high_max": 0.0005122284987010062, "clip_ratio/high_mean": 0.0002354433760046959, "clip_ratio/low_mean": 7.516549958381802e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031060888431966305, "completions/clipped_ratio": 0.044921875, "completions/max_length": 1024.0, "completions/mean_length": 548.962890625, "completions/min_length": 262.5, "epoch": 0.8466953461087211, "frac_reward_zero_std": 0.03125, "grad_norm": 0.79296875, "kl": 0.0027603634167462586, "learning_rate": 1.8117035017063365e-07, "loss": 0.00013592415489256383, "reward": 0.36153893172740936, "reward_std": 0.27941690385341644, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36153894662857056, "rewards/QAReward/std": 0.40176425874233246, "step": 4330 }, { "clip_ratio/high_max": 0.000276545318774879, "clip_ratio/high_mean": 0.00013678441755473613, "clip_ratio/low_mean": 2.7719915669877083e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001645043375901878, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 525.9830729166666, "completions/min_length": 235.0, "epoch": 0.8476730543605788, "frac_reward_zero_std": 0.03125, "grad_norm": 0.828125, "kl": 0.0030574051197618246, "learning_rate": 1.789138027147899e-07, "loss": 0.00016896446468308567, "reward": 0.41378453373908997, "reward_std": 0.2796175380547841, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.41378451387087506, "rewards/QAReward/std": 0.4311663905779521, "step": 4335 }, { "clip_ratio/high_max": 0.0006671840790659189, "clip_ratio/high_mean": 0.00023935675853863357, "clip_ratio/low_mean": 8.426518033957108e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003236219403333962, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 524.26171875, "completions/min_length": 242.0, "epoch": 0.8486507626124364, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.0029714960139244793, "learning_rate": 1.7667050499397282e-07, "loss": 0.00011033108457922936, "reward": 0.30613142251968384, "reward_std": 0.30749930441379547, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.30613142251968384, "rewards/QAReward/std": 0.4786885678768158, "step": 4340 }, { "clip_ratio/high_max": 0.00036229481920599935, "clip_ratio/high_mean": 0.0001728659786749631, "clip_ratio/low_mean": 4.2482520802877845e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021534849656745791, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 537.0950520833334, "completions/min_length": 251.66666666666666, "epoch": 0.849628470864294, "frac_reward_zero_std": 0.0625, "grad_norm": 0.80859375, "kl": 0.0029726655222475527, "learning_rate": 1.744404795071692e-07, "loss": 0.00014400864019989968, "reward": 0.375322421391805, "reward_std": 0.2645599792400996, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.37532244126001996, "rewards/QAReward/std": 0.4348786175251007, "step": 4345 }, { "clip_ratio/high_max": 0.0005105827818624675, "clip_ratio/high_mean": 0.00028460272587835787, "clip_ratio/low_mean": 8.184851758414879e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000366451230365783, "completions/clipped_ratio": 0.01171875, "completions/max_length": 1024.0, "completions/mean_length": 517.669921875, "completions/min_length": 259.0, "epoch": 0.8506061791161518, "frac_reward_zero_std": 0.015625, "grad_norm": 0.8203125, "kl": 0.0029448303394019604, "learning_rate": 1.7222374862025238e-07, "loss": 0.0002000484149903059, "reward": 0.44301681220531464, "reward_std": 0.28656625747680664, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.44301681220531464, "rewards/QAReward/std": 0.39420635998249054, "step": 4350 }, { "clip_ratio/high_max": 0.000442329584620893, "clip_ratio/high_mean": 0.00022580038057640195, "clip_ratio/low_mean": 4.140259770792909e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002672029659152031, "completions/clipped_ratio": 0.022135416666666668, "completions/max_length": 1024.0, "completions/mean_length": 536.703125, "completions/min_length": 246.66666666666666, "epoch": 0.8515838873680094, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 0.8203125, "kl": 0.002922180853784084, "learning_rate": 1.7002033456575883e-07, "loss": 5.5020826403051613e-05, "reward": 0.3486250738302867, "reward_std": 0.2837316294511159, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3486250638961792, "rewards/QAReward/std": 0.4296138981978099, "step": 4355 }, { "clip_ratio/high_max": 0.0005608068080618977, "clip_ratio/high_mean": 0.00029446802218444643, "clip_ratio/low_mean": 6.17030105786398e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003561710356734693, "completions/clipped_ratio": 0.017578125, "completions/max_length": 1024.0, "completions/mean_length": 535.75, "completions/min_length": 254.0, "epoch": 0.852561595619867, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8359375, "kl": 0.0029095578007400035, "learning_rate": 1.6783025944266405e-07, "loss": 0.00012885504402220248, "reward": 0.3277692645788193, "reward_std": 0.2939479351043701, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3277692496776581, "rewards/QAReward/std": 0.4580059200525284, "step": 4360 }, { "clip_ratio/high_max": 0.000456034776289016, "clip_ratio/high_mean": 0.0001745645538903773, "clip_ratio/low_mean": 6.593306316062807e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024049761705100537, "completions/clipped_ratio": 0.040364583333333336, "completions/max_length": 1024.0, "completions/mean_length": 529.9309895833334, "completions/min_length": 244.33333333333334, "epoch": 0.8535393038717247, "frac_reward_zero_std": 0.03125, "grad_norm": 0.80078125, "kl": 0.0030520183499902487, "learning_rate": 1.656535452161632e-07, "loss": 0.00010978956706821918, "reward": 0.35794374346733093, "reward_std": 0.2638590782880783, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3579437534014384, "rewards/QAReward/std": 0.43150925636291504, "step": 4365 }, { "clip_ratio/high_max": 0.0005305795115418732, "clip_ratio/high_mean": 0.00025926416856236756, "clip_ratio/low_mean": 5.676477085216902e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003160289372317493, "completions/clipped_ratio": 0.052734375, "completions/max_length": 1024.0, "completions/mean_length": 541.830078125, "completions/min_length": 242.5, "epoch": 0.8545170121235823, "frac_reward_zero_std": 0.046875, "grad_norm": 0.82421875, "kl": 0.002803700603544712, "learning_rate": 1.634902137174483e-07, "loss": 0.00022404873743653297, "reward": 0.36104367673397064, "reward_std": 0.269224651157856, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.36104367673397064, "rewards/QAReward/std": 0.49193692207336426, "step": 4370 }, { "clip_ratio/high_max": 0.00029354189755395057, "clip_ratio/high_mean": 0.00013265296001918614, "clip_ratio/low_mean": 6.722626567352563e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019987921696156262, "completions/clipped_ratio": 0.024739583333333332, "completions/max_length": 1024.0, "completions/mean_length": 531.3919270833334, "completions/min_length": 268.0, "epoch": 0.8554947203754399, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7890625, "kl": 0.0029799608513712884, "learning_rate": 1.6134028664349026e-07, "loss": 0.00010966697009280324, "reward": 0.3773380716641744, "reward_std": 0.27213863531748456, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.3773380716641744, "rewards/QAReward/std": 0.41895421346028644, "step": 4375 }, { "clip_ratio/high_max": 0.0004614585661329329, "clip_ratio/high_mean": 0.000219314283458516, "clip_ratio/low_mean": 7.108850550139323e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002904027933254838, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 518.123046875, "completions/min_length": 265.5, "epoch": 0.8564724286272977, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8671875, "kl": 0.002927587553858757, "learning_rate": 1.5920378555682163e-07, "loss": 0.00016628807643428444, "reward": 0.4083179831504822, "reward_std": 0.24491140246391296, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.4083179831504822, "rewards/QAReward/std": 0.448096364736557, "step": 4380 }, { "clip_ratio/high_max": 0.0003204236156307161, "clip_ratio/high_mean": 0.0001528916647657752, "clip_ratio/low_mean": 4.9917529395315796e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020280919852666557, "completions/clipped_ratio": 0.033854166666666664, "completions/max_length": 1024.0, "completions/mean_length": 528.9361979166666, "completions/min_length": 251.0, "epoch": 0.8574501368791553, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 0.78125, "kl": 0.002952137030661106, "learning_rate": 1.5708073188532025e-07, "loss": 0.00014542676508426665, "reward": 0.38504258791605633, "reward_std": 0.27843953172365826, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.38504258791605633, "rewards/QAReward/std": 0.4544186592102051, "step": 4385 }, { "clip_ratio/high_max": 0.0004504724289290607, "clip_ratio/high_mean": 0.0002410866436548531, "clip_ratio/low_mean": 7.193740457296371e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031302402494475244, "completions/clipped_ratio": 0.01953125, "completions/max_length": 1024.0, "completions/mean_length": 544.16796875, "completions/min_length": 273.5, "epoch": 0.8584278451310129, "frac_reward_zero_std": 0.0625, "grad_norm": 0.79296875, "kl": 0.0029806621838361024, "learning_rate": 1.5497114692199377e-07, "loss": 0.0001272534020245075, "reward": 0.3324799984693527, "reward_std": 0.29502472281455994, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.33247998356819153, "rewards/QAReward/std": 0.4396052807569504, "step": 4390 }, { "clip_ratio/high_max": 0.0003549292217940092, "clip_ratio/high_mean": 0.00015865829191170634, "clip_ratio/low_mean": 3.543282728060149e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019409111700952053, "completions/clipped_ratio": 0.026041666666666668, "completions/max_length": 1024.0, "completions/mean_length": 525.6263020833334, "completions/min_length": 264.3333333333333, "epoch": 0.8594055533828705, "frac_reward_zero_std": 0.03125, "grad_norm": 0.81640625, "kl": 0.002979703852906823, "learning_rate": 1.528750518247663e-07, "loss": 0.00012762262485921383, "reward": 0.29664403200149536, "reward_std": 0.2927919228871663, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.29664404193560284, "rewards/QAReward/std": 0.4647211829821269, "step": 4395 }, { "clip_ratio/high_max": 0.00043934080749750135, "clip_ratio/high_mean": 0.0002411194669548422, "clip_ratio/low_mean": 6.415813113562763e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003052776039112359, "completions/clipped_ratio": 0.03125, "completions/max_length": 1024.0, "completions/mean_length": 525.373046875, "completions/min_length": 250.5, "epoch": 0.8603832616347282, "frac_reward_zero_std": 0.03125, "grad_norm": 0.78515625, "kl": 0.0030499016866087914, "learning_rate": 1.507924676162663e-07, "loss": 0.00013259758707135917, "reward": 0.40972842276096344, "reward_std": 0.27650539577007294, "rewards/FormatReward/mean": 0.0, "rewards/FormatReward/std": 0.0, "rewards/QAReward/mean": 0.40972843766212463, "rewards/QAReward/std": 0.42427462339401245, "step": 4400 } ], "logging_steps": 5, "max_steps": 5114, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }